From 07ac2da6153df150eb2bd15f814118793582943e Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 11 Sep 2023 13:49:40 -0700 Subject: [PATCH 01/26] Implements necessary sycl utilities for custom reductions --- .../libtensor/include/utils/sycl_utils.hpp | 265 ++++++++++++++++++ 1 file changed, 265 insertions(+) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index 2fc7b02efa..b490c8ed14 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -28,12 +28,111 @@ #include #include +#include "math_utils.hpp" + namespace dpctl { namespace tensor { namespace sycl_utils { +namespace detail +{ + +template struct TypeList; + +template struct TypeList +{ + using head = Head; + using tail = TypeList; +}; + +using NullTypeList = TypeList<>; +template +struct IsNullTypeList : std::conditional_t, + std::true_type, + std::false_type> +{ +}; + +// recursively check if type is contained in given TypeList +template +struct IsContained + : std::conditional_t< + std::is_same_v>, + std::true_type, + IsContained> +{ +}; + +template <> struct TypeList<> +{ +}; + +// std::false_type when last case has been checked for membership +template struct IsContained : std::false_type +{ +}; + +template struct IsComplex : std::false_type +{ +}; +template struct IsComplex> : std::true_type +{ +}; + +} // namespace detail + +template +using sycl_ops = detail::TypeList, + sycl::bit_or, + sycl::bit_xor, + sycl::bit_and, + sycl::maximum, + sycl::minimum, + sycl::multiplies>; + +template struct IsSyclOp +{ + static constexpr bool value = + detail::IsContained>>::value || + detail::IsContained>>::value || + detail::IsContained>::value; +}; + +struct AtomicSupport +{ + bool operator()(const sycl::queue &exec_q, + sycl::usm::alloc usm_alloc_type, + bool require_atomic64 = false) const + { + bool supports_atomics = false; + + const sycl::device &dev = exec_q.get_device(); + if (require_atomic64) { + if (!dev.has(sycl::aspect::atomic64)) + return false; + } + + switch (usm_alloc_type) { + case sycl::usm::alloc::shared: + supports_atomics = + dev.has(sycl::aspect::usm_atomic_shared_allocations); + break; + case sycl::usm::alloc::host: + supports_atomics = + dev.has(sycl::aspect::usm_atomic_host_allocations); + break; + case sycl::usm::alloc::device: + supports_atomics = true; + break; + default: + supports_atomics = false; + } + + return supports_atomics; + } +}; /*! @brief Find the smallest multiple of supported sub-group size larger than * nelems */ @@ -66,6 +165,172 @@ size_t choose_workgroup_size(const size_t nelems, return wg; } +template +T custom_reduce_over_group(GroupT wg, + LocAccT local_mem_acc, + T local_val, + OpT op) +{ + size_t wgs = wg.get_local_linear_range(); + local_mem_acc[wg.get_local_linear_id()] = local_val; + + sycl::group_barrier(wg, sycl::memory_scope::work_group); + + T red_val_over_wg = local_mem_acc[0]; + if (wg.leader()) { + for (size_t i = 1; i < wgs; ++i) { + red_val_over_wg = op(red_val_over_wg, local_mem_acc[i]); + } + } + + sycl::group_barrier(wg, sycl::memory_scope::work_group); + + return sycl::group_broadcast(wg, red_val_over_wg); +} + +// Reduction functors + +// Maximum + +template struct Maximum +{ + T operator()(const T &x, const T &y) const + { + if constexpr (detail::IsComplex::value) { + using dpctl::tensor::math_utils::max_complex; + return max_complex(x, y); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + return (std::isnan(x) || x > y) ? x : y; + } + else if constexpr (std::is_same_v) { + return x || y; + } + else { + return (x > y) ? x : y; + } + } +}; + +// Minimum + +template struct Minimum +{ + T operator()(const T &x, const T &y) const + { + if constexpr (detail::IsComplex::value) { + using dpctl::tensor::math_utils::min_complex; + return min_complex(x, y); + } + else if constexpr (std::is_floating_point_v || + std::is_same_v) { + return (std::isnan(x) || x < y) ? x : y; + } + else if constexpr (std::is_same_v) { + return x && y; + } + else { + return (x < y) ? x : y; + } + } +}; + +// Define identities and operator checking structs + +template struct GetIdentity +{ +}; + +// Maximum + +template +using IsMaximum = std::bool_constant> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = + static_cast(std::numeric_limits::has_infinity + ? static_cast(-std::numeric_limits::infinity()) + : std::numeric_limits::lowest()); +}; + +template +struct GetIdentity::value>> +{ + static constexpr bool value = false; +}; + +template +struct GetIdentity, + std::enable_if_t, Op>::value>> +{ + static constexpr std::complex value{-std::numeric_limits::infinity(), + -std::numeric_limits::infinity()}; +}; + +// Minimum + +template +using IsMinimum = std::bool_constant> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = + static_cast(std::numeric_limits::has_infinity + ? static_cast(std::numeric_limits::infinity()) + : std::numeric_limits::max()); +}; + +template +struct GetIdentity::value>> +{ + static constexpr bool value = true; +}; + +template +struct GetIdentity, + std::enable_if_t, Op>::value>> +{ + static constexpr std::complex value{std::numeric_limits::infinity(), + std::numeric_limits::infinity()}; +}; + +// Plus + +template +using IsPlus = std::bool_constant< + std::is_same_v> || std::is_same_v> || + std::is_same_v> || std::is_same_v>>; + +// Identity + +template struct Identity +{ +}; + +template +struct Identity::value>> +{ + static constexpr T value = GetIdentity::value; +}; + +template +struct Identity::value>> +{ + static constexpr T value = sycl::known_identity::value; +}; + } // namespace sycl_utils } // namespace tensor } // namespace dpctl From 78f7aba6eaaa9484a29f8a8a5d2ba1a881b3251c Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 11 Sep 2023 14:07:45 -0700 Subject: [PATCH 02/26] Implements dpctl.tensor.max and dpctl.tensor.min --- dpctl/tensor/CMakeLists.txt | 2 + dpctl/tensor/__init__.py | 4 +- dpctl/tensor/_reduction.py | 59 + .../libtensor/include/kernels/reductions.hpp | 921 +++++++++---- .../include/kernels/sum_reductions.hpp | 1172 +++++++++++++++++ .../libtensor/source/reduction_over_axis.cpp | 218 +++ .../libtensor/source/reduction_over_axis.hpp | 394 ++++++ .../libtensor/source/sum_reductions.cpp | 6 +- .../libtensor/source/sum_reductions.hpp | 4 +- dpctl/tensor/libtensor/source/tensor_py.cpp | 2 + 10 files changed, 2527 insertions(+), 255 deletions(-) create mode 100644 dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp create mode 100644 dpctl/tensor/libtensor/source/reduction_over_axis.cpp create mode 100644 dpctl/tensor/libtensor/source/reduction_over_axis.hpp diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 456eebdbaa..234626abd5 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -51,6 +51,7 @@ pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sum_reductions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp ) set(_clang_prefix "") if (WIN32) @@ -60,6 +61,7 @@ set_source_files_properties( ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/full_ctor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/linear_sequences.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp PROPERTIES COMPILE_OPTIONS "${_clang_prefix}-fno-fast-math") if (UNIX) set_source_files_properties( diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index f0930004ec..d5c2672d46 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -160,7 +160,7 @@ tanh, trunc, ) -from ._reduction import sum +from ._reduction import max, min, sum from ._testing import allclose __all__ = [ @@ -309,4 +309,6 @@ "allclose", "repeat", "tile", + "max", + "min", ] diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index d9bd6b5b2b..dc4cf64dcc 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -171,3 +171,62 @@ def sum(arr, axis=None, dtype=None, keepdims=False): dpctl.SyclEvent.wait_for(host_tasks_list) return res + + +def _same_dtype_reduction(x, axis, keepdims, func): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + nd = x.ndim + if axis is None: + red_nd = nd + # case of a scalar + if red_nd == 0: + return dpt.copy(x) + x_tmp = x + res_shape = tuple() + perm = list(range(nd)) + else: + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + + red_nd = len(axis) + # check for axis=() + if red_nd == 0: + return dpt.copy(x) + perm = [i for i in range(nd) if i not in axis] + list(axis) + x_tmp = dpt.permute_dims(x, perm) + res_shape = x_tmp.shape[: nd - red_nd] + + exec_q = x.sycl_queue + res_usm_type = x.usm_type + res_dtype = x.dtype + + res = dpt.empty( + res_shape, + dtype=res_dtype, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) + hev, _ = func( + src=x_tmp, + trailing_dims_to_reduce=red_nd, + dst=res, + sycl_queue=exec_q, + ) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + hev.wait() + return res + + +def max(x, axis=None, keepdims=False): + return _same_dtype_reduction(x, axis, keepdims, ti._max_over_axis) + + +def min(x, axis=None, keepdims=False): + return _same_dtype_reduction(x, axis, keepdims, ti._min_over_axis) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 7dfc956492..c33f1fab24 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -32,6 +32,7 @@ #include #include "pybind11/pybind11.h" +#include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/sycl_utils.hpp" #include "utils/type_dispatch.hpp" @@ -39,6 +40,7 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; +namespace su_ns = dpctl::tensor::sycl_utils; namespace dpctl { @@ -153,7 +155,7 @@ struct ReductionOverGroupWithAtomicFunctor const size_t reduction_lid = it.get_local_id(0); const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg - // work-items sums over input with indices + // work-items operate over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg // + reduction_lid // for 0 <= m < reductions_per_wi @@ -191,11 +193,15 @@ struct ReductionOverGroupWithAtomicFunctor sycl::memory_scope::device, sycl::access::address_space::global_space> res_ref(out_[out_iter_offset]); - if constexpr (std::is_same_v> || - std::is_same_v>) - { + if constexpr (su_ns::IsPlus::value) { res_ref += red_val_over_wg; } + else if constexpr (su_ns::IsMaximum::value) { + res_ref.fetch_max(red_val_over_wg); + } + else if constexpr (su_ns::IsMinimum::value) { + res_ref.fetch_min(red_val_over_wg); + } else { outT read_val = res_ref.load(); outT new_val{}; @@ -207,7 +213,114 @@ struct ReductionOverGroupWithAtomicFunctor } }; -typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)( +/* === Reduction, using custom_reduce_over_group, and sycl::atomic_ref === */ + +template +struct CustomReductionOverGroupWithAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + CustomReductionOverGroupWithAtomicFunctor( + const argT *data, + outT *res, + ReductionOp reduction_op, + const outT &identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + SlmT local_mem, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + // work-items operate over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + size_t arg_reduce_gid_max = std::min( + reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); + + for (size_t arg_reduce_gid = arg_reduce_gid0; + arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) + { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val = convert_impl(inp_[inp_offset]); + + local_red_val = reduction_op_(local_red_val, val); + } + + auto work_group = it.get_group(); + outT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + if (work_group.leader()) { + sycl::atomic_ref + res_ref(out_[out_iter_offset]); + if constexpr (su_ns::IsPlus::value) { + res_ref += red_val_over_wg; + } + else if constexpr (su_ns::IsMaximum::value) { + res_ref.fetch_max(red_val_over_wg); + } + else if constexpr (su_ns::IsMinimum::value) { + res_ref.fetch_min(red_val_over_wg); + } + else { + outT read_val = res_ref.load(); + outT new_val{}; + do { + new_val = reduction_op_(read_val, red_val_over_wg); + } while (!res_ref.compare_exchange_strong(read_val, new_val)); + } + } + } +}; + +typedef sycl::event (*reduction_strided_impl_fn_ptr)( sycl::queue &, size_t, size_t, @@ -223,27 +336,51 @@ typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)( const std::vector &); template -class sum_reduction_over_group_with_atomics_krn; +class reduction_over_group_with_atomics_krn; + +template +class custom_reduction_over_group_with_atomics_krn; -template -class sum_reduction_over_group_with_atomics_init_krn; +template +class reduction_over_group_with_atomics_init_krn; template -class sum_reduction_seq_strided_krn; +class reduction_seq_strided_krn; template -class sum_reduction_seq_contig_krn; +class reduction_seq_contig_krn; template -class sum_reduction_axis0_over_group_with_atomics_contig_krn; +class reduction_axis0_over_group_with_atomics_contig_krn; + +template +class custom_reduction_axis0_over_group_with_atomics_contig_krn; template -class sum_reduction_axis1_over_group_with_atomics_contig_krn; +class reduction_axis1_over_group_with_atomics_contig_krn; + +template +class custom_reduction_axis1_over_group_with_atomics_contig_krn; using dpctl::tensor::sycl_utils::choose_workgroup_size; -template -sycl::event sum_reduction_over_group_with_atomics_strided_impl( +template +sycl::event reduction_over_group_with_atomics_strided_impl( sycl::queue &exec_q, size_t iter_nelems, // number of reductions (num. of rows in a matrix // when reducing over rows) @@ -263,8 +400,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( const argTy *arg_tp = reinterpret_cast(arg_cp); resTy *res_tp = reinterpret_cast(res_cp); - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; + constexpr resTy identity_val = su_ns::Identity::value; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); @@ -285,7 +421,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, reduction_shape_stride}; - cgh.parallel_for>( sycl::range<1>(iter_nelems), @@ -308,8 +444,8 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, res_strides); using InitKernelName = - class sum_reduction_over_group_with_atomics_init_krn; + class reduction_over_group_with_atomics_init_krn; cgh.depends_on(depends); cgh.parallel_for( @@ -347,18 +483,37 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = class sum_reduction_over_group_with_atomics_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; + if constexpr (su_ns::IsSyclOp::value) { + using KernelName = class reduction_over_group_with_atomics_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupWithAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_reduction_over_group_with_atomics_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } }); return comp_ev; @@ -367,7 +522,7 @@ sycl::event sum_reduction_over_group_with_atomics_strided_impl( // Contig -typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)( +typedef sycl::event (*reduction_contig_impl_fn_ptr)( sycl::queue &, size_t, size_t, @@ -379,8 +534,8 @@ typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)( const std::vector &); /* @brief Reduce rows in a matrix */ -template -sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( +template +sycl::event reduction_axis1_over_group_with_atomics_contig_impl( sycl::queue &exec_q, size_t iter_nelems, // number of reductions (num. of rows in a matrix // when reducing over rows) @@ -397,8 +552,7 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( iter_arg_offset + reduction_arg_offset; resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; + constexpr resTy identity_val = su_ns::Identity::value; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); @@ -422,7 +576,7 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; - cgh.parallel_for>( sycl::range<1>(iter_nelems), @@ -470,28 +624,46 @@ sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = - class sum_reduction_axis1_over_group_with_atomics_contig_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; + if constexpr (su_ns::IsSyclOp::value) { + using KernelName = + class reduction_axis1_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupWithAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class + custom_reduction_axis1_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } }); - return comp_ev; } } /* @brief Reduce rows in a matrix */ -template -sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl( +template +sycl::event reduction_axis0_over_group_with_atomics_contig_impl( sycl::queue &exec_q, size_t iter_nelems, // number of reductions (num. of cols in a matrix // when reducing over cols) @@ -508,8 +680,8 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl( iter_arg_offset + reduction_arg_offset; resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; + constexpr resTy identity_val = su_ns::Identity::value; + ; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); @@ -551,21 +723,39 @@ sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = - class sum_reduction_axis0_over_group_with_atomics_contig_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; + if constexpr (su_ns::IsSyclOp::value) { + using KernelName = + class reduction_axis0_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupWithAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class + custom_reduction_axis0_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupWithAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } }); - return comp_ev; } } @@ -618,7 +808,7 @@ struct ReductionOverGroupNoAtomicFunctor const size_t reduction_batch_id = it.get_group(0) / iter_gws_; const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; - // work-items sums over input with indices + // work-items operates over input with indices // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg // + reduction_lid // for 0 <= m < reductions_per_wi @@ -658,11 +848,110 @@ struct ReductionOverGroupNoAtomicFunctor } }; -template -class sum_reduction_over_group_temps_krn; +/* = Reduction, using custom_reduce_over_group and not using atomic_ref*/ + +template +struct CustomReductionOverGroupNoAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + CustomReductionOverGroupNoAtomicFunctor( + const argT *data, + outT *res, + ReductionOp reduction_op, + const outT &identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + SlmT local_mem, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } -template -sycl::event sum_reduction_over_group_temps_strided_impl( + void operator()(sycl::nd_item<1> it) const + { + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (size_t m = 0; m < reductions_per_wi; ++m) { + size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val = convert_impl(inp_[inp_offset]); + + local_red_val = reduction_op_(local_red_val, val); + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + outT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + if (work_group.leader()) { + // each group writes to a different memory location + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + red_val_over_wg; + } + } +}; + +template +class reduction_over_group_temps_krn; + +template +class custom_reduction_over_group_temps_krn; + +template +sycl::event reduction_over_group_temps_strided_impl( sycl::queue &exec_q, size_t iter_nelems, // number of reductions (num. of rows in a matrix // when reducing over rows) @@ -682,8 +971,7 @@ sycl::event sum_reduction_over_group_temps_strided_impl( const argTy *arg_tp = reinterpret_cast(arg_cp); resTy *res_tp = reinterpret_cast(res_cp); - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; + constexpr resTy identity_val = su_ns::Identity::value; const sycl::device &d = exec_q.get_device(); const auto &sg_sizes = d.get_info(); @@ -694,7 +982,7 @@ sycl::event sum_reduction_over_group_temps_strided_impl( size_t reductions_per_wi(preferrered_reductions_per_wi); if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requires 1 work-group, can output directly to res + // reduction only requries 1 work-group, can output directly to res sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(depends); @@ -722,19 +1010,35 @@ sycl::event sum_reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = class sum_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); + if constexpr (su_ns::IsSyclOp::value) { + using KernelName = class reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(arg_tp, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } }); - return comp_ev; } else { @@ -789,17 +1093,36 @@ sycl::event sum_reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = class sum_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor( - arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); + if constexpr (su_ns::IsSyclOp::value) { + using KernelName = class reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + local_memory, reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } }); size_t remaining_reduction_nelems = reduction_groups; @@ -817,34 +1140,33 @@ sycl::event sum_reduction_over_group_temps_strided_impl( assert(reduction_groups_ > 1); // keep reducing - sycl::event partial_reduction_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dependent_ev); - - using InputIndexerT = - dpctl::tensor::offset_utils::Strided1DIndexer; - using ResIndexerT = - dpctl::tensor::offset_utils::NoOpIndexer; - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< - InputIndexerT, ResIndexerT>; - using ReductionIndexerT = - dpctl::tensor::offset_utils::NoOpIndexer; - - InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; - ResIndexerT res_iter_indexer{}; - - InputOutputIterIndexerT in_out_iter_indexer{ - inp_indexer, res_iter_indexer}; - ReductionIndexerT reduction_indexer{}; - - auto globalRange = - sycl::range<1>{iter_nelems * reduction_groups_ * wg}; - auto localRange = sycl::range<1>{wg}; - - using KernelName = class sum_reduction_over_group_temps_krn< + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (su_ns::IsSyclOp::value) { + using KernelName = class reduction_over_group_temps_krn< resTy, resTy, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT>; cgh.parallel_for( @@ -856,7 +1178,25 @@ sycl::event sum_reduction_over_group_temps_strided_impl( in_out_iter_indexer, reduction_indexer, remaining_reduction_nelems, iter_nelems, preferrered_reductions_per_wi)); - }); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_reduction_over_group_temps_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, temp2_arg, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + local_memory, remaining_reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); + } + }); remaining_reduction_nelems = reduction_groups_; std::swap(temp_arg, temp2_arg); @@ -900,18 +1240,36 @@ sycl::event sum_reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - using KernelName = class sum_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor( - temp_arg, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, - remaining_reduction_nelems, iter_nelems, - reductions_per_wi)); + if constexpr (su_ns::IsSyclOp::value) { + using KernelName = class reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>(temp_arg, res_tp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, + remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + temp_arg, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } }); sycl::event cleanup_host_task_event = @@ -931,69 +1289,26 @@ sycl::event sum_reduction_over_group_temps_strided_impl( } } -/* @brief Types supported by plus-reduction code based on atomic_ref */ +/* @brief Types supported by comparison-reduction code based on atomic_ref */ template -struct TypePairSupportDataForSumReductionAtomic +struct TypePairSupportDataForCompReductionAtomic { /* value if true a kernel for must be instantiated, false * otherwise */ static constexpr bool is_defined = std::disjunction< // disjunction is C++17 // feature, supported - // by DPC++ input bool - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int8 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint8 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int16 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint16 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, + // by DPC++ // input int32 td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, // input uint32 td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, // input int64 td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, // input uint64 td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input half - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, // input float td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, // input double td_ns::TypePairDefinedEntry, // fall-through @@ -1001,55 +1316,29 @@ struct TypePairSupportDataForSumReductionAtomic }; template -struct TypePairSupportDataForSumReductionTemps +struct TypePairSupportDataForCompReductionTemps { static constexpr bool is_defined = std::disjunction< // disjunction is C++17 // feature, supported // by DPC++ input bool - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - + td_ns::TypePairDefinedEntry, // input int8_t td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, // input uint8_t td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, // input int16_t td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, // input uint16_t td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, // input int32_t td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint32_t td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, // input int64_t td_ns::TypePairDefinedEntry, @@ -1059,55 +1348,158 @@ struct TypePairSupportDataForSumReductionTemps // input half td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns:: - TypePairDefinedEntry>, - td_ns::TypePairDefinedEntry>, // input float td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry>, - td_ns::TypePairDefinedEntry>, // input double td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry>, // input std::complex td_ns::TypePairDefinedEntry, outTy, std::complex>, - td_ns::TypePairDefinedEntry, - outTy, - std::complex>, td_ns::TypePairDefinedEntry, outTy, std::complex>, - // fall-throug + // fall-through td_ns::NotDefinedEntry>::is_defined; }; template -struct SumOverAxisAtomicStridedFactory +struct MaxOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + using dpctl::tensor::type_utils::is_complex; + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MaxOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForCompReductionAtomic< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::maximum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct MinOverAxisAtomicStridedFactory { fnT get() const { - if constexpr (TypePairSupportDataForSumReductionAtomic< + if constexpr (TypePairSupportDataForCompReductionAtomic< srcTy, dstTy>::is_defined) { - return dpctl::tensor::kernels:: - sum_reduction_over_group_with_atomics_strided_impl; + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl< + srcTy, dstTy, ReductionOpT>; + } } else { return nullptr; @@ -1116,14 +1508,27 @@ struct SumOverAxisAtomicStridedFactory }; template -struct SumOverAxisTempsStridedFactory +struct MinOverAxisTempsStridedFactory { fnT get() const { - if constexpr (TypePairSupportDataForSumReductionTemps< - srcTy, dstTy>::is_defined) { - return dpctl::tensor::kernels:: - sum_reduction_over_group_temps_strided_impl; + if constexpr (TypePairSupportDataForCompReductionTemps< + srcTy, dstTy>::is_defined) + { + using dpctl::tensor::type_utils::is_complex; + if constexpr (std::is_integral_v && + !std::is_same_v) { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } } else { return nullptr; @@ -1132,16 +1537,25 @@ struct SumOverAxisTempsStridedFactory }; template -struct SumOverAxis1AtomicContigFactory +struct MinOverAxis1AtomicContigFactory { fnT get() const { - if constexpr (TypePairSupportDataForSumReductionAtomic< + if constexpr (TypePairSupportDataForCompReductionAtomic< srcTy, dstTy>::is_defined) { - return dpctl::tensor::kernels:: - sum_reduction_axis1_over_group_with_atomics_contig_impl; + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } } else { return nullptr; @@ -1150,16 +1564,25 @@ struct SumOverAxis1AtomicContigFactory }; template -struct SumOverAxis0AtomicContigFactory +struct MinOverAxis0AtomicContigFactory { fnT get() const { - if constexpr (TypePairSupportDataForSumReductionAtomic< + if constexpr (TypePairSupportDataForCompReductionAtomic< srcTy, dstTy>::is_defined) { - return dpctl::tensor::kernels:: - sum_reduction_axis0_over_group_with_atomics_contig_impl; + if constexpr (std::is_floating_point::value) { + using ReductionOpT = su_ns::Minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + using ReductionOpT = sycl::minimum; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } } else { return nullptr; diff --git a/dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp new file mode 100644 index 0000000000..0ebbd8b308 --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp @@ -0,0 +1,1172 @@ +//=== sum_reductions.hpp - Implementation of sum kernels ------- *-C++-*/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines kernels for summing tensors along axis. +//===----------------------------------------------------------------------===// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "pybind11/pybind11.h" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch.hpp" +#include "utils/type_utils.hpp" + +namespace py = pybind11; +namespace td_ns = dpctl::tensor::type_dispatch; + +namespace dpctl +{ +namespace tensor +{ +namespace kernels +{ + +template +struct SequentialReduction +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + size_t reduction_max_gid_ = 0; + +public: + SequentialReduction(const argT *inp, + outT *res, + ReductionOp reduction_op, + const outT &identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + size_t reduction_size) + : inp_(inp), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size) + { + } + + void operator()(sycl::id<1> id) const + { + + auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]); + const py::ssize_t &inp_iter_offset = + inp_out_iter_offsets_.get_first_offset(); + const py::ssize_t &out_iter_offset = + inp_out_iter_offsets_.get_second_offset(); + + outT red_val(identity_); + for (size_t m = 0; m < reduction_max_gid_; ++m) { + const py::ssize_t inp_reduction_offset = + inp_reduced_dims_indexer_(m); + const py::ssize_t inp_offset = + inp_iter_offset + inp_reduction_offset; + + red_val = reduction_op_(red_val, inp_[inp_offset]); + } + + out_[out_iter_offset] = red_val; + } +}; + +/* === Reduction, using sycl::reduce_over_group, and sycl::atomic_ref === */ + +/* + This kernel only works for outT with sizeof(outT) == 4, or sizeof(outT) == 8 + if the device has aspect atomic64 and only with those supported by + sycl::atomic_ref +*/ +template +struct ReductionOverGroupWithAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + ReductionOverGroupWithAtomicFunctor( + const argT *data, + outT *res, + ReductionOp reduction_op, + const outT &identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), iter_gws_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + // work-items sums over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + size_t arg_reduce_gid_max = std::min( + reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); + + for (size_t arg_reduce_gid = arg_reduce_gid0; + arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) + { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val = convert_impl(inp_[inp_offset]); + + local_red_val = reduction_op_(local_red_val, val); + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + outT red_val_over_wg = sycl::reduce_over_group( + work_group, local_red_val, identity_, reduction_op_); + + if (work_group.leader()) { + sycl::atomic_ref + res_ref(out_[out_iter_offset]); + if constexpr (std::is_same_v> || + std::is_same_v>) + { + res_ref += red_val_over_wg; + } + else { + outT read_val = res_ref.load(); + outT new_val{}; + do { + new_val = reduction_op_(read_val, red_val_over_wg); + } while (!res_ref.compare_exchange_strong(read_val, new_val)); + } + } + } +}; + +typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)( + sycl::queue &, + size_t, + size_t, + const char *, + char *, + int, + const py::ssize_t *, + py::ssize_t, + py::ssize_t, + int, + const py::ssize_t *, + py::ssize_t, + const std::vector &); + +template +class sum_reduction_over_group_with_atomics_krn; + +template +class sum_reduction_over_group_with_atomics_init_krn; + +template +class sum_reduction_seq_strided_krn; + +template +class sum_reduction_seq_contig_krn; + +template +class sum_reduction_axis0_over_group_with_atomics_contig_krn; + +template +class sum_reduction_axis1_over_group_with_atomics_contig_krn; + +using dpctl::tensor::sycl_utils::choose_workgroup_size; + +template +sycl::event sum_reduction_over_group_with_atomics_strided_impl( + sycl::queue &exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + int iter_nd, + const py::ssize_t *iter_shape_and_strides, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + int red_nd, + const py::ssize_t *reduction_shape_stride, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + using ReductionOpT = sycl::plus; + constexpr resTy identity_val = resTy{0}; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + else { + sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { + using IndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + + const py::ssize_t *const &res_shape = iter_shape_and_strides; + const py::ssize_t *const &res_strides = + iter_shape_and_strides + 2 * iter_nd; + IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, + res_strides); + using InitKernelName = + class sum_reduction_over_group_with_atomics_init_krn; + cgh.depends_on(depends); + + cgh.parallel_for( + sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { + auto res_offset = res_indexer(id[0]); + res_tp[res_offset] = identity_val; + }); + }); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(res_init_ev); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + constexpr size_t preferrered_reductions_per_wi = 4; + size_t reductions_per_wi = + (reduction_nelems < preferrered_reductions_per_wi * wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) + : preferrered_reductions_per_wi; + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + using KernelName = class sum_reduction_over_group_with_atomics_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupWithAtomicFunctor( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + }); + + return comp_ev; + } +} + +// Contig + +typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)( + sycl::queue &, + size_t, + size_t, + const char *, + char *, + py::ssize_t, + py::ssize_t, + py::ssize_t, + const std::vector &); + +/* @brief Reduce rows in a matrix */ +template +sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( + sycl::queue &exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + using ReductionOpT = sycl::plus; + constexpr resTy identity_val = resTy{0}; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + if (reduction_nelems < wg) { + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIterIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIterIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + InputOutputIterIndexerT in_out_iter_indexer{ + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, + NoOpIndexerT{}}; + ReductionIndexerT reduction_indexer{}; + + cgh.parallel_for>( + sycl::range<1>(iter_nelems), + SequentialReduction( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems)); + }); + + return comp_ev; + } + else { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(res_init_ev); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + RowsIndexerT, NoOpIndexerT>; + using ReductionIndexerT = NoOpIndexerT; + + RowsIndexerT rows_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_nelems)}; + NoOpIndexerT result_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{rows_indexer, + result_indexer}; + ReductionIndexerT reduction_indexer{}; + + constexpr size_t preferrered_reductions_per_wi = 8; + size_t reductions_per_wi = + (reduction_nelems < preferrered_reductions_per_wi * wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) + : preferrered_reductions_per_wi; + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + using KernelName = + class sum_reduction_axis1_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupWithAtomicFunctor( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + }); + + return comp_ev; + } +} + +/* @brief Reduce rows in a matrix */ +template +sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl( + sycl::queue &exec_q, + size_t iter_nelems, // number of reductions (num. of cols in a matrix + // when reducing over cols) + size_t reduction_nelems, // size of each reduction (length of cols, i.e. + // number of rows) + const char *arg_cp, + char *res_cp, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp) + + iter_arg_offset + reduction_arg_offset; + resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; + + using ReductionOpT = sycl::plus; + constexpr resTy identity_val = resTy{0}; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + { + sycl::event res_init_ev = exec_q.fill( + res_tp, resTy(identity_val), iter_nelems, depends); + + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(res_init_ev); + + using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + NoOpIndexerT, NoOpIndexerT>; + using ReductionIndexerT = ColsIndexerT; + + NoOpIndexerT columns_indexer{}; + NoOpIndexerT result_indexer{}; + InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, + result_indexer}; + ReductionIndexerT reduction_indexer{ + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; + + constexpr size_t preferrered_reductions_per_wi = 8; + size_t reductions_per_wi = + (reduction_nelems < preferrered_reductions_per_wi * wg) + ? std::max(1, (reduction_nelems + wg - 1) / wg) + : preferrered_reductions_per_wi; + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + using KernelName = + class sum_reduction_axis0_over_group_with_atomics_contig_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupWithAtomicFunctor( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + }); + + return comp_ev; + } +} + +/* = Reduction, using sycl::reduce_over_group, but not using atomic_ref = */ + +template +struct ReductionOverGroupNoAtomicFunctor +{ +private: + const argT *inp_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + outT identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + ReductionOverGroupNoAtomicFunctor( + const argT *data, + outT *res, + ReductionOp reduction_op, + const outT &identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), out_(res), reduction_op_(reduction_op), + identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), iter_gws_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + + // work-items sums over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + outT local_red_val(identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (size_t m = 0; m < reductions_per_wi; ++m) { + size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + using dpctl::tensor::type_utils::convert_impl; + outT val = convert_impl(inp_[inp_offset]); + + local_red_val = reduction_op_(local_red_val, val); + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + outT red_val_over_wg = sycl::reduce_over_group( + work_group, local_red_val, identity_, reduction_op_); + + if (work_group.leader()) { + // each group writes to a different memory location + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + red_val_over_wg; + } + } +}; + +template +class sum_reduction_over_group_temps_krn; + +template +sycl::event sum_reduction_over_group_temps_strided_impl( + sycl::queue &exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + int iter_nd, + const py::ssize_t *iter_shape_and_strides, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + int red_nd, + const py::ssize_t *reduction_shape_stride, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + using ReductionOpT = sycl::plus; + constexpr resTy identity_val = resTy{0}; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + constexpr size_t preferrered_reductions_per_wi = 4; + size_t max_wg = d.get_info(); + + size_t reductions_per_wi(preferrered_reductions_per_wi); + if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { + // reduction only requires 1 work-group, can output directly to res + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + using KernelName = class sum_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor( + arg_tp, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems, + iter_nelems, reductions_per_wi)); + }); + + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + size_t reduction_groups = + (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups > 1); + + size_t second_iter_reduction_groups_ = + (reduction_groups + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + + resTy *partially_reduced_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + resTy *partially_reduced_tmp2 = nullptr; + + if (partially_reduced_tmp == nullptr) { + throw std::runtime_error("Unabled to allocate device_memory"); + } + else { + partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + } + + const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(depends); + + using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + // Only 2*iter_nd entries describing shape and strides of iterated + // dimensions of input array from iter_shape_and_strides are going + // to be accessed by inp_indexer + InputIndexerT inp_indexer(iter_nd, iter_arg_offset, + iter_shape_and_strides); + ResIndexerT noop_tmp_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + noop_tmp_indexer}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + using KernelName = class sum_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor( + arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); + }); + + size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferrered_reductions_per_wi * max_wg) { + size_t reduction_groups_ = + (remaining_reduction_nelems + + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + sycl::event partial_reduction_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{ + inp_indexer, res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + + using KernelName = class sum_reduction_over_group_temps_krn< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + resTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + temp_arg, temp2_arg, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + remaining_reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + }); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + dependent_ev = std::move(partial_reduction_ev); + } + + // final reduction to res + sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; + ResIndexerT res_iter_indexer{iter_nd, iter_res_offset, + /* shape */ iter_shape_and_strides, + /*s trides */ iter_shape_and_strides + + 2 * iter_nd}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + using KernelName = class sum_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor( + temp_arg, res_tp, ReductionOpT(), identity_val, + in_out_iter_indexer, reduction_indexer, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + }); + + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(final_reduction_ev); + const sycl::context &ctx = exec_q.get_context(); + + cgh.host_task([ctx, partially_reduced_tmp] { + sycl::free(partially_reduced_tmp, ctx); + }); + }); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +/* @brief Types supported by plus-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForSumReductionAtomic +{ + + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input double + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForSumReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns:: + TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input double + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-throug + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct SumOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + return dpctl::tensor::kernels:: + sum_reduction_over_group_with_atomics_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + return dpctl::tensor::kernels:: + sum_reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + return dpctl::tensor::kernels:: + sum_reduction_axis1_over_group_with_atomics_contig_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + return dpctl::tensor::kernels:: + sum_reduction_axis0_over_group_with_atomics_contig_impl; + } + else { + return nullptr; + } + } +}; + +} // namespace kernels +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp new file mode 100644 index 0000000000..4072d266d3 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp @@ -0,0 +1,218 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include "kernels/reductions.hpp" +#include "reduction_over_axis.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +namespace td_ns = dpctl::tensor::type_dispatch; +// Max +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + max_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + max_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + max_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + max_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_max_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::MaxOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(max_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(max_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(max_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::MaxOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(max_over_axis0_contig_atomic_dispatch_table); +} + +} // namespace impl + +// Min +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + min_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + min_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + min_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + min_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_min_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::MinOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(min_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(min_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(min_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::MinOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(min_over_axis0_contig_atomic_dispatch_table); +} + +} // namespace impl + +namespace py = pybind11; + +void init_reduction_functions(py::module_ m) +{ + using arrayT = dpctl::tensor::usm_ndarray; + using event_vecT = std::vector; + + namespace impl = dpctl::tensor::py_internal::impl; + + // MAX + { + using dpctl::tensor::py_internal::impl:: + populate_max_over_axis_dispatch_tables; + populate_max_over_axis_dispatch_tables(); + using impl::max_over_axis0_contig_atomic_dispatch_table; + using impl::max_over_axis1_contig_atomic_dispatch_table; + using impl::max_over_axis_strided_atomic_dispatch_table; + using impl::max_over_axis_strided_temps_dispatch_table; + + auto max_pyapi = [&](arrayT src, int trailing_dims_to_reduce, + arrayT dst, sycl::queue exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + max_over_axis_strided_atomic_dispatch_table, + max_over_axis_strided_temps_dispatch_table, + max_over_axis0_contig_atomic_dispatch_table, + max_over_axis1_contig_atomic_dispatch_table); + }; + m.def("_max_over_axis", max_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } + + // MIN + { + using dpctl::tensor::py_internal::impl:: + populate_min_over_axis_dispatch_tables; + populate_min_over_axis_dispatch_tables(); + using impl::min_over_axis0_contig_atomic_dispatch_table; + using impl::min_over_axis1_contig_atomic_dispatch_table; + using impl::min_over_axis_strided_atomic_dispatch_table; + using impl::min_over_axis_strided_temps_dispatch_table; + + auto min_pyapi = [&](arrayT src, int trailing_dims_to_reduce, + arrayT dst, sycl::queue exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + min_over_axis_strided_atomic_dispatch_table, + min_over_axis_strided_temps_dispatch_table, + min_over_axis0_contig_atomic_dispatch_table, + min_over_axis1_contig_atomic_dispatch_table); + }; + m.def("_min_over_axis", min_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } +} + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp new file mode 100644 index 0000000000..0a83f4aa92 --- /dev/null +++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp @@ -0,0 +1,394 @@ +//===----------- Implementation of _tensor_impl module ---------*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions, +/// specifically functions for reductions. +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "dpctl4pybind11.hpp" +#include +#include +#include + +#include "kernels/reductions.hpp" +#include "simplify_iteration_space.hpp" +#include "utils/memory_overlap.hpp" +#include "utils/offset_utils.hpp" +#include "utils/sycl_utils.hpp" +#include "utils/type_dispatch.hpp" + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +template +std::pair py_reduction_over_axis( + dpctl::tensor::usm_ndarray src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends, + const strided_fnT &atomic_dispatch_table, + const strided_fnT &temps_dispatch_table, + const contig_fnT &axis0_dispatch_table, + const contig_fnT &axis1_dispatch_table) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + size_t dst_nelems = dst.get_size(); + + size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < dst_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + int dst_itemsize = dst.get_elemsize(); + bool supports_atomics = false; + + switch (dst_itemsize) { + case sizeof(float): + { + void *data_ptr = dst.get_data(); + const auto &ctx = exec_q.get_context(); + auto usm_type = sycl::get_pointer_type(data_ptr, ctx); + using dpctl::tensor::sycl_utils::AtomicSupport; + const auto &check_atomic_support = AtomicSupport{}; + supports_atomics = check_atomic_support(exec_q, usm_type); + } break; + case sizeof(double): + { + void *data_ptr = dst.get_data(); + const auto &ctx = exec_q.get_context(); + auto usm_type = sycl::get_pointer_type(data_ptr, ctx); + + constexpr bool require_atomic64 = true; + using dpctl::tensor::sycl_utils::AtomicSupport; + const auto &check_atomic_support = AtomicSupport{}; + supports_atomics = + check_atomic_support(exec_q, usm_type, require_atomic64); + } break; + } + + // handle special case when both reduction and iteration are 1D contiguous + // and can be done with atomics + if (supports_atomics) { + bool is_src_c_contig = src.is_c_contiguous(); + bool is_dst_c_contig = dst.is_c_contiguous(); + bool is_src_f_contig = src.is_f_contiguous(); + + if ((is_src_c_contig && is_dst_c_contig) || + (is_src_f_contig && dst_nelems == 1)) + { + auto fn = axis1_dispatch_table[src_typeid][dst_typeid]; + + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + else if (is_src_f_contig && + ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) + { + auto fn = axis0_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + size_t iter_nelems = dst_nelems; + + constexpr py::ssize_t zero_offset = 0; + + sycl::event reduction_over_axis_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), + zero_offset, // iteration_src_offset + zero_offset, // iteration_dst_offset + zero_offset, // reduction_src_offset + depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis_contig_ev); + } + } + } + + using dpctl::tensor::py_internal::simplify_iteration_space; + using dpctl::tensor::py_internal::simplify_iteration_space_1; + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT simplified_reduction_shape; + shT simplified_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + simplify_iteration_space_1( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + simplified_reduction_shape, simplified_reduction_src_strides, + reduction_src_offset); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + if (supports_atomics && (reduction_nd == 1) && (iteration_nd == 1)) { + bool mat_reduce_over_axis1 = false; + bool mat_reduce_over_axis0 = false; + bool array_reduce_all_elems = false; + size_t iter_nelems = dst_nelems; + + if (simplified_reduction_src_strides[0] == 1) { + array_reduce_all_elems = (simplified_iteration_shape[0] == 1); + mat_reduce_over_axis1 = + (simplified_iteration_dst_strides[0] == 1) && + (static_cast(simplified_iteration_src_strides[0]) == + reduction_nelems); + } + else if (static_cast(simplified_reduction_src_strides[0]) == + iter_nelems) + { + mat_reduce_over_axis0 = + (simplified_iteration_dst_strides[0] == 1) && + (simplified_iteration_src_strides[0] == 1); + } + + if (mat_reduce_over_axis1 || array_reduce_all_elems) { + auto fn = axis1_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis1_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis1_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis1_contig_ev); + } + } + else if (mat_reduce_over_axis0) { + auto fn = axis0_dispatch_table[src_typeid][dst_typeid]; + if (fn != nullptr) { + sycl::event reduction_over_axis0_contig_ev = + fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_src_offset, + iteration_dst_offset, reduction_src_offset, depends); + + sycl::event keep_args_event = dpctl::utils::keep_args_alive( + exec_q, {src, dst}, {reduction_over_axis0_contig_ev}); + + return std::make_pair(keep_args_event, + reduction_over_axis0_contig_ev); + } + } + } + + // remove_all_extents gets underlying type of table + using strided_fn_ptr_T = + typename std::remove_all_extents::type; + strided_fn_ptr_T fn = nullptr; + + if (supports_atomics) { + fn = atomic_dispatch_table[src_typeid][dst_typeid]; + } + + if (fn == nullptr) { + // use slower reduction implementation using temporaries + fn = temps_dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + } + + std::vector host_task_events{}; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + + const auto &arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + simplified_reduction_shape, simplified_reduction_src_strides); + py::ssize_t *temp_allocation_ptr = + std::get<0>(arrays_metainfo_packing_triple_); + if (temp_allocation_ptr == nullptr) { + throw std::runtime_error("Unable to allocate memory on device"); + } + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + + py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto reduction_ev = + fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), dst.get_data(), + iteration_nd, iter_shape_and_strides, iteration_src_offset, + iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(reduction_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, temp_allocation_ptr] { + sycl::free(temp_allocation_ptr, ctx); + }); + }); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, reduction_ev); +} + +extern void init_reduction_functions(py::module_ m); + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sum_reductions.cpp b/dpctl/tensor/libtensor/source/sum_reductions.cpp index 529096f5b6..e4b6595d66 100644 --- a/dpctl/tensor/libtensor/source/sum_reductions.cpp +++ b/dpctl/tensor/libtensor/source/sum_reductions.cpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ #include #include -#include "kernels/reductions.hpp" +#include "kernels/sum_reductions.hpp" #include "sum_reductions.hpp" #include "simplify_iteration_space.hpp" @@ -524,7 +524,7 @@ void populate_sum_over_axis_dispatch_table(void) namespace py = pybind11; -void init_reduction_functions(py::module_ m) +void init_sum_reduction_functions(py::module_ m) { populate_sum_over_axis_dispatch_table(); diff --git a/dpctl/tensor/libtensor/source/sum_reductions.hpp b/dpctl/tensor/libtensor/source/sum_reductions.hpp index ac612ec1f7..6c34160fb6 100644 --- a/dpctl/tensor/libtensor/source/sum_reductions.hpp +++ b/dpctl/tensor/libtensor/source/sum_reductions.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -33,7 +33,7 @@ namespace tensor namespace py_internal { -extern void init_reduction_functions(py::module_ m); +extern void init_sum_reduction_functions(py::module_ m); } // namespace py_internal } // namespace tensor diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 2ce7c72add..8b687a6d1d 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -46,6 +46,7 @@ #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" #include "linear_sequences.hpp" +#include "reduction_over_axis.hpp" #include "repeat.hpp" #include "simplify_iteration_space.hpp" #include "sum_reductions.hpp" @@ -412,5 +413,6 @@ PYBIND11_MODULE(_tensor_impl, m) dpctl::tensor::py_internal::init_elementwise_functions(m); dpctl::tensor::py_internal::init_boolean_reduction_functions(m); + dpctl::tensor::py_internal::init_sum_reduction_functions(m); dpctl::tensor::py_internal::init_reduction_functions(m); } From 41671ae7b7b2ce85c6836518ed009fd8ea453562 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 14 Sep 2023 11:45:08 -0700 Subject: [PATCH 03/26] Adds tests for min and max --- dpctl/tests/test_usm_ndarray_reductions.py | 107 +++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 dpctl/tests/test_usm_ndarray_reductions.py diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py new file mode 100644 index 0000000000..87c32e90fe --- /dev/null +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -0,0 +1,107 @@ +# Data Parallel Control (dpctl) +# +# Copyright 2020-2023 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import dpctl.tensor as dpt +from dpctl.tests.helper import get_queue_or_skip, skip_if_dtype_not_supported + + +def test_max_min_axis(): + get_queue_or_skip() + + x = dpt.reshape( + dpt.arange((3 * 4 * 5 * 6 * 7), dtype="i4"), (3, 4, 5, 6, 7) + ) + + m = dpt.max(x, axis=(1, 2, -1)) + assert m.shape == (3, 6) + assert dpt.all(m == x[:, -1, -1, :, -1]) + + m = dpt.min(x, axis=(1, 2, -1)) + assert m.shape == (3, 6) + assert dpt.all(m == x[:, 0, 0, :, 0]) + + +def test_reduction_keepdims(): + get_queue_or_skip() + + x = dpt.ones((3, 4, 5, 6, 7), dtype="i4") + m = dpt.max(x, axis=(1, 2, -1), keepdims=True) + + assert m.shape == (3, 1, 1, 6, 1) + assert dpt.all(m == dpt.reshape(x[:, 0, 0, :, 0], m.shape)) + + +def test_max_scalar(): + get_queue_or_skip() + + x = dpt.ones(()) + m = dpt.max(x) + + assert m.shape == () + assert x == m + + +@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"]) +def test_reduction_kernels(arg_dtype): + # i4 - always uses atomics w/ sycl group reduction + # f4 - always uses atomics w/ custom group reduction + # c8 - always uses temps w/ custom group reduction + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + x = dpt.reshape(dpt.arange(24 * 1025, dtype=arg_dtype), (24, 1025)) + + m = dpt.max(x) + assert m == x[-1, -1] + m = dpt.max(x, axis=0) + assert dpt.all(m == x[-1, :]) + m = dpt.max(x, axis=1) + assert dpt.all(m == x[:, -1]) + + m = dpt.min(x) + assert m == x[0, 0] + m = dpt.min(x, axis=0) + assert dpt.all(m == x[0, :]) + m = dpt.min(x, axis=1) + assert dpt.all(m == x[:, 0]) + + +def test_max_min_nan_propagation(): + get_queue_or_skip() + + # float, finites + x = dpt.arange(4, dtype="f4") + x[0] = dpt.nan + assert dpt.isnan(dpt.max(x)) + assert dpt.isnan(dpt.min(x)) + + # float, infinities + x[1:] = dpt.inf + assert dpt.isnan(dpt.max(x)) + x[1:] = -dpt.inf + assert dpt.isnan(dpt.min(x)) + + # complex + x = dpt.arange(4, dtype="c8") + x[0] = complex(dpt.nan, 0) + assert dpt.isnan(dpt.max(x)) + assert dpt.isnan(dpt.min(x)) + + x[0] = complex(0, dpt.nan) + assert dpt.isnan(dpt.max(x)) + assert dpt.isnan(dpt.min(x)) From 093fcca16ad19564ab44400137cfaa70f7fc2c78 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 14 Sep 2023 15:14:25 -0700 Subject: [PATCH 04/26] Reductions now set max_wg to the minimum of the max work group size and 2048 - This prevents running out of resources when using local memory on CPU --- dpctl/tensor/libtensor/include/kernels/reductions.hpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index c33f1fab24..1693cdab7d 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -978,7 +978,10 @@ sycl::event reduction_over_group_temps_strided_impl( size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); constexpr size_t preferrered_reductions_per_wi = 4; - size_t max_wg = d.get_info(); + // max_max_wg prevents running out of resources on CPU + constexpr size_t max_max_wg = 2048; + size_t max_wg = std::min( + max_max_wg, d.get_info()); size_t reductions_per_wi(preferrered_reductions_per_wi); if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { From 82688ed6972f850fe699fe09701f35a93bfcac3c Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Thu, 14 Sep 2023 17:52:53 -0700 Subject: [PATCH 05/26] max and min nan propagation fixed for CPU devices - drops use of fetch_max/fetch_min for floats, which do not handle nans correctly --- .../libtensor/include/kernels/reductions.hpp | 27 +++++++------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 1693cdab7d..e3c4adeead 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -196,10 +196,12 @@ struct ReductionOverGroupWithAtomicFunctor if constexpr (su_ns::IsPlus::value) { res_ref += red_val_over_wg; } - else if constexpr (su_ns::IsMaximum::value) { + else if constexpr (std::is_same_v>) + { res_ref.fetch_max(red_val_over_wg); } - else if constexpr (su_ns::IsMinimum::value) { + else if constexpr (std::is_same_v>) + { res_ref.fetch_min(red_val_over_wg); } else { @@ -300,22 +302,11 @@ struct CustomReductionOverGroupWithAtomicFunctor sycl::memory_scope::device, sycl::access::address_space::global_space> res_ref(out_[out_iter_offset]); - if constexpr (su_ns::IsPlus::value) { - res_ref += red_val_over_wg; - } - else if constexpr (su_ns::IsMaximum::value) { - res_ref.fetch_max(red_val_over_wg); - } - else if constexpr (su_ns::IsMinimum::value) { - res_ref.fetch_min(red_val_over_wg); - } - else { - outT read_val = res_ref.load(); - outT new_val{}; - do { - new_val = reduction_op_(read_val, red_val_over_wg); - } while (!res_ref.compare_exchange_strong(read_val, new_val)); - } + outT read_val = res_ref.load(); + outT new_val{}; + do { + new_val = reduction_op_(read_val, red_val_over_wg); + } while (!res_ref.compare_exchange_strong(read_val, new_val)); } } }; From e5a39cf7a54a47829478f7c09eed122a466749c2 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 15 Sep 2023 00:37:48 -0700 Subject: [PATCH 06/26] Tweak to test_reduction_kernels --- dpctl/tests/test_usm_ndarray_reductions.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py index 87c32e90fe..8200d05c58 100644 --- a/dpctl/tests/test_usm_ndarray_reductions.py +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -64,7 +64,9 @@ def test_reduction_kernels(arg_dtype): q = get_queue_or_skip() skip_if_dtype_not_supported(arg_dtype, q) - x = dpt.reshape(dpt.arange(24 * 1025, dtype=arg_dtype), (24, 1025)) + x = dpt.reshape( + dpt.arange(24 * 1025, dtype=arg_dtype, sycl_queue=q), (24, 1025) + ) m = dpt.max(x) assert m == x[-1, -1] From 3af754c63edb0bb2369ba349f9b641d1a8456a1f Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 18 Sep 2023 18:24:35 -0700 Subject: [PATCH 07/26] Implements dpctl.tensor.argmax and argmin --- dpctl/tensor/__init__.py | 4 +- dpctl/tensor/_reduction.py | 59 + .../libtensor/include/kernels/reductions.hpp | 1009 ++++++++++++++++- .../libtensor/source/reduction_over_axis.cpp | 86 ++ .../libtensor/source/reduction_over_axis.hpp | 185 +++ 5 files changed, 1287 insertions(+), 56 deletions(-) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index d5c2672d46..b5f356ab30 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -160,7 +160,7 @@ tanh, trunc, ) -from ._reduction import max, min, sum +from ._reduction import argmax, argmin, max, min, sum from ._testing import allclose __all__ = [ @@ -311,4 +311,6 @@ "tile", "max", "min", + "argmax", + "argmin", ] diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index dc4cf64dcc..7e18a63042 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -230,3 +230,62 @@ def max(x, axis=None, keepdims=False): def min(x, axis=None, keepdims=False): return _same_dtype_reduction(x, axis, keepdims, ti._min_over_axis) + + +def _argmax_argmin_reduction(x, axis, keepdims, func): + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + + nd = x.ndim + if axis is None: + red_nd = nd + # case of a scalar + if red_nd == 0: + return dpt.copy(x) + x_tmp = x + res_shape = tuple() + perm = list(range(nd)) + else: + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + + red_nd = len(axis) + # check for axis=() + if red_nd == 0: + return dpt.copy(x) + perm = [i for i in range(nd) if i not in axis] + list(axis) + x_tmp = dpt.permute_dims(x, perm) + res_shape = x_tmp.shape[: nd - red_nd] + + exec_q = x.sycl_queue + res_usm_type = x.usm_type + res_dtype = dpt.int64 + + res = dpt.empty( + res_shape, + dtype=res_dtype, + usm_type=res_usm_type, + sycl_queue=exec_q, + ) + hev, _ = func( + src=x_tmp, + trailing_dims_to_reduce=red_nd, + dst=res, + sycl_queue=exec_q, + ) + + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res = dpt.permute_dims(dpt.reshape(res, res_shape), inv_perm) + hev.wait() + return res + + +def argmax(x, axis=None, keepdims=False): + return _argmax_argmin_reduction(x, axis, keepdims, ti._argmax_over_axis) + + +def argmin(x, axis=None, keepdims=False): + return _argmax_argmin_reduction(x, axis, keepdims, ti._argmin_over_axis) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index e3c4adeead..3e83725cd2 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -24,6 +24,7 @@ #pragma once #include +#include #include #include #include @@ -1059,65 +1060,68 @@ sycl::event reduction_over_group_temps_strided_impl( partially_reduced_tmp + reduction_groups * iter_nelems; } - const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler - &cgh) { - cgh.depends_on(depends); + const sycl::event &first_reduction_ev = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); - using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; - using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< - InputIndexerT, ResIndexerT>; - using ReductionIndexerT = - dpctl::tensor::offset_utils::StridedIndexer; + using InputIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; - // Only 2*iter_nd entries describing shape and strides of iterated - // dimensions of input array from iter_shape_and_strides are going - // to be accessed by inp_indexer - InputIndexerT inp_indexer(iter_nd, iter_arg_offset, - iter_shape_and_strides); - ResIndexerT noop_tmp_indexer{}; + // Only 2*iter_nd entries describing shape and strides of + // iterated dimensions of input array from + // iter_shape_and_strides are going to be accessed by + // inp_indexer + InputIndexerT inp_indexer(iter_nd, iter_arg_offset, + iter_shape_and_strides); + ResIndexerT noop_tmp_indexer{}; - InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, - noop_tmp_indexer}; - ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, - reduction_shape_stride}; + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + noop_tmp_indexer}; + ReductionIndexerT reduction_indexer{ + red_nd, reduction_arg_offset, reduction_shape_stride}; - auto globalRange = - sycl::range<1>{iter_nelems * reduction_groups * wg}; - auto localRange = sycl::range<1>{wg}; + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { - using KernelName = class reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>( - arg_tp, partially_reduced_tmp, ReductionOpT(), - identity_val, in_out_iter_indexer, reduction_indexer, - reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); - } - else { - using SlmT = sycl::local_accessor; - SlmT local_memory = SlmT(localRange, cgh); - using KernelName = class custom_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT, SlmT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - CustomReductionOverGroupNoAtomicFunctor< + if constexpr (su_ns::IsSyclOp::value) { + using KernelName = class reduction_over_group_temps_krn< argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT, SlmT>( - arg_tp, partially_reduced_tmp, ReductionOpT(), - identity_val, in_out_iter_indexer, reduction_indexer, - local_memory, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); - } - }); + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); + } + }); size_t remaining_reduction_nelems = reduction_groups; @@ -1399,7 +1403,6 @@ struct MaxOverAxisTempsStridedFactory if constexpr (TypePairSupportDataForCompReductionTemps< srcTy, dstTy>::is_defined) { - using dpctl::tensor::type_utils::is_complex; if constexpr (std::is_integral_v && !std::is_same_v) { using ReductionOpT = sycl::maximum; @@ -1509,7 +1512,6 @@ struct MinOverAxisTempsStridedFactory if constexpr (TypePairSupportDataForCompReductionTemps< srcTy, dstTy>::is_defined) { - using dpctl::tensor::type_utils::is_complex; if constexpr (std::is_integral_v && !std::is_same_v) { using ReductionOpT = sycl::minimum; @@ -1584,6 +1586,903 @@ struct MinOverAxis0AtomicContigFactory } }; +// Argmax and Argmin + +/* = Search reduction using reduce_over_group*/ + +template +struct SearchReduction +{ +private: + const argT *inp_ = nullptr; + argT *vals_ = nullptr; + const outT *inds_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + SearchReduction(const argT *data, + argT *vals, + const outT *inds, + outT *res, + ReductionOp reduction_op, + const argT &identity_val, + IdxReductionOp idx_reduction_op, + const outT &idx_identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), vals_(vals), inds_(inds), out_(res), + reduction_op_(reduction_op), identity_(identity_val), + idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + reduction_max_gid_(reduction_size), iter_gws_(iteration_size), + reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + argT local_red_val(identity_); + outT local_idx(idx_identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (size_t m = 0; m < reductions_per_wi; ++m) { + size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if constexpr (su_ns::IsMinimum::value) { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + argT red_val_over_wg = sycl::reduce_over_group( + work_group, local_red_val, identity_, reduction_op_); + + if constexpr (std::is_integral_v) { + local_idx = + (red_val_over_wg == local_red_val) ? local_idx : idx_identity_; + } + else { + local_idx = + (red_val_over_wg == local_red_val || + std::isnan(red_val_over_wg) || std::isnan(local_red_val)) + ? local_idx + : idx_identity_; + } + outT idx_over_wg = sycl::reduce_over_group( + work_group, local_idx, idx_identity_, idx_reduction_op_); + + if (work_group.leader()) { + // each group writes to a different memory location + if constexpr (!Last) { + // if not the final reduction, write value corresponding to + // an index to a temporary + vals_[out_iter_offset * n_reduction_groups + + reduction_batch_id] = red_val_over_wg; + } + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + idx_over_wg; + } + } +}; + +/* = Search reduction using custom_reduce_over_group*/ + +template +struct CustomSearchReduction +{ +private: + const argT *inp_ = nullptr; + argT *vals_ = nullptr; + const outT *inds_ = nullptr; + outT *out_ = nullptr; + ReductionOp reduction_op_; + argT identity_; + IdxReductionOp idx_reduction_op_; + outT idx_identity_; + InputOutputIterIndexerT inp_out_iter_indexer_; + InputRedIndexerT inp_reduced_dims_indexer_; + SlmT local_mem_; + size_t reduction_max_gid_ = 0; + size_t iter_gws_ = 1; + size_t reductions_per_wi = 16; + +public: + CustomSearchReduction(const argT *data, + argT *vals, + outT *inds, + outT *res, + ReductionOp reduction_op, + const argT &identity_val, + IdxReductionOp idx_reduction_op, + const outT &idx_identity_val, + InputOutputIterIndexerT arg_res_iter_indexer, + InputRedIndexerT arg_reduced_dims_indexer, + SlmT local_mem, + size_t reduction_size, + size_t iteration_size, + size_t reduction_size_per_wi) + : inp_(data), vals_(vals), inds_(inds), out_(res), + reduction_op_(reduction_op), identity_(identity_val), + idx_reduction_op_(idx_reduction_op), idx_identity_(idx_identity_val), + inp_out_iter_indexer_(arg_res_iter_indexer), + inp_reduced_dims_indexer_(arg_reduced_dims_indexer), + local_mem_(local_mem), reduction_max_gid_(reduction_size), + iter_gws_(iteration_size), reductions_per_wi(reduction_size_per_wi) + { + } + + void operator()(sycl::nd_item<1> it) const + { + const size_t reduction_lid = it.get_local_id(0); + const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg + + const size_t iter_gid = it.get_group(0) % iter_gws_; + const size_t reduction_batch_id = it.get_group(0) / iter_gws_; + const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; + + // work-items operates over input with indices + // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg + // + reduction_lid + // for 0 <= m < reductions_per_wi + + auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); + const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); + const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); + + argT local_red_val(identity_); + outT local_idx(idx_identity_); + size_t arg_reduce_gid0 = + reduction_lid + reduction_batch_id * wg * reductions_per_wi; + for (size_t m = 0; m < reductions_per_wi; ++m) { + size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; + + if (arg_reduce_gid < reduction_max_gid_) { + auto inp_reduction_offset = + inp_reduced_dims_indexer_(arg_reduce_gid); + auto inp_offset = inp_iter_offset + inp_reduction_offset; + + argT val = inp_[inp_offset]; + if constexpr (su_ns::IsMinimum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::less_complex; + // less_complex always returns false for NaNs, so check + if (less_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + else if constexpr (std::is_floating_point_v) { + if (val < local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + else { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + } + else if constexpr (su_ns::IsMaximum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::greater_complex; + if (greater_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + else if constexpr (std::is_floating_point_v) { + if (val > local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + else { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } + } + } + } + } + } + + auto work_group = it.get_group(); + // This only works if reduction_op_ is from small set of operators + argT red_val_over_wg = su_ns::custom_reduce_over_group( + work_group, local_mem_, local_red_val, reduction_op_); + + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + // equality does not hold for NaNs, so check here + local_idx = (red_val_over_wg == local_red_val || + std::isnan(std::real(local_red_val)) || + std::isnan(std::imag(local_red_val))) + ? local_idx + : idx_identity_; + } + else if constexpr (std::is_floating_point_v) { + // equality does not hold for NaNs, so check here + local_idx = + (red_val_over_wg == local_red_val || std::isnan(local_red_val)) + ? local_idx + : idx_identity_; + } + else { + local_idx = + red_val_over_wg == local_red_val ? local_idx : idx_identity_; + } + outT idx_over_wg = sycl::reduce_over_group( + work_group, local_idx, idx_identity_, idx_reduction_op_); + if (work_group.leader()) { + // each group writes to a different memory location + if constexpr (!Last) { + // if not the final reduction, write value corresponding to + // an index to a temporary + vals_[out_iter_offset * n_reduction_groups + + reduction_batch_id] = red_val_over_wg; + } + out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = + idx_over_wg; + } + } +}; + +typedef sycl::event (*search_reduction_strided_impl_fn_ptr)( + sycl::queue, + size_t, + size_t, + const char *, + char *, + int, + const py::ssize_t *, + py::ssize_t, + py::ssize_t, + int, + const py::ssize_t *, + py::ssize_t, + const std::vector &); + +template +class search_reduction_over_group_temps_krn; + +template +class search_custom_reduction_over_group_temps_krn; + +using dpctl::tensor::sycl_utils::choose_workgroup_size; + +template +sycl::event search_reduction_over_group_temps_strided_impl( + sycl::queue exec_q, + size_t iter_nelems, // number of reductions (num. of rows in a matrix + // when reducing over rows) + size_t reduction_nelems, // size of each reduction (length of rows, i.e. + // number of columns) + const char *arg_cp, + char *res_cp, + int iter_nd, + const py::ssize_t *iter_shape_and_strides, + py::ssize_t iter_arg_offset, + py::ssize_t iter_res_offset, + int red_nd, + const py::ssize_t *reduction_shape_stride, + py::ssize_t reduction_arg_offset, + const std::vector &depends) +{ + const argTy *arg_tp = reinterpret_cast(arg_cp); + resTy *res_tp = reinterpret_cast(res_cp); + + constexpr argTy identity_val = su_ns::Identity::value; + constexpr resTy idx_identity_val = su_ns::Identity::value; + + const sycl::device &d = exec_q.get_device(); + const auto &sg_sizes = d.get_info(); + size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); + + constexpr size_t preferrered_reductions_per_wi = 4; + // max_max_wg prevents running out of resources on CPU + size_t max_wg = std::min( + size_t(2048), d.get_info()); + + size_t reductions_per_wi(preferrered_reductions_per_wi); + if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { + // reduction only requries 1 work-group, can output directly to res + sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + InputOutputIterIndexerT in_out_iter_indexer{ + iter_nd, iter_arg_offset, iter_res_offset, + iter_shape_and_strides}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (su_ns::IsSyclOp::value) { + using KernelName = class search_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class search_custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, nullptr, nullptr, res_tp, ReductionOpT(), + identity_val, IndexOpT(), idx_identity_val, + in_out_iter_indexer, reduction_indexer, local_memory, + reduction_nelems, iter_nelems, reductions_per_wi)); + } + }); + return comp_ev; + } + else { + // more than one work-groups is needed, requires a temporary + size_t reduction_groups = + (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups > 1); + + size_t second_iter_reduction_groups_ = + (reduction_groups + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + + resTy *partially_reduced_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + resTy *partially_reduced_tmp2 = nullptr; + + if (partially_reduced_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_tmp2 = + partially_reduced_tmp + reduction_groups * iter_nelems; + } + + argTy *partially_reduced_vals_tmp = sycl::malloc_device( + iter_nelems * (reduction_groups + second_iter_reduction_groups_), + exec_q); + argTy *partially_reduced_vals_tmp2 = nullptr; + + if (partially_reduced_vals_tmp == nullptr) { + throw std::runtime_error("Unable to allocate device_memory"); + } + else { + partially_reduced_vals_tmp2 = + partially_reduced_vals_tmp + reduction_groups * iter_nelems; + } + + sycl::event first_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(depends); + + using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; + + // Only 2*iter_nd entries describing shape and strides of iterated + // dimensions of input array from iter_shape_and_strides are going + // to be accessed by inp_indexer + InputIndexerT inp_indexer(iter_nd, iter_arg_offset, + iter_shape_and_strides); + ResIndexerT noop_tmp_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + noop_tmp_indexer}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (su_ns::IsSyclOp::value) { + using KernelName = class search_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, true, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class search_custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, true, + false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + arg_tp, partially_reduced_vals_tmp, nullptr, + partially_reduced_tmp, ReductionOpT(), identity_val, + IndexOpT(), idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); + } + }); + + size_t remaining_reduction_nelems = reduction_groups; + + resTy *temp_arg = partially_reduced_tmp; + resTy *temp2_arg = partially_reduced_tmp2; + + argTy *vals_temp_arg = partially_reduced_vals_tmp; + argTy *vals_temp2_arg = partially_reduced_vals_tmp2; + + sycl::event dependent_ev = first_reduction_ev; + + while (remaining_reduction_nelems > + preferrered_reductions_per_wi * max_wg) { + size_t reduction_groups_ = + (remaining_reduction_nelems + + preferrered_reductions_per_wi * wg - 1) / + (preferrered_reductions_per_wi * wg); + assert(reduction_groups_ > 1); + + // keep reducing + sycl::event partial_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = + dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; + ResIndexerT res_iter_indexer{}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups_ * wg}; + auto localRange = sycl::range<1>{wg}; + if constexpr (su_ns::IsSyclOp::value) { + using KernelName = + class search_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, + false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class search_custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, + false, false>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, vals_temp2_arg, temp_arg, temp2_arg, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + }); + + remaining_reduction_nelems = reduction_groups_; + std::swap(temp_arg, temp2_arg); + std::swap(vals_temp_arg, vals_temp2_arg); + dependent_ev = partial_reduction_ev; + } + + // final reduction to res + sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(dependent_ev); + + using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; + using ResIndexerT = + dpctl::tensor::offset_utils::UnpackedStridedIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + + InputIndexerT inp_indexer{ + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; + ResIndexerT res_iter_indexer{iter_nd, iter_res_offset, + /* shape */ iter_shape_and_strides, + /*s trides */ iter_shape_and_strides + + 2 * iter_nd}; + + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + res_iter_indexer}; + ReductionIndexerT reduction_indexer{}; + + wg = max_wg; + reductions_per_wi = + std::max(1, (remaining_reduction_nelems + wg - 1) / wg); + + size_t reduction_groups = + (remaining_reduction_nelems + reductions_per_wi * wg - 1) / + (reductions_per_wi * wg); + assert(reduction_groups == 1); + + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; + + if constexpr (su_ns::IsSyclOp::value) { + using KernelName = class search_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, false, true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + SearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, remaining_reduction_nelems, + iter_nelems, reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = + class search_custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, IndexOpT, + InputOutputIterIndexerT, ReductionIndexerT, SlmT, false, + true>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomSearchReduction( + vals_temp_arg, nullptr, temp_arg, res_tp, + ReductionOpT(), identity_val, IndexOpT(), + idx_identity_val, in_out_iter_indexer, + reduction_indexer, local_memory, + remaining_reduction_nelems, iter_nelems, + reductions_per_wi)); + } + }); + + sycl::event cleanup_host_task_event = + exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(final_reduction_ev); + sycl::context ctx = exec_q.get_context(); + + cgh.host_task( + [ctx, partially_reduced_tmp, partially_reduced_vals_tmp] { + sycl::free(partially_reduced_tmp, ctx); + sycl::free(partially_reduced_vals_tmp, ctx); + }); + }); + + // FIXME: do not return host-task event + // Instead collect all host-tasks to a list + + return cleanup_host_task_event; + } +} + +template +struct TypePairSupportDataForSearchReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + // input int8_t + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + + // input float + td_ns::TypePairDefinedEntry, + + // input double + td_ns::TypePairDefinedEntry, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::int64_t>, + + td_ns::TypePairDefinedEntry, + outTy, + std::int64_t>, + + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct ArgmaxOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSearchReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_reduction_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Maximum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_reduction_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + +template +struct ArgminOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSearchReductionTemps< + srcTy, dstTy>::is_defined) + { + if constexpr (std::is_integral_v && + !std::is_same_v) { + // op for values + using ReductionOpT = sycl::minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_reduction_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + else { + // op for values + using ReductionOpT = su_ns::Minimum; + // op for indices + using IndexOpT = sycl::minimum; + return dpctl::tensor::kernels:: + search_reduction_over_group_temps_strided_impl< + srcTy, dstTy, ReductionOpT, IndexOpT>; + } + } + else { + return nullptr; + } + } +}; + } // namespace kernels } // namespace tensor } // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp index 4072d266d3..2339429a48 100644 --- a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp +++ b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp @@ -151,6 +151,52 @@ void populate_min_over_axis_dispatch_tables(void) } // namespace impl +// Argmax +namespace impl +{ + +using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr; +static search_reduction_strided_impl_fn_ptr + argmax_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_argmax_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::ArgmaxOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(argmax_over_axis_strided_temps_dispatch_table); +} + +} // namespace impl + +// Argmin +namespace impl +{ + +using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr; +static search_reduction_strided_impl_fn_ptr + argmin_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_argmin_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::search_reduction_strided_impl_fn_ptr; + using td_ns::DispatchTableBuilder; + + using dpctl::tensor::kernels::ArgminOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(argmin_over_axis_strided_temps_dispatch_table); +} + +} // namespace impl + namespace py = pybind11; void init_reduction_functions(py::module_ m) @@ -211,6 +257,46 @@ void init_reduction_functions(py::module_ m) py::arg("trailing_dims_to_reduce"), py::arg("dst"), py::arg("sycl_queue"), py::arg("depends") = py::list()); } + + // ARGMAX + { + using dpctl::tensor::py_internal::impl:: + populate_argmax_over_axis_dispatch_tables; + populate_argmax_over_axis_dispatch_tables(); + using impl::argmax_over_axis_strided_temps_dispatch_table; + + auto argmax_pyapi = [&](arrayT src, int trailing_dims_to_reduce, + arrayT dst, sycl::queue exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_search_over_axis; + return py_search_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + argmax_over_axis_strided_temps_dispatch_table); + }; + m.def("_argmax_over_axis", argmax_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } + + // ARGMIN + { + using dpctl::tensor::py_internal::impl:: + populate_argmin_over_axis_dispatch_tables; + populate_argmin_over_axis_dispatch_tables(); + using impl::argmin_over_axis_strided_temps_dispatch_table; + + auto argmin_pyapi = [&](arrayT src, int trailing_dims_to_reduce, + arrayT dst, sycl::queue exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_search_over_axis; + return py_search_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + argmin_over_axis_strided_temps_dispatch_table); + }; + m.def("_argmin_over_axis", argmin_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + } } } // namespace py_internal diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp index 0a83f4aa92..fda41f950b 100644 --- a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp @@ -51,6 +51,8 @@ namespace tensor namespace py_internal { +/* ==================== Generic reductions ====================== */ + template std::pair py_reduction_over_axis( dpctl::tensor::usm_ndarray src, @@ -387,6 +389,189 @@ std::pair py_reduction_over_axis( return std::make_pair(keep_args_event, reduction_ev); } +/* ==================== Search reductions ====================== */ + +template +std::pair py_search_over_axis( + dpctl::tensor::usm_ndarray src, + int trailing_dims_to_reduce, // comp over this many trailing indexes + dpctl::tensor::usm_ndarray dst, + sycl::queue exec_q, + const std::vector &depends, + const fn_tableT &dispatch_table) +{ + int src_nd = src.get_ndim(); + int iteration_nd = src_nd - trailing_dims_to_reduce; + if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { + throw py::value_error("Trailing_dim_to_reduce must be positive, but no " + "greater than rank of the array being reduced"); + } + + int dst_nd = dst.get_ndim(); + if (dst_nd != iteration_nd) { + throw py::value_error("Destination array rank does not match input " + "array rank and number of reduced dimensions"); + } + + const py::ssize_t *src_shape_ptr = src.get_shape_raw(); + const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); + + bool same_shapes = true; + for (int i = 0; same_shapes && (i < dst_nd); ++i) { + same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); + } + + if (!same_shapes) { + throw py::value_error("Destination shape does not match unreduced " + "dimensions of the input shape"); + } + + if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { + throw py::value_error( + "Execution queue is not compatible with allocation queues"); + } + + size_t dst_nelems = dst.get_size(); + + size_t reduction_nelems(1); + for (int i = dst_nd; i < src_nd; ++i) { + reduction_nelems *= static_cast(src_shape_ptr[i]); + } + + // check that dst and src do not overlap + auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); + if (overlap(src, dst)) { + throw py::value_error("Arrays index overlapping segments of memory"); + } + + // destination must be ample enough to accommodate all elements + { + auto dst_offsets = dst.get_minmax_offsets(); + size_t range = + static_cast(dst_offsets.second - dst_offsets.first); + if (range + 1 < dst_nelems) { + throw py::value_error( + "Destination array can not accommodate all the " + "elements of source array."); + } + } + + int src_typenum = src.get_typenum(); + int dst_typenum = dst.get_typenum(); + + namespace td_ns = dpctl::tensor::type_dispatch; + const auto &array_types = td_ns::usm_ndarray_types(); + int src_typeid = array_types.typenum_to_lookup_id(src_typenum); + int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); + + using dpctl::tensor::py_internal::simplify_iteration_space; + using dpctl::tensor::py_internal::simplify_iteration_space_1; + + auto const &src_shape_vecs = src.get_shape_vector(); + auto const &src_strides_vecs = src.get_strides_vector(); + auto const &dst_strides_vecs = dst.get_strides_vector(); + + int reduction_nd = trailing_dims_to_reduce; + const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; + using shT = std::vector; + shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, + std::end(src_strides_vecs)); + + shT compact_reduction_shape; + shT compact_reduction_src_strides; + py::ssize_t reduction_src_offset(0); + + compact_iteration_space( + reduction_nd, reduction_shape_ptr, reduction_src_strides, + // output + compact_reduction_shape, compact_reduction_src_strides); + + const py::ssize_t *iteration_shape_ptr = src_shape_ptr; + + shT iteration_src_strides(std::begin(src_strides_vecs), + std::begin(src_strides_vecs) + iteration_nd); + shT const &iteration_dst_strides = dst_strides_vecs; + + shT simplified_iteration_shape; + shT simplified_iteration_src_strides; + shT simplified_iteration_dst_strides; + py::ssize_t iteration_src_offset(0); + py::ssize_t iteration_dst_offset(0); + + if (iteration_nd == 0) { + if (dst_nelems != 1) { + throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); + } + iteration_nd = 1; + simplified_iteration_shape.push_back(1); + simplified_iteration_src_strides.push_back(0); + simplified_iteration_dst_strides.push_back(0); + } + else { + simplify_iteration_space(iteration_nd, iteration_shape_ptr, + iteration_src_strides, iteration_dst_strides, + // output + simplified_iteration_shape, + simplified_iteration_src_strides, + simplified_iteration_dst_strides, + iteration_src_offset, iteration_dst_offset); + } + + auto fn = dispatch_table[src_typeid][dst_typeid]; + if (fn == nullptr) { + throw std::runtime_error("Datatypes are not supported"); + } + + std::vector host_task_events{}; + + using dpctl::tensor::offset_utils::device_allocate_and_pack; + + const auto &arrays_metainfo_packing_triple_ = + device_allocate_and_pack( + exec_q, host_task_events, + // iteration metadata + simplified_iteration_shape, simplified_iteration_src_strides, + simplified_iteration_dst_strides, + // reduction metadata + compact_reduction_shape, compact_reduction_src_strides); + py::ssize_t *temp_allocation_ptr = + std::get<0>(arrays_metainfo_packing_triple_); + if (temp_allocation_ptr == nullptr) { + throw std::runtime_error("Unable to allocate memory on device"); + } + const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); + + py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; + py::ssize_t *reduction_shape_stride = + temp_allocation_ptr + 3 * simplified_iteration_shape.size(); + + std::vector all_deps; + all_deps.reserve(depends.size() + 1); + all_deps.resize(depends.size()); + std::copy(depends.begin(), depends.end(), all_deps.begin()); + all_deps.push_back(copy_metadata_ev); + + auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), + dst.get_data(), iteration_nd, iter_shape_and_strides, + iteration_src_offset, iteration_dst_offset, + reduction_nd, // number dimensions being reduced + reduction_shape_stride, reduction_src_offset, all_deps); + + sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { + cgh.depends_on(comp_ev); + auto ctx = exec_q.get_context(); + cgh.host_task([ctx, temp_allocation_ptr] { + sycl::free(temp_allocation_ptr, ctx); + }); + }); + host_task_events.push_back(temp_cleanup_ev); + + sycl::event keep_args_event = + dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); + + return std::make_pair(keep_args_event, comp_ev); +} + extern void init_reduction_functions(py::module_ m); } // namespace py_internal From 7052ad1f87bb0d9ec442fbd1c332a67ba8d37db1 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 18 Sep 2023 18:24:55 -0700 Subject: [PATCH 08/26] Tests for argmin and argmax Also fixes argmin and argmax for scalar inputs --- dpctl/tensor/_reduction.py | 8 +- dpctl/tests/test_usm_ndarray_reductions.py | 112 +++++++++++++++++++-- 2 files changed, 109 insertions(+), 11 deletions(-) diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index 7e18a63042..05c8e4a4d9 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -241,7 +241,9 @@ def _argmax_argmin_reduction(x, axis, keepdims, func): red_nd = nd # case of a scalar if red_nd == 0: - return dpt.copy(x) + return dpt.zeros( + (), dtype="i8", usm_type=x.usm_type, sycl_queue=x.sycl_queue + ) x_tmp = x res_shape = tuple() perm = list(range(nd)) @@ -253,7 +255,9 @@ def _argmax_argmin_reduction(x, axis, keepdims, func): red_nd = len(axis) # check for axis=() if red_nd == 0: - return dpt.copy(x) + return dpt.zeros( + (), dtype="i8", usm_type=x.usm_type, sycl_queue=x.sycl_queue + ) perm = [i for i in range(nd) if i not in axis] + list(axis) x_tmp = dpt.permute_dims(x, perm) res_shape = x_tmp.shape[: nd - red_nd] diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py index 8200d05c58..e4c3a7a881 100644 --- a/dpctl/tests/test_usm_ndarray_reductions.py +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -14,6 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +from random import randrange + +import numpy as np import pytest import dpctl.tensor as dpt @@ -64,23 +67,27 @@ def test_reduction_kernels(arg_dtype): q = get_queue_or_skip() skip_if_dtype_not_supported(arg_dtype, q) - x = dpt.reshape( - dpt.arange(24 * 1025, dtype=arg_dtype, sycl_queue=q), (24, 1025) - ) + x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q) + x[x.shape[0] // 2, :] = 3 + x[:, x.shape[1] // 2] = 3 m = dpt.max(x) - assert m == x[-1, -1] + assert m == 3 m = dpt.max(x, axis=0) - assert dpt.all(m == x[-1, :]) + assert dpt.all(m == 3) m = dpt.max(x, axis=1) - assert dpt.all(m == x[:, -1]) + assert dpt.all(m == 3) + + x = dpt.ones((24, 1025), dtype=arg_dtype, sycl_queue=q) + x[x.shape[0] // 2, :] = 0 + x[:, x.shape[1] // 2] = 0 m = dpt.min(x) - assert m == x[0, 0] + assert m == 0 m = dpt.min(x, axis=0) - assert dpt.all(m == x[0, :]) + assert dpt.all(m == 0) m = dpt.min(x, axis=1) - assert dpt.all(m == x[:, 0]) + assert dpt.all(m == 0) def test_max_min_nan_propagation(): @@ -107,3 +114,90 @@ def test_max_min_nan_propagation(): x[0] = complex(0, dpt.nan) assert dpt.isnan(dpt.max(x)) assert dpt.isnan(dpt.min(x)) + + +def test_argmax_scalar(): + get_queue_or_skip() + + x = dpt.ones(()) + m = dpt.argmax(x) + + assert m.shape == () + assert m == 0 + + +@pytest.mark.parametrize("arg_dtype", ["i4", "f4", "c8"]) +def test_search_reduction_kernels(arg_dtype): + # i4 - always uses atomics w/ sycl group reduction + # f4 - always uses atomics w/ custom group reduction + # c8 - always uses temps w/ custom group reduction + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + x = dpt.ones((24 * 1025), dtype=arg_dtype, sycl_queue=q) + idx = randrange(x.size) + idx_tup = np.unravel_index(idx, (24, 1025)) + x[idx] = 2 + + m = dpt.argmax(x) + assert m == idx + + x = dpt.reshape(x, (24, 1025)) + + x[idx_tup[0], :] = 3 + m = dpt.argmax(x, axis=0) + assert dpt.all(m == idx_tup[0]) + x[:, idx_tup[1]] = 4 + m = dpt.argmax(x, axis=1) + assert dpt.all(m == idx_tup[1]) + + x = x[:, ::-2] + idx = randrange(x.shape[1]) + x[:, idx] = 5 + m = dpt.argmax(x, axis=1) + assert dpt.all(m == idx) + + x = dpt.ones((24 * 1025), dtype=arg_dtype, sycl_queue=q) + idx = randrange(x.size) + idx_tup = np.unravel_index(idx, (24, 1025)) + x[idx] = 0 + + m = dpt.argmin(x) + assert m == idx + + x = dpt.reshape(x, (24, 1025)) + + x[idx_tup[0], :] = -1 + m = dpt.argmin(x, axis=0) + assert dpt.all(m == idx_tup[0]) + x[:, idx_tup[1]] = -2 + m = dpt.argmin(x, axis=1) + assert dpt.all(m == idx_tup[1]) + + x = x[:, ::-2] + idx = randrange(x.shape[1]) + x[:, idx] = -3 + m = dpt.argmin(x, axis=1) + assert dpt.all(m == idx) + + +def test_argmax_argmin_nan_propagation(): + get_queue_or_skip() + + sz = 4 + idx = randrange(sz) + # floats + x = dpt.arange(sz, dtype="f4") + x[idx] = dpt.nan + assert dpt.argmax(x) == idx + assert dpt.argmin(x) == idx + + # complex + x = dpt.arange(sz, dtype="c8") + x[idx] = complex(dpt.nan, 0) + assert dpt.argmax(x) == idx + assert dpt.argmin(x) == idx + + x[idx] = complex(0, dpt.nan) + assert dpt.argmax(x) == idx + assert dpt.argmin(x) == idx From 97efe7a635c19ab455331958ab98281dca5ca2ea Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 18 Sep 2023 23:06:47 -0700 Subject: [PATCH 09/26] Argmin and argmax now handle identities correctly Adds a test for this behavior Fixed a typo in argmin and argmax causing shared local memory variant to be used for more types than expected --- .../libtensor/include/kernels/reductions.hpp | 203 ++++++++++-------- dpctl/tests/test_usm_ndarray_reductions.py | 10 + 2 files changed, 127 insertions(+), 86 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 3e83725cd2..cafbdf929b 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -1670,25 +1670,37 @@ struct SearchReduction auto inp_offset = inp_iter_offset + inp_reduction_offset; argT val = inp_[inp_offset]; - if constexpr (su_ns::IsMinimum::value) { - if (val < local_red_val) { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; - } - else { - local_idx = static_cast(arg_reduce_gid); - } + if (val == local_red_val) { + if constexpr (!First) { + local_idx = std::min(local_idx, inds_[inp_offset]); + } + else { + local_idx = std::min(local_idx, + static_cast(arg_reduce_gid)); } } - else if constexpr (su_ns::IsMaximum::value) { - if (val > local_red_val) { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; + else { + if constexpr (su_ns::IsMinimum::value) { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } } - else { - local_idx = static_cast(arg_reduce_gid); + } + else if constexpr (su_ns::IsMaximum::value) { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = static_cast(arg_reduce_gid); + } } } } @@ -1813,83 +1825,102 @@ struct CustomSearchReduction auto inp_offset = inp_iter_offset + inp_reduction_offset; argT val = inp_[inp_offset]; - if constexpr (su_ns::IsMinimum::value) { - using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value) { - using dpctl::tensor::math_utils::less_complex; - // less_complex always returns false for NaNs, so check - if (less_complex(val, local_red_val) || - std::isnan(std::real(val)) || - std::isnan(std::imag(val))) - { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; - } - else { - local_idx = static_cast(arg_reduce_gid); - } - } - } - else if constexpr (std::is_floating_point_v) { - if (val < local_red_val || std::isnan(val)) { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; - } - else { - local_idx = static_cast(arg_reduce_gid); - } - } + if (val == local_red_val) { + if constexpr (!First) { + local_idx = std::min(local_idx, inds_[inp_offset]); } else { - if (val < local_red_val) { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; - } - else { - local_idx = static_cast(arg_reduce_gid); - } - } + local_idx = std::min(local_idx, + static_cast(arg_reduce_gid)); } } - else if constexpr (su_ns::IsMaximum::value) { - using dpctl::tensor::type_utils::is_complex; - if constexpr (is_complex::value) { - using dpctl::tensor::math_utils::greater_complex; - if (greater_complex(val, local_red_val) || - std::isnan(std::real(val)) || - std::isnan(std::imag(val))) - { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; - } - else { - local_idx = static_cast(arg_reduce_gid); + else { + if constexpr (su_ns::IsMinimum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::less_complex; + // less_complex always returns false for NaNs, so + // check + if (less_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } } } - } - else if constexpr (std::is_floating_point_v) { - if (val > local_red_val || std::isnan(val)) { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; + else if constexpr (std::is_floating_point_v) { + if (val < local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } } - else { - local_idx = static_cast(arg_reduce_gid); + } + else { + if (val < local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } } } } - else { - if (val > local_red_val) { - local_red_val = val; - if constexpr (!First) { - local_idx = inds_[inp_offset]; + else if constexpr (su_ns::IsMaximum::value) { + using dpctl::tensor::type_utils::is_complex; + if constexpr (is_complex::value) { + using dpctl::tensor::math_utils::greater_complex; + if (greater_complex(val, local_red_val) || + std::isnan(std::real(val)) || + std::isnan(std::imag(val))) + { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } } - else { - local_idx = static_cast(arg_reduce_gid); + } + else if constexpr (std::is_floating_point_v) { + if (val > local_red_val || std::isnan(val)) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } + } + } + else { + if (val > local_red_val) { + local_red_val = val; + if constexpr (!First) { + local_idx = inds_[inp_offset]; + } + else { + local_idx = + static_cast(arg_reduce_gid); + } } } } @@ -2042,7 +2073,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (su_ns::IsSyclOp::value) { using KernelName = class search_reduction_over_group_temps_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, true, true>; @@ -2141,7 +2172,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (su_ns::IsSyclOp::value) { using KernelName = class search_reduction_over_group_temps_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, true, false>; @@ -2221,7 +2252,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( auto globalRange = sycl::range<1>{iter_nelems * reduction_groups_ * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (su_ns::IsSyclOp::value) { using KernelName = class search_reduction_over_group_temps_krn< argTy, resTy, ReductionOpT, IndexOpT, @@ -2304,7 +2335,7 @@ sycl::event search_reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (su_ns::IsSyclOp::value) { using KernelName = class search_reduction_over_group_temps_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, false, true>; diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py index e4c3a7a881..e137304dc5 100644 --- a/dpctl/tests/test_usm_ndarray_reductions.py +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -201,3 +201,13 @@ def test_argmax_argmin_nan_propagation(): x[idx] = complex(0, dpt.nan) assert dpt.argmax(x) == idx assert dpt.argmin(x) == idx + + +def test_argmax_argmin_identities(): + # make sure that identity arrays work as expected + get_queue_or_skip() + + x = dpt.full(3, dpt.iinfo(dpt.int32).min, dtype="i4") + assert dpt.argmax(x) == 0 + x = dpt.full(3, dpt.iinfo(dpt.int32).max, dtype="i4") + assert dpt.argmin(x) == 0 From 7aef816c4e10253a0a348f32e5d79020f6ce6879 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Tue, 26 Sep 2023 19:26:53 -0700 Subject: [PATCH 10/26] Replaced `std::min` with `idx_reduction_op_` --- .../libtensor/include/kernels/reductions.hpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index cafbdf929b..c42e91b812 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -1672,11 +1672,12 @@ struct SearchReduction argT val = inp_[inp_offset]; if (val == local_red_val) { if constexpr (!First) { - local_idx = std::min(local_idx, inds_[inp_offset]); + local_idx = + idx_reduction_op_(local_idx, inds_[inp_offset]); } else { - local_idx = std::min(local_idx, - static_cast(arg_reduce_gid)); + local_idx = idx_reduction_op_( + local_idx, static_cast(arg_reduce_gid)); } } else { @@ -1827,11 +1828,12 @@ struct CustomSearchReduction argT val = inp_[inp_offset]; if (val == local_red_val) { if constexpr (!First) { - local_idx = std::min(local_idx, inds_[inp_offset]); + local_idx = + idx_reduction_op_(local_idx, inds_[inp_offset]); } else { - local_idx = std::min(local_idx, - static_cast(arg_reduce_gid)); + local_idx = idx_reduction_op_( + local_idx, static_cast(arg_reduce_gid)); } } else { From 6c3abcc7c10f106ac17fb48623f28670449a751a Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Tue, 26 Sep 2023 20:31:10 -0700 Subject: [PATCH 11/26] reductions now well-behaved for size-zero arrays - comparison and search reductions will throw an error in this case - slips in change to align sum signature with array API spec --- dpctl/tensor/_reduction.py | 116 ++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 65 deletions(-) diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index 05c8e4a4d9..f0fd40bc18 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -52,7 +52,7 @@ def _default_reduction_dtype(inp_dt, q): return res_dt -def sum(arr, axis=None, dtype=None, keepdims=False): +def sum(x, axis=None, dtype=None, keepdims=False): """sum(x, axis=None, dtype=None, keepdims=False) Calculates the sum of the input array `x`. @@ -101,9 +101,9 @@ def sum(arr, axis=None, dtype=None, keepdims=False): array has the data type as described in the `dtype` parameter description above. """ - if not isinstance(arr, dpt.usm_ndarray): - raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(arr)}") - nd = arr.ndim + if not isinstance(x, dpt.usm_ndarray): + raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") + nd = x.ndim if axis is None: axis = tuple(range(nd)) if not isinstance(axis, (tuple, list)): @@ -111,18 +111,18 @@ def sum(arr, axis=None, dtype=None, keepdims=False): axis = normalize_axis_tuple(axis, nd, "axis") red_nd = len(axis) perm = [i for i in range(nd) if i not in axis] + list(axis) - arr2 = dpt.permute_dims(arr, perm) + arr2 = dpt.permute_dims(x, perm) res_shape = arr2.shape[: nd - red_nd] - q = arr.sycl_queue - inp_dt = arr.dtype + q = x.sycl_queue + inp_dt = x.dtype if dtype is None: res_dt = _default_reduction_dtype(inp_dt, q) else: res_dt = dpt.dtype(dtype) res_dt = _to_device_supported_dtype(res_dt, q.sycl_device) - res_usm_type = arr.usm_type - if arr.size == 0: + res_usm_type = x.usm_type + if x.size == 0: if keepdims: res_shape = res_shape + (1,) * red_nd inv_perm = sorted(range(nd), key=lambda d: perm[d]) @@ -131,7 +131,7 @@ def sum(arr, axis=None, dtype=None, keepdims=False): res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) if red_nd == 0: - return dpt.astype(arr, res_dt, copy=False) + return dpt.astype(x, res_dt, copy=False) host_tasks_list = [] if ti._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q): @@ -173,43 +173,35 @@ def sum(arr, axis=None, dtype=None, keepdims=False): return res -def _same_dtype_reduction(x, axis, keepdims, func): +def _comparison_over_axis(x, axis, keepdims, _reduction_fn): if not isinstance(x, dpt.usm_ndarray): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") nd = x.ndim if axis is None: - red_nd = nd - # case of a scalar - if red_nd == 0: - return dpt.copy(x) - x_tmp = x - res_shape = tuple() - perm = list(range(nd)) - else: - if not isinstance(axis, (tuple, list)): - axis = (axis,) - axis = normalize_axis_tuple(axis, nd, "axis") - - red_nd = len(axis) - # check for axis=() - if red_nd == 0: - return dpt.copy(x) - perm = [i for i in range(nd) if i not in axis] + list(axis) - x_tmp = dpt.permute_dims(x, perm) - res_shape = x_tmp.shape[: nd - red_nd] - + axis = tuple(range(nd)) + if not isinstance(axis, (tuple, list)): + axis = (axis,) + axis = normalize_axis_tuple(axis, nd, "axis") + red_nd = len(axis) + perm = [i for i in range(nd) if i not in axis] + list(axis) + x_tmp = dpt.permute_dims(x, perm) + res_shape = x_tmp.shape[: nd - red_nd] exec_q = x.sycl_queue + res_dt = x.dtype res_usm_type = x.usm_type - res_dtype = x.dtype + if x.size == 0: + raise ValueError("reduction does not support zero-size arrays") + if red_nd == 0: + return x res = dpt.empty( res_shape, - dtype=res_dtype, + dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q, ) - hev, _ = func( + hev, _ = _reduction_fn( src=x_tmp, trailing_dims_to_reduce=red_nd, dst=res, @@ -225,54 +217,48 @@ def _same_dtype_reduction(x, axis, keepdims, func): def max(x, axis=None, keepdims=False): - return _same_dtype_reduction(x, axis, keepdims, ti._max_over_axis) + return _comparison_over_axis(x, axis, keepdims, ti._max_over_axis) def min(x, axis=None, keepdims=False): - return _same_dtype_reduction(x, axis, keepdims, ti._min_over_axis) + return _comparison_over_axis(x, axis, keepdims, ti._min_over_axis) -def _argmax_argmin_reduction(x, axis, keepdims, func): +def _search_over_axis(x, axis, keepdims, _reduction_fn): if not isinstance(x, dpt.usm_ndarray): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") nd = x.ndim if axis is None: - red_nd = nd - # case of a scalar - if red_nd == 0: - return dpt.zeros( - (), dtype="i8", usm_type=x.usm_type, sycl_queue=x.sycl_queue - ) - x_tmp = x - res_shape = tuple() - perm = list(range(nd)) + axis = tuple(range(nd)) + elif isinstance(axis, int): + axis = (axis,) else: - if not isinstance(axis, (tuple, list)): - axis = (axis,) - axis = normalize_axis_tuple(axis, nd, "axis") - - red_nd = len(axis) - # check for axis=() - if red_nd == 0: - return dpt.zeros( - (), dtype="i8", usm_type=x.usm_type, sycl_queue=x.sycl_queue - ) - perm = [i for i in range(nd) if i not in axis] + list(axis) - x_tmp = dpt.permute_dims(x, perm) - res_shape = x_tmp.shape[: nd - red_nd] - + raise TypeError( + f"`axis` argument expected `int` or `None`, got {type(axis)}" + ) + axis = normalize_axis_tuple(axis, nd, "axis") + red_nd = len(axis) + perm = [i for i in range(nd) if i not in axis] + list(axis) + x_tmp = dpt.permute_dims(x, perm) + res_shape = x_tmp.shape[: nd - red_nd] exec_q = x.sycl_queue + res_dt = ti.default_device_index_type(exec_q.sycl_device) res_usm_type = x.usm_type - res_dtype = dpt.int64 + if x.size == 0: + raise ValueError("reduction does not support zero-size arrays") + if red_nd == 0: + return dpt.zeros( + res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q + ) res = dpt.empty( res_shape, - dtype=res_dtype, + dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q, ) - hev, _ = func( + hev, _ = _reduction_fn( src=x_tmp, trailing_dims_to_reduce=red_nd, dst=res, @@ -288,8 +274,8 @@ def _argmax_argmin_reduction(x, axis, keepdims, func): def argmax(x, axis=None, keepdims=False): - return _argmax_argmin_reduction(x, axis, keepdims, ti._argmax_over_axis) + return _search_over_axis(x, axis, keepdims, ti._argmax_over_axis) def argmin(x, axis=None, keepdims=False): - return _argmax_argmin_reduction(x, axis, keepdims, ti._argmin_over_axis) + return _search_over_axis(x, axis, keepdims, ti._argmin_over_axis) From a00ac58f81d7def20a211fa5768295db90ee3f35 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Tue, 26 Sep 2023 20:31:15 -0700 Subject: [PATCH 12/26] removed unnecessary copies in reduction templates --- .../tensor/libtensor/source/reduction_over_axis.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp index fda41f950b..8ee3c0f352 100644 --- a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp @@ -55,10 +55,10 @@ namespace py_internal template std::pair py_reduction_over_axis( - dpctl::tensor::usm_ndarray src, + const dpctl::tensor::usm_ndarray &src, int trailing_dims_to_reduce, // comp over this many trailing indexes - dpctl::tensor::usm_ndarray dst, - sycl::queue exec_q, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, const std::vector &depends, const strided_fnT &atomic_dispatch_table, const strided_fnT &temps_dispatch_table, @@ -393,10 +393,10 @@ std::pair py_reduction_over_axis( template std::pair py_search_over_axis( - dpctl::tensor::usm_ndarray src, + const dpctl::tensor::usm_ndarray &src, int trailing_dims_to_reduce, // comp over this many trailing indexes - dpctl::tensor::usm_ndarray dst, - sycl::queue exec_q, + const dpctl::tensor::usm_ndarray &dst, + sycl::queue &exec_q, const std::vector &depends, const fn_tableT &dispatch_table) { From 2468d8a580dff626a87a1c220d657391fc9664a4 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 27 Sep 2023 10:10:58 -0700 Subject: [PATCH 13/26] Refactors sum to use generic reduction templates --- dpctl/tensor/CMakeLists.txt | 1 - .../libtensor/include/kernels/reductions.hpp | 243 ++++ .../include/kernels/sum_reductions.hpp | 1172 ----------------- .../libtensor/include/utils/sycl_utils.hpp | 34 - .../libtensor/source/reduction_over_axis.cpp | 108 +- .../libtensor/source/reduction_over_axis.hpp | 115 +- .../libtensor/source/sum_reductions.cpp | 542 -------- .../libtensor/source/sum_reductions.hpp | 40 - dpctl/tensor/libtensor/source/tensor_py.cpp | 2 - 9 files changed, 451 insertions(+), 1806 deletions(-) delete mode 100644 dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp delete mode 100644 dpctl/tensor/libtensor/source/sum_reductions.cpp delete mode 100644 dpctl/tensor/libtensor/source/sum_reductions.hpp diff --git a/dpctl/tensor/CMakeLists.txt b/dpctl/tensor/CMakeLists.txt index 234626abd5..9a2493421e 100644 --- a/dpctl/tensor/CMakeLists.txt +++ b/dpctl/tensor/CMakeLists.txt @@ -49,7 +49,6 @@ pybind11_add_module(${python_module_name} MODULE ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/boolean_reductions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/device_support_queries.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/elementwise_functions.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/sum_reductions.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/repeat.cpp ${CMAKE_CURRENT_SOURCE_DIR}/libtensor/source/reduction_over_axis.cpp ) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index c42e91b812..8a1182421a 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -1586,6 +1586,249 @@ struct MinOverAxis0AtomicContigFactory } }; +// Sum + +/* @brief Types supported by plus-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForSumReductionAtomic +{ + + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input double + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForSumReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns:: + TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input double + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-throug + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct SumOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionTemps< + srcTy, dstTy>::is_defined) { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct SumOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForSumReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::plus; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + // Argmax and Argmin /* = Search reduction using reduce_over_group*/ diff --git a/dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp deleted file mode 100644 index 0ebbd8b308..0000000000 --- a/dpctl/tensor/libtensor/include/kernels/sum_reductions.hpp +++ /dev/null @@ -1,1172 +0,0 @@ -//=== sum_reductions.hpp - Implementation of sum kernels ------- *-C++-*/===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2023 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===----------------------------------------------------------------------===// -/// -/// \file -/// This file defines kernels for summing tensors along axis. -//===----------------------------------------------------------------------===// - -#pragma once -#include -#include -#include -#include -#include -#include -#include - -#include "pybind11/pybind11.h" -#include "utils/offset_utils.hpp" -#include "utils/sycl_utils.hpp" -#include "utils/type_dispatch.hpp" -#include "utils/type_utils.hpp" - -namespace py = pybind11; -namespace td_ns = dpctl::tensor::type_dispatch; - -namespace dpctl -{ -namespace tensor -{ -namespace kernels -{ - -template -struct SequentialReduction -{ -private: - const argT *inp_ = nullptr; - outT *out_ = nullptr; - ReductionOp reduction_op_; - outT identity_; - InputOutputIterIndexerT inp_out_iter_indexer_; - InputRedIndexerT inp_reduced_dims_indexer_; - size_t reduction_max_gid_ = 0; - -public: - SequentialReduction(const argT *inp, - outT *res, - ReductionOp reduction_op, - const outT &identity_val, - InputOutputIterIndexerT arg_res_iter_indexer, - InputRedIndexerT arg_reduced_dims_indexer, - size_t reduction_size) - : inp_(inp), out_(res), reduction_op_(reduction_op), - identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), - inp_reduced_dims_indexer_(arg_reduced_dims_indexer), - reduction_max_gid_(reduction_size) - { - } - - void operator()(sycl::id<1> id) const - { - - auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]); - const py::ssize_t &inp_iter_offset = - inp_out_iter_offsets_.get_first_offset(); - const py::ssize_t &out_iter_offset = - inp_out_iter_offsets_.get_second_offset(); - - outT red_val(identity_); - for (size_t m = 0; m < reduction_max_gid_; ++m) { - const py::ssize_t inp_reduction_offset = - inp_reduced_dims_indexer_(m); - const py::ssize_t inp_offset = - inp_iter_offset + inp_reduction_offset; - - red_val = reduction_op_(red_val, inp_[inp_offset]); - } - - out_[out_iter_offset] = red_val; - } -}; - -/* === Reduction, using sycl::reduce_over_group, and sycl::atomic_ref === */ - -/* - This kernel only works for outT with sizeof(outT) == 4, or sizeof(outT) == 8 - if the device has aspect atomic64 and only with those supported by - sycl::atomic_ref -*/ -template -struct ReductionOverGroupWithAtomicFunctor -{ -private: - const argT *inp_ = nullptr; - outT *out_ = nullptr; - ReductionOp reduction_op_; - outT identity_; - InputOutputIterIndexerT inp_out_iter_indexer_; - InputRedIndexerT inp_reduced_dims_indexer_; - size_t reduction_max_gid_ = 0; - size_t iter_gws_ = 1; - size_t reductions_per_wi = 16; - -public: - ReductionOverGroupWithAtomicFunctor( - const argT *data, - outT *res, - ReductionOp reduction_op, - const outT &identity_val, - InputOutputIterIndexerT arg_res_iter_indexer, - InputRedIndexerT arg_reduced_dims_indexer, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) - : inp_(data), out_(res), reduction_op_(reduction_op), - identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), - inp_reduced_dims_indexer_(arg_reduced_dims_indexer), - reduction_max_gid_(reduction_size), iter_gws_(iteration_size), - reductions_per_wi(reduction_size_per_wi) - { - } - - void operator()(sycl::nd_item<1> it) const - { - const size_t iter_gid = it.get_group(0) % iter_gws_; - const size_t reduction_batch_id = it.get_group(0) / iter_gws_; - - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg - - // work-items sums over input with indices - // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg - // + reduction_lid - // for 0 <= m < reductions_per_wi - - auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); - const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); - const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); - - outT local_red_val(identity_); - size_t arg_reduce_gid0 = - reduction_lid + reduction_batch_id * wg * reductions_per_wi; - size_t arg_reduce_gid_max = std::min( - reduction_max_gid_, arg_reduce_gid0 + reductions_per_wi * wg); - - for (size_t arg_reduce_gid = arg_reduce_gid0; - arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg) - { - auto inp_reduction_offset = - inp_reduced_dims_indexer_(arg_reduce_gid); - auto inp_offset = inp_iter_offset + inp_reduction_offset; - - using dpctl::tensor::type_utils::convert_impl; - outT val = convert_impl(inp_[inp_offset]); - - local_red_val = reduction_op_(local_red_val, val); - } - - auto work_group = it.get_group(); - // This only works if reduction_op_ is from small set of operators - outT red_val_over_wg = sycl::reduce_over_group( - work_group, local_red_val, identity_, reduction_op_); - - if (work_group.leader()) { - sycl::atomic_ref - res_ref(out_[out_iter_offset]); - if constexpr (std::is_same_v> || - std::is_same_v>) - { - res_ref += red_val_over_wg; - } - else { - outT read_val = res_ref.load(); - outT new_val{}; - do { - new_val = reduction_op_(read_val, red_val_over_wg); - } while (!res_ref.compare_exchange_strong(read_val, new_val)); - } - } - } -}; - -typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)( - sycl::queue &, - size_t, - size_t, - const char *, - char *, - int, - const py::ssize_t *, - py::ssize_t, - py::ssize_t, - int, - const py::ssize_t *, - py::ssize_t, - const std::vector &); - -template -class sum_reduction_over_group_with_atomics_krn; - -template -class sum_reduction_over_group_with_atomics_init_krn; - -template -class sum_reduction_seq_strided_krn; - -template -class sum_reduction_seq_contig_krn; - -template -class sum_reduction_axis0_over_group_with_atomics_contig_krn; - -template -class sum_reduction_axis1_over_group_with_atomics_contig_krn; - -using dpctl::tensor::sycl_utils::choose_workgroup_size; - -template -sycl::event sum_reduction_over_group_with_atomics_strided_impl( - sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of rows in a matrix - // when reducing over rows) - size_t reduction_nelems, // size of each reduction (length of rows, i.e. - // number of columns) - const char *arg_cp, - char *res_cp, - int iter_nd, - const py::ssize_t *iter_shape_and_strides, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - int red_nd, - const py::ssize_t *reduction_shape_stride, - py::ssize_t reduction_arg_offset, - const std::vector &depends) -{ - const argTy *arg_tp = reinterpret_cast(arg_cp); - resTy *res_tp = reinterpret_cast(res_cp); - - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; - - const sycl::device &d = exec_q.get_device(); - const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - - if (reduction_nelems < wg) { - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; - using ReductionIndexerT = - dpctl::tensor::offset_utils::StridedIndexer; - - InputOutputIterIndexerT in_out_iter_indexer{ - iter_nd, iter_arg_offset, iter_res_offset, - iter_shape_and_strides}; - ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, - reduction_shape_stride}; - - cgh.parallel_for>( - sycl::range<1>(iter_nelems), - SequentialReduction( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems)); - }); - - return comp_ev; - } - else { - sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) { - using IndexerT = - dpctl::tensor::offset_utils::UnpackedStridedIndexer; - - const py::ssize_t *const &res_shape = iter_shape_and_strides; - const py::ssize_t *const &res_strides = - iter_shape_and_strides + 2 * iter_nd; - IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, - res_strides); - using InitKernelName = - class sum_reduction_over_group_with_atomics_init_krn; - cgh.depends_on(depends); - - cgh.parallel_for( - sycl::range<1>(iter_nelems), [=](sycl::id<1> id) { - auto res_offset = res_indexer(id[0]); - res_tp[res_offset] = identity_val; - }); - }); - - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(res_init_ev); - - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; - using ReductionIndexerT = - dpctl::tensor::offset_utils::StridedIndexer; - - InputOutputIterIndexerT in_out_iter_indexer{ - iter_nd, iter_arg_offset, iter_res_offset, - iter_shape_and_strides}; - ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, - reduction_shape_stride}; - - constexpr size_t preferrered_reductions_per_wi = 4; - size_t reductions_per_wi = - (reduction_nelems < preferrered_reductions_per_wi * wg) - ? std::max(1, (reduction_nelems + wg - 1) / wg) - : preferrered_reductions_per_wi; - - size_t reduction_groups = - (reduction_nelems + reductions_per_wi * wg - 1) / - (reductions_per_wi * wg); - - auto globalRange = - sycl::range<1>{iter_nelems * reduction_groups * wg}; - auto localRange = sycl::range<1>{wg}; - - using KernelName = class sum_reduction_over_group_with_atomics_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupWithAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); - }); - - return comp_ev; - } -} - -// Contig - -typedef sycl::event (*sum_reduction_contig_impl_fn_ptr)( - sycl::queue &, - size_t, - size_t, - const char *, - char *, - py::ssize_t, - py::ssize_t, - py::ssize_t, - const std::vector &); - -/* @brief Reduce rows in a matrix */ -template -sycl::event sum_reduction_axis1_over_group_with_atomics_contig_impl( - sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of rows in a matrix - // when reducing over rows) - size_t reduction_nelems, // size of each reduction (length of rows, i.e. - // number of columns) - const char *arg_cp, - char *res_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - py::ssize_t reduction_arg_offset, - const std::vector &depends) -{ - const argTy *arg_tp = reinterpret_cast(arg_cp) + - iter_arg_offset + reduction_arg_offset; - resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; - - const sycl::device &d = exec_q.get_device(); - const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - - if (reduction_nelems < wg) { - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - using InputIterIndexerT = - dpctl::tensor::offset_utils::Strided1DIndexer; - using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< - InputIterIndexerT, NoOpIndexerT>; - using ReductionIndexerT = NoOpIndexerT; - - InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, - NoOpIndexerT{}}; - ReductionIndexerT reduction_indexer{}; - - cgh.parallel_for>( - sycl::range<1>(iter_nelems), - SequentialReduction( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems)); - }); - - return comp_ev; - } - else { - sycl::event res_init_ev = exec_q.fill( - res_tp, resTy(identity_val), iter_nelems, depends); - - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(res_init_ev); - - using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - using RowsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< - RowsIndexerT, NoOpIndexerT>; - using ReductionIndexerT = NoOpIndexerT; - - RowsIndexerT rows_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_nelems)}; - NoOpIndexerT result_indexer{}; - InputOutputIterIndexerT in_out_iter_indexer{rows_indexer, - result_indexer}; - ReductionIndexerT reduction_indexer{}; - - constexpr size_t preferrered_reductions_per_wi = 8; - size_t reductions_per_wi = - (reduction_nelems < preferrered_reductions_per_wi * wg) - ? std::max(1, (reduction_nelems + wg - 1) / wg) - : preferrered_reductions_per_wi; - - size_t reduction_groups = - (reduction_nelems + reductions_per_wi * wg - 1) / - (reductions_per_wi * wg); - - auto globalRange = - sycl::range<1>{iter_nelems * reduction_groups * wg}; - auto localRange = sycl::range<1>{wg}; - - using KernelName = - class sum_reduction_axis1_over_group_with_atomics_contig_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupWithAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); - }); - - return comp_ev; - } -} - -/* @brief Reduce rows in a matrix */ -template -sycl::event sum_reduction_axis0_over_group_with_atomics_contig_impl( - sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of cols in a matrix - // when reducing over cols) - size_t reduction_nelems, // size of each reduction (length of cols, i.e. - // number of rows) - const char *arg_cp, - char *res_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - py::ssize_t reduction_arg_offset, - const std::vector &depends) -{ - const argTy *arg_tp = reinterpret_cast(arg_cp) + - iter_arg_offset + reduction_arg_offset; - resTy *res_tp = reinterpret_cast(res_cp) + iter_res_offset; - - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; - - const sycl::device &d = exec_q.get_device(); - const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - - { - sycl::event res_init_ev = exec_q.fill( - res_tp, resTy(identity_val), iter_nelems, depends); - - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(res_init_ev); - - using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< - NoOpIndexerT, NoOpIndexerT>; - using ReductionIndexerT = ColsIndexerT; - - NoOpIndexerT columns_indexer{}; - NoOpIndexerT result_indexer{}; - InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, - result_indexer}; - ReductionIndexerT reduction_indexer{ - 0, /* size */ static_cast(reduction_nelems), - /* step */ static_cast(iter_nelems)}; - - constexpr size_t preferrered_reductions_per_wi = 8; - size_t reductions_per_wi = - (reduction_nelems < preferrered_reductions_per_wi * wg) - ? std::max(1, (reduction_nelems + wg - 1) / wg) - : preferrered_reductions_per_wi; - - size_t reduction_groups = - (reduction_nelems + reductions_per_wi * wg - 1) / - (reductions_per_wi * wg); - - auto globalRange = - sycl::range<1>{iter_nelems * reduction_groups * wg}; - auto localRange = sycl::range<1>{wg}; - - using KernelName = - class sum_reduction_axis0_over_group_with_atomics_contig_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupWithAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); - }); - - return comp_ev; - } -} - -/* = Reduction, using sycl::reduce_over_group, but not using atomic_ref = */ - -template -struct ReductionOverGroupNoAtomicFunctor -{ -private: - const argT *inp_ = nullptr; - outT *out_ = nullptr; - ReductionOp reduction_op_; - outT identity_; - InputOutputIterIndexerT inp_out_iter_indexer_; - InputRedIndexerT inp_reduced_dims_indexer_; - size_t reduction_max_gid_ = 0; - size_t iter_gws_ = 1; - size_t reductions_per_wi = 16; - -public: - ReductionOverGroupNoAtomicFunctor( - const argT *data, - outT *res, - ReductionOp reduction_op, - const outT &identity_val, - InputOutputIterIndexerT arg_res_iter_indexer, - InputRedIndexerT arg_reduced_dims_indexer, - size_t reduction_size, - size_t iteration_size, - size_t reduction_size_per_wi) - : inp_(data), out_(res), reduction_op_(reduction_op), - identity_(identity_val), inp_out_iter_indexer_(arg_res_iter_indexer), - inp_reduced_dims_indexer_(arg_reduced_dims_indexer), - reduction_max_gid_(reduction_size), iter_gws_(iteration_size), - reductions_per_wi(reduction_size_per_wi) - { - } - - void operator()(sycl::nd_item<1> it) const - { - const size_t reduction_lid = it.get_local_id(0); - const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg - - const size_t iter_gid = it.get_group(0) % iter_gws_; - const size_t reduction_batch_id = it.get_group(0) / iter_gws_; - const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_; - - // work-items sums over input with indices - // inp_data_id = reduction_batch_id * wg * reductions_per_wi + m * wg - // + reduction_lid - // for 0 <= m < reductions_per_wi - - auto inp_out_iter_offsets_ = inp_out_iter_indexer_(iter_gid); - const auto &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); - const auto &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); - - outT local_red_val(identity_); - size_t arg_reduce_gid0 = - reduction_lid + reduction_batch_id * wg * reductions_per_wi; - for (size_t m = 0; m < reductions_per_wi; ++m) { - size_t arg_reduce_gid = arg_reduce_gid0 + m * wg; - - if (arg_reduce_gid < reduction_max_gid_) { - auto inp_reduction_offset = - inp_reduced_dims_indexer_(arg_reduce_gid); - auto inp_offset = inp_iter_offset + inp_reduction_offset; - - using dpctl::tensor::type_utils::convert_impl; - outT val = convert_impl(inp_[inp_offset]); - - local_red_val = reduction_op_(local_red_val, val); - } - } - - auto work_group = it.get_group(); - // This only works if reduction_op_ is from small set of operators - outT red_val_over_wg = sycl::reduce_over_group( - work_group, local_red_val, identity_, reduction_op_); - - if (work_group.leader()) { - // each group writes to a different memory location - out_[out_iter_offset * n_reduction_groups + reduction_batch_id] = - red_val_over_wg; - } - } -}; - -template -class sum_reduction_over_group_temps_krn; - -template -sycl::event sum_reduction_over_group_temps_strided_impl( - sycl::queue &exec_q, - size_t iter_nelems, // number of reductions (num. of rows in a matrix - // when reducing over rows) - size_t reduction_nelems, // size of each reduction (length of rows, i.e. - // number of columns) - const char *arg_cp, - char *res_cp, - int iter_nd, - const py::ssize_t *iter_shape_and_strides, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - int red_nd, - const py::ssize_t *reduction_shape_stride, - py::ssize_t reduction_arg_offset, - const std::vector &depends) -{ - const argTy *arg_tp = reinterpret_cast(arg_cp); - resTy *res_tp = reinterpret_cast(res_cp); - - using ReductionOpT = sycl::plus; - constexpr resTy identity_val = resTy{0}; - - const sycl::device &d = exec_q.get_device(); - const auto &sg_sizes = d.get_info(); - size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes); - - constexpr size_t preferrered_reductions_per_wi = 4; - size_t max_wg = d.get_info(); - - size_t reductions_per_wi(preferrered_reductions_per_wi); - if (reduction_nelems <= preferrered_reductions_per_wi * max_wg) { - // reduction only requires 1 work-group, can output directly to res - sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); - - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer; - using ReductionIndexerT = - dpctl::tensor::offset_utils::StridedIndexer; - - InputOutputIterIndexerT in_out_iter_indexer{ - iter_nd, iter_arg_offset, iter_res_offset, - iter_shape_and_strides}; - ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, - reduction_shape_stride}; - - wg = max_wg; - reductions_per_wi = - std::max(1, (reduction_nelems + wg - 1) / wg); - - size_t reduction_groups = - (reduction_nelems + reductions_per_wi * wg - 1) / - (reductions_per_wi * wg); - assert(reduction_groups == 1); - - auto globalRange = - sycl::range<1>{iter_nelems * reduction_groups * wg}; - auto localRange = sycl::range<1>{wg}; - - using KernelName = class sum_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor( - arg_tp, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, reductions_per_wi)); - }); - - return comp_ev; - } - else { - // more than one work-groups is needed, requires a temporary - size_t reduction_groups = - (reduction_nelems + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); - assert(reduction_groups > 1); - - size_t second_iter_reduction_groups_ = - (reduction_groups + preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); - - resTy *partially_reduced_tmp = sycl::malloc_device( - iter_nelems * (reduction_groups + second_iter_reduction_groups_), - exec_q); - resTy *partially_reduced_tmp2 = nullptr; - - if (partially_reduced_tmp == nullptr) { - throw std::runtime_error("Unabled to allocate device_memory"); - } - else { - partially_reduced_tmp2 = - partially_reduced_tmp + reduction_groups * iter_nelems; - } - - const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler - &cgh) { - cgh.depends_on(depends); - - using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; - using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< - InputIndexerT, ResIndexerT>; - using ReductionIndexerT = - dpctl::tensor::offset_utils::StridedIndexer; - - // Only 2*iter_nd entries describing shape and strides of iterated - // dimensions of input array from iter_shape_and_strides are going - // to be accessed by inp_indexer - InputIndexerT inp_indexer(iter_nd, iter_arg_offset, - iter_shape_and_strides); - ResIndexerT noop_tmp_indexer{}; - - InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, - noop_tmp_indexer}; - ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, - reduction_shape_stride}; - - auto globalRange = - sycl::range<1>{iter_nelems * reduction_groups * wg}; - auto localRange = sycl::range<1>{wg}; - - using KernelName = class sum_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor( - arg_tp, partially_reduced_tmp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); - }); - - size_t remaining_reduction_nelems = reduction_groups; - - resTy *temp_arg = partially_reduced_tmp; - resTy *temp2_arg = partially_reduced_tmp2; - sycl::event dependent_ev = first_reduction_ev; - - while (remaining_reduction_nelems > - preferrered_reductions_per_wi * max_wg) { - size_t reduction_groups_ = - (remaining_reduction_nelems + - preferrered_reductions_per_wi * wg - 1) / - (preferrered_reductions_per_wi * wg); - assert(reduction_groups_ > 1); - - // keep reducing - sycl::event partial_reduction_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dependent_ev); - - using InputIndexerT = - dpctl::tensor::offset_utils::Strided1DIndexer; - using ResIndexerT = - dpctl::tensor::offset_utils::NoOpIndexer; - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< - InputIndexerT, ResIndexerT>; - using ReductionIndexerT = - dpctl::tensor::offset_utils::NoOpIndexer; - - InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; - ResIndexerT res_iter_indexer{}; - - InputOutputIterIndexerT in_out_iter_indexer{ - inp_indexer, res_iter_indexer}; - ReductionIndexerT reduction_indexer{}; - - auto globalRange = - sycl::range<1>{iter_nelems * reduction_groups_ * wg}; - auto localRange = sycl::range<1>{wg}; - - using KernelName = class sum_reduction_over_group_temps_krn< - resTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor< - resTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>( - temp_arg, temp2_arg, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, - remaining_reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); - }); - - remaining_reduction_nelems = reduction_groups_; - std::swap(temp_arg, temp2_arg); - dependent_ev = std::move(partial_reduction_ev); - } - - // final reduction to res - sycl::event final_reduction_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(dependent_ev); - - using InputIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer; - using ResIndexerT = - dpctl::tensor::offset_utils::UnpackedStridedIndexer; - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< - InputIndexerT, ResIndexerT>; - using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - - InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; - ResIndexerT res_iter_indexer{iter_nd, iter_res_offset, - /* shape */ iter_shape_and_strides, - /*s trides */ iter_shape_and_strides + - 2 * iter_nd}; - - InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, - res_iter_indexer}; - ReductionIndexerT reduction_indexer{}; - - wg = max_wg; - reductions_per_wi = - std::max(1, (remaining_reduction_nelems + wg - 1) / wg); - - size_t reduction_groups = - (remaining_reduction_nelems + reductions_per_wi * wg - 1) / - (reductions_per_wi * wg); - assert(reduction_groups == 1); - - auto globalRange = - sycl::range<1>{iter_nelems * reduction_groups * wg}; - auto localRange = sycl::range<1>{wg}; - - using KernelName = class sum_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor( - temp_arg, res_tp, ReductionOpT(), identity_val, - in_out_iter_indexer, reduction_indexer, - remaining_reduction_nelems, iter_nelems, - reductions_per_wi)); - }); - - sycl::event cleanup_host_task_event = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(final_reduction_ev); - const sycl::context &ctx = exec_q.get_context(); - - cgh.host_task([ctx, partially_reduced_tmp] { - sycl::free(partially_reduced_tmp, ctx); - }); - }); - - // FIXME: do not return host-task event - // Instead collect all host-tasks to a list - - return cleanup_host_task_event; - } -} - -/* @brief Types supported by plus-reduction code based on atomic_ref */ -template -struct TypePairSupportDataForSumReductionAtomic -{ - - /* value if true a kernel for must be instantiated, false - * otherwise */ - static constexpr bool is_defined = std::disjunction< // disjunction is C++17 - // feature, supported - // by DPC++ input bool - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int8 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint8 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int16 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint16 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int32 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint32 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input int64 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input uint64 - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input half - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input float - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - // input double - td_ns::TypePairDefinedEntry, - // fall-through - td_ns::NotDefinedEntry>::is_defined; -}; - -template -struct TypePairSupportDataForSumReductionTemps -{ - - static constexpr bool is_defined = std::disjunction< // disjunction is C++17 - // feature, supported - // by DPC++ input bool - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - - // input int8_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - - // input uint8_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - - // input int16_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - - // input uint16_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - - // input int32_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - - // input uint32_t - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - - // input int64_t - td_ns::TypePairDefinedEntry, - - // input uint32_t - td_ns::TypePairDefinedEntry, - - // input half - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns:: - TypePairDefinedEntry>, - td_ns::TypePairDefinedEntry>, - - // input float - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry>, - td_ns::TypePairDefinedEntry>, - - // input double - td_ns::TypePairDefinedEntry, - td_ns::TypePairDefinedEntry>, - - // input std::complex - td_ns::TypePairDefinedEntry, - outTy, - std::complex>, - td_ns::TypePairDefinedEntry, - outTy, - std::complex>, - - td_ns::TypePairDefinedEntry, - outTy, - std::complex>, - - // fall-throug - td_ns::NotDefinedEntry>::is_defined; -}; - -template -struct SumOverAxisAtomicStridedFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForSumReductionAtomic< - srcTy, dstTy>::is_defined) - { - return dpctl::tensor::kernels:: - sum_reduction_over_group_with_atomics_strided_impl; - } - else { - return nullptr; - } - } -}; - -template -struct SumOverAxisTempsStridedFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForSumReductionTemps< - srcTy, dstTy>::is_defined) { - return dpctl::tensor::kernels:: - sum_reduction_over_group_temps_strided_impl; - } - else { - return nullptr; - } - } -}; - -template -struct SumOverAxis1AtomicContigFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForSumReductionAtomic< - srcTy, dstTy>::is_defined) - { - return dpctl::tensor::kernels:: - sum_reduction_axis1_over_group_with_atomics_contig_impl; - } - else { - return nullptr; - } - } -}; - -template -struct SumOverAxis0AtomicContigFactory -{ - fnT get() const - { - if constexpr (TypePairSupportDataForSumReductionAtomic< - srcTy, dstTy>::is_defined) - { - return dpctl::tensor::kernels:: - sum_reduction_axis0_over_group_with_atomics_contig_impl; - } - else { - return nullptr; - } - } -}; - -} // namespace kernels -} // namespace tensor -} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index b490c8ed14..3e501590e1 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -100,40 +100,6 @@ template struct IsSyclOp detail::IsContained>::value; }; -struct AtomicSupport -{ - bool operator()(const sycl::queue &exec_q, - sycl::usm::alloc usm_alloc_type, - bool require_atomic64 = false) const - { - bool supports_atomics = false; - - const sycl::device &dev = exec_q.get_device(); - if (require_atomic64) { - if (!dev.has(sycl::aspect::atomic64)) - return false; - } - - switch (usm_alloc_type) { - case sycl::usm::alloc::shared: - supports_atomics = - dev.has(sycl::aspect::usm_atomic_shared_allocations); - break; - case sycl::usm::alloc::host: - supports_atomics = - dev.has(sycl::aspect::usm_atomic_host_allocations); - break; - case sycl::usm::alloc::device: - supports_atomics = true; - break; - default: - supports_atomics = false; - } - - return supports_atomics; - } -}; - /*! @brief Find the smallest multiple of supported sub-group size larger than * nelems */ template diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp index 2339429a48..346efaa936 100644 --- a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp +++ b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp @@ -151,6 +151,59 @@ void populate_min_over_axis_dispatch_tables(void) } // namespace impl +// Sum +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + sum_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_sum_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + using dpctl::tensor::kernels::SumOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table); +} + +} // namespace impl + // Argmax namespace impl { @@ -216,8 +269,8 @@ void init_reduction_functions(py::module_ m) using impl::max_over_axis_strided_atomic_dispatch_table; using impl::max_over_axis_strided_temps_dispatch_table; - auto max_pyapi = [&](arrayT src, int trailing_dims_to_reduce, - arrayT dst, sycl::queue exec_q, + auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { using dpctl::tensor::py_internal::py_reduction_over_axis; return py_reduction_over_axis( @@ -242,8 +295,8 @@ void init_reduction_functions(py::module_ m) using impl::min_over_axis_strided_atomic_dispatch_table; using impl::min_over_axis_strided_temps_dispatch_table; - auto min_pyapi = [&](arrayT src, int trailing_dims_to_reduce, - arrayT dst, sycl::queue exec_q, + auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { using dpctl::tensor::py_internal::py_reduction_over_axis; return py_reduction_over_axis( @@ -258,6 +311,45 @@ void init_reduction_functions(py::module_ m) py::arg("sycl_queue"), py::arg("depends") = py::list()); } + // SUM + { + using dpctl::tensor::py_internal::impl:: + populate_sum_over_axis_dispatch_tables; + populate_sum_over_axis_dispatch_tables(); + using impl::sum_over_axis0_contig_atomic_dispatch_table; + using impl::sum_over_axis1_contig_atomic_dispatch_table; + using impl::sum_over_axis_strided_atomic_dispatch_table; + using impl::sum_over_axis_strided_temps_dispatch_table; + + auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + sum_over_axis_strided_atomic_dispatch_table, + sum_over_axis_strided_temps_dispatch_table, + sum_over_axis0_contig_atomic_dispatch_table, + sum_over_axis1_contig_atomic_dispatch_table); + }; + m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto sum_dtype_supported = + [&](const py::dtype &input_dtype, const py::dtype &output_dtype, + const std::string &dst_usm_type, sycl::queue &q) { + using dpctl::tensor::py_internal::py_reduction_dtype_supported; + return py_reduction_dtype_supported( + input_dtype, output_dtype, dst_usm_type, q, + sum_over_axis_strided_atomic_dispatch_table, + sum_over_axis_strided_temps_dispatch_table); + }; + m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype"), + py::arg("dst_usm_type"), py::arg("sycl_queue")); + } + // ARGMAX { using dpctl::tensor::py_internal::impl:: @@ -265,8 +357,8 @@ void init_reduction_functions(py::module_ m) populate_argmax_over_axis_dispatch_tables(); using impl::argmax_over_axis_strided_temps_dispatch_table; - auto argmax_pyapi = [&](arrayT src, int trailing_dims_to_reduce, - arrayT dst, sycl::queue exec_q, + auto argmax_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { using dpctl::tensor::py_internal::py_search_over_axis; return py_search_over_axis( @@ -285,8 +377,8 @@ void init_reduction_functions(py::module_ m) populate_argmin_over_axis_dispatch_tables(); using impl::argmin_over_axis_strided_temps_dispatch_table; - auto argmin_pyapi = [&](arrayT src, int trailing_dims_to_reduce, - arrayT dst, sycl::queue exec_q, + auto argmin_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { using dpctl::tensor::py_internal::py_search_over_axis; return py_search_over_axis( diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp index 8ee3c0f352..c7bbadd455 100644 --- a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp @@ -41,7 +41,6 @@ #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" -#include "utils/sycl_utils.hpp" #include "utils/type_dispatch.hpp" namespace dpctl @@ -51,6 +50,112 @@ namespace tensor namespace py_internal { +inline bool check_atomic_support(const sycl::queue &exec_q, + sycl::usm::alloc usm_alloc_type, + bool require_atomic64 = false) +{ + bool supports_atomics = false; + + const sycl::device &dev = exec_q.get_device(); + if (require_atomic64) { + if (!dev.has(sycl::aspect::atomic64)) + return false; + } + + switch (usm_alloc_type) { + case sycl::usm::alloc::shared: + supports_atomics = dev.has(sycl::aspect::usm_atomic_shared_allocations); + break; + case sycl::usm::alloc::host: + supports_atomics = dev.has(sycl::aspect::usm_atomic_host_allocations); + break; + case sycl::usm::alloc::device: + supports_atomics = true; + break; + default: + supports_atomics = false; + } + + return supports_atomics; +} + +/* ====================== dtype supported ======================== */ + +template +bool py_reduction_dtype_supported(const py::dtype &input_dtype, + const py::dtype &output_dtype, + const std::string &dst_usm_type, + sycl::queue &q, + const fnT &atomic_dispatch_table, + const fnT &temps_dispatch_table) +{ + int arg_tn = + input_dtype.num(); // NumPy type numbers are the same as in dpctl + int out_tn = + output_dtype.num(); // NumPy type numbers are the same as in dpctl + int arg_typeid = -1; + int out_typeid = -1; + + auto array_types = td_ns::usm_ndarray_types(); + + try { + arg_typeid = array_types.typenum_to_lookup_id(arg_tn); + out_typeid = array_types.typenum_to_lookup_id(out_tn); + } catch (const std::exception &e) { + throw py::value_error(e.what()); + } + + if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || + out_typeid >= td_ns::num_types) + { + throw std::runtime_error("Reduction type support check: lookup failed"); + } + + // remove_all_extents gets underlying type of table + using fn_ptrT = typename std::remove_all_extents::type; + fn_ptrT fn = nullptr; + + sycl::usm::alloc kind = sycl::usm::alloc::unknown; + + if (dst_usm_type == "device") { + kind = sycl::usm::alloc::device; + } + else if (dst_usm_type == "shared") { + kind = sycl::usm::alloc::shared; + } + else if (dst_usm_type == "host") { + kind = sycl::usm::alloc::host; + } + else { + throw py::value_error("Unrecognized `dst_usm_type` argument."); + } + + bool supports_atomics = false; + + switch (output_dtype.itemsize()) { + case sizeof(float): + { + supports_atomics = check_atomic_support(q, kind); + } break; + case sizeof(double): + { + constexpr bool require_atomic64 = true; + supports_atomics = check_atomic_support(q, kind, require_atomic64); + } break; + } + + if (supports_atomics) { + fn = atomic_dispatch_table[arg_typeid][out_typeid]; + } + + if (fn == nullptr) { + // use slower reduction implementation using temporaries + fn = temps_dispatch_table[arg_typeid][out_typeid]; + } + + return (fn != nullptr); +} + /* ==================== Generic reductions ====================== */ template @@ -138,8 +243,6 @@ std::pair py_reduction_over_axis( void *data_ptr = dst.get_data(); const auto &ctx = exec_q.get_context(); auto usm_type = sycl::get_pointer_type(data_ptr, ctx); - using dpctl::tensor::sycl_utils::AtomicSupport; - const auto &check_atomic_support = AtomicSupport{}; supports_atomics = check_atomic_support(exec_q, usm_type); } break; case sizeof(double): @@ -149,8 +252,6 @@ std::pair py_reduction_over_axis( auto usm_type = sycl::get_pointer_type(data_ptr, ctx); constexpr bool require_atomic64 = true; - using dpctl::tensor::sycl_utils::AtomicSupport; - const auto &check_atomic_support = AtomicSupport{}; supports_atomics = check_atomic_support(exec_q, usm_type, require_atomic64); } break; @@ -376,7 +477,7 @@ std::pair py_reduction_over_axis( sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(reduction_ev); - auto ctx = exec_q.get_context(); + const auto &ctx = exec_q.get_context(); cgh.host_task([ctx, temp_allocation_ptr] { sycl::free(temp_allocation_ptr, ctx); }); @@ -559,7 +660,7 @@ std::pair py_search_over_axis( sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { cgh.depends_on(comp_ev); - auto ctx = exec_q.get_context(); + const auto &ctx = exec_q.get_context(); cgh.host_task([ctx, temp_allocation_ptr] { sycl::free(temp_allocation_ptr, ctx); }); diff --git a/dpctl/tensor/libtensor/source/sum_reductions.cpp b/dpctl/tensor/libtensor/source/sum_reductions.cpp deleted file mode 100644 index e4b6595d66..0000000000 --- a/dpctl/tensor/libtensor/source/sum_reductions.cpp +++ /dev/null @@ -1,542 +0,0 @@ -//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2023 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===--------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions -//===--------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include - -#include "dpctl4pybind11.hpp" -#include -#include -#include - -#include "kernels/sum_reductions.hpp" -#include "sum_reductions.hpp" - -#include "simplify_iteration_space.hpp" -#include "utils/memory_overlap.hpp" -#include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -bool check_atomic_support(const sycl::queue &exec_q, - sycl::usm::alloc usm_alloc_type, - bool require_atomic64 = false) -{ - bool supports_atomics = false; - - const sycl::device &dev = exec_q.get_device(); - if (require_atomic64) { - if (!dev.has(sycl::aspect::atomic64)) - return false; - } - - switch (usm_alloc_type) { - case sycl::usm::alloc::shared: - supports_atomics = dev.has(sycl::aspect::usm_atomic_shared_allocations); - break; - case sycl::usm::alloc::host: - supports_atomics = dev.has(sycl::aspect::usm_atomic_host_allocations); - break; - case sycl::usm::alloc::device: - supports_atomics = true; - break; - default: - supports_atomics = false; - } - - return supports_atomics; -} - -using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr; -static sum_reduction_strided_impl_fn_ptr - sum_over_axis_strided_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static sum_reduction_strided_impl_fn_ptr - sum_over_axis_strided_temps_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -using dpctl::tensor::kernels::sum_reduction_contig_impl_fn_ptr; -static sum_reduction_contig_impl_fn_ptr - sum_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; -static sum_reduction_contig_impl_fn_ptr - sum_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] - [td_ns::num_types]; - -std::pair py_sum_over_axis( - const dpctl::tensor::usm_ndarray &src, - int trailing_dims_to_reduce, // sum over this many trailing indexes - const dpctl::tensor::usm_ndarray &dst, - sycl::queue &exec_q, - const std::vector &depends) -{ - int src_nd = src.get_ndim(); - int iteration_nd = src_nd - trailing_dims_to_reduce; - if (trailing_dims_to_reduce <= 0 || iteration_nd < 0) { - throw py::value_error("Trailing_dim_to_reduce must be positive, but no " - "greater than rank of the array being reduced"); - } - - int dst_nd = dst.get_ndim(); - if (dst_nd != iteration_nd) { - throw py::value_error("Destination array rank does not match input " - "array rank and number of reduced dimensions"); - } - - const py::ssize_t *src_shape_ptr = src.get_shape_raw(); - const py::ssize_t *dst_shape_ptr = dst.get_shape_raw(); - - bool same_shapes = true; - for (int i = 0; same_shapes && (i < dst_nd); ++i) { - same_shapes = same_shapes && (src_shape_ptr[i] == dst_shape_ptr[i]); - } - - if (!same_shapes) { - throw py::value_error("Destination shape does not match unreduced " - "dimensions of the input shape"); - } - - if (!dpctl::utils::queues_are_compatible(exec_q, {src, dst})) { - throw py::value_error( - "Execution queue is not compatible with allocation queues"); - } - - size_t dst_nelems = dst.get_size(); - - size_t reduction_nelems(1); - for (int i = dst_nd; i < src_nd; ++i) { - reduction_nelems *= static_cast(src_shape_ptr[i]); - } - - // check that dst and src do not overlap - auto const &overlap = dpctl::tensor::overlap::MemoryOverlap(); - if (overlap(src, dst)) { - throw py::value_error("Arrays index overlapping segments of memory"); - } - - // destination must be ample enough to accommodate all elements - { - auto dst_offsets = dst.get_minmax_offsets(); - size_t range = - static_cast(dst_offsets.second - dst_offsets.first); - if (range + 1 < dst_nelems) { - throw py::value_error( - "Destination array can not accommodate all the " - "elements of source array."); - } - } - - int src_typenum = src.get_typenum(); - int dst_typenum = dst.get_typenum(); - - const auto &array_types = td_ns::usm_ndarray_types(); - int src_typeid = array_types.typenum_to_lookup_id(src_typenum); - int dst_typeid = array_types.typenum_to_lookup_id(dst_typenum); - - int dst_itemsize = dst.get_elemsize(); - bool supports_atomics = false; - - switch (dst_itemsize) { - case sizeof(float): - { - void *data_ptr = dst.get_data(); - const auto &ctx = exec_q.get_context(); - auto usm_type = sycl::get_pointer_type(data_ptr, ctx); - supports_atomics = check_atomic_support(exec_q, usm_type); - } break; - case sizeof(double): - { - void *data_ptr = dst.get_data(); - const auto &ctx = exec_q.get_context(); - auto usm_type = sycl::get_pointer_type(data_ptr, ctx); - - constexpr bool require_atomic64 = true; - supports_atomics = - check_atomic_support(exec_q, usm_type, require_atomic64); - } break; - } - - // handle special case when both reduction and iteration are 1D contiguous - // and can be done with atomics - if (supports_atomics) { - bool is_src_c_contig = src.is_c_contiguous(); - bool is_dst_c_contig = dst.is_c_contiguous(); - bool is_src_f_contig = src.is_f_contiguous(); - - if ((is_src_c_contig && is_dst_c_contig) || - (is_src_f_contig && dst_nelems == 1)) - { - auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid] - [dst_typeid]; - if (fn != nullptr) { - size_t iter_nelems = dst_nelems; - - constexpr py::ssize_t zero_offset = 0; - - sycl::event sum_over_axis_contig_ev = - fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), - dst.get_data(), - zero_offset, // iteration_src_offset - zero_offset, // iteration_dst_offset - zero_offset, // reduction_src_offset - depends); - - sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {sum_over_axis_contig_ev}); - - return std::make_pair(keep_args_event, sum_over_axis_contig_ev); - } - } - else if (is_src_f_contig && - ((is_dst_c_contig && dst_nd == 1) || dst.is_f_contiguous())) - { - auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid] - [dst_typeid]; - if (fn != nullptr) { - size_t iter_nelems = dst_nelems; - - constexpr py::ssize_t zero_offset = 0; - - sycl::event sum_over_axis_contig_ev = - fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), - dst.get_data(), - zero_offset, // iteration_src_offset - zero_offset, // iteration_dst_offset - zero_offset, // reduction_src_offset - depends); - - sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {sum_over_axis_contig_ev}); - - return std::make_pair(keep_args_event, sum_over_axis_contig_ev); - } - } - } - - using dpctl::tensor::py_internal::simplify_iteration_space; - using dpctl::tensor::py_internal::simplify_iteration_space_1; - - auto const &src_shape_vecs = src.get_shape_vector(); - auto const &src_strides_vecs = src.get_strides_vector(); - auto const &dst_strides_vecs = dst.get_strides_vector(); - - int reduction_nd = trailing_dims_to_reduce; - const py::ssize_t *reduction_shape_ptr = src_shape_ptr + dst_nd; - using shT = std::vector; - shT reduction_src_strides(std::begin(src_strides_vecs) + dst_nd, - std::end(src_strides_vecs)); - - shT simplified_reduction_shape; - shT simplified_reduction_src_strides; - py::ssize_t reduction_src_offset(0); - - simplify_iteration_space_1( - reduction_nd, reduction_shape_ptr, reduction_src_strides, - // output - simplified_reduction_shape, simplified_reduction_src_strides, - reduction_src_offset); - - const py::ssize_t *iteration_shape_ptr = src_shape_ptr; - - shT iteration_src_strides(std::begin(src_strides_vecs), - std::begin(src_strides_vecs) + iteration_nd); - shT const &iteration_dst_strides = dst_strides_vecs; - - shT simplified_iteration_shape; - shT simplified_iteration_src_strides; - shT simplified_iteration_dst_strides; - py::ssize_t iteration_src_offset(0); - py::ssize_t iteration_dst_offset(0); - - if (iteration_nd == 0) { - if (dst_nelems != 1) { - throw std::runtime_error("iteration_nd == 0, but dst_nelems != 1"); - } - iteration_nd = 1; - simplified_iteration_shape.push_back(1); - simplified_iteration_src_strides.push_back(0); - simplified_iteration_dst_strides.push_back(0); - } - else { - simplify_iteration_space(iteration_nd, iteration_shape_ptr, - iteration_src_strides, iteration_dst_strides, - // output - simplified_iteration_shape, - simplified_iteration_src_strides, - simplified_iteration_dst_strides, - iteration_src_offset, iteration_dst_offset); - } - - if (supports_atomics && (reduction_nd == 1) && (iteration_nd == 1)) { - bool mat_reduce_over_axis1 = false; - bool mat_reduce_over_axis0 = false; - bool array_reduce_all_elems = false; - size_t iter_nelems = dst_nelems; - - if (simplified_reduction_src_strides[0] == 1) { - array_reduce_all_elems = (simplified_iteration_shape[0] == 1); - mat_reduce_over_axis1 = - (simplified_iteration_dst_strides[0] == 1) && - (static_cast(simplified_iteration_src_strides[0]) == - reduction_nelems); - } - else if (static_cast(simplified_reduction_src_strides[0]) == - iter_nelems) - { - mat_reduce_over_axis0 = - (simplified_iteration_dst_strides[0] == 1) && - (simplified_iteration_src_strides[0] == 1); - } - - if (mat_reduce_over_axis1 || array_reduce_all_elems) { - auto fn = sum_over_axis1_contig_atomic_dispatch_table[src_typeid] - [dst_typeid]; - if (fn != nullptr) { - sycl::event sum_over_axis1_contig_ev = - fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), - dst.get_data(), iteration_src_offset, - iteration_dst_offset, reduction_src_offset, depends); - - sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {sum_over_axis1_contig_ev}); - - return std::make_pair(keep_args_event, - sum_over_axis1_contig_ev); - } - } - else if (mat_reduce_over_axis0) { - auto fn = sum_over_axis0_contig_atomic_dispatch_table[src_typeid] - [dst_typeid]; - if (fn != nullptr) { - sycl::event sum_over_axis0_contig_ev = - fn(exec_q, iter_nelems, reduction_nelems, src.get_data(), - dst.get_data(), iteration_src_offset, - iteration_dst_offset, reduction_src_offset, depends); - - sycl::event keep_args_event = dpctl::utils::keep_args_alive( - exec_q, {src, dst}, {sum_over_axis0_contig_ev}); - - return std::make_pair(keep_args_event, - sum_over_axis0_contig_ev); - } - } - } - - using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr; - sum_reduction_strided_impl_fn_ptr fn = nullptr; - - if (supports_atomics) { - fn = - sum_over_axis_strided_atomic_dispatch_table[src_typeid][dst_typeid]; - } - - if (fn == nullptr) { - // use slower reduction implementation using temporaries - fn = sum_over_axis_strided_temps_dispatch_table[src_typeid][dst_typeid]; - if (fn == nullptr) { - throw std::runtime_error("Datatypes are not supported"); - } - } - - std::vector host_task_events{}; - - using dpctl::tensor::offset_utils::device_allocate_and_pack; - - const auto &arrays_metainfo_packing_triple_ = - device_allocate_and_pack( - exec_q, host_task_events, - // iteration metadata - simplified_iteration_shape, simplified_iteration_src_strides, - simplified_iteration_dst_strides, - // reduction metadata - simplified_reduction_shape, simplified_reduction_src_strides); - py::ssize_t *temp_allocation_ptr = - std::get<0>(arrays_metainfo_packing_triple_); - if (temp_allocation_ptr == nullptr) { - throw std::runtime_error("Unable to allocate memory on device"); - } - const auto ©_metadata_ev = std::get<2>(arrays_metainfo_packing_triple_); - - py::ssize_t *iter_shape_and_strides = temp_allocation_ptr; - py::ssize_t *reduction_shape_stride = - temp_allocation_ptr + 3 * simplified_iteration_shape.size(); - - std::vector all_deps; - all_deps.reserve(depends.size() + 1); - all_deps.resize(depends.size()); - std::copy(depends.begin(), depends.end(), all_deps.begin()); - all_deps.push_back(copy_metadata_ev); - - auto comp_ev = fn(exec_q, dst_nelems, reduction_nelems, src.get_data(), - dst.get_data(), iteration_nd, iter_shape_and_strides, - iteration_src_offset, iteration_dst_offset, - reduction_nd, // number dimensions being reduced - reduction_shape_stride, reduction_src_offset, all_deps); - - sycl::event temp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(comp_ev); - const auto &ctx = exec_q.get_context(); - cgh.host_task([ctx, temp_allocation_ptr] { - sycl::free(temp_allocation_ptr, ctx); - }); - }); - host_task_events.push_back(temp_cleanup_ev); - - sycl::event keep_args_event = - dpctl::utils::keep_args_alive(exec_q, {src, dst}, host_task_events); - - return std::make_pair(keep_args_event, comp_ev); -} - -bool py_sum_over_axis_dtype_supported(const py::dtype &input_dtype, - const py::dtype &output_dtype, - const std::string &dst_usm_type, - sycl::queue &q) -{ - int arg_tn = - input_dtype.num(); // NumPy type numbers are the same as in dpctl - int out_tn = - output_dtype.num(); // NumPy type numbers are the same as in dpctl - int arg_typeid = -1; - int out_typeid = -1; - - auto array_types = td_ns::usm_ndarray_types(); - - try { - arg_typeid = array_types.typenum_to_lookup_id(arg_tn); - out_typeid = array_types.typenum_to_lookup_id(out_tn); - } catch (const std::exception &e) { - throw py::value_error(e.what()); - } - - if (arg_typeid < 0 || arg_typeid >= td_ns::num_types || out_typeid < 0 || - out_typeid >= td_ns::num_types) - { - throw std::runtime_error("Reduction type support check: lookup failed"); - } - - using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr; - sum_reduction_strided_impl_fn_ptr fn = nullptr; - - sycl::usm::alloc kind = sycl::usm::alloc::unknown; - - if (dst_usm_type == "device") { - kind = sycl::usm::alloc::device; - } - else if (dst_usm_type == "shared") { - kind = sycl::usm::alloc::shared; - } - else if (dst_usm_type == "host") { - kind = sycl::usm::alloc::host; - } - else { - throw py::value_error("Unrecognized `dst_usm_type` argument."); - } - - bool supports_atomics = false; - - switch (output_dtype.itemsize()) { - case sizeof(float): - { - supports_atomics = check_atomic_support(q, kind); - } break; - case sizeof(double): - { - constexpr bool require_atomic64 = true; - supports_atomics = check_atomic_support(q, kind, require_atomic64); - } break; - } - - if (supports_atomics) { - fn = - sum_over_axis_strided_atomic_dispatch_table[arg_typeid][out_typeid]; - } - - if (fn == nullptr) { - // use slower reduction implementation using temporaries - fn = sum_over_axis_strided_temps_dispatch_table[arg_typeid][out_typeid]; - } - - return (fn != nullptr); -} - -void populate_sum_over_axis_dispatch_table(void) -{ - using dpctl::tensor::kernels::sum_reduction_contig_impl_fn_ptr; - using dpctl::tensor::kernels::sum_reduction_strided_impl_fn_ptr; - using namespace td_ns; - - using dpctl::tensor::kernels::SumOverAxisAtomicStridedFactory; - DispatchTableBuilder - dtb1; - dtb1.populate_dispatch_table(sum_over_axis_strided_atomic_dispatch_table); - - using dpctl::tensor::kernels::SumOverAxisTempsStridedFactory; - DispatchTableBuilder - dtb2; - dtb2.populate_dispatch_table(sum_over_axis_strided_temps_dispatch_table); - - using dpctl::tensor::kernels::SumOverAxis1AtomicContigFactory; - DispatchTableBuilder - dtb3; - dtb3.populate_dispatch_table(sum_over_axis1_contig_atomic_dispatch_table); - - using dpctl::tensor::kernels::SumOverAxis0AtomicContigFactory; - DispatchTableBuilder - dtb4; - dtb4.populate_dispatch_table(sum_over_axis0_contig_atomic_dispatch_table); -} - -namespace py = pybind11; - -void init_sum_reduction_functions(py::module_ m) -{ - populate_sum_over_axis_dispatch_table(); - - m.def("_sum_over_axis", &py_sum_over_axis, "", py::arg("src"), - py::arg("trailing_dims_to_reduce"), py::arg("dst"), - py::arg("sycl_queue"), py::arg("depends") = py::list()); - - m.def("_sum_over_axis_dtype_supported", &py_sum_over_axis_dtype_supported, - "", py::arg("arg_dtype"), py::arg("out_dtype"), - py::arg("dst_usm_type"), py::arg("sycl_queue")); -} - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/sum_reductions.hpp b/dpctl/tensor/libtensor/source/sum_reductions.hpp deleted file mode 100644 index 6c34160fb6..0000000000 --- a/dpctl/tensor/libtensor/source/sum_reductions.hpp +++ /dev/null @@ -1,40 +0,0 @@ -//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// -// -// Data Parallel Control (dpctl) -// -// Copyright 2020-2023 Intel Corporation -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -//===--------------------------------------------------------------------===// -/// -/// \file -/// This file defines functions of dpctl.tensor._tensor_impl extensions -//===--------------------------------------------------------------------===// - -#pragma once -#include -#include - -namespace dpctl -{ -namespace tensor -{ -namespace py_internal -{ - -extern void init_sum_reduction_functions(py::module_ m); - -} // namespace py_internal -} // namespace tensor -} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/tensor_py.cpp b/dpctl/tensor/libtensor/source/tensor_py.cpp index 8b687a6d1d..6bd0649c1f 100644 --- a/dpctl/tensor/libtensor/source/tensor_py.cpp +++ b/dpctl/tensor/libtensor/source/tensor_py.cpp @@ -49,7 +49,6 @@ #include "reduction_over_axis.hpp" #include "repeat.hpp" #include "simplify_iteration_space.hpp" -#include "sum_reductions.hpp" #include "triul_ctor.hpp" #include "utils/memory_overlap.hpp" #include "utils/strided_iters.hpp" @@ -413,6 +412,5 @@ PYBIND11_MODULE(_tensor_impl, m) dpctl::tensor::py_internal::init_elementwise_functions(m); dpctl::tensor::py_internal::init_boolean_reduction_functions(m); - dpctl::tensor::py_internal::init_sum_reduction_functions(m); dpctl::tensor::py_internal::init_reduction_functions(m); } From 78829e7a65d0c851e2ea4b4be326959ca88f67b8 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 27 Sep 2023 10:32:04 -0700 Subject: [PATCH 14/26] Sum now uses a generic Python API --- dpctl/tensor/_reduction.py | 149 ++++++++++++++++++++++--------------- 1 file changed, 89 insertions(+), 60 deletions(-) diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index f0fd40bc18..d89e7f2465 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -52,55 +52,16 @@ def _default_reduction_dtype(inp_dt, q): return res_dt -def sum(x, axis=None, dtype=None, keepdims=False): - """sum(x, axis=None, dtype=None, keepdims=False) - - Calculates the sum of the input array `x`. - - Args: - x (usm_ndarray): - input array. - axis (Optional[int, Tuple[int,...]]): - axis or axes along which sums must be computed. If a tuple - of unique integers, sums are computed over multiple axes. - If `None`, the sum if computed over the entire array. - Default: `None`. - dtype (Optional[dtype]): - data type of the returned array. If `None`, the default data - type is inferred from the "kind" of the input array data type. - * If `x` has a real-valued floating-point data type, - the returned array will have the default real-valued - floating-point data type for the device where input - array `x` is allocated. - * If x` has signed integral data type, the returned array - will have the default signed integral type for the device - where input array `x` is allocated. - * If `x` has unsigned integral data type, the returned array - will have the default unsigned integral type for the device - where input array `x` is allocated. - * If `x` has a complex-valued floating-point data typee, - the returned array will have the default complex-valued - floating-pointer data type for the device where input - array `x` is allocated. - * If `x` has a boolean data type, the returned array will - have the default signed integral type for the device - where input array `x` is allocated. - If the data type (either specified or resolved) differs from the - data type of `x`, the input array elements are cast to the - specified data type before computing the sum. Default: `None`. - keepdims (Optional[bool]): - if `True`, the reduced axes (dimensions) are included in the result - as singleton dimensions, so that the returned array remains - compatible with the input arrays according to Array Broadcasting - rules. Otherwise, if `False`, the reduced axes are not included in - the returned array. Default: `False`. - Returns: - usm_ndarray: - an array containing the sums. If the sum was computed over the - entire array, a zero-dimensional array is returned. The returned - array has the data type as described in the `dtype` parameter - description above. - """ +def _reduction_over_axis( + x, + axis, + dtype, + keepdims, + _reduction_fn, + _dtype_supported, + _default_reduction_type_fn, + _identity=None, +): if not isinstance(x, dpt.usm_ndarray): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") nd = x.ndim @@ -116,29 +77,36 @@ def sum(x, axis=None, dtype=None, keepdims=False): q = x.sycl_queue inp_dt = x.dtype if dtype is None: - res_dt = _default_reduction_dtype(inp_dt, q) + res_dt = _default_reduction_type_fn(inp_dt, q) else: res_dt = dpt.dtype(dtype) res_dt = _to_device_supported_dtype(res_dt, q.sycl_device) res_usm_type = x.usm_type if x.size == 0: - if keepdims: - res_shape = res_shape + (1,) * red_nd - inv_perm = sorted(range(nd), key=lambda d: perm[d]) - res_shape = tuple(res_shape[i] for i in inv_perm) - return dpt.zeros( - res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q - ) + if _identity is None: + raise ValueError("reduction does not support zero-size arrays") + else: + if keepdims: + res_shape = res_shape + (1,) * red_nd + inv_perm = sorted(range(nd), key=lambda d: perm[d]) + res_shape = tuple(res_shape[i] for i in inv_perm) + return dpt.full( + res_shape, + _identity, + dtype=res_dt, + usm_type=res_usm_type, + sycl_queue=q, + ) if red_nd == 0: return dpt.astype(x, res_dt, copy=False) host_tasks_list = [] - if ti._sum_over_axis_dtype_supported(inp_dt, res_dt, res_usm_type, q): + if _dtype_supported(inp_dt, res_dt, res_usm_type, q): res = dpt.empty( res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=q ) - ht_e, _ = ti._sum_over_axis( + ht_e, _ = _reduction_fn( src=arr2, trailing_dims_to_reduce=red_nd, dst=res, sycl_queue=q ) host_tasks_list.append(ht_e) @@ -152,7 +120,7 @@ def sum(x, axis=None, dtype=None, keepdims=False): tmp = dpt.empty( res_shape, dtype=tmp_dt, usm_type=res_usm_type, sycl_queue=q ) - ht_e_tmp, r_e = ti._sum_over_axis( + ht_e_tmp, r_e = _reduction_fn( src=arr2, trailing_dims_to_reduce=red_nd, dst=tmp, sycl_queue=q ) host_tasks_list.append(ht_e_tmp) @@ -173,6 +141,67 @@ def sum(x, axis=None, dtype=None, keepdims=False): return res +def sum(x, axis=None, dtype=None, keepdims=False): + """sum(x, axis=None, dtype=None, keepdims=False) + + Calculates the sum of the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int,...]]): + axis or axes along which sums must be computed. If a tuple + of unique integers, sums are computed over multiple axes. + If `None`, the sum is computed over the entire array. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + * If `x` has a real-valued floating-point data type, + the returned array will have the default real-valued + floating-point data type for the device where input + array `x` is allocated. + * If x` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array `x` is allocated. + * If `x` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array `x` is allocated. + * If `x` has a complex-valued floating-point data typee, + the returned array will have the default complex-valued + floating-pointer data type for the device where input + array `x` is allocated. + * If `x` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array `x` is allocated. + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the sum. Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the sums. If the sum was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the data type as described in the `dtype` parameter + description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + ti._sum_over_axis, + ti._sum_over_axis_dtype_supported, + _default_reduction_dtype, + _identity=0, + ) + + def _comparison_over_axis(x, axis, keepdims, _reduction_fn): if not isinstance(x, dpt.usm_ndarray): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") From f01991b14ecfbe8a32e7a9f07febdf651f7734a5 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 27 Sep 2023 10:44:46 -0700 Subject: [PATCH 15/26] Docstrings added for argmax, argmin, max, and min --- dpctl/tensor/_reduction.py | 104 +++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index d89e7f2465..0bbfc262a4 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -246,10 +246,58 @@ def _comparison_over_axis(x, axis, keepdims, _reduction_fn): def max(x, axis=None, keepdims=False): + """max(x, axis=None, dtype=None, keepdims=False) + + Calculates the maximum value of the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int,...]]): + axis or axes along which maxima must be computed. If a tuple + of unique integers, the maxima are computed over multiple axes. + If `None`, the max is computed over the entire array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the maxima. If the max was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the same data type as `x`. + """ return _comparison_over_axis(x, axis, keepdims, ti._max_over_axis) def min(x, axis=None, keepdims=False): + """min(x, axis=None, dtype=None, keepdims=False) + + Calculates the minimum value of the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int,...]]): + axis or axes along which minima must be computed. If a tuple + of unique integers, the minima are computed over multiple axes. + If `None`, the min is computed over the entire array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the minima. If the min was computed over the + entire array, a zero-dimensional array is returned. The returned + array has the same data type as `x`. + """ return _comparison_over_axis(x, axis, keepdims, ti._min_over_axis) @@ -303,8 +351,64 @@ def _search_over_axis(x, axis, keepdims, _reduction_fn): def argmax(x, axis=None, keepdims=False): + """argmax(x, axis=None, dtype=None, keepdims=False) + + Returns the indices of the maximum values of the input array `x` along a + specified axis. + + When the maximum value occurs multiple times, the indices corresponding to + the first occurrence are returned. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to search. If `None`, returns the index of the + maximum value of the flattened array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the indices of the first occurrence of the + maximum values. If the entire array was searched, a + zero-dimensional array is returned. The returned array has the + default array index data type for the device of `x`. + """ return _search_over_axis(x, axis, keepdims, ti._argmax_over_axis) def argmin(x, axis=None, keepdims=False): + """argmin(x, axis=None, dtype=None, keepdims=False) + + Returns the indices of the minimum values of the input array `x` along a + specified axis. + + When the minimum value occurs multiple times, the indices corresponding to + the first occurrence are returned. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int]): + axis along which to search. If `None`, returns the index of the + minimum value of the flattened array. + Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the indices of the first occurrence of the + minimum values. If the entire array was searched, a + zero-dimensional array is returned. The returned array has the + default array index data type for the device of `x`. + """ return _search_over_axis(x, axis, keepdims, ti._argmin_over_axis) From 8597300d1ebbfeb793b8c2436eb9ebe257f77007 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Wed, 27 Sep 2023 17:10:34 -0700 Subject: [PATCH 16/26] Small reduction clean-ups Removed unnecessary copies in custom_reduce_over_group Sequential reduction now casts before calling operator (makes behavior explicit rather than implicit) --- dpctl/tensor/libtensor/include/kernels/reductions.hpp | 4 +++- dpctl/tensor/libtensor/include/utils/sycl_utils.hpp | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 8a1182421a..abeef5d669 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -97,7 +97,9 @@ struct SequentialReduction const py::ssize_t inp_offset = inp_iter_offset + inp_reduction_offset; - red_val = reduction_op_(red_val, inp_[inp_offset]); + using dpctl::tensor::type_utils::convert_impl; + outT val = convert_impl(inp_[inp_offset]); + red_val = reduction_op_(red_val, val); } out_[out_iter_offset] = red_val; diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index 3e501590e1..e209f5b088 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -132,10 +132,10 @@ size_t choose_workgroup_size(const size_t nelems, } template -T custom_reduce_over_group(GroupT wg, +T custom_reduce_over_group(const GroupT &wg, LocAccT local_mem_acc, - T local_val, - OpT op) + const T &local_val, + const OpT &op) { size_t wgs = wg.get_local_linear_range(); local_mem_acc[wg.get_local_linear_id()] = local_val; From 2c186676d14ffbdf1f49c86ec503a350afe5964c Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 28 Sep 2023 15:33:07 -0500 Subject: [PATCH 17/26] Added test for argmin with keepdims=True --- dpctl/tests/test_usm_ndarray_reductions.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py index e137304dc5..7d328da967 100644 --- a/dpctl/tests/test_usm_ndarray_reductions.py +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -42,11 +42,16 @@ def test_max_min_axis(): def test_reduction_keepdims(): get_queue_or_skip() - x = dpt.ones((3, 4, 5, 6, 7), dtype="i4") + n0, n1 = 3, 6 + x = dpt.ones((n0, 4, 5, n1, 7), dtype="i4") m = dpt.max(x, axis=(1, 2, -1), keepdims=True) - assert m.shape == (3, 1, 1, 6, 1) + xx = dpt.reshape(dpt.permute_dims(x, (0, 3, 1, 2, -1)), (n0, n1, -1)) + p = dpt.argmax(xx, axis=-1, keepdims=True) + + assert m.shape == (n0, 1, 1, n1, 1) assert dpt.all(m == dpt.reshape(x[:, 0, 0, :, 0], m.shape)) + assert dpt.all(p == 0) def test_max_scalar(): From 24b54d776a2f7fc9362a7331175543e6738a8dfd Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Fri, 29 Sep 2023 12:48:07 -0700 Subject: [PATCH 18/26] Added a test for raised errors in reductions Also removed unused `_usm_types` in `test_tensor_sum` --- dpctl/tests/test_tensor_sum.py | 1 - dpctl/tests/test_usm_ndarray_reductions.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py index 403a823324..8f2bd45362 100644 --- a/dpctl/tests/test_tensor_sum.py +++ b/dpctl/tests/test_tensor_sum.py @@ -36,7 +36,6 @@ "c8", "c16", ] -_usm_types = ["device", "shared", "host"] @pytest.mark.parametrize("arg_dtype", _all_dtypes) diff --git a/dpctl/tests/test_usm_ndarray_reductions.py b/dpctl/tests/test_usm_ndarray_reductions.py index 7d328da967..8d66f35d71 100644 --- a/dpctl/tests/test_usm_ndarray_reductions.py +++ b/dpctl/tests/test_usm_ndarray_reductions.py @@ -216,3 +216,21 @@ def test_argmax_argmin_identities(): assert dpt.argmax(x) == 0 x = dpt.full(3, dpt.iinfo(dpt.int32).max, dtype="i4") assert dpt.argmin(x) == 0 + + +def test_reduction_arg_validation(): + get_queue_or_skip() + + x = dict() + with pytest.raises(TypeError): + dpt.sum(x) + with pytest.raises(TypeError): + dpt.max(x) + with pytest.raises(TypeError): + dpt.argmax(x) + + x = dpt.zeros((0,), dtype="i4") + with pytest.raises(ValueError): + dpt.max(x) + with pytest.raises(ValueError): + dpt.argmax(x) From df1c22f79c49e9d2dbcfb3d4bcc0048d630648b0 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Sat, 30 Sep 2023 15:52:08 -0700 Subject: [PATCH 19/26] Removed `void` overloads from reduction utilities These were unused by dpctl --- .../libtensor/include/utils/sycl_utils.hpp | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index e209f5b088..3ecfbe67c7 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -96,8 +96,7 @@ template struct IsSyclOp { static constexpr bool value = detail::IsContained>>::value || - detail::IsContained>>::value || - detail::IsContained>::value; + detail::IsContained>>::value; }; /*! @brief Find the smallest multiple of supported sub-group size larger than @@ -212,9 +211,7 @@ template struct GetIdentity template using IsMaximum = std::bool_constant> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v>>; + std::is_same_v>>; template struct GetIdentity::value>> @@ -244,9 +241,7 @@ struct GetIdentity using IsMinimum = std::bool_constant> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v>>; + std::is_same_v>>; template struct GetIdentity::value>> @@ -275,9 +270,8 @@ struct GetIdentity -using IsPlus = std::bool_constant< - std::is_same_v> || std::is_same_v> || - std::is_same_v> || std::is_same_v>>; +using IsPlus = std::bool_constant> || + std::is_same_v>>; // Identity From 478b30c71456061e3e6450c1e1d452bafd282f67 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 1 Oct 2023 07:51:37 -0500 Subject: [PATCH 20/26] Added missing include, Identity to use has_known_identity Implementation of Identity trait should call sycl::known_identity if trait sycl::has_known_identity is a true_type. Added IsMultiplies, and identity value for it, since sycl::known_identity for multiplies is only defined for real-valued types. --- .../libtensor/include/utils/sycl_utils.hpp | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp index 3ecfbe67c7..0d4240c516 100644 --- a/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/sycl_utils.hpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include "math_utils.hpp" @@ -272,6 +273,18 @@ struct GetIdentity using IsPlus = std::bool_constant> || std::is_same_v>>; +// Multiplies + +template +using IsMultiplies = + std::bool_constant> || + std::is_same_v>>; + +template +struct GetIdentity::value>> +{ + static constexpr T value = static_cast(1); +}; // Identity @@ -280,13 +293,17 @@ template struct Identity }; template -struct Identity::value>> +using UseBuiltInIdentity = + std::conjunction, sycl::has_known_identity>; + +template +struct Identity::value>> { static constexpr T value = GetIdentity::value; }; template -struct Identity::value>> +struct Identity::value>> { static constexpr T value = sycl::known_identity::value; }; From 0598416a043331910e85372e822f7b10aa8229f4 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 1 Oct 2023 07:56:28 -0500 Subject: [PATCH 21/26] Adding functor factories for product over axis --- .../libtensor/include/kernels/reductions.hpp | 244 ++++++++++++++++++ 1 file changed, 244 insertions(+) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index abeef5d669..28adaa1db9 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -1831,6 +1831,250 @@ struct SumOverAxis0AtomicContigFactory } }; +// Product + +/* @brief Types supported by plus-reduction code based on atomic_ref */ +template +struct TypePairSupportDataForProductReductionAtomic +{ + + /* value if true a kernel for must be instantiated, false + * otherwise */ + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint8 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint16 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint32 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input int64 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input uint64 + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + // input double + td_ns::TypePairDefinedEntry, + // fall-through + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct TypePairSupportDataForProductReductionTemps +{ + + static constexpr bool is_defined = std::disjunction< // disjunction is C++17 + // feature, supported + // by DPC++ input bool + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint8_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint16_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + + // input int64_t + td_ns::TypePairDefinedEntry, + + // input uint32_t + td_ns::TypePairDefinedEntry, + + // input half + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns:: + TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input float + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + td_ns::TypePairDefinedEntry>, + + // input double + td_ns::TypePairDefinedEntry, + td_ns::TypePairDefinedEntry>, + + // input std::complex + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + td_ns::TypePairDefinedEntry, + outTy, + std::complex>, + + // fall-throug + td_ns::NotDefinedEntry>::is_defined; +}; + +template +struct ProductOverAxisAtomicStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_over_group_with_atomics_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxisTempsStridedFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionTemps< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_over_group_temps_strided_impl; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis1AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis1_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + +template +struct ProductOverAxis0AtomicContigFactory +{ + fnT get() const + { + if constexpr (TypePairSupportDataForProductReductionAtomic< + srcTy, dstTy>::is_defined) + { + using ReductionOpT = sycl::multiplies; + return dpctl::tensor::kernels:: + reduction_axis0_over_group_with_atomics_contig_impl< + srcTy, dstTy, ReductionOpT>; + } + else { + return nullptr; + } + } +}; + // Argmax and Argmin /* = Search reduction using reduce_over_group*/ From ca0ff64378e97b819321347e793a324d05a23c41 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 1 Oct 2023 07:57:37 -0500 Subject: [PATCH 22/26] Added Python API for _prod_over_axis --- .../libtensor/source/reduction_over_axis.cpp | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp index 346efaa936..a20277c241 100644 --- a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp +++ b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp @@ -204,6 +204,59 @@ void populate_sum_over_axis_dispatch_tables(void) } // namespace impl +// Product +namespace impl +{ + +using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; +static reduction_strided_impl_fn_ptr + prod_over_axis_strided_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_strided_impl_fn_ptr + prod_over_axis_strided_temps_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; +static reduction_contig_impl_fn_ptr + prod_over_axis1_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; +static reduction_contig_impl_fn_ptr + prod_over_axis0_contig_atomic_dispatch_table[td_ns::num_types] + [td_ns::num_types]; + +void populate_prod_over_axis_dispatch_tables(void) +{ + using dpctl::tensor::kernels::reduction_contig_impl_fn_ptr; + using dpctl::tensor::kernels::reduction_strided_impl_fn_ptr; + using namespace td_ns; + + using dpctl::tensor::kernels::ProductOverAxisAtomicStridedFactory; + DispatchTableBuilder + dtb1; + dtb1.populate_dispatch_table(prod_over_axis_strided_atomic_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxisTempsStridedFactory; + DispatchTableBuilder + dtb2; + dtb2.populate_dispatch_table(prod_over_axis_strided_temps_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxis1AtomicContigFactory; + DispatchTableBuilder + dtb3; + dtb3.populate_dispatch_table(prod_over_axis1_contig_atomic_dispatch_table); + + using dpctl::tensor::kernels::ProductOverAxis0AtomicContigFactory; + DispatchTableBuilder + dtb4; + dtb4.populate_dispatch_table(prod_over_axis0_contig_atomic_dispatch_table); +} + +} // namespace impl + // Argmax namespace impl { @@ -350,6 +403,45 @@ void init_reduction_functions(py::module_ m) py::arg("dst_usm_type"), py::arg("sycl_queue")); } + // PROD + { + using dpctl::tensor::py_internal::impl:: + populate_prod_over_axis_dispatch_tables; + populate_prod_over_axis_dispatch_tables(); + using impl::prod_over_axis0_contig_atomic_dispatch_table; + using impl::prod_over_axis1_contig_atomic_dispatch_table; + using impl::prod_over_axis_strided_atomic_dispatch_table; + using impl::prod_over_axis_strided_temps_dispatch_table; + + auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, + const arrayT &dst, sycl::queue &exec_q, + const event_vecT &depends = {}) { + using dpctl::tensor::py_internal::py_reduction_over_axis; + return py_reduction_over_axis( + src, trailing_dims_to_reduce, dst, exec_q, depends, + prod_over_axis_strided_atomic_dispatch_table, + prod_over_axis_strided_temps_dispatch_table, + prod_over_axis0_contig_atomic_dispatch_table, + prod_over_axis1_contig_atomic_dispatch_table); + }; + m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"), + py::arg("trailing_dims_to_reduce"), py::arg("dst"), + py::arg("sycl_queue"), py::arg("depends") = py::list()); + + auto prod_dtype_supported = + [&](const py::dtype &input_dtype, const py::dtype &output_dtype, + const std::string &dst_usm_type, sycl::queue &q) { + using dpctl::tensor::py_internal::py_reduction_dtype_supported; + return py_reduction_dtype_supported( + input_dtype, output_dtype, dst_usm_type, q, + prod_over_axis_strided_atomic_dispatch_table, + prod_over_axis_strided_temps_dispatch_table); + }; + m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "", + py::arg("arg_dtype"), py::arg("out_dtype"), + py::arg("dst_usm_type"), py::arg("sycl_queue")); + } + // ARGMAX { using dpctl::tensor::py_internal::impl:: From ee46ae1b8d6a4854dc28e283ccc2503015a6cea7 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 2 Oct 2023 13:46:16 -0500 Subject: [PATCH 23/26] Common reduction template takes functions to test if atomics are applicable Passing these function pointers around allows to turn atomic off altogether if desired. Use custom trait to check if reduce_over_groups can be used. This allows to work-around bug, or switch to custom code for reduction over group if desired. Such custom trait type works around issue with incorrect result returned from sycl::reduce_over_group for sycl::multiplies operator for 64-bit integral types. --- .../libtensor/include/kernels/reductions.hpp | 151 ++++++++++-------- .../libtensor/source/reduction_over_axis.cpp | 50 ++++-- .../libtensor/source/reduction_over_axis.hpp | 49 +++--- 3 files changed, 152 insertions(+), 98 deletions(-) diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index 28adaa1db9..7cb97cd4f9 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -50,6 +50,14 @@ namespace tensor namespace kernels { +template struct can_use_reduce_over_group +{ + static constexpr bool value = + sycl::has_known_identity::value && + !std::is_same_v && !std::is_same_v && + !std::is_same_v>; +}; + template {iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (can_use_reduce_over_group::value) + { using KernelName = class reduction_over_group_with_atomics_krn< argTy, resTy, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT>; @@ -618,7 +627,8 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (can_use_reduce_over_group::value) + { using KernelName = class reduction_axis1_over_group_with_atomics_contig_krn< argTy, resTy, ReductionOpT, InputOutputIterIndexerT, @@ -717,7 +727,8 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (can_use_reduce_over_group::value) + { using KernelName = class reduction_axis0_over_group_with_atomics_contig_krn< argTy, resTy, ReductionOpT, InputOutputIterIndexerT, @@ -1007,10 +1018,12 @@ sycl::event reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (can_use_reduce_over_group::value) + { using KernelName = class reduction_over_group_temps_krn< argTy, resTy, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT>; + cgh.parallel_for( sycl::nd_range<1>(globalRange, localRange), ReductionOverGroupNoAtomicFunctor< @@ -1026,6 +1039,7 @@ sycl::event reduction_over_group_temps_strided_impl( using KernelName = class custom_reduction_over_group_temps_krn< argTy, resTy, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT, SlmT>; + cgh.parallel_for( sycl::nd_range<1>(globalRange, localRange), CustomReductionOverGroupNoAtomicFunctor< @@ -1062,68 +1076,67 @@ sycl::event reduction_over_group_temps_strided_impl( partially_reduced_tmp + reduction_groups * iter_nelems; } - const sycl::event &first_reduction_ev = - exec_q.submit([&](sycl::handler &cgh) { - cgh.depends_on(depends); + const sycl::event &first_reduction_ev = exec_q.submit([&](sycl::handler + &cgh) { + cgh.depends_on(depends); - using InputIndexerT = - dpctl::tensor::offset_utils::StridedIndexer; - using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - using InputOutputIterIndexerT = - dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< - InputIndexerT, ResIndexerT>; - using ReductionIndexerT = - dpctl::tensor::offset_utils::StridedIndexer; + using InputIndexerT = dpctl::tensor::offset_utils::StridedIndexer; + using ResIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; + using InputOutputIterIndexerT = + dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer< + InputIndexerT, ResIndexerT>; + using ReductionIndexerT = + dpctl::tensor::offset_utils::StridedIndexer; - // Only 2*iter_nd entries describing shape and strides of - // iterated dimensions of input array from - // iter_shape_and_strides are going to be accessed by - // inp_indexer - InputIndexerT inp_indexer(iter_nd, iter_arg_offset, - iter_shape_and_strides); - ResIndexerT noop_tmp_indexer{}; + // Only 2*iter_nd entries describing shape and strides of + // iterated dimensions of input array from + // iter_shape_and_strides are going to be accessed by + // inp_indexer + InputIndexerT inp_indexer(iter_nd, iter_arg_offset, + iter_shape_and_strides); + ResIndexerT noop_tmp_indexer{}; - InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, - noop_tmp_indexer}; - ReductionIndexerT reduction_indexer{ - red_nd, reduction_arg_offset, reduction_shape_stride}; + InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, + noop_tmp_indexer}; + ReductionIndexerT reduction_indexer{red_nd, reduction_arg_offset, + reduction_shape_stride}; - auto globalRange = - sycl::range<1>{iter_nelems * reduction_groups * wg}; - auto localRange = sycl::range<1>{wg}; + auto globalRange = + sycl::range<1>{iter_nelems * reduction_groups * wg}; + auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { - using KernelName = class reduction_over_group_temps_krn< + if constexpr (can_use_reduce_over_group::value) + { + using KernelName = class reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + ReductionOverGroupNoAtomicFunctor< argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - ReductionOverGroupNoAtomicFunctor< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT>( - arg_tp, partially_reduced_tmp, ReductionOpT(), - identity_val, in_out_iter_indexer, - reduction_indexer, reduction_nelems, iter_nelems, - preferrered_reductions_per_wi)); - } - else { - using SlmT = sycl::local_accessor; - SlmT local_memory = SlmT(localRange, cgh); - using KernelName = - class custom_reduction_over_group_temps_krn< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT, SlmT>; - cgh.parallel_for( - sycl::nd_range<1>(globalRange, localRange), - CustomReductionOverGroupNoAtomicFunctor< - argTy, resTy, ReductionOpT, InputOutputIterIndexerT, - ReductionIndexerT, SlmT>( - arg_tp, partially_reduced_tmp, ReductionOpT(), - identity_val, in_out_iter_indexer, - reduction_indexer, local_memory, reduction_nelems, - iter_nelems, preferrered_reductions_per_wi)); - } - }); + ReductionIndexerT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + else { + using SlmT = sycl::local_accessor; + SlmT local_memory = SlmT(localRange, cgh); + using KernelName = class custom_reduction_over_group_temps_krn< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>; + cgh.parallel_for( + sycl::nd_range<1>(globalRange, localRange), + CustomReductionOverGroupNoAtomicFunctor< + argTy, resTy, ReductionOpT, InputOutputIterIndexerT, + ReductionIndexerT, SlmT>( + arg_tp, partially_reduced_tmp, ReductionOpT(), + identity_val, in_out_iter_indexer, reduction_indexer, + local_memory, reduction_nelems, iter_nelems, + preferrered_reductions_per_wi)); + } + }); size_t remaining_reduction_nelems = reduction_groups; @@ -1165,7 +1178,8 @@ sycl::event reduction_over_group_temps_strided_impl( auto globalRange = sycl::range<1>{iter_nelems * reduction_groups_ * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (can_use_reduce_over_group::value) { using KernelName = class reduction_over_group_temps_krn< resTy, resTy, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT>; @@ -1240,7 +1254,8 @@ sycl::event reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (can_use_reduce_over_group::value) + { using KernelName = class reduction_over_group_temps_krn< argTy, resTy, ReductionOpT, InputOutputIterIndexerT, ReductionIndexerT>; @@ -2564,7 +2579,8 @@ sycl::event search_reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (can_use_reduce_over_group::value) + { using KernelName = class search_reduction_over_group_temps_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, true, true>; @@ -2663,7 +2679,8 @@ sycl::event search_reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (can_use_reduce_over_group::value) + { using KernelName = class search_reduction_over_group_temps_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, true, false>; @@ -2743,7 +2760,8 @@ sycl::event search_reduction_over_group_temps_strided_impl( auto globalRange = sycl::range<1>{iter_nelems * reduction_groups_ * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (can_use_reduce_over_group::value) { using KernelName = class search_reduction_over_group_temps_krn< argTy, resTy, ReductionOpT, IndexOpT, @@ -2826,7 +2844,8 @@ sycl::event search_reduction_over_group_temps_strided_impl( sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; - if constexpr (su_ns::IsSyclOp::value) { + if constexpr (can_use_reduce_over_group::value) + { using KernelName = class search_reduction_over_group_temps_krn< argTy, resTy, ReductionOpT, IndexOpT, InputOutputIterIndexerT, ReductionIndexerT, false, true>; diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp index a20277c241..c67fcd5ba3 100644 --- a/dpctl/tensor/libtensor/source/reduction_over_axis.cpp +++ b/dpctl/tensor/libtensor/source/reduction_over_axis.cpp @@ -312,6 +312,12 @@ void init_reduction_functions(py::module_ m) namespace impl = dpctl::tensor::py_internal::impl; + using dpctl::tensor::py_internal::py_reduction_dtype_supported; + using dpctl::tensor::py_internal::py_reduction_over_axis; + + using dpctl::tensor::py_internal::check_atomic_support; + using dpctl::tensor::py_internal::fixed_decision; + // MAX { using dpctl::tensor::py_internal::impl:: @@ -322,16 +328,21 @@ void init_reduction_functions(py::module_ m) using impl::max_over_axis_strided_atomic_dispatch_table; using impl::max_over_axis_strided_temps_dispatch_table; + const auto &check_atomic_support_size4 = + check_atomic_support; + const auto &check_atomic_support_size8 = + check_atomic_support; + auto max_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_reduction_over_axis; return py_reduction_over_axis( src, trailing_dims_to_reduce, dst, exec_q, depends, max_over_axis_strided_atomic_dispatch_table, max_over_axis_strided_temps_dispatch_table, max_over_axis0_contig_atomic_dispatch_table, - max_over_axis1_contig_atomic_dispatch_table); + max_over_axis1_contig_atomic_dispatch_table, + check_atomic_support_size4, check_atomic_support_size8); }; m.def("_max_over_axis", max_pyapi, "", py::arg("src"), py::arg("trailing_dims_to_reduce"), py::arg("dst"), @@ -348,16 +359,21 @@ void init_reduction_functions(py::module_ m) using impl::min_over_axis_strided_atomic_dispatch_table; using impl::min_over_axis_strided_temps_dispatch_table; + const auto &check_atomic_support_size4 = + check_atomic_support; + const auto &check_atomic_support_size8 = + check_atomic_support; + auto min_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_reduction_over_axis; return py_reduction_over_axis( src, trailing_dims_to_reduce, dst, exec_q, depends, min_over_axis_strided_atomic_dispatch_table, min_over_axis_strided_temps_dispatch_table, min_over_axis0_contig_atomic_dispatch_table, - min_over_axis1_contig_atomic_dispatch_table); + min_over_axis1_contig_atomic_dispatch_table, + check_atomic_support_size4, check_atomic_support_size8); }; m.def("_min_over_axis", min_pyapi, "", py::arg("src"), py::arg("trailing_dims_to_reduce"), py::arg("dst"), @@ -374,16 +390,21 @@ void init_reduction_functions(py::module_ m) using impl::sum_over_axis_strided_atomic_dispatch_table; using impl::sum_over_axis_strided_temps_dispatch_table; + const auto &check_atomic_support_size4 = + check_atomic_support; + const auto &check_atomic_support_size8 = + check_atomic_support; + auto sum_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_reduction_over_axis; return py_reduction_over_axis( src, trailing_dims_to_reduce, dst, exec_q, depends, sum_over_axis_strided_atomic_dispatch_table, sum_over_axis_strided_temps_dispatch_table, sum_over_axis0_contig_atomic_dispatch_table, - sum_over_axis1_contig_atomic_dispatch_table); + sum_over_axis1_contig_atomic_dispatch_table, + check_atomic_support_size4, check_atomic_support_size8); }; m.def("_sum_over_axis", sum_pyapi, "", py::arg("src"), py::arg("trailing_dims_to_reduce"), py::arg("dst"), @@ -392,11 +413,11 @@ void init_reduction_functions(py::module_ m) auto sum_dtype_supported = [&](const py::dtype &input_dtype, const py::dtype &output_dtype, const std::string &dst_usm_type, sycl::queue &q) { - using dpctl::tensor::py_internal::py_reduction_dtype_supported; return py_reduction_dtype_supported( input_dtype, output_dtype, dst_usm_type, q, sum_over_axis_strided_atomic_dispatch_table, - sum_over_axis_strided_temps_dispatch_table); + sum_over_axis_strided_temps_dispatch_table, + check_atomic_support_size4, check_atomic_support_size8); }; m.def("_sum_over_axis_dtype_supported", sum_dtype_supported, "", py::arg("arg_dtype"), py::arg("out_dtype"), @@ -413,16 +434,21 @@ void init_reduction_functions(py::module_ m) using impl::prod_over_axis_strided_atomic_dispatch_table; using impl::prod_over_axis_strided_temps_dispatch_table; + const auto &check_atomic_support_size4 = + check_atomic_support; + const auto &check_atomic_support_size8 = + check_atomic_support; + auto prod_pyapi = [&](const arrayT &src, int trailing_dims_to_reduce, const arrayT &dst, sycl::queue &exec_q, const event_vecT &depends = {}) { - using dpctl::tensor::py_internal::py_reduction_over_axis; return py_reduction_over_axis( src, trailing_dims_to_reduce, dst, exec_q, depends, prod_over_axis_strided_atomic_dispatch_table, prod_over_axis_strided_temps_dispatch_table, prod_over_axis0_contig_atomic_dispatch_table, - prod_over_axis1_contig_atomic_dispatch_table); + prod_over_axis1_contig_atomic_dispatch_table, + check_atomic_support_size4, check_atomic_support_size8); }; m.def("_prod_over_axis", prod_pyapi, "", py::arg("src"), py::arg("trailing_dims_to_reduce"), py::arg("dst"), @@ -431,11 +457,11 @@ void init_reduction_functions(py::module_ m) auto prod_dtype_supported = [&](const py::dtype &input_dtype, const py::dtype &output_dtype, const std::string &dst_usm_type, sycl::queue &q) { - using dpctl::tensor::py_internal::py_reduction_dtype_supported; return py_reduction_dtype_supported( input_dtype, output_dtype, dst_usm_type, q, prod_over_axis_strided_atomic_dispatch_table, - prod_over_axis_strided_temps_dispatch_table); + prod_over_axis_strided_temps_dispatch_table, + check_atomic_support_size4, check_atomic_support_size8); }; m.def("_prod_over_axis_dtype_supported", prod_dtype_supported, "", py::arg("arg_dtype"), py::arg("out_dtype"), diff --git a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp index c7bbadd455..1a9cb6f5e7 100644 --- a/dpctl/tensor/libtensor/source/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reduction_over_axis.hpp @@ -50,14 +50,15 @@ namespace tensor namespace py_internal { -inline bool check_atomic_support(const sycl::queue &exec_q, - sycl::usm::alloc usm_alloc_type, - bool require_atomic64 = false) +template +bool check_atomic_support(const sycl::queue &exec_q, + sycl::usm::alloc usm_alloc_type) { bool supports_atomics = false; const sycl::device &dev = exec_q.get_device(); - if (require_atomic64) { + + if constexpr (require_atomic64) { if (!dev.has(sycl::aspect::atomic64)) return false; } @@ -79,15 +80,24 @@ inline bool check_atomic_support(const sycl::queue &exec_q, return supports_atomics; } +template +bool fixed_decision(const sycl::queue &, sycl::usm::alloc) +{ + return return_value; +} + /* ====================== dtype supported ======================== */ -template -bool py_reduction_dtype_supported(const py::dtype &input_dtype, - const py::dtype &output_dtype, - const std::string &dst_usm_type, - sycl::queue &q, - const fnT &atomic_dispatch_table, - const fnT &temps_dispatch_table) +template +bool py_reduction_dtype_supported( + const py::dtype &input_dtype, + const py::dtype &output_dtype, + const std::string &dst_usm_type, + sycl::queue &q, + const fnT &atomic_dispatch_table, + const fnT &temps_dispatch_table, + const CheckAtomicSupportFnT &check_atomic_support_size4, + const CheckAtomicSupportFnT &check_atomic_support_size8) { int arg_tn = input_dtype.num(); // NumPy type numbers are the same as in dpctl @@ -135,12 +145,11 @@ bool py_reduction_dtype_supported(const py::dtype &input_dtype, switch (output_dtype.itemsize()) { case sizeof(float): { - supports_atomics = check_atomic_support(q, kind); + supports_atomics = check_atomic_support_size4(q, kind); } break; case sizeof(double): { - constexpr bool require_atomic64 = true; - supports_atomics = check_atomic_support(q, kind, require_atomic64); + supports_atomics = check_atomic_support_size8(q, kind); } break; } @@ -158,7 +167,7 @@ bool py_reduction_dtype_supported(const py::dtype &input_dtype, /* ==================== Generic reductions ====================== */ -template +template std::pair py_reduction_over_axis( const dpctl::tensor::usm_ndarray &src, int trailing_dims_to_reduce, // comp over this many trailing indexes @@ -168,7 +177,9 @@ std::pair py_reduction_over_axis( const strided_fnT &atomic_dispatch_table, const strided_fnT &temps_dispatch_table, const contig_fnT &axis0_dispatch_table, - const contig_fnT &axis1_dispatch_table) + const contig_fnT &axis1_dispatch_table, + const SupportAtomicFnT &check_atomic_support_size4, + const SupportAtomicFnT &check_atomic_support_size8) { int src_nd = src.get_ndim(); int iteration_nd = src_nd - trailing_dims_to_reduce; @@ -243,7 +254,7 @@ std::pair py_reduction_over_axis( void *data_ptr = dst.get_data(); const auto &ctx = exec_q.get_context(); auto usm_type = sycl::get_pointer_type(data_ptr, ctx); - supports_atomics = check_atomic_support(exec_q, usm_type); + supports_atomics = check_atomic_support_size4(exec_q, usm_type); } break; case sizeof(double): { @@ -251,9 +262,7 @@ std::pair py_reduction_over_axis( const auto &ctx = exec_q.get_context(); auto usm_type = sycl::get_pointer_type(data_ptr, ctx); - constexpr bool require_atomic64 = true; - supports_atomics = - check_atomic_support(exec_q, usm_type, require_atomic64); + supports_atomics = check_atomic_support_size8(exec_q, usm_type); } break; } From 1d9b7cecd71b127a14b806253685ab8eed96139f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 2 Oct 2023 13:56:26 -0500 Subject: [PATCH 24/26] Defined dpctl.tensor.prod Also tweaked docstring for sum. --- dpctl/tensor/__init__.py | 3 +- dpctl/tensor/_reduction.py | 63 +++++++++++++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/dpctl/tensor/__init__.py b/dpctl/tensor/__init__.py index b5f356ab30..3473d5cde5 100644 --- a/dpctl/tensor/__init__.py +++ b/dpctl/tensor/__init__.py @@ -160,7 +160,7 @@ tanh, trunc, ) -from ._reduction import argmax, argmin, max, min, sum +from ._reduction import argmax, argmin, max, min, prod, sum from ._testing import allclose __all__ = [ @@ -313,4 +313,5 @@ "min", "argmax", "argmin", + "prod", ] diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index 0bbfc262a4..f64dab39c4 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -144,7 +144,7 @@ def _reduction_over_axis( def sum(x, axis=None, dtype=None, keepdims=False): """sum(x, axis=None, dtype=None, keepdims=False) - Calculates the sum of the input array `x`. + Calculates the sum of elements in the input array `x`. Args: x (usm_ndarray): @@ -202,6 +202,67 @@ def sum(x, axis=None, dtype=None, keepdims=False): ) +def prod(x, axis=None, dtype=None, keepdims=False): + """prod(x, axis=None, dtype=None, keepdims=False) + + Calculates the product of elements in the input array `x`. + + Args: + x (usm_ndarray): + input array. + axis (Optional[int, Tuple[int,...]]): + axis or axes along which sums must be computed. If a tuple + of unique integers, sums are computed over multiple axes. + If `None`, the sum is computed over the entire array. + Default: `None`. + dtype (Optional[dtype]): + data type of the returned array. If `None`, the default data + type is inferred from the "kind" of the input array data type. + * If `x` has a real-valued floating-point data type, + the returned array will have the default real-valued + floating-point data type for the device where input + array `x` is allocated. + * If x` has signed integral data type, the returned array + will have the default signed integral type for the device + where input array `x` is allocated. + * If `x` has unsigned integral data type, the returned array + will have the default unsigned integral type for the device + where input array `x` is allocated. + * If `x` has a complex-valued floating-point data typee, + the returned array will have the default complex-valued + floating-pointer data type for the device where input + array `x` is allocated. + * If `x` has a boolean data type, the returned array will + have the default signed integral type for the device + where input array `x` is allocated. + If the data type (either specified or resolved) differs from the + data type of `x`, the input array elements are cast to the + specified data type before computing the sum. Default: `None`. + keepdims (Optional[bool]): + if `True`, the reduced axes (dimensions) are included in the result + as singleton dimensions, so that the returned array remains + compatible with the input arrays according to Array Broadcasting + rules. Otherwise, if `False`, the reduced axes are not included in + the returned array. Default: `False`. + Returns: + usm_ndarray: + an array containing the products. If the product was computed over + the entire array, a zero-dimensional array is returned. The returned + array has the data type as described in the `dtype` parameter + description above. + """ + return _reduction_over_axis( + x, + axis, + dtype, + keepdims, + ti._prod_over_axis, + ti._prod_over_axis_dtype_supported, + _default_reduction_dtype, + _identity=1, + ) + + def _comparison_over_axis(x, axis, keepdims, _reduction_fn): if not isinstance(x, dpt.usm_ndarray): raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}") From 8890d21c83a1c4dab8b85d452b9c42c12603c722 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 2 Oct 2023 15:40:11 -0500 Subject: [PATCH 25/26] Added tests for dpt.prod, removed uses of numpy --- dpctl/tests/test_tensor_sum.py | 78 ++++++++++++++++++++++++++++++---- 1 file changed, 70 insertions(+), 8 deletions(-) diff --git a/dpctl/tests/test_tensor_sum.py b/dpctl/tests/test_tensor_sum.py index 8f2bd45362..dc647febf7 100644 --- a/dpctl/tests/test_tensor_sum.py +++ b/dpctl/tests/test_tensor_sum.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import pytest import dpctl.tensor as dpt @@ -55,11 +54,11 @@ def test_sum_arg_dtype_default_output_dtype_matrix(arg_dtype): assert r.dtype.kind == "f" elif m.dtype.kind == "c": assert r.dtype.kind == "c" - assert (dpt.asnumpy(r) == 100).all() + assert dpt.all(r == 100) m = dpt.ones(200, dtype=arg_dtype)[:1:-2] r = dpt.sum(m) - assert (dpt.asnumpy(r) == 99).all() + assert dpt.all(r == 99) @pytest.mark.parametrize("arg_dtype", _all_dtypes) @@ -74,7 +73,7 @@ def test_sum_arg_out_dtype_matrix(arg_dtype, out_dtype): assert isinstance(r, dpt.usm_ndarray) assert r.dtype == dpt.dtype(out_dtype) - assert (dpt.asnumpy(r) == 100).all() + assert dpt.all(r == 100) def test_sum_empty(): @@ -93,7 +92,7 @@ def test_sum_axis(): assert isinstance(s, dpt.usm_ndarray) assert s.shape == (3, 6) - assert (dpt.asnumpy(s) == np.full(s.shape, 4 * 5 * 7)).all() + assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype="i4")) def test_sum_keepdims(): @@ -104,7 +103,7 @@ def test_sum_keepdims(): assert isinstance(s, dpt.usm_ndarray) assert s.shape == (3, 1, 1, 6, 1) - assert (dpt.asnumpy(s) == np.full(s.shape, 4 * 5 * 7)).all() + assert dpt.all(s == dpt.asarray(4 * 5 * 7, dtype=s.dtype)) def test_sum_scalar(): @@ -116,7 +115,7 @@ def test_sum_scalar(): assert isinstance(s, dpt.usm_ndarray) assert m.sycl_queue == s.sycl_queue assert s.shape == () - assert dpt.asnumpy(s) == np.full((), 1) + assert s == dpt.full((), 1) @pytest.mark.parametrize("arg_dtype", _all_dtypes) @@ -131,7 +130,7 @@ def test_sum_arg_out_dtype_scalar(arg_dtype, out_dtype): assert isinstance(r, dpt.usm_ndarray) assert r.dtype == dpt.dtype(out_dtype) - assert dpt.asnumpy(r) == 1 + assert r == 1 def test_sum_keepdims_zero_size(): @@ -186,3 +185,66 @@ def test_axis0_bug(): expected = dpt.asarray([[0, 3], [1, 4], [2, 5]]) assert dpt.all(s == expected) + + +@pytest.mark.parametrize("arg_dtype", _all_dtypes[1:]) +def test_prod_arg_dtype_default_output_dtype_matrix(arg_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.prod(m) + + assert isinstance(r, dpt.usm_ndarray) + if m.dtype.kind == "i": + assert r.dtype.kind == "i" + elif m.dtype.kind == "u": + assert r.dtype.kind == "u" + elif m.dtype.kind == "f": + assert r.dtype.kind == "f" + elif m.dtype.kind == "c": + assert r.dtype.kind == "c" + assert dpt.all(r == 1) + + if dpt.isdtype(m.dtype, "unsigned integer"): + m = dpt.tile(dpt.arange(1, 3, dtype=arg_dtype), 10)[:1:-2] + r = dpt.prod(m) + assert dpt.all(r == dpt.asarray(512, dtype=r.dtype)) + else: + m = dpt.full(200, -1, dtype=arg_dtype)[:1:-2] + r = dpt.prod(m) + assert dpt.all(r == dpt.asarray(-1, dtype=r.dtype)) + + +def test_prod_empty(): + get_queue_or_skip() + x = dpt.empty((0,), dtype="u1") + y = dpt.prod(x) + assert y.shape == tuple() + assert int(y) == 1 + + +def test_prod_axis(): + get_queue_or_skip() + + m = dpt.ones((3, 4, 5, 6, 7), dtype="i4") + s = dpt.prod(m, axis=(1, 2, -1)) + + assert isinstance(s, dpt.usm_ndarray) + assert s.shape == (3, 6) + assert dpt.all(s == dpt.asarray(1, dtype="i4")) + + +@pytest.mark.parametrize("arg_dtype", _all_dtypes) +@pytest.mark.parametrize("out_dtype", _all_dtypes[1:]) +def test_prod_arg_out_dtype_matrix(arg_dtype, out_dtype): + q = get_queue_or_skip() + skip_if_dtype_not_supported(arg_dtype, q) + skip_if_dtype_not_supported(out_dtype, q) + + m = dpt.ones(100, dtype=arg_dtype) + r = dpt.prod(m, dtype=out_dtype) + + assert isinstance(r, dpt.usm_ndarray) + assert r.dtype == dpt.dtype(out_dtype) + assert dpt.all(r == 1) From 60a8ad749846f43ff458110832dce7d499867e86 Mon Sep 17 00:00:00 2001 From: Nikita Grigorian Date: Mon, 2 Oct 2023 17:33:38 -0700 Subject: [PATCH 26/26] Corrected prod docstring Small tweaks to sum, min, and max docstrings --- dpctl/tensor/_reduction.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py index f64dab39c4..aac1c84677 100644 --- a/dpctl/tensor/_reduction.py +++ b/dpctl/tensor/_reduction.py @@ -149,7 +149,7 @@ def sum(x, axis=None, dtype=None, keepdims=False): Args: x (usm_ndarray): input array. - axis (Optional[int, Tuple[int,...]]): + axis (Optional[int, Tuple[int, ...]]): axis or axes along which sums must be computed. If a tuple of unique integers, sums are computed over multiple axes. If `None`, the sum is computed over the entire array. @@ -210,10 +210,10 @@ def prod(x, axis=None, dtype=None, keepdims=False): Args: x (usm_ndarray): input array. - axis (Optional[int, Tuple[int,...]]): - axis or axes along which sums must be computed. If a tuple - of unique integers, sums are computed over multiple axes. - If `None`, the sum is computed over the entire array. + axis (Optional[int, Tuple[int, ...]]): + axis or axes along which products must be computed. If a tuple + of unique integers, products are computed over multiple axes. + If `None`, the product is computed over the entire array. Default: `None`. dtype (Optional[dtype]): data type of the returned array. If `None`, the default data @@ -237,7 +237,7 @@ def prod(x, axis=None, dtype=None, keepdims=False): where input array `x` is allocated. If the data type (either specified or resolved) differs from the data type of `x`, the input array elements are cast to the - specified data type before computing the sum. Default: `None`. + specified data type before computing the product. Default: `None`. keepdims (Optional[bool]): if `True`, the reduced axes (dimensions) are included in the result as singleton dimensions, so that the returned array remains @@ -314,7 +314,7 @@ def max(x, axis=None, keepdims=False): Args: x (usm_ndarray): input array. - axis (Optional[int, Tuple[int,...]]): + axis (Optional[int, Tuple[int, ...]]): axis or axes along which maxima must be computed. If a tuple of unique integers, the maxima are computed over multiple axes. If `None`, the max is computed over the entire array. @@ -342,7 +342,7 @@ def min(x, axis=None, keepdims=False): Args: x (usm_ndarray): input array. - axis (Optional[int, Tuple[int,...]]): + axis (Optional[int, Tuple[int, ...]]): axis or axes along which minima must be computed. If a tuple of unique integers, the minima are computed over multiple axes. If `None`, the min is computed over the entire array.