From be5fb99b99720311cf8ce431d40770c15f5e100f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 7 Sep 2023 06:55:16 -0500 Subject: [PATCH 01/13] Added SyclQueue._submit_keep_args_alive method Usage: q = dpctl.SyclQueue() ... e = q.submit(krn, args, ranges) ht_e = q._submit_keep_args_alive(args, [e]) .... ht_e.wait() --- dpctl/_host_task_util.hpp | 47 ++++++++++------ dpctl/_sycl_queue.pxd | 5 ++ dpctl/_sycl_queue.pyx | 113 +++++++++++++++++++++++++++++++++----- 3 files changed, 132 insertions(+), 33 deletions(-) diff --git a/dpctl/_host_task_util.hpp b/dpctl/_host_task_util.hpp index 8db17594fd..ff360eff1c 100644 --- a/dpctl/_host_task_util.hpp +++ b/dpctl/_host_task_util.hpp @@ -2,7 +2,7 @@ // // Data Parallel Control (dpctl) // -// Copyright 2020-2022 Intel Corporation +// Copyright 2020-2023 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -31,28 +31,27 @@ #include "Python.h" #include "syclinterface/dpctl_data_types.h" +#include "syclinterface/dpctl_sycl_type_casters.hpp" #include -int async_dec_ref(DPCTLSyclQueueRef QRef, - PyObject **obj_array, - size_t obj_array_size, - DPCTLSyclEventRef *ERefs, - size_t nERefs) +DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef QRef, + PyObject **obj_array, + size_t obj_array_size, + DPCTLSyclEventRef *depERefs, + size_t nDepERefs, + int *status) { + using dpctl::syclinterface::unwrap; + using dpctl::syclinterface::wrap; - sycl::queue *q = reinterpret_cast(QRef); + sycl::queue *q = unwrap(QRef); - std::vector obj_vec; - obj_vec.reserve(obj_array_size); - for (size_t obj_id = 0; obj_id < obj_array_size; ++obj_id) { - obj_vec.push_back(obj_array[obj_id]); - } + std::vector obj_vec(obj_array, obj_array + obj_array_size); try { - q->submit([&](sycl::handler &cgh) { - for (size_t ev_id = 0; ev_id < nERefs; ++ev_id) { - cgh.depends_on( - *(reinterpret_cast(ERefs[ev_id]))); + sycl::event ht_ev = q->submit([&](sycl::handler &cgh) { + for (size_t ev_id = 0; ev_id < nDepERefs; ++ev_id) { + cgh.depends_on(*(unwrap(depERefs[ev_id]))); } cgh.host_task([obj_array_size, obj_vec]() { // if the main thread has not finilized the interpreter yet @@ -66,9 +65,21 @@ int async_dec_ref(DPCTLSyclQueueRef QRef, } }); }); + + constexpr int result_ok = 0; + + *status = result_ok; + auto e_ptr = new sycl::event(ht_ev); + return wrap(e_ptr); } catch (const std::exception &e) { - return 1; + constexpr int result_std_exception = 1; + + *status = result_std_exception; + return nullptr; } - return 0; + constexpr int result_other_abnormal = 2; + + *status = result_other_abnormal; + return nullptr; } diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd index 8f9028fabf..0ce33b5ef3 100644 --- a/dpctl/_sycl_queue.pxd +++ b/dpctl/_sycl_queue.pxd @@ -70,6 +70,11 @@ cdef public api class SyclQueue (_SyclQueue) [ cpdef SyclContext get_sycl_context(self) cpdef SyclDevice get_sycl_device(self) cdef DPCTLSyclQueueRef get_queue_ref(self) + cpdef SyclEvent _submit_keep_args_alive( + self, + object args, + list dEvents + ) cpdef SyclEvent submit( self, SyclKernel kernel, diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 361b9d5924..79adf7e014 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -72,7 +72,7 @@ import logging cdef extern from "_host_task_util.hpp": - int async_dec_ref(DPCTLSyclQueueRef, PyObject **, size_t, DPCTLSyclEventRef *, size_t) nogil + DPCTLSyclEventRef async_dec_ref(DPCTLSyclQueueRef, PyObject **, size_t, DPCTLSyclEventRef *, size_t, int *) nogil __all__ = [ @@ -703,6 +703,79 @@ cdef class SyclQueue(_SyclQueue): """ return self._queue_ref + + cpdef SyclEvent _submit_keep_args_alive( + self, + object args, + list dEvents + ): + """ SyclQueue._submit_keep_args_alive(args, events) + + Keeps objects in `args` alive until tasks associated with events + complete. + + Args: + args(object): Python object to keep alive. + Typically a tuple with arguments to offloaded tasks + events(Tuple[dpctl.SyclEvent]): Gating events + The list or tuple of events associated with tasks + working on Python objects collected in `args`. + Returns: + dpctl.SyclEvent + The event associated with the submission of host task. + + Increments reference count of `args` and schedules asynchronous + ``host_task`` to decrement the count once dependent events are + complete. + + N.B.: The `host_task` attempts to acquire Python GIL, and it is + known to be unsafe during interpreter shudown sequence. It is + thus strongly advised to ensure that all submitted `host_task` + complete before the end of the Python script. + """ + cdef size_t nDE = len(dEvents) + cdef DPCTLSyclEventRef *depEvents = NULL + cdef PyObject *args_raw = NULL + cdef DPCTLSyclEventRef htERef = NULL + cdef int status = -1 + + # Create the array of dependent events if any + if nDE > 0: + depEvents = ( + malloc(nDE*sizeof(DPCTLSyclEventRef)) + ) + if not depEvents: + raise MemoryError() + else: + for idx, de in enumerate(dEvents): + if isinstance(de, SyclEvent): + depEvents[idx] = (de).get_event_ref() + else: + free(depEvents) + raise TypeError( + "A sequence of dpctl.SyclEvent is expected" + ) + + # increment reference counts to list of arguments + Py_INCREF(args) + + # schedule decrement + args_raw = args + + htERef = async_dec_ref( + self.get_queue_ref(), + &args_raw, 1, + depEvents, nDE, &status + ) + + free(depEvents) + if (status != 0): + with nogil: DPCTLEvent_Wait(htERef) + raise RuntimeError("Could not submit keep_args_alive host_task") + + return SyclEvent._create(htERef) + + cpdef SyclEvent submit( self, SyclKernel kernel, @@ -715,13 +788,14 @@ cdef class SyclQueue(_SyclQueue): cdef _arg_data_type *kargty = NULL cdef DPCTLSyclEventRef *depEvents = NULL cdef DPCTLSyclEventRef Eref = NULL + cdef DPCTLSyclEventRef htEref = NULL cdef int ret = 0 cdef size_t gRange[3] cdef size_t lRange[3] cdef size_t nGS = len(gS) cdef size_t nLS = len(lS) if lS is not None else 0 cdef size_t nDE = len(dEvents) if dEvents is not None else 0 - cdef PyObject **arg_objects = NULL + cdef PyObject *args_raw = NULL cdef ssize_t i = 0 # Allocate the arrays to be sent to DPCTLQueue_Submit @@ -745,7 +819,15 @@ cdef class SyclQueue(_SyclQueue): raise MemoryError() else: for idx, de in enumerate(dEvents): - depEvents[idx] = (de).get_event_ref() + if isinstance(de, SyclEvent): + depEvents[idx] = (de).get_event_ref() + else: + free(kargs) + free(kargty) + free(depEvents) + raise TypeError( + "A sequence of dpctl.SyclEvent is expected" + ) # populate the args and argstype arrays ret = self._populate_args(args, kargs, kargty) @@ -823,22 +905,23 @@ cdef class SyclQueue(_SyclQueue): raise SyclKernelSubmitError( "Kernel submission to Sycl queue failed." ) - # increment reference counts to each argument - arg_objects = malloc(len(args) * sizeof(PyObject *)) - for i in range(len(args)): - arg_objects[i] = (args[i]) - Py_INCREF( arg_objects[i]) + # increment reference counts to list of arguments + Py_INCREF(args) # schedule decrement - if async_dec_ref(self.get_queue_ref(), arg_objects, len(args), &Eref, 1): + args_raw = args + + ret = -1 + htERef = async_dec_ref(self.get_queue_ref(), &args_raw, 1, &Eref, 1, &ret) + if ret: # async task submission failed, decrement ref counts and wait - for i in range(len(args)): - arg_objects[i] = (args[i]) - Py_DECREF( arg_objects[i]) - with nogil: DPCTLEvent_Wait(Eref) + Py_DECREF(args) + with nogil: + DPCTLEvent_Wait(Eref) + DPCTLEvent_Wait(htERef) - # free memory - free(arg_objects) + # we are not returning host-task event at the moment + DPCTLEvent_Delete(htERef) return SyclEvent._create(Eref) From 5b5363e95979a79a877e27604be91e28c1554389 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 7 Sep 2023 08:58:03 -0500 Subject: [PATCH 02/13] Used _submit_keep_args_alive in a test --- dpctl/tests/test_sycl_kernel_submit.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dpctl/tests/test_sycl_kernel_submit.py b/dpctl/tests/test_sycl_kernel_submit.py index d15f5c8e2b..31727ce2d8 100644 --- a/dpctl/tests/test_sycl_kernel_submit.py +++ b/dpctl/tests/test_sycl_kernel_submit.py @@ -214,6 +214,9 @@ def test_async_submit(): e3_st = e3.execution_status e2_st = e2.execution_status e1_st = e1.execution_status + ht_e = q._submit_keep_args_alive( + [first_row, second_row, third_row], [e1, e2, e3] + ) are_complete = [ e == status_complete for e in ( @@ -223,6 +226,7 @@ def test_async_submit(): ) ] e3.wait() + ht_e.wait() if not all(are_complete): async_detected = True break From 0a35e9c57e4fea1c5156a76c548fae1f30063e4e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 10 Sep 2023 16:17:18 -0500 Subject: [PATCH 03/13] Do not use async_dec_ref in submit method Instead delegated the task of Python object life-time management to the user via use of _submit_keep_args_alive method --- dpctl/_sycl_queue.pyx | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 79adf7e014..3c2945d82e 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -905,23 +905,6 @@ cdef class SyclQueue(_SyclQueue): raise SyclKernelSubmitError( "Kernel submission to Sycl queue failed." ) - # increment reference counts to list of arguments - Py_INCREF(args) - - # schedule decrement - args_raw = args - - ret = -1 - htERef = async_dec_ref(self.get_queue_ref(), &args_raw, 1, &Eref, 1, &ret) - if ret: - # async task submission failed, decrement ref counts and wait - Py_DECREF(args) - with nogil: - DPCTLEvent_Wait(Eref) - DPCTLEvent_Wait(htERef) - - # we are not returning host-task event at the moment - DPCTLEvent_Delete(htERef) return SyclEvent._create(Eref) From f30a6a2efb0189ce373d0393a5e228e6be0baa7f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 10 Sep 2023 16:18:58 -0500 Subject: [PATCH 04/13] Add memcpy_async method --- dpctl/_sycl_queue.pxd | 1 + dpctl/_sycl_queue.pyx | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd index 0ce33b5ef3..999fb417af 100644 --- a/dpctl/_sycl_queue.pxd +++ b/dpctl/_sycl_queue.pxd @@ -86,6 +86,7 @@ cdef public api class SyclQueue (_SyclQueue) [ cpdef void wait(self) cdef DPCTLSyclQueueRef get_queue_ref(self) cpdef memcpy(self, dest, src, size_t count) + cpdef SyclEvent memcpy_async(self, dest, src, size_t count) cpdef prefetch(self, ptr, size_t count=*) cpdef mem_advise(self, ptr, size_t count, int mem) cpdef SyclEvent submit_barrier(self, dependent_events=*) diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 3c2945d82e..d944a25e36 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -934,6 +934,29 @@ cdef class SyclQueue(_SyclQueue): with nogil: DPCTLEvent_Wait(ERef) DPCTLEvent_Delete(ERef) + cpdef SyclEvent memcpy_async(self, dest, src, size_t count): + cdef void *c_dest + cdef void *c_src + cdef DPCTLSyclEventRef ERef = NULL + + if isinstance(dest, _Memory): + c_dest = (<_Memory>dest).memory_ptr + else: + raise TypeError("Parameter `dest` should have type _Memory.") + + if isinstance(src, _Memory): + c_src = (<_Memory>src).memory_ptr + else: + raise TypeError("Parameter `src` should have type _Memory.") + + ERef = DPCTLQueue_Memcpy(self._queue_ref, c_dest, c_src, count) + if (ERef is NULL): + raise RuntimeError( + "SyclQueue.memcpy operation encountered an error" + ) + + return SyclEvent._create(ERef) + cpdef prefetch(self, mem, size_t count=0): cdef void *ptr cdef DPCTLSyclEventRef ERef = NULL From e7ee1d928a4dfb5a305e49af8e152523c9b7573c Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 15 Sep 2023 02:40:59 -0500 Subject: [PATCH 05/13] Introduce dpctl.SyclQueue.submit_async The SyclQueue.submit has become synchronosing, although it still returns a SyclEvent (with exectuion_status always complete) --- dpctl/_sycl_queue.pxd | 8 ++++++++ dpctl/_sycl_queue.pyx | 14 +++++++++++++- dpctl/tests/test_sycl_kernel_submit.py | 8 ++++---- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd index 999fb417af..35062e0ded 100644 --- a/dpctl/_sycl_queue.pxd +++ b/dpctl/_sycl_queue.pxd @@ -75,6 +75,14 @@ cdef public api class SyclQueue (_SyclQueue) [ object args, list dEvents ) + cpdef SyclEvent submit_async( + self, + SyclKernel kernel, + list args, + list gS, + list lS=*, + list dEvents=* + ) cpdef SyclEvent submit( self, SyclKernel kernel, diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index d944a25e36..5fee72bc64 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -776,7 +776,7 @@ cdef class SyclQueue(_SyclQueue): return SyclEvent._create(htERef) - cpdef SyclEvent submit( + cpdef SyclEvent submit_async( self, SyclKernel kernel, list args, @@ -908,6 +908,18 @@ cdef class SyclQueue(_SyclQueue): return SyclEvent._create(Eref) + cpdef SyclEvent submit( + self, + SyclKernel kernel, + list args, + list gS, + list lS=None, + list dEvents=None + ): + cdef SyclEvent e = self.submit_async(kernel, args, gS, lS, dEvents) + e.wait() + return e + cpdef void wait(self): with nogil: DPCTLQueue_Wait(self._queue_ref) diff --git a/dpctl/tests/test_sycl_kernel_submit.py b/dpctl/tests/test_sycl_kernel_submit.py index 31727ce2d8..697af32f5c 100644 --- a/dpctl/tests/test_sycl_kernel_submit.py +++ b/dpctl/tests/test_sycl_kernel_submit.py @@ -114,7 +114,7 @@ def test_create_program_from_source(ctype_str, dtype, ctypes_ctor): ) -def test_async_submit(): +def test_submit_async(): try: q = dpctl.SyclQueue("opencl") except dpctl.SyclQueueCreationError: @@ -182,7 +182,7 @@ def test_async_submit(): async_detected = False for attempt in range(5): - e1 = q.submit( + e1 = q.submit_async( kern1Kernel, [ first_row, @@ -192,7 +192,7 @@ def test_async_submit(): n, ], ) - e2 = q.submit( + e2 = q.submit_async( kern2Kernel, [ second_row, @@ -202,7 +202,7 @@ def test_async_submit(): n, ], ) - e3 = q.submit( + e3 = q.submit_async( kern3Kernel, [third_row, first_row, second_row], [ From b6dd4d2c02405d51b533839e0d5a04e75654ae51 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 15 Sep 2023 12:27:02 -0500 Subject: [PATCH 06/13] Use pragma once in _host_task_util.hpp --- dpctl/_host_task_util.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dpctl/_host_task_util.hpp b/dpctl/_host_task_util.hpp index ff360eff1c..cb3828a54f 100644 --- a/dpctl/_host_task_util.hpp +++ b/dpctl/_host_task_util.hpp @@ -29,6 +29,7 @@ /// //===----------------------------------------------------------------------===// +#pragma once #include "Python.h" #include "syclinterface/dpctl_data_types.h" #include "syclinterface/dpctl_sycl_type_casters.hpp" From 306ff96e1b876f76d6a5e5ce831d3f665c470688 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Fri, 15 Sep 2023 16:29:15 -0500 Subject: [PATCH 07/13] Fixed possible memory leak on error --- dpctl/_sycl_queue.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 5fee72bc64..9850da48e4 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -771,6 +771,7 @@ cdef class SyclQueue(_SyclQueue): free(depEvents) if (status != 0): with nogil: DPCTLEvent_Wait(htERef) + DPCTLEvent_Delete(htERef) raise RuntimeError("Could not submit keep_args_alive host_task") return SyclEvent._create(htERef) From 67a325dfada9d40e180c0f132f9183de2891802d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Sun, 17 Sep 2023 23:06:43 -0500 Subject: [PATCH 08/13] Add DPCTLQueue_MemcpyWithEvents This is the copy operation where one can specify list of events the copy operation requires before start of its execution. DPCTLQueue_MemcpyWithEvents( __dpctl_keep DPCTLSyclQueueRef QRef, void *dst, const void *src, size_t nbytes, const DPCTLSyclEventRef *depEvents, size_t nDE ) Uses this function in tests. --- .../include/dpctl_sycl_queue_interface.h | 23 ++++ .../source/dpctl_sycl_queue_interface.cpp | 45 ++++++- .../tests/test_sycl_queue_interface.cpp | 127 +++++++++++------- 3 files changed, 146 insertions(+), 49 deletions(-) diff --git a/libsyclinterface/include/dpctl_sycl_queue_interface.h b/libsyclinterface/include/dpctl_sycl_queue_interface.h index 1c5e53a395..cc466fce17 100644 --- a/libsyclinterface/include/dpctl_sycl_queue_interface.h +++ b/libsyclinterface/include/dpctl_sycl_queue_interface.h @@ -294,6 +294,29 @@ DPCTLQueue_Memcpy(__dpctl_keep const DPCTLSyclQueueRef QRef, const void *Src, size_t Count); +/*! + * @brief C-API wrapper for ``sycl::queue::memcpy``. + * + * @param QRef An opaque pointer to the ``sycl::queue``. + * @param Dest An USM pointer to the destination memory. + * @param Src An USM pointer to the source memory. + * @param Count A number of bytes to copy. + * @param DepEvents A pointer to array of DPCTLSyclEventRef opaque + * pointers to dependent events. + * @param DepEventsCount A number of dependent events. + * @return An opaque pointer to the ``sycl::event`` returned by the + * ``sycl::queue::memcpy`` function. + * @ingroup QueueInterface + */ +DPCTL_API +__dpctl_give DPCTLSyclEventRef +DPCTLQueue_MemcpyWithEvents(__dpctl_keep const DPCTLSyclQueueRef QRef, + void *Dest, + const void *Src, + size_t Count, + __dpctl_keep const DPCTLSyclEventRef *DepEvents, + size_t DepEventsCount); + /*! * @brief C-API wrapper for ``sycl::queue::prefetch``. * diff --git a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp index 4903b888ff..60098ae933 100644 --- a/libsyclinterface/source/dpctl_sycl_queue_interface.cpp +++ b/libsyclinterface/source/dpctl_sycl_queue_interface.cpp @@ -410,9 +410,12 @@ DPCTLQueue_SubmitNDRange(__dpctl_keep const DPCTLSyclKernelRef KRef, try { e = Queue->submit([&](handler &cgh) { // Depend on any event that was specified by the caller. - if (NDepEvents) - for (auto i = 0ul; i < NDepEvents; ++i) - cgh.depends_on(*unwrap(DepEvents[i])); + if (DepEvents) + for (auto i = 0ul; i < NDepEvents; ++i) { + auto ei = unwrap(DepEvents[i]); + if (ei) + cgh.depends_on(*ei); + } for (auto i = 0ul; i < NArgs; ++i) { // \todo add support for Sycl buffers @@ -485,6 +488,42 @@ DPCTLQueue_Memcpy(__dpctl_keep const DPCTLSyclQueueRef QRef, } } +__dpctl_give DPCTLSyclEventRef +DPCTLQueue_MemcpyWithEvents(__dpctl_keep const DPCTLSyclQueueRef QRef, + void *Dest, + const void *Src, + size_t Count, + const DPCTLSyclEventRef *DepEvents, + size_t DepEventsCount) +{ + event ev; + auto Q = unwrap(QRef); + if (Q) { + try { + ev = Q->submit([&](handler &cgh) { + if (DepEvents) + for (size_t i = 0; i < DepEventsCount; ++i) { + event *ei = unwrap(DepEvents[i]); + if (ei) + cgh.depends_on(*ei); + } + + cgh.memcpy(Dest, Src, Count); + }); + } catch (const std::exception &ex) { + error_handler(ex, __FILE__, __func__, __LINE__); + return nullptr; + } + } + else { + error_handler("QRef passed to memcpy was NULL.", __FILE__, __func__, + __LINE__); + return nullptr; + } + + return wrap(new event(ev)); +} + __dpctl_give DPCTLSyclEventRef DPCTLQueue_Prefetch(__dpctl_keep DPCTLSyclQueueRef QRef, const void *Ptr, diff --git a/libsyclinterface/tests/test_sycl_queue_interface.cpp b/libsyclinterface/tests/test_sycl_queue_interface.cpp index 8d23929d39..836a87379b 100644 --- a/libsyclinterface/tests/test_sycl_queue_interface.cpp +++ b/libsyclinterface/tests/test_sycl_queue_interface.cpp @@ -340,6 +340,10 @@ TEST(TestDPCTLSyclQueueInterface, CheckMemOpsZeroQRef) ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Memcpy(QRef, p1, p2, n_bytes)); ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE( + ERef = DPCTLQueue_MemcpyWithEvents(QRef, p1, p2, n_bytes, NULL, 0)); + ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Prefetch(QRef, p1, n_bytes)); ASSERT_FALSE(bool(ERef)); @@ -391,6 +395,10 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckMemOpsNullPtr) ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Memcpy(QRef, p1, p2, n_bytes)); ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE( + ERef = DPCTLQueue_MemcpyWithEvents(QRef, p1, p2, n_bytes, NULL, 0)); + ASSERT_FALSE(bool(ERef)); + ASSERT_NO_FATAL_FAILURE(ERef = DPCTLQueue_Prefetch(QRef, p1, n_bytes)); if (ERef) { ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); @@ -450,6 +458,38 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckMemset) delete[] host_arr; } +TEST_P(TestDPCTLQueueMemberFunctions, CheckMemset2) +{ + DPCTLSyclUSMRef p = nullptr; + DPCTLSyclEventRef Memset_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; + uint8_t val = 42; + size_t nbytes = 256; + uint8_t *host_arr = new uint8_t[nbytes]; + + ASSERT_FALSE(host_arr == nullptr); + + ASSERT_NO_FATAL_FAILURE(p = DPCTLmalloc_device(nbytes, QRef)); + ASSERT_FALSE(p == nullptr); + + ASSERT_NO_FATAL_FAILURE( + Memset_ERef = DPCTLQueue_Memset(QRef, (void *)p, val, nbytes)); + + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Memset_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); + + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memset_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); + + ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); + + for (size_t i = 0; i < nbytes; ++i) { + ASSERT_TRUE(host_arr[i] == val); + } + delete[] host_arr; +} + TEST(TestDPCTLSyclQueueInterface, CheckFillNullQRef) { DPCTLSyclQueueRef QRef = nullptr; @@ -481,7 +521,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill8) { using T = uint8_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill8_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xB); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -492,17 +533,15 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill8) ASSERT_NO_FATAL_FAILURE(p = DPCTLmalloc_device(nbytes, QRef)); ASSERT_FALSE(p == nullptr); - ASSERT_NO_FATAL_FAILURE(ERef = + ASSERT_NO_FATAL_FAILURE(Fill8_ERef = DPCTLQueue_Fill8(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill8_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill8_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -517,7 +556,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill16) using T = uint16_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill16_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xAB); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -529,16 +569,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill16) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill16(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill16_ERef = DPCTLQueue_Fill16(QRef, (void *)p, val, nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill16_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill16_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -553,7 +591,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill32) using T = uint32_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill32_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xABCD); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -565,16 +604,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill32) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill32(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill32_ERef = DPCTLQueue_Fill32(QRef, (void *)p, val, nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill32_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill32_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -589,7 +626,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill64) using T = uint64_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill64_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val = static_cast(0xABCDEF73); size_t nelems = 256; T *host_arr = new T[nelems]; @@ -601,16 +639,14 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill64) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill64(QRef, (void *)p, val, nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill64_ERef = DPCTLQueue_Fill64(QRef, (void *)p, val, nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill64_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill64_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); @@ -639,7 +675,8 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill128) using T = value128_t; DPCTLSyclUSMRef p = nullptr; - DPCTLSyclEventRef ERef = nullptr; + DPCTLSyclEventRef Fill128_ERef = nullptr; + DPCTLSyclEventRef Memcpy_ERef = nullptr; T val{static_cast(0xABCDEF73), static_cast(0x3746AF05)}; size_t nelems = 256; T *host_arr = new T[nelems]; @@ -651,17 +688,15 @@ TEST_P(TestDPCTLQueueMemberFunctions, CheckFill128) ASSERT_FALSE(p == nullptr); ASSERT_NO_FATAL_FAILURE( - ERef = DPCTLQueue_Fill128(QRef, (void *)p, - reinterpret_cast(&val), nelems)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + Fill128_ERef = DPCTLQueue_Fill128( + QRef, (void *)p, reinterpret_cast(&val), nelems)); - ERef = nullptr; + ASSERT_NO_FATAL_FAILURE(Memcpy_ERef = DPCTLQueue_MemcpyWithEvents( + QRef, host_arr, p, nbytes, &Fill128_ERef, 1)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(Memcpy_ERef)); - ASSERT_NO_FATAL_FAILURE(ERef = - DPCTLQueue_Memcpy(QRef, host_arr, p, nbytes)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Wait(ERef)); - ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Fill128_ERef)); + ASSERT_NO_FATAL_FAILURE(DPCTLEvent_Delete(Memcpy_ERef)); ASSERT_NO_FATAL_FAILURE(DPCTLfree_with_queue(p, QRef)); From 6949e690553047600aa767c4c743f474f5608b7d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 18 Sep 2023 02:48:44 -0500 Subject: [PATCH 09/13] Adds dpctl.SyclQueue.memcpy_async Also extends `dpctl.SyclQueue.memcpy` to allow arguments to be objects that expose buffer protocol, allowing `dpctl.SyclQueue.memcpy` and `dpctl.SyclQueue.memcpy_async` to be used to copy from/to USM-allocation or host buffer. --- dpctl/_backend.pxd | 7 ++ dpctl/_sycl_queue.pxd | 2 +- dpctl/_sycl_queue.pyx | 108 ++++++++++++++++++++------ dpctl/tests/test_sycl_queue_memcpy.py | 76 +++++++++++++++++- 4 files changed, 164 insertions(+), 29 deletions(-) diff --git a/dpctl/_backend.pxd b/dpctl/_backend.pxd index 3f7ba63a55..57da77eb7d 100644 --- a/dpctl/_backend.pxd +++ b/dpctl/_backend.pxd @@ -403,6 +403,13 @@ cdef extern from "syclinterface/dpctl_sycl_queue_interface.h": void *Dest, const void *Src, size_t Count) + cdef DPCTLSyclEventRef DPCTLQueue_MemcpyWithEvents( + const DPCTLSyclQueueRef Q, + void *Dest, + const void *Src, + size_t Count, + const DPCTLSyclEventRef *depEvents, + size_t depEventsCount) cdef DPCTLSyclEventRef DPCTLQueue_Memset( const DPCTLSyclQueueRef Q, void *Dest, diff --git a/dpctl/_sycl_queue.pxd b/dpctl/_sycl_queue.pxd index 35062e0ded..0269cc4aae 100644 --- a/dpctl/_sycl_queue.pxd +++ b/dpctl/_sycl_queue.pxd @@ -94,7 +94,7 @@ cdef public api class SyclQueue (_SyclQueue) [ cpdef void wait(self) cdef DPCTLSyclQueueRef get_queue_ref(self) cpdef memcpy(self, dest, src, size_t count) - cpdef SyclEvent memcpy_async(self, dest, src, size_t count) + cpdef SyclEvent memcpy_async(self, dest, src, size_t count, list dEvents=*) cpdef prefetch(self, ptr, size_t count=*) cpdef mem_advise(self, ptr, size_t count, int mem) cpdef SyclEvent submit_barrier(self, dependent_events=*) diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 9850da48e4..6496b365f6 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -45,6 +45,7 @@ from ._backend cimport ( # noqa: E211 DPCTLQueue_IsInOrder, DPCTLQueue_MemAdvise, DPCTLQueue_Memcpy, + DPCTLQueue_MemcpyWithEvents, DPCTLQueue_Prefetch, DPCTLQueue_SubmitBarrierForEvents, DPCTLQueue_SubmitNDRange, @@ -64,6 +65,7 @@ import ctypes from .enum_types import backend_type from cpython cimport pycapsule +from cpython.buffer cimport PyObject_CheckBuffer from cpython.ref cimport Py_DECREF, Py_INCREF, PyObject from libc.stdlib cimport free, malloc @@ -160,6 +162,62 @@ cdef void _queue_capsule_deleter(object o) noexcept: DPCTLQueue_Delete(QRef) +cdef bint _is_buffer(object o): + return PyObject_CheckBuffer(o) + + +cdef DPCTLSyclEventRef _memcpy_impl( + SyclQueue q, + object dst, + object src, + size_t byte_count, + DPCTLSyclEventRef *dep_events, + size_t dep_events_count +): + cdef void *c_dst_ptr = NULL + cdef void *c_src_ptr = NULL + cdef DPCTLSyclEventRef ERef = NULL + cdef const unsigned char[::1] src_host_buf = None + cdef unsigned char[::1] dst_host_buf = None + + if isinstance(src, _Memory): + c_src_ptr = (<_Memory>src).memory_ptr + elif _is_buffer(src): + src_host_buf = src + c_src_ptr = &src_host_buf[0] + else: + raise TypeError( + "Parameter `src` should have either type " + "`dpctl.memory._Memory` or a type that " + "supports Python buffer protocol" + ) + + if isinstance(dst, _Memory): + c_dst_ptr = (<_Memory>dst).memory_ptr + elif _is_buffer(dst): + dst_host_buf = dst + c_dst_ptr = &dst_host_buf[0] + else: + raise TypeError( + "Parameter `dst` should have either type " + "`dpctl.memory._Memory` or a type that " + "supports Python buffer protocol" + ) + + if dep_events_count == 0 or dep_events is NULL: + ERef = DPCTLQueue_Memcpy(q._queue_ref, c_dst_ptr, c_src_ptr, byte_count) + else: + ERef = DPCTLQueue_MemcpyWithEvents( + q._queue_ref, + c_dst_ptr, + c_src_ptr, + byte_count, + dep_events, + dep_events_count + ) + return ERef + + cdef class _SyclQueue: """ Barebone data owner class used by SyclQueue. """ @@ -925,21 +983,10 @@ cdef class SyclQueue(_SyclQueue): with nogil: DPCTLQueue_Wait(self._queue_ref) cpdef memcpy(self, dest, src, size_t count): - cdef void *c_dest - cdef void *c_src + """Copy memory from `src` to `dst`""" cdef DPCTLSyclEventRef ERef = NULL - if isinstance(dest, _Memory): - c_dest = (<_Memory>dest).memory_ptr - else: - raise TypeError("Parameter `dest` should have type _Memory.") - - if isinstance(src, _Memory): - c_src = (<_Memory>src).memory_ptr - else: - raise TypeError("Parameter `src` should have type _Memory.") - - ERef = DPCTLQueue_Memcpy(self._queue_ref, c_dest, c_src, count) + ERef = _memcpy_impl(self, dest, src, count, NULL, 0) if (ERef is NULL): raise RuntimeError( "SyclQueue.memcpy operation encountered an error" @@ -947,22 +994,33 @@ cdef class SyclQueue(_SyclQueue): with nogil: DPCTLEvent_Wait(ERef) DPCTLEvent_Delete(ERef) - cpdef SyclEvent memcpy_async(self, dest, src, size_t count): - cdef void *c_dest - cdef void *c_src + cpdef SyclEvent memcpy_async(self, dest, src, size_t count, list dEvents=None): + """Copy memory from `src` to `dst`""" cdef DPCTLSyclEventRef ERef = NULL + cdef DPCTLSyclEventRef *depEvents = NULL + cdef size_t nDE = 0 - if isinstance(dest, _Memory): - c_dest = (<_Memory>dest).memory_ptr - else: - raise TypeError("Parameter `dest` should have type _Memory.") - - if isinstance(src, _Memory): - c_src = (<_Memory>src).memory_ptr + if dEvents is None: + ERef = _memcpy_impl(self, dest, src, count, NULL, 0) else: - raise TypeError("Parameter `src` should have type _Memory.") + nDE = len(dEvents) + depEvents = ( + malloc(nDE*sizeof(DPCTLSyclEventRef)) + ) + if depEvents is NULL: + raise MemoryError() + else: + for idx, de in enumerate(dEvents): + if isinstance(de, SyclEvent): + depEvents[idx] = (de).get_event_ref() + else: + free(depEvents) + raise TypeError( + "A sequence of dpctl.SyclEvent is expected" + ) + ERef = _memcpy_impl(self, dest, src, count, depEvents, nDE) + free(depEvents) - ERef = DPCTLQueue_Memcpy(self._queue_ref, c_dest, c_src, count) if (ERef is NULL): raise RuntimeError( "SyclQueue.memcpy operation encountered an error" diff --git a/dpctl/tests/test_sycl_queue_memcpy.py b/dpctl/tests/test_sycl_queue_memcpy.py index 45c8e41f61..bb3c7b0376 100644 --- a/dpctl/tests/test_sycl_queue_memcpy.py +++ b/dpctl/tests/test_sycl_queue_memcpy.py @@ -44,7 +44,77 @@ def test_memcpy_copy_usm_to_usm(): q.memcpy(mobj2, mobj1, 3) - assert mv2[:3], b"123" + assert mv2[:3] == b"123" + + +def test_memcpy_copy_host_to_usm(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + usm_obj = _create_memory(q) + + canary = bytearray(b"123456789") + host_obj = memoryview(canary) + + q.memcpy(usm_obj, host_obj, len(canary)) + + mv2 = memoryview(usm_obj) + + assert mv2[: len(canary)] == canary + + +def test_memcpy_copy_usm_to_host(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + usm_obj = _create_memory(q) + mv2 = memoryview(usm_obj) + + n = 9 + for id in range(n): + mv2[id] = ord("a") + id + + host_obj = bytearray(b" " * n) + + q.memcpy(host_obj, usm_obj, n) + + assert host_obj == b"abcdefghi" + + +def test_memcpy_copy_host_to_host(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + + src_buf = b"abcdefghijklmnopqrstuvwxyz" + dst_buf = bytearray(len(src_buf)) + + q.memcpy(dst_buf, src_buf, len(src_buf)) + + assert dst_buf == src_buf + + +def test_memcpy_async(): + try: + q = dpctl.SyclQueue() + except dpctl.SyclQueueCreationError: + pytest.skip("Default constructor for SyclQueue failed") + + src_buf = b"abcdefghijklmnopqrstuvwxyz" + n = len(src_buf) + dst_buf = bytearray(n) + dst_buf2 = bytearray(n) + + e = q.memcpy_async(dst_buf, src_buf, n) + e2 = q.memcpy_async(dst_buf2, src_buf, n) + + e2.wait() + e.wait() + assert dst_buf == src_buf + assert dst_buf2 == src_buf def test_memcpy_type_error(): @@ -56,8 +126,8 @@ def test_memcpy_type_error(): with pytest.raises(TypeError) as cm: q.memcpy(None, mobj, 3) - assert "`dest`" in str(cm.value) + assert "_Memory" in str(cm.value) with pytest.raises(TypeError) as cm: q.memcpy(mobj, None, 3) - assert "`src`" in str(cm.value) + assert "_Memory" in str(cm.value) From cf9084db3efdeaa67b41af1c6e35529f46a23b3d Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 18 Sep 2023 14:09:46 -0500 Subject: [PATCH 10/13] One of the memcpy_async calls must use events --- dpctl/tests/test_sycl_queue_memcpy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpctl/tests/test_sycl_queue_memcpy.py b/dpctl/tests/test_sycl_queue_memcpy.py index bb3c7b0376..e678b73f03 100644 --- a/dpctl/tests/test_sycl_queue_memcpy.py +++ b/dpctl/tests/test_sycl_queue_memcpy.py @@ -109,10 +109,10 @@ def test_memcpy_async(): dst_buf2 = bytearray(n) e = q.memcpy_async(dst_buf, src_buf, n) - e2 = q.memcpy_async(dst_buf2, src_buf, n) + e2 = q.memcpy_async(dst_buf2, src_buf, n, [e]) - e2.wait() e.wait() + e2.wait() assert dst_buf == src_buf assert dst_buf2 == src_buf From 48cb54d2d2d4d7a30002e164b03d08cf5af18d7f Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Mon, 18 Sep 2023 14:39:37 -0500 Subject: [PATCH 11/13] Make SyclTimer accumulative ``` In [9]: timer = dpctl.SyclTimer() In [10]: with timer(q): ...: y = dpt.linspace(1, 2, num=10**6, sycl_queue=q) ...: In [11]: timer.dt Out[11]: (0.0022024469999450957, 0.002116712) In [12]: with timer(q): ...: x = dpt.linspace(0, 1, num=10**6, sycl_queue=q) ...: In [13]: timer.dt Out[13]: (0.004531950999989931, 0.004239664000000001) ``` --- dpctl/_sycl_timer.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py index 322272df2d..33c4c2995f 100644 --- a/dpctl/_sycl_timer.py +++ b/dpctl/_sycl_timer.py @@ -67,10 +67,8 @@ def __init__(self, host_timer=timeit.default_timer, time_scale=1): self.timer = host_timer self.time_scale = time_scale self.queue = None - self.host_start = None - self.host_finish = None - self.event_start = None - self.event_finish = None + self.host_times = [] + self.bracketing_events = [] def __call__(self, queue=None): if isinstance(queue, SyclQueue): @@ -89,13 +87,17 @@ def __call__(self, queue=None): return self def __enter__(self): - self.event_start = self.queue.submit_barrier() - self.host_start = self.timer() + self._event_start = self.queue.submit_barrier() + self._host_start = self.timer() return self def __exit__(self, *args): - self.event_finish = self.queue.submit_barrier() - self.host_finish = self.timer() + self.host_times.append((self._host_start, self.timer())) + self.bracketing_events.append( + (self._event_start, self.queue.submit_barrier()) + ) + del self._event_start + del self._host_start @property def dt(self): @@ -103,13 +105,12 @@ def dt(self): element is the duration as measured by the host timer, while the second element is the duration as measured by the device timer and encoded in profiling events""" - self.event_start.wait() - self.event_finish.wait() - return ( - (self.host_finish - self.host_start) * self.time_scale, - ( - self.event_finish.profiling_info_start - - self.event_start.profiling_info_end - ) - * (1e-9 * self.time_scale), - ) + for es, ef in self.bracketing_events: + es.wait() + ef.wait() + host_dt = sum(tf - ts for ts, tf in self.host_times) * self.time_scale + dev_dt = sum( + ef.profiling_info_start - es.profiling_info_end + for es, ef in self.bracketing_events + ) * (1e-9 * self.time_scale) + return (host_dt, dev_dt) From 0a737644ea2d180b0fe86a034249802d816e032a Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 21 Sep 2023 19:32:27 -0500 Subject: [PATCH 12/13] Make cdef function except * to channel Python exceptions --- dpctl/_sycl_queue.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dpctl/_sycl_queue.pyx b/dpctl/_sycl_queue.pyx index 6496b365f6..a27e6f940f 100644 --- a/dpctl/_sycl_queue.pyx +++ b/dpctl/_sycl_queue.pyx @@ -173,7 +173,7 @@ cdef DPCTLSyclEventRef _memcpy_impl( size_t byte_count, DPCTLSyclEventRef *dep_events, size_t dep_events_count -): +) except *: cdef void *c_dst_ptr = NULL cdef void *c_src_ptr = NULL cdef DPCTLSyclEventRef ERef = NULL From 10722d4f359a9dbb0e07007b28d5b6b47af50ecd Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk Date: Thu, 5 Oct 2023 15:51:59 -0500 Subject: [PATCH 13/13] SyclTimer.dt return object with named accessors The object can unpack into a tuple, like before, but it prints with annotation of what each number means, and provides names getters. with timer(q): code dur = timer.dt print(dur) # outputs (host_dt=..., device_dt=...) dur.host_dt # get host-timer delta dur.device_dt # get device-timer delta hdt, ddt = dur # unpack into a tuple --- dpctl/_sycl_timer.py | 37 +++++++++++++++++++++++++++------- dpctl/tests/test_sycl_event.py | 7 ++++++- 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/dpctl/_sycl_timer.py b/dpctl/_sycl_timer.py index 33c4c2995f..66dd4f9340 100644 --- a/dpctl/_sycl_timer.py +++ b/dpctl/_sycl_timer.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import timeit from . import SyclQueue @@ -22,6 +21,29 @@ __doc__ = "This module implements :class:`dpctl.SyclTimer`." +class HostDeviceDuration: + def __init__(self, host_dt, device_dt): + self._host_dt = host_dt + self._device_dt = device_dt + + def __repr__(self): + return f"(host_dt={self._host_dt}, device_dt={self._device_dt})" + + def __str__(self): + return f"(host_dt={self._host_dt}, device_dt={self._device_dt})" + + def __iter__(self): + yield from [self._host_dt, self._device_dt] + + @property + def host_dt(self): + return self._host_dt + + @property + def device_dt(self): + return self._device_dt + + class SyclTimer: """ SyclTimer(host_timer=timeit.default_timer, time_scale=1) @@ -45,7 +67,7 @@ class SyclTimer: code_block # retrieve elapsed times in milliseconds - sycl_dt, wall_dt = timer.dt + wall_dt, device_dt = timer.dt Remark: The timer submits barriers to the queue at the entrance and the @@ -101,10 +123,11 @@ def __exit__(self, *args): @property def dt(self): - """Returns a tuple of elapsed times where first - element is the duration as measured by the host timer, - while the second element is the duration as measured by - the device timer and encoded in profiling events""" + """Returns a pair of elapsed times (host_dt, device_dt). + + The host_dt is the duration as measured by the host + timer, while the device_dt is the duration as measured by + the device timer and encoded in profiling events.""" for es, ef in self.bracketing_events: es.wait() ef.wait() @@ -113,4 +136,4 @@ def dt(self): ef.profiling_info_start - es.profiling_info_end for es, ef in self.bracketing_events ) * (1e-9 * self.time_scale) - return (host_dt, dev_dt) + return HostDeviceDuration(host_dt, dev_dt) diff --git a/dpctl/tests/test_sycl_event.py b/dpctl/tests/test_sycl_event.py index fa496d1bb8..7f0db07539 100644 --- a/dpctl/tests/test_sycl_event.py +++ b/dpctl/tests/test_sycl_event.py @@ -202,7 +202,12 @@ def test_sycl_timer(): m1.copy_from_device(m2) # host operation [x**2 for x in range(128 * 1024)] - host_dt, device_dt = timer.dt + elapsed = timer.dt + host_dt, device_dt = elapsed + assert isinstance(repr(elapsed), str) + assert isinstance(str(elapsed), str) + assert host_dt == elapsed.host_dt + assert device_dt == elapsed.device_dt assert host_dt > device_dt or (host_dt > 0 and device_dt >= 0) q_no_profiling = dpctl.SyclQueue() assert q_no_profiling.has_enable_profiling is False