From 6dee4c81c24f5014c3bbbbcb401da11b8e372bb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 17 Mar 2022 10:47:35 +0000 Subject: [PATCH 01/17] introduce 3 streams per queue --- sycl/plugins/cuda/pi_cuda.cpp | 237 ++++++++++++++++++++-------------- sycl/plugins/cuda/pi_cuda.hpp | 22 +++- 2 files changed, 154 insertions(+), 105 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index cf5f5abd0e9f..789eecc0b125 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -78,20 +78,20 @@ pi_result forLatestEvents(const pi_event *event_wait_list, std::sort(events.begin(), events.end(), [](pi_event e0, pi_event e1) { // Tiered sort creating sublists of streams (smallest value first) in which // the corresponding events are sorted into a sequence of newest first. - return e0->get_queue()->stream_ < e1->get_queue()->stream_ || - (e0->get_queue()->stream_ == e1->get_queue()->stream_ && + return e0->get_stream() < e1->get_stream() || + (e0->get_stream() == e1->get_stream() && e0->get_event_id() > e1->get_event_id()); }); bool first = true; CUstream lastSeenStream = 0; for (pi_event event : events) { - if (!event || (!first && event->get_queue()->stream_ == lastSeenStream)) { + if (!event || (!first && event->get_stream() == lastSeenStream)) { continue; } first = false; - lastSeenStream = event->get_queue()->stream_; + lastSeenStream = event->get_stream(); auto result = f(event); if (result != PI_SUCCESS) { @@ -270,6 +270,31 @@ void guessLocalWorkSize(size_t *threadsPerBlock, const size_t *global_work_size, } } +pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream, + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list) { + try { + ScopedContext active(command_queue->get_context()); + + if (event_wait_list) { + auto result = + forLatestEvents(event_wait_list, num_events_in_wait_list, + [stream](pi_event event) -> pi_result { + return PI_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0)); + }); + + if (result != PI_SUCCESS) { + return result; + } + } + return PI_SUCCESS; + } catch (pi_result err) { + return err; + } catch (...) { + return PI_ERROR_UNKNOWN; + } +} + } // anonymous namespace /// ------ Error handling, matching OpenCL plugin semantics. @@ -323,10 +348,10 @@ pi_result cuda_piEventRetain(pi_event event); /// \endcond -_pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue) +_pi_event::_pi_event(pi_command_type type, pi_context context, pi_queue queue, CUstream stream) : commandType_{type}, refCount_{1}, hasBeenWaitedOn_{false}, isRecorded_{false}, isStarted_{false}, evEnd_{nullptr}, evStart_{nullptr}, - evQueued_{nullptr}, queue_{queue}, context_{context} { + evQueued_{nullptr}, queue_{queue}, stream_{stream}, context_{context} { bool profilingEnabled = queue_->properties_ & PI_QUEUE_PROFILING_ENABLE; @@ -472,9 +497,9 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { // for native events, the cuStreamWaitEvent call is used. // This makes all future work submitted to stream wait for all // work captured in event. - if (queue->get() != event->get_queue()->get()) { - return PI_CHECK_ERROR(cuStreamWaitEvent(queue->get(), event->get(), 0)); - } + PI_CHECK_ERROR(cuStreamWaitEvent(queue->get_h2d(), event->get(), 0)); + PI_CHECK_ERROR(cuStreamWaitEvent(queue->get_d2h(), event->get(), 0)); + PI_CHECK_ERROR(cuStreamWaitEvent(queue->get_compute(), event->get(), 0)); return PI_SUCCESS; } @@ -2205,7 +2230,7 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, ScopedContext active(context); - CUstream cuStream; + CUstream cuStream_h2d, cuStream_d2h, cuStream_compute; unsigned int flags = 0; if (properties == __SYCL_PI_CUDA_USE_DEFAULT_STREAM) { @@ -2216,13 +2241,21 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, flags = CU_STREAM_NON_BLOCKING; } - err = PI_CHECK_ERROR(cuStreamCreate(&cuStream, flags)); + err = PI_CHECK_ERROR(cuStreamCreate(&cuStream_h2d, flags)); + if (err != PI_SUCCESS) { + return err; + } + err = PI_CHECK_ERROR(cuStreamCreate(&cuStream_d2h, flags)); + if (err != PI_SUCCESS) { + return err; + } + err = PI_CHECK_ERROR(cuStreamCreate(&cuStream_compute, flags)); if (err != PI_SUCCESS) { return err; } queueImpl = std::unique_ptr<_pi_queue>( - new _pi_queue{cuStream, context, device, properties}); + new _pi_queue{cuStream_h2d, cuStream_d2h, cuStream_compute, context, device, properties}); *queue = queueImpl.release(); @@ -2282,9 +2315,15 @@ pi_result cuda_piQueueRelease(pi_queue command_queue) { ScopedContext active(command_queue->get_context()); - auto stream = queueImpl->stream_; - PI_CHECK_ERROR(cuStreamSynchronize(stream)); - PI_CHECK_ERROR(cuStreamDestroy(stream)); + auto stream_h2d = queueImpl->stream_h2d_; + PI_CHECK_ERROR(cuStreamSynchronize(stream_h2d)); + PI_CHECK_ERROR(cuStreamDestroy(stream_h2d)); + auto stream_d2h = queueImpl->stream_d2h_; + PI_CHECK_ERROR(cuStreamSynchronize(stream_d2h)); + PI_CHECK_ERROR(cuStreamDestroy(stream_d2h)); + auto stream_compute = queueImpl->stream_compute_; + PI_CHECK_ERROR(cuStreamSynchronize(stream_compute)); + PI_CHECK_ERROR(cuStreamDestroy(stream_compute)); return PI_SUCCESS; } catch (pi_result err) { @@ -2304,7 +2343,9 @@ pi_result cuda_piQueueFinish(pi_queue command_queue) { assert(command_queue != nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code ScopedContext active(command_queue->get_context()); - result = PI_CHECK_ERROR(cuStreamSynchronize(command_queue->stream_)); + result = PI_CHECK_ERROR(cuStreamSynchronize(command_queue->get_h2d())); + result = PI_CHECK_ERROR(cuStreamSynchronize(command_queue->get_d2h())); + result = PI_CHECK_ERROR(cuStreamSynchronize(command_queue->get_compute())); } catch (pi_result err) { @@ -2368,19 +2409,18 @@ pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, assert(buffer != nullptr); assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_h2d(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { ScopedContext active(command_queue->get_context()); - retErr = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_WRITE, command_queue)); + PI_COMMAND_TYPE_MEM_BUFFER_WRITE, command_queue, cuStream)); retImplEv->start(); } @@ -2414,19 +2454,18 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, assert(buffer != nullptr); assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_d2h(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { ScopedContext active(command_queue->get_context()); - - retErr = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); + + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue)); + PI_COMMAND_TYPE_MEM_BUFFER_READ, command_queue, cuStream)); retImplEv->start(); } @@ -2744,13 +2783,10 @@ pi_result cuda_piEnqueueKernelLaunch( std::unique_ptr<_pi_event> retImplEv{nullptr}; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_compute(); CUfunction cuFunc = kernel->get(); - if (event_wait_list) { - retError = cuda_piEnqueueEventsWait( - command_queue, num_events_in_wait_list, event_wait_list, nullptr); - } + retError = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); // Set the implicit global offset parameter if kernel has offset variant if (kernel->get_with_offset_parameter()) { @@ -2772,7 +2808,7 @@ pi_result cuda_piEnqueueKernelLaunch( if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_NDRANGE_KERNEL, command_queue)); + PI_COMMAND_TYPE_NDRANGE_KERNEL, command_queue, cuStream)); retImplEv->start(); } @@ -3585,7 +3621,7 @@ pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, } if (event) { - *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue); + *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue, command_queue->get_compute()); (*event)->start(); (*event)->record(); } @@ -3835,19 +3871,18 @@ pi_result cuda_piEnqueueMemBufferReadRect( assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_d2h(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { ScopedContext active(command_queue->get_context()); - - retErr = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); + + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, command_queue)); + PI_COMMAND_TYPE_MEM_BUFFER_READ_RECT, command_queue, cuStream)); retImplEv->start(); } @@ -3886,19 +3921,17 @@ pi_result cuda_piEnqueueMemBufferWriteRect( assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_h2d(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { ScopedContext active(command_queue->get_context()); - - retErr = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, command_queue)); + PI_COMMAND_TYPE_MEM_BUFFER_WRITE_RECT, command_queue, cuStream)); retImplEv->start(); } @@ -3939,21 +3972,17 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, try { ScopedContext active(command_queue->get_context()); - - if (event_wait_list) { - cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); - } - pi_result result; + auto stream = command_queue->get_compute(); + result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, event_wait_list); + if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue)); + PI_COMMAND_TYPE_MEM_BUFFER_COPY, command_queue, stream)); result = retImplEv->start(); } - auto stream = command_queue->get(); auto src = src_buffer->mem_.buffer_mem_.get() + src_offset; auto dst = dst_buffer->mem_.buffer_mem_.get() + dst_offset; @@ -3985,20 +4014,18 @@ pi_result cuda_piEnqueueMemBufferCopyRect( assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_compute(); CUdeviceptr srcPtr = src_buffer->mem_.buffer_mem_.get(); CUdeviceptr dstPtr = dst_buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { ScopedContext active(command_queue->get_context()); - - retErr = cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue)); + PI_COMMAND_TYPE_MEM_BUFFER_COPY_RECT, command_queue, cuStream)); retImplEv->start(); } @@ -4046,21 +4073,18 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, try { ScopedContext active(command_queue->get_context()); - if (event_wait_list) { - cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); - } - + auto stream = command_queue->get_compute(); pi_result result; + result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, event_wait_list); + if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue)); + PI_COMMAND_TYPE_MEM_BUFFER_FILL, command_queue, stream)); result = retImplEv->start(); } auto dstDevice = buffer->mem_.buffer_mem_.get() + offset; - auto stream = command_queue->get(); auto N = size / pattern_size; // pattern size in bytes @@ -4221,15 +4245,11 @@ pi_result cuda_piEnqueueMemImageRead( assert(image->mem_type_ == _pi_mem::mem_type::surface); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_d2h(); try { ScopedContext active(command_queue->get_context()); - - if (event_wait_list) { - cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); - } + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); CUarray array = image->mem_.surface_mem_.get_array(); @@ -4260,7 +4280,7 @@ pi_result cuda_piEnqueueMemImageRead( if (event) { auto new_event = - _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_READ, command_queue); + _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_READ, command_queue, cuStream); new_event->record(); *event = new_event; } @@ -4294,15 +4314,11 @@ cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, assert(image->mem_type_ == _pi_mem::mem_type::surface); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_h2d(); try { ScopedContext active(command_queue->get_context()); - - if (event_wait_list) { - cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); - } + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); CUarray array = image->mem_.surface_mem_.get_array(); @@ -4333,7 +4349,7 @@ cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, if (event) { auto new_event = - _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_WRITE, command_queue); + _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_WRITE, command_queue, cuStream); new_event->record(); *event = new_event; } @@ -4359,15 +4375,11 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, dst_image->mem_.surface_mem_.get_image_type()); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_compute(); try { ScopedContext active(command_queue->get_context()); - - if (event_wait_list) { - cuda_piEnqueueEventsWait(command_queue, num_events_in_wait_list, - event_wait_list, nullptr); - } + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); CUarray srcArray = src_image->mem_.surface_mem_.get_array(); CUarray dstArray = dst_image->mem_.surface_mem_.get_array(); @@ -4408,7 +4420,7 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, if (event) { auto new_event = - _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_COPY, command_queue); + _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_COPY, command_queue, cuStream); new_event->record(); *event = new_event; } @@ -4479,7 +4491,7 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, if (event) { try { *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_MAP, - command_queue); + command_queue, command_queue->get_d2h()); (*event)->start(); (*event)->record(); } catch (pi_result error) { @@ -4533,7 +4545,7 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, if (event) { try { *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, - command_queue); + command_queue, command_queue->get_h2d()); (*event)->start(); (*event)->record(); } catch (pi_result error) { @@ -4653,17 +4665,16 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, pi_event *event) { assert(queue != nullptr); assert(ptr != nullptr); - CUstream cuStream = queue->get(); + CUstream cuStream = queue->get_compute(); pi_result result = PI_SUCCESS; std::unique_ptr<_pi_event> event_ptr{nullptr}; try { ScopedContext active(queue->get_context()); - result = cuda_piEnqueueEventsWait(queue, num_events_in_waitlist, - events_waitlist, nullptr); + result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); if (event) { event_ptr = std::unique_ptr<_pi_event>( - _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue)); + _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue, cuStream)); event_ptr->start(); } result = PI_CHECK_ERROR(cuMemsetD8Async( @@ -4678,6 +4689,12 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, return result; } +pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, + pi_mem_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); + pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, void *dst_ptr, const void *src_ptr, size_t size, @@ -4687,17 +4704,42 @@ pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, assert(queue != nullptr); assert(dst_ptr != nullptr); assert(src_ptr != nullptr); - CUstream cuStream = queue->get(); + _pi_usm_type src_type, dst_type; pi_result result = PI_SUCCESS; + result = cuda_piextUSMGetMemAllocInfo(queue->get_context(), dst_ptr, + PI_MEM_ALLOC_TYPE, + sizeof(_pi_usm_type), + &dst_type, + nullptr); + result = cuda_piextUSMGetMemAllocInfo(queue->get_context(), src_ptr, + PI_MEM_ALLOC_TYPE, + sizeof(_pi_usm_type), + &src_type, + nullptr); + CUstream cuStream; + if((src_type==PI_MEM_TYPE_DEVICE || src_type==PI_MEM_TYPE_SHARED) && + (dst_type == PI_MEM_TYPE_UNKNOWN || dst_type == PI_MEM_TYPE_HOST)){ + cuStream = queue->get_d2h(); + std::cout << "doing d2h"<< std::endl; + } + else if((dst_type==PI_MEM_TYPE_DEVICE || dst_type==PI_MEM_TYPE_SHARED) && + (src_type == PI_MEM_TYPE_UNKNOWN || src_type == PI_MEM_TYPE_HOST)){ + cuStream = queue->get_h2d(); + std::cout << "doing h2d"<< std::endl; + } + else{ + cuStream = queue->get_compute(); + std::cout << "doing other"<< std::endl; + } + std::unique_ptr<_pi_event> event_ptr{nullptr}; try { ScopedContext active(queue->get_context()); - result = cuda_piEnqueueEventsWait(queue, num_events_in_waitlist, - events_waitlist, nullptr); + result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); if (event) { event_ptr = std::unique_ptr<_pi_event>( - _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue)); + _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream)); event_ptr->start(); } result = PI_CHECK_ERROR(cuMemcpyAsync( @@ -4737,17 +4779,16 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, return PI_INVALID_VALUE; assert(queue != nullptr); assert(ptr != nullptr); - CUstream cuStream = queue->get(); + CUstream cuStream = queue->get_h2d(); pi_result result = PI_SUCCESS; std::unique_ptr<_pi_event> event_ptr{nullptr}; try { ScopedContext active(queue->get_context()); - result = cuda_piEnqueueEventsWait(queue, num_events_in_waitlist, - events_waitlist, nullptr); + result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); if (event) { event_ptr = std::unique_ptr<_pi_event>( - _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue)); + _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream)); event_ptr->start(); } result = PI_CHECK_ERROR(cuMemPrefetchAsync( @@ -4778,7 +4819,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, if (event) { event_ptr = std::unique_ptr<_pi_event>( - _pi_event::make_native(PI_COMMAND_TYPE_USER, queue)); + _pi_event::make_native(PI_COMMAND_TYPE_USER, queue, queue->get_compute())); event_ptr->start(); } diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 0d10142caf07..bf7629e550e7 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -380,16 +380,16 @@ struct _pi_mem { struct _pi_queue { using native_type = CUstream; - native_type stream_; + native_type stream_h2d_, stream_d2h_, stream_compute_; _pi_context *context_; _pi_device *device_; pi_queue_properties properties_; std::atomic_uint32_t refCount_; std::atomic_uint32_t eventCount_; - _pi_queue(CUstream stream, _pi_context *context, _pi_device *device, + _pi_queue(CUstream stream_h2d, CUstream stream_d2h, CUstream stream_compute, _pi_context *context, _pi_device *device, pi_queue_properties properties) - : stream_{stream}, context_{context}, device_{device}, + : stream_h2d_{stream_h2d}, stream_d2h_{stream_d2h}, stream_compute_(stream_compute), context_{context}, device_{device}, properties_{properties}, refCount_{1}, eventCount_{0} { cuda_piContextRetain(context_); cuda_piDeviceRetain(device_); @@ -400,7 +400,10 @@ struct _pi_queue { cuda_piDeviceRelease(device_); } - native_type get() const noexcept { return stream_; }; + native_type get() const noexcept { return stream_compute_; }; + native_type get_h2d() const noexcept { return stream_h2d_; }; + native_type get_d2h() const noexcept { return stream_d2h_; }; + native_type get_compute() const noexcept { return stream_compute_; }; _pi_context *get_context() const { return context_; }; @@ -431,6 +434,8 @@ struct _pi_event { pi_queue get_queue() const noexcept { return queue_; } + CUstream get_stream() const noexcept { return stream_; } + pi_command_type get_command_type() const noexcept { return commandType_; } pi_uint32 get_reference_count() const noexcept { return refCount_; } @@ -474,8 +479,8 @@ struct _pi_event { pi_uint64 get_end_time() const; // construct a native CUDA. This maps closely to the underlying CUDA event. - static pi_event make_native(pi_command_type type, pi_queue queue) { - return new _pi_event(type, queue->get_context(), queue); + static pi_event make_native(pi_command_type type, pi_queue queue, CUstream stream) { + return new _pi_event(type, queue->get_context(), queue, stream); } pi_result release(); @@ -485,7 +490,7 @@ struct _pi_event { private: // This constructor is private to force programmers to use the make_native / // make_user static members in order to create a pi_event for CUDA. - _pi_event(pi_command_type type, pi_context context, pi_queue queue); + _pi_event(pi_command_type type, pi_context context, pi_queue queue, CUstream stream); pi_command_type commandType_; // The type of command associated with event. @@ -514,6 +519,9 @@ struct _pi_event { pi_queue queue_; // pi_queue associated with the event. If this is a user // event, this will be nullptr. + CUstream stream_; // CUstream associated with the event. If this is a user + // event, this will be uninitialized. + pi_context context_; // pi_context associated with the event. If this is a // native event, this will be the same context associated // with the queue_ member. From f64d3a37fe9500bda6172cb8bd439b113acccb85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Fri, 18 Mar 2022 08:12:54 +0000 Subject: [PATCH 02/17] fix stream in event member functions --- sycl/plugins/cuda/pi_cuda.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 789eecc0b125..d51832d49d7f 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -384,7 +384,7 @@ pi_result _pi_event::start() { if (queue_->properties_ & PI_QUEUE_PROFILING_ENABLE) { // NOTE: This relies on the default stream to be unused. result = PI_CHECK_ERROR(cuEventRecord(evQueued_, 0)); - result = PI_CHECK_ERROR(cuEventRecord(evStart_, queue_->get())); + result = PI_CHECK_ERROR(cuEventRecord(evStart_, stream_)); } } catch (pi_result error) { result = error; @@ -448,15 +448,13 @@ pi_result _pi_event::record() { return PI_INVALID_QUEUE; } - CUstream cuStream = queue_->get(); - try { eventId_ = queue_->get_next_event_id(); if (eventId_ == 0) { cl::sycl::detail::pi::die( "Unrecoverable program state reached in event identifier overflow"); } - result = PI_CHECK_ERROR(cuEventRecord(evEnd_, cuStream)); + result = PI_CHECK_ERROR(cuEventRecord(evEnd_, stream_)); } catch (pi_result error) { result = error; } @@ -4720,16 +4718,13 @@ pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, if((src_type==PI_MEM_TYPE_DEVICE || src_type==PI_MEM_TYPE_SHARED) && (dst_type == PI_MEM_TYPE_UNKNOWN || dst_type == PI_MEM_TYPE_HOST)){ cuStream = queue->get_d2h(); - std::cout << "doing d2h"<< std::endl; } else if((dst_type==PI_MEM_TYPE_DEVICE || dst_type==PI_MEM_TYPE_SHARED) && (src_type == PI_MEM_TYPE_UNKNOWN || src_type == PI_MEM_TYPE_HOST)){ cuStream = queue->get_h2d(); - std::cout << "doing h2d"<< std::endl; } else{ cuStream = queue->get_compute(); - std::cout << "doing other"<< std::endl; } std::unique_ptr<_pi_event> event_ptr{nullptr}; From c95ac197f349099cead085c8ae96af8dbb529c9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Fri, 18 Mar 2022 09:06:39 +0000 Subject: [PATCH 03/17] introduce pool of threads --- sycl/plugins/cuda/pi_cuda.cpp | 104 ++++++++++++---------------------- sycl/plugins/cuda/pi_cuda.hpp | 44 +++++++------- 2 files changed, 59 insertions(+), 89 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index d51832d49d7f..67f6b4b3077b 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -495,9 +495,9 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { // for native events, the cuStreamWaitEvent call is used. // This makes all future work submitted to stream wait for all // work captured in event. - PI_CHECK_ERROR(cuStreamWaitEvent(queue->get_h2d(), event->get(), 0)); - PI_CHECK_ERROR(cuStreamWaitEvent(queue->get_d2h(), event->get(), 0)); - PI_CHECK_ERROR(cuStreamWaitEvent(queue->get_compute(), event->get(), 0)); + for(CUstream s : queue->get_all()){ + PI_CHECK_ERROR(cuStreamWaitEvent(s, event->get(), 0)); + } return PI_SUCCESS; } @@ -2228,7 +2228,7 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, ScopedContext active(context); - CUstream cuStream_h2d, cuStream_d2h, cuStream_compute; + std::vector cuStreams(300); unsigned int flags = 0; if (properties == __SYCL_PI_CUDA_USE_DEFAULT_STREAM) { @@ -2239,21 +2239,15 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, flags = CU_STREAM_NON_BLOCKING; } - err = PI_CHECK_ERROR(cuStreamCreate(&cuStream_h2d, flags)); - if (err != PI_SUCCESS) { - return err; - } - err = PI_CHECK_ERROR(cuStreamCreate(&cuStream_d2h, flags)); - if (err != PI_SUCCESS) { - return err; - } - err = PI_CHECK_ERROR(cuStreamCreate(&cuStream_compute, flags)); - if (err != PI_SUCCESS) { - return err; + for(CUstream& s : cuStreams){ + err = PI_CHECK_ERROR(cuStreamCreate(&s, flags)); + if (err != PI_SUCCESS) { + return err; + } } queueImpl = std::unique_ptr<_pi_queue>( - new _pi_queue{cuStream_h2d, cuStream_d2h, cuStream_compute, context, device, properties}); + new _pi_queue{std::move(cuStreams), context, device, properties}); *queue = queueImpl.release(); @@ -2313,15 +2307,10 @@ pi_result cuda_piQueueRelease(pi_queue command_queue) { ScopedContext active(command_queue->get_context()); - auto stream_h2d = queueImpl->stream_h2d_; - PI_CHECK_ERROR(cuStreamSynchronize(stream_h2d)); - PI_CHECK_ERROR(cuStreamDestroy(stream_h2d)); - auto stream_d2h = queueImpl->stream_d2h_; - PI_CHECK_ERROR(cuStreamSynchronize(stream_d2h)); - PI_CHECK_ERROR(cuStreamDestroy(stream_d2h)); - auto stream_compute = queueImpl->stream_compute_; - PI_CHECK_ERROR(cuStreamSynchronize(stream_compute)); - PI_CHECK_ERROR(cuStreamDestroy(stream_compute)); + for(CUstream s : queueImpl->get_all()){ + PI_CHECK_ERROR(cuStreamSynchronize(s)); + PI_CHECK_ERROR(cuStreamDestroy(s)); + } return PI_SUCCESS; } catch (pi_result err) { @@ -2341,9 +2330,10 @@ pi_result cuda_piQueueFinish(pi_queue command_queue) { assert(command_queue != nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code ScopedContext active(command_queue->get_context()); - result = PI_CHECK_ERROR(cuStreamSynchronize(command_queue->get_h2d())); - result = PI_CHECK_ERROR(cuStreamSynchronize(command_queue->get_d2h())); - result = PI_CHECK_ERROR(cuStreamSynchronize(command_queue->get_compute())); + + for(CUstream s : command_queue->get_all()){ + result = PI_CHECK_ERROR(cuStreamSynchronize(s)); + } } catch (pi_result err) { @@ -2407,7 +2397,7 @@ pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, assert(buffer != nullptr); assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_h2d(); + CUstream cuStream = command_queue->get(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; @@ -2452,7 +2442,7 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, assert(buffer != nullptr); assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_d2h(); + CUstream cuStream = command_queue->get(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; @@ -2781,7 +2771,7 @@ pi_result cuda_piEnqueueKernelLaunch( std::unique_ptr<_pi_event> retImplEv{nullptr}; - CUstream cuStream = command_queue->get_compute(); + CUstream cuStream = command_queue->get(); CUfunction cuFunc = kernel->get(); retError = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -3619,7 +3609,7 @@ pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, } if (event) { - *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue, command_queue->get_compute()); + *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue, command_queue->get()); (*event)->start(); (*event)->record(); } @@ -3869,7 +3859,7 @@ pi_result cuda_piEnqueueMemBufferReadRect( assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_d2h(); + CUstream cuStream = command_queue->get(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; @@ -3919,7 +3909,7 @@ pi_result cuda_piEnqueueMemBufferWriteRect( assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_h2d(); + CUstream cuStream = command_queue->get(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; @@ -3972,7 +3962,7 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, ScopedContext active(command_queue->get_context()); pi_result result; - auto stream = command_queue->get_compute(); + auto stream = command_queue->get(); result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, event_wait_list); if (event) { @@ -4012,7 +4002,7 @@ pi_result cuda_piEnqueueMemBufferCopyRect( assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_compute(); + CUstream cuStream = command_queue->get(); CUdeviceptr srcPtr = src_buffer->mem_.buffer_mem_.get(); CUdeviceptr dstPtr = dst_buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; @@ -4071,7 +4061,7 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, try { ScopedContext active(command_queue->get_context()); - auto stream = command_queue->get_compute(); + auto stream = command_queue->get(); pi_result result; result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, event_wait_list); @@ -4243,7 +4233,7 @@ pi_result cuda_piEnqueueMemImageRead( assert(image->mem_type_ == _pi_mem::mem_type::surface); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_d2h(); + CUstream cuStream = command_queue->get(); try { ScopedContext active(command_queue->get_context()); @@ -4312,7 +4302,7 @@ cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, assert(image->mem_type_ == _pi_mem::mem_type::surface); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_h2d(); + CUstream cuStream = command_queue->get(); try { ScopedContext active(command_queue->get_context()); @@ -4373,7 +4363,7 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, dst_image->mem_.surface_mem_.get_image_type()); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_compute(); + CUstream cuStream = command_queue->get(); try { ScopedContext active(command_queue->get_context()); @@ -4489,7 +4479,7 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, if (event) { try { *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_MAP, - command_queue, command_queue->get_d2h()); + command_queue, command_queue->get()); (*event)->start(); (*event)->record(); } catch (pi_result error) { @@ -4543,7 +4533,7 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, if (event) { try { *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, - command_queue, command_queue->get_h2d()); + command_queue, command_queue->get()); (*event)->start(); (*event)->record(); } catch (pi_result error) { @@ -4663,7 +4653,7 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, pi_event *event) { assert(queue != nullptr); assert(ptr != nullptr); - CUstream cuStream = queue->get_compute(); + CUstream cuStream = queue->get(); pi_result result = PI_SUCCESS; std::unique_ptr<_pi_event> event_ptr{nullptr}; @@ -4702,30 +4692,8 @@ pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, assert(queue != nullptr); assert(dst_ptr != nullptr); assert(src_ptr != nullptr); - _pi_usm_type src_type, dst_type; pi_result result = PI_SUCCESS; - result = cuda_piextUSMGetMemAllocInfo(queue->get_context(), dst_ptr, - PI_MEM_ALLOC_TYPE, - sizeof(_pi_usm_type), - &dst_type, - nullptr); - result = cuda_piextUSMGetMemAllocInfo(queue->get_context(), src_ptr, - PI_MEM_ALLOC_TYPE, - sizeof(_pi_usm_type), - &src_type, - nullptr); - CUstream cuStream; - if((src_type==PI_MEM_TYPE_DEVICE || src_type==PI_MEM_TYPE_SHARED) && - (dst_type == PI_MEM_TYPE_UNKNOWN || dst_type == PI_MEM_TYPE_HOST)){ - cuStream = queue->get_d2h(); - } - else if((dst_type==PI_MEM_TYPE_DEVICE || dst_type==PI_MEM_TYPE_SHARED) && - (src_type == PI_MEM_TYPE_UNKNOWN || src_type == PI_MEM_TYPE_HOST)){ - cuStream = queue->get_h2d(); - } - else{ - cuStream = queue->get_compute(); - } + CUstream cuStream = queue->get(); std::unique_ptr<_pi_event> event_ptr{nullptr}; @@ -4774,7 +4742,7 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, return PI_INVALID_VALUE; assert(queue != nullptr); assert(ptr != nullptr); - CUstream cuStream = queue->get_h2d(); + CUstream cuStream = queue->get(); pi_result result = PI_SUCCESS; std::unique_ptr<_pi_event> event_ptr{nullptr}; @@ -4814,7 +4782,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, if (event) { event_ptr = std::unique_ptr<_pi_event>( - _pi_event::make_native(PI_COMMAND_TYPE_USER, queue, queue->get_compute())); + _pi_event::make_native(PI_COMMAND_TYPE_USER, queue, queue->get())); event_ptr->start(); } diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index bf7629e550e7..a545ad629a83 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -36,12 +36,12 @@ extern "C" { /// \cond IGNORE_BLOCK_IN_DOXYGEN -pi_result cuda_piContextRetain(pi_context ); -pi_result cuda_piContextRelease(pi_context ); -pi_result cuda_piDeviceRelease(pi_device ); -pi_result cuda_piDeviceRetain(pi_device ); -pi_result cuda_piProgramRetain(pi_program ); -pi_result cuda_piProgramRelease(pi_program ); +pi_result cuda_piContextRetain(pi_context); +pi_result cuda_piContextRelease(pi_context); +pi_result cuda_piDeviceRelease(pi_device); +pi_result cuda_piDeviceRetain(pi_device); +pi_result cuda_piProgramRetain(pi_program); +pi_result cuda_piProgramRelease(pi_program); pi_result cuda_piQueueRelease(pi_queue); pi_result cuda_piQueueRetain(pi_queue); pi_result cuda_piMemRetain(pi_mem); @@ -380,17 +380,18 @@ struct _pi_mem { struct _pi_queue { using native_type = CUstream; - native_type stream_h2d_, stream_d2h_, stream_compute_; + std::vector streams_; _pi_context *context_; _pi_device *device_; pi_queue_properties properties_; std::atomic_uint32_t refCount_; std::atomic_uint32_t eventCount_; + std::atomic_uint32_t stream_idx_; - _pi_queue(CUstream stream_h2d, CUstream stream_d2h, CUstream stream_compute, _pi_context *context, _pi_device *device, - pi_queue_properties properties) - : stream_h2d_{stream_h2d}, stream_d2h_{stream_d2h}, stream_compute_(stream_compute), context_{context}, device_{device}, - properties_{properties}, refCount_{1}, eventCount_{0} { + _pi_queue(std::vector &&streams, _pi_context *context, + _pi_device *device, pi_queue_properties properties) + : streams_{std::move(streams)}, context_{context}, device_{device}, + properties_{properties}, refCount_{1}, eventCount_{0}, stream_idx_{0} { cuda_piContextRetain(context_); cuda_piDeviceRetain(device_); } @@ -400,10 +401,10 @@ struct _pi_queue { cuda_piDeviceRelease(device_); } - native_type get() const noexcept { return stream_compute_; }; - native_type get_h2d() const noexcept { return stream_h2d_; }; - native_type get_d2h() const noexcept { return stream_d2h_; }; - native_type get_compute() const noexcept { return stream_compute_; }; + native_type get() noexcept { + return streams_[stream_idx_++ % streams_.size()]; + }; + const std::vector &get_all() const noexcept { return streams_; }; _pi_context *get_context() const { return context_; }; @@ -479,7 +480,8 @@ struct _pi_event { pi_uint64 get_end_time() const; // construct a native CUDA. This maps closely to the underlying CUDA event. - static pi_event make_native(pi_command_type type, pi_queue queue, CUstream stream) { + static pi_event make_native(pi_command_type type, pi_queue queue, + CUstream stream) { return new _pi_event(type, queue->get_context(), queue, stream); } @@ -490,7 +492,8 @@ struct _pi_event { private: // This constructor is private to force programmers to use the make_native / // make_user static members in order to create a pi_event for CUDA. - _pi_event(pi_command_type type, pi_context context, pi_queue queue, CUstream stream); + _pi_event(pi_command_type type, pi_context context, pi_queue queue, + CUstream stream); pi_command_type commandType_; // The type of command associated with event. @@ -520,7 +523,7 @@ struct _pi_event { // event, this will be nullptr. CUstream stream_; // CUstream associated with the event. If this is a user - // event, this will be uninitialized. + // event, this will be uninitialized. pi_context context_; // pi_context associated with the event. If this is a // native event, this will be the same context associated @@ -555,7 +558,7 @@ struct _pi_program { pi_result set_binary(const char *binary, size_t binarySizeInBytes); - pi_result build_program(const char* build_options); + pi_result build_program(const char *build_options); pi_context get_context() const { return context_; }; @@ -696,8 +699,7 @@ struct _pi_kernel { assert(retError == PI_SUCCESS); } - ~_pi_kernel() - { + ~_pi_kernel() { cuda_piProgramRelease(program_); cuda_piContextRelease(context_); } From 8ba1fb9163c40ecf64493cd1f5a512cf55ecc80c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Wed, 13 Apr 2022 08:08:06 +0100 Subject: [PATCH 04/17] use two pools --- sycl/plugins/cuda/pi_cuda.cpp | 74 ++++++++++++++++++++--------------- sycl/plugins/cuda/pi_cuda.hpp | 22 +++++++---- 2 files changed, 57 insertions(+), 39 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index a5117324c91e..37b0324327e6 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -495,7 +495,10 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { // for native events, the cuStreamWaitEvent call is used. // This makes all future work submitted to stream wait for all // work captured in event. - for(CUstream s : queue->get_all()){ + for(CUstream s : queue->get_all_compute()){ + PI_CHECK_ERROR(cuStreamWaitEvent(s, event->get(), 0)); + } + for(CUstream s : queue->get_all_transfer()){ PI_CHECK_ERROR(cuStreamWaitEvent(s, event->get(), 0)); } return PI_SUCCESS; @@ -2243,7 +2246,8 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, ScopedContext active(context); - std::vector cuStreams(300); + std::vector computeCuStreams(128); + std::vector transferCuStreams(64); unsigned int flags = 0; if (properties == __SYCL_PI_CUDA_USE_DEFAULT_STREAM) { @@ -2254,7 +2258,13 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, flags = CU_STREAM_NON_BLOCKING; } - for(CUstream& s : cuStreams){ + for(CUstream& s : computeCuStreams){ + err = PI_CHECK_ERROR(cuStreamCreate(&s, flags)); + if (err != PI_SUCCESS) { + return err; + } + } + for(CUstream& s : transferCuStreams){ err = PI_CHECK_ERROR(cuStreamCreate(&s, flags)); if (err != PI_SUCCESS) { return err; @@ -2262,7 +2272,7 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, } queueImpl = std::unique_ptr<_pi_queue>( - new _pi_queue{std::move(cuStreams), context, device, properties}); + new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams), context, device, properties}); *queue = queueImpl.release(); @@ -2322,7 +2332,12 @@ pi_result cuda_piQueueRelease(pi_queue command_queue) { ScopedContext active(command_queue->get_context()); - for(CUstream s : queueImpl->get_all()){ + for(CUstream s : queueImpl->get_all_compute()){ + PI_CHECK_ERROR(cuStreamSynchronize(s)); + PI_CHECK_ERROR(cuStreamDestroy(s)); + } + + for(CUstream s : queueImpl->get_all_transfer()){ PI_CHECK_ERROR(cuStreamSynchronize(s)); PI_CHECK_ERROR(cuStreamDestroy(s)); } @@ -2346,7 +2361,10 @@ pi_result cuda_piQueueFinish(pi_queue command_queue) { nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code ScopedContext active(command_queue->get_context()); - for(CUstream s : command_queue->get_all()){ + for(CUstream s : command_queue->get_all_compute()){ + result = PI_CHECK_ERROR(cuStreamSynchronize(s)); + } + for(CUstream s : command_queue->get_all_transfer()){ result = PI_CHECK_ERROR(cuStreamSynchronize(s)); } @@ -2378,7 +2396,7 @@ pi_result cuda_piQueueFlush(pi_queue command_queue) { /// \return PI_SUCCESS pi_result cuda_piextQueueGetNativeHandle(pi_queue queue, pi_native_handle *nativeHandle) { - *nativeHandle = reinterpret_cast(queue->get()); + *nativeHandle = reinterpret_cast(queue->get_compute()); return PI_SUCCESS; } @@ -2412,7 +2430,7 @@ pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, assert(buffer != nullptr); assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_transfer(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; @@ -2457,7 +2475,7 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, assert(buffer != nullptr); assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_transfer(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; @@ -2786,7 +2804,7 @@ pi_result cuda_piEnqueueKernelLaunch( std::unique_ptr<_pi_event> retImplEv{nullptr}; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_compute(); CUfunction cuFunc = kernel->get(); retError = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -3624,7 +3642,7 @@ pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, } if (event) { - *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue, command_queue->get()); + *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue, command_queue->get_compute()); (*event)->start(); (*event)->record(); } @@ -3874,7 +3892,7 @@ pi_result cuda_piEnqueueMemBufferReadRect( assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_transfer(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; @@ -3924,7 +3942,7 @@ pi_result cuda_piEnqueueMemBufferWriteRect( assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_transfer(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; @@ -3977,7 +3995,7 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, ScopedContext active(command_queue->get_context()); pi_result result; - auto stream = command_queue->get(); + auto stream = command_queue->get_transfer(); result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, event_wait_list); if (event) { @@ -4017,7 +4035,7 @@ pi_result cuda_piEnqueueMemBufferCopyRect( assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_transfer(); CUdeviceptr srcPtr = src_buffer->mem_.buffer_mem_.get(); CUdeviceptr dstPtr = dst_buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; @@ -4076,7 +4094,7 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, try { ScopedContext active(command_queue->get_context()); - auto stream = command_queue->get(); + auto stream = command_queue->get_transfer(); pi_result result; result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, event_wait_list); @@ -4248,7 +4266,7 @@ pi_result cuda_piEnqueueMemImageRead( assert(image->mem_type_ == _pi_mem::mem_type::surface); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_transfer(); try { ScopedContext active(command_queue->get_context()); @@ -4317,7 +4335,7 @@ cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, assert(image->mem_type_ == _pi_mem::mem_type::surface); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_transfer(); try { ScopedContext active(command_queue->get_context()); @@ -4378,7 +4396,7 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, dst_image->mem_.surface_mem_.get_image_type()); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get(); + CUstream cuStream = command_queue->get_transfer(); try { ScopedContext active(command_queue->get_context()); @@ -4494,7 +4512,7 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, if (event) { try { *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_MAP, - command_queue, command_queue->get()); + command_queue, command_queue->get_transfer()); (*event)->start(); (*event)->record(); } catch (pi_result error) { @@ -4548,7 +4566,7 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, if (event) { try { *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, - command_queue, command_queue->get()); + command_queue, command_queue->get_transfer()); (*event)->start(); (*event)->record(); } catch (pi_result error) { @@ -4668,7 +4686,7 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, pi_event *event) { assert(queue != nullptr); assert(ptr != nullptr); - CUstream cuStream = queue->get(); + CUstream cuStream = queue->get_compute(); pi_result result = PI_SUCCESS; std::unique_ptr<_pi_event> event_ptr{nullptr}; @@ -4692,12 +4710,6 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, return result; } -pi_result cuda_piextUSMGetMemAllocInfo(pi_context context, const void *ptr, - pi_mem_info param_name, - size_t param_value_size, - void *param_value, - size_t *param_value_size_ret); - pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, void *dst_ptr, const void *src_ptr, size_t size, @@ -4708,7 +4720,7 @@ pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, assert(dst_ptr != nullptr); assert(src_ptr != nullptr); pi_result result = PI_SUCCESS; - CUstream cuStream = queue->get(); + CUstream cuStream = queue->get_transfer(); std::unique_ptr<_pi_event> event_ptr{nullptr}; @@ -4757,7 +4769,7 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, return PI_INVALID_VALUE; assert(queue != nullptr); assert(ptr != nullptr); - CUstream cuStream = queue->get(); + CUstream cuStream = queue->get_transfer(); pi_result result = PI_SUCCESS; std::unique_ptr<_pi_event> event_ptr{nullptr}; @@ -4797,7 +4809,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, if (event) { event_ptr = std::unique_ptr<_pi_event>( - _pi_event::make_native(PI_COMMAND_TYPE_USER, queue, queue->get())); + _pi_event::make_native(PI_COMMAND_TYPE_USER, queue, queue->get_transfer())); event_ptr->start(); } diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index a545ad629a83..b22d96907012 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -380,18 +380,20 @@ struct _pi_mem { struct _pi_queue { using native_type = CUstream; - std::vector streams_; + std::vector compute_streams_; + std::vector transfer_streams_; _pi_context *context_; _pi_device *device_; pi_queue_properties properties_; std::atomic_uint32_t refCount_; std::atomic_uint32_t eventCount_; - std::atomic_uint32_t stream_idx_; + std::atomic_uint32_t compute_stream_idx_; + std::atomic_uint32_t transfer_stream_idx_; - _pi_queue(std::vector &&streams, _pi_context *context, + _pi_queue(std::vector &&compute_streams_, std::vector &&transfer_streams, _pi_context *context, _pi_device *device, pi_queue_properties properties) - : streams_{std::move(streams)}, context_{context}, device_{device}, - properties_{properties}, refCount_{1}, eventCount_{0}, stream_idx_{0} { + : compute_streams_{std::move(compute_streams_)}, transfer_streams_{std::move(transfer_streams)}, context_{context}, device_{device}, + properties_{properties}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0}, transfer_stream_idx_{0} { cuda_piContextRetain(context_); cuda_piDeviceRetain(device_); } @@ -401,10 +403,14 @@ struct _pi_queue { cuda_piDeviceRelease(device_); } - native_type get() noexcept { - return streams_[stream_idx_++ % streams_.size()]; + native_type get_compute() noexcept { + return compute_streams_[compute_stream_idx_++ % compute_streams_.size()]; }; - const std::vector &get_all() const noexcept { return streams_; }; + const std::vector &get_all_compute() const noexcept { return compute_streams_; }; + native_type get_transfer() noexcept { + return transfer_streams_[transfer_stream_idx_++ % transfer_streams_.size()]; + }; + const std::vector &get_all_transfer() const noexcept { return transfer_streams_; }; _pi_context *get_context() const { return context_; }; From 6a748887e0d7095d62b9f7110cd0907b1a423743 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Wed, 13 Apr 2022 10:03:51 +0100 Subject: [PATCH 05/17] add suport for in order queue --- sycl/plugins/cuda/pi_cuda.cpp | 18 +++++++++++------- sycl/plugins/cuda/pi_cuda.hpp | 7 ++++++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 37b0324327e6..ac32f5ddcc23 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -2246,10 +2246,7 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, ScopedContext active(context); - std::vector computeCuStreams(128); - std::vector transferCuStreams(64); unsigned int flags = 0; - if (properties == __SYCL_PI_CUDA_USE_DEFAULT_STREAM) { flags = CU_STREAM_DEFAULT; } else if (properties == __SYCL_PI_CUDA_SYNC_WITH_DEFAULT) { @@ -2258,16 +2255,23 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, flags = CU_STREAM_NON_BLOCKING; } + const bool is_ooo = + properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; + + std::vector computeCuStreams(is_ooo ? 128 : 1); + std::vector transferCuStreams(is_ooo ? 64 : 0); for(CUstream& s : computeCuStreams){ err = PI_CHECK_ERROR(cuStreamCreate(&s, flags)); if (err != PI_SUCCESS) { return err; } } - for(CUstream& s : transferCuStreams){ - err = PI_CHECK_ERROR(cuStreamCreate(&s, flags)); - if (err != PI_SUCCESS) { - return err; + if(is_ooo){ + for(CUstream& s : transferCuStreams){ + err = PI_CHECK_ERROR(cuStreamCreate(&s, flags)); + if (err != PI_SUCCESS) { + return err; + } } } diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index b22d96907012..56bf5012a14d 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -408,9 +408,14 @@ struct _pi_queue { }; const std::vector &get_all_compute() const noexcept { return compute_streams_; }; native_type get_transfer() noexcept { + if(!(properties_ & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)){ + return get_compute(); + } return transfer_streams_[transfer_stream_idx_++ % transfer_streams_.size()]; }; - const std::vector &get_all_transfer() const noexcept { return transfer_streams_; }; + const std::vector &get_all_transfer() const noexcept { + return transfer_streams_; + }; _pi_context *get_context() const { return context_; }; From 3b78a297bf279053910b7a033ea9ec2185abce07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 14 Apr 2022 11:41:11 +0100 Subject: [PATCH 06/17] use for_each_stream instead of get_all --- sycl/plugins/cuda/pi_cuda.cpp | 31 +++++++++---------------------- sycl/plugins/cuda/pi_cuda.hpp | 14 ++++++++++---- 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index ac32f5ddcc23..c79d5926e70d 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -495,12 +495,9 @@ pi_result enqueueEventWait(pi_queue queue, pi_event event) { // for native events, the cuStreamWaitEvent call is used. // This makes all future work submitted to stream wait for all // work captured in event. - for(CUstream s : queue->get_all_compute()){ - PI_CHECK_ERROR(cuStreamWaitEvent(s, event->get(), 0)); - } - for(CUstream s : queue->get_all_transfer()){ - PI_CHECK_ERROR(cuStreamWaitEvent(s, event->get(), 0)); - } + queue->for_each_stream([e=event->get()](CUstream s){ + PI_CHECK_ERROR(cuStreamWaitEvent(s, e, 0)); + }); return PI_SUCCESS; } @@ -2335,16 +2332,11 @@ pi_result cuda_piQueueRelease(pi_queue command_queue) { std::unique_ptr<_pi_queue> queueImpl(command_queue); ScopedContext active(command_queue->get_context()); - - for(CUstream s : queueImpl->get_all_compute()){ - PI_CHECK_ERROR(cuStreamSynchronize(s)); - PI_CHECK_ERROR(cuStreamDestroy(s)); - } - - for(CUstream s : queueImpl->get_all_transfer()){ + + command_queue->for_each_stream([](CUstream s){ PI_CHECK_ERROR(cuStreamSynchronize(s)); PI_CHECK_ERROR(cuStreamDestroy(s)); - } + }); return PI_SUCCESS; } catch (pi_result err) { @@ -2355,9 +2347,7 @@ pi_result cuda_piQueueRelease(pi_queue command_queue) { } pi_result cuda_piQueueFinish(pi_queue command_queue) { - - // set default result to a negative result (avoid false-positve tests) - pi_result result = PI_OUT_OF_HOST_MEMORY; + pi_result result = PI_SUCCESS; try { @@ -2365,12 +2355,9 @@ pi_result cuda_piQueueFinish(pi_queue command_queue) { nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code ScopedContext active(command_queue->get_context()); - for(CUstream s : command_queue->get_all_compute()){ + command_queue->for_each_stream([&result](CUstream s) mutable { result = PI_CHECK_ERROR(cuStreamSynchronize(s)); - } - for(CUstream s : command_queue->get_all_transfer()){ - result = PI_CHECK_ERROR(cuStreamSynchronize(s)); - } + }); } catch (pi_result err) { diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 56bf5012a14d..f489fbac67da 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -406,16 +406,22 @@ struct _pi_queue { native_type get_compute() noexcept { return compute_streams_[compute_stream_idx_++ % compute_streams_.size()]; }; - const std::vector &get_all_compute() const noexcept { return compute_streams_; }; native_type get_transfer() noexcept { if(!(properties_ & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)){ return get_compute(); } return transfer_streams_[transfer_stream_idx_++ % transfer_streams_.size()]; }; - const std::vector &get_all_transfer() const noexcept { - return transfer_streams_; - }; + + template + void for_each_stream(T&& f){ + for(unsigned int i=0;i Date: Fri, 15 Apr 2022 07:56:43 +0100 Subject: [PATCH 07/17] dynamic stream allocation --- sycl/plugins/cuda/pi_cuda.cpp | 67 ++++++++++++++++++++--------------- sycl/plugins/cuda/pi_cuda.hpp | 25 +++++++------ 2 files changed, 50 insertions(+), 42 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index c79d5926e70d..89194ad616f4 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -348,6 +348,30 @@ pi_result cuda_piEventRetain(pi_event event); /// \endcond + +_pi_queue::native_type _pi_queue::get_compute() noexcept { + if(n_compute_streams_ < compute_streams_.size()){ + unsigned int idx = n_compute_streams_++; + if(idx queueImpl{nullptr}; if (context->get_device() != device) { @@ -2241,7 +2263,7 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, return PI_INVALID_DEVICE; } - ScopedContext active(context); + //ScopedContext active(context); unsigned int flags = 0; if (properties == __SYCL_PI_CUDA_USE_DEFAULT_STREAM) { @@ -2257,23 +2279,9 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, std::vector computeCuStreams(is_ooo ? 128 : 1); std::vector transferCuStreams(is_ooo ? 64 : 0); - for(CUstream& s : computeCuStreams){ - err = PI_CHECK_ERROR(cuStreamCreate(&s, flags)); - if (err != PI_SUCCESS) { - return err; - } - } - if(is_ooo){ - for(CUstream& s : transferCuStreams){ - err = PI_CHECK_ERROR(cuStreamCreate(&s, flags)); - if (err != PI_SUCCESS) { - return err; - } - } - } queueImpl = std::unique_ptr<_pi_queue>( - new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams), context, device, properties}); + new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams), context, device, properties, flags}); *queue = queueImpl.release(); @@ -2387,6 +2395,7 @@ pi_result cuda_piQueueFlush(pi_queue command_queue) { /// \return PI_SUCCESS pi_result cuda_piextQueueGetNativeHandle(pi_queue queue, pi_native_handle *nativeHandle) { + ScopedContext active(queue->get_context()); *nativeHandle = reinterpret_cast(queue->get_compute()); return PI_SUCCESS; } @@ -2421,12 +2430,12 @@ pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, assert(buffer != nullptr); assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_transfer(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { ScopedContext active(command_queue->get_context()); + CUstream cuStream = command_queue->get_transfer(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -2466,12 +2475,12 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, assert(buffer != nullptr); assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_transfer(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { ScopedContext active(command_queue->get_context()); + CUstream cuStream = command_queue->get_transfer(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -3883,12 +3892,12 @@ pi_result cuda_piEnqueueMemBufferReadRect( assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_transfer(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { ScopedContext active(command_queue->get_context()); + CUstream cuStream = command_queue->get_transfer(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -3933,12 +3942,12 @@ pi_result cuda_piEnqueueMemBufferWriteRect( assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_transfer(); CUdeviceptr devPtr = buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { ScopedContext active(command_queue->get_context()); + CUstream cuStream = command_queue->get_transfer(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); if (event) { @@ -4026,13 +4035,13 @@ pi_result cuda_piEnqueueMemBufferCopyRect( assert(command_queue != nullptr); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_transfer(); CUdeviceptr srcPtr = src_buffer->mem_.buffer_mem_.get(); CUdeviceptr dstPtr = dst_buffer->mem_.buffer_mem_.get(); std::unique_ptr<_pi_event> retImplEv{nullptr}; try { ScopedContext active(command_queue->get_context()); + CUstream cuStream = command_queue->get_transfer(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); if (event) { @@ -4257,10 +4266,10 @@ pi_result cuda_piEnqueueMemImageRead( assert(image->mem_type_ == _pi_mem::mem_type::surface); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_transfer(); try { ScopedContext active(command_queue->get_context()); + CUstream cuStream = command_queue->get_transfer(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); CUarray array = image->mem_.surface_mem_.get_array(); @@ -4326,10 +4335,10 @@ cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, assert(image->mem_type_ == _pi_mem::mem_type::surface); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_transfer(); try { ScopedContext active(command_queue->get_context()); + CUstream cuStream = command_queue->get_transfer(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); CUarray array = image->mem_.surface_mem_.get_array(); @@ -4387,10 +4396,10 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, dst_image->mem_.surface_mem_.get_image_type()); pi_result retErr = PI_SUCCESS; - CUstream cuStream = command_queue->get_transfer(); try { ScopedContext active(command_queue->get_context()); + CUstream cuStream = command_queue->get_transfer(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); CUarray srcArray = src_image->mem_.surface_mem_.get_array(); @@ -4677,12 +4686,12 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, pi_event *event) { assert(queue != nullptr); assert(ptr != nullptr); - CUstream cuStream = queue->get_compute(); pi_result result = PI_SUCCESS; std::unique_ptr<_pi_event> event_ptr{nullptr}; try { ScopedContext active(queue->get_context()); + CUstream cuStream = queue->get_compute(); result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); if (event) { event_ptr = std::unique_ptr<_pi_event>( @@ -4711,12 +4720,12 @@ pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, assert(dst_ptr != nullptr); assert(src_ptr != nullptr); pi_result result = PI_SUCCESS; - CUstream cuStream = queue->get_transfer(); std::unique_ptr<_pi_event> event_ptr{nullptr}; try { ScopedContext active(queue->get_context()); + CUstream cuStream = queue->get_transfer(); result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); if (event) { event_ptr = std::unique_ptr<_pi_event>( @@ -4760,12 +4769,12 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, return PI_INVALID_VALUE; assert(queue != nullptr); assert(ptr != nullptr); - CUstream cuStream = queue->get_transfer(); pi_result result = PI_SUCCESS; std::unique_ptr<_pi_event> event_ptr{nullptr}; try { ScopedContext active(queue->get_context()); + CUstream cuStream = queue->get_transfer(); result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); if (event) { event_ptr = std::unique_ptr<_pi_event>( diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index f489fbac67da..066fd57079d6 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -389,11 +389,15 @@ struct _pi_queue { std::atomic_uint32_t eventCount_; std::atomic_uint32_t compute_stream_idx_; std::atomic_uint32_t transfer_stream_idx_; + std::atomic_uint32_t n_compute_streams_; + std::atomic_uint32_t n_transfer_streams_; + unsigned int flags_; _pi_queue(std::vector &&compute_streams_, std::vector &&transfer_streams, _pi_context *context, - _pi_device *device, pi_queue_properties properties) + _pi_device *device, pi_queue_properties properties, unsigned int flags) : compute_streams_{std::move(compute_streams_)}, transfer_streams_{std::move(transfer_streams)}, context_{context}, device_{device}, - properties_{properties}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0}, transfer_stream_idx_{0} { + properties_{properties}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0}, transfer_stream_idx_{0}, + n_compute_streams_{0}, n_transfer_streams_{0}, flags_(flags) { cuda_piContextRetain(context_); cuda_piDeviceRetain(device_); } @@ -403,22 +407,17 @@ struct _pi_queue { cuda_piDeviceRelease(device_); } - native_type get_compute() noexcept { - return compute_streams_[compute_stream_idx_++ % compute_streams_.size()]; - }; - native_type get_transfer() noexcept { - if(!(properties_ & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)){ - return get_compute(); - } - return transfer_streams_[transfer_stream_idx_++ % transfer_streams_.size()]; - }; + native_type get_compute() noexcept; + native_type get_transfer() noexcept; template void for_each_stream(T&& f){ - for(unsigned int i=0;i(compute_streams_.size()), n_compute_streams_.load()); + for(unsigned int i=0;i(transfer_streams_.size()), n_transfer_streams_.load()); + for(unsigned int i=0;i Date: Wed, 20 Apr 2022 08:16:57 +0100 Subject: [PATCH 08/17] add get function --- sycl/plugins/cuda/pi_cuda.cpp | 4 ++-- sycl/plugins/cuda/pi_cuda.hpp | 7 +++++-- sycl/unittests/pi/cuda/CMakeLists.txt | 1 + 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 89194ad616f4..8ae503abd617 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -349,7 +349,7 @@ pi_result cuda_piEventRetain(pi_event event); /// \endcond -_pi_queue::native_type _pi_queue::get_compute() noexcept { +_pi_queue::native_type _pi_queue::get_compute() { if(n_compute_streams_ < compute_streams_.size()){ unsigned int idx = n_compute_streams_++; if(idx void for_each_stream(T&& f){ diff --git a/sycl/unittests/pi/cuda/CMakeLists.txt b/sycl/unittests/pi/cuda/CMakeLists.txt index 6afd689af130..73cacaff50f4 100644 --- a/sycl/unittests/pi/cuda/CMakeLists.txt +++ b/sycl/unittests/pi/cuda/CMakeLists.txt @@ -28,4 +28,5 @@ target_include_directories(PiCudaTests target_link_libraries(PiCudaTests PRIVATE cudadrv + pi_cuda ) From 39efc6e398a996778ce0ff8db2ce04ae8470b535 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Tue, 26 Apr 2022 13:29:02 +0100 Subject: [PATCH 09/17] removed problematic test --- sycl/unittests/pi/cuda/CMakeLists.txt | 2 - sycl/unittests/pi/cuda/test_queue.cpp | 172 -------------------------- 2 files changed, 174 deletions(-) delete mode 100644 sycl/unittests/pi/cuda/test_queue.cpp diff --git a/sycl/unittests/pi/cuda/CMakeLists.txt b/sycl/unittests/pi/cuda/CMakeLists.txt index 73cacaff50f4..df1679c8a5eb 100644 --- a/sycl/unittests/pi/cuda/CMakeLists.txt +++ b/sycl/unittests/pi/cuda/CMakeLists.txt @@ -7,7 +7,6 @@ add_sycl_unittest(PiCudaTests OBJECT test_kernels.cpp test_mem_obj.cpp test_primary_context.cpp - test_queue.cpp test_sampler_properties.cpp ) @@ -28,5 +27,4 @@ target_include_directories(PiCudaTests target_link_libraries(PiCudaTests PRIVATE cudadrv - pi_cuda ) diff --git a/sycl/unittests/pi/cuda/test_queue.cpp b/sycl/unittests/pi/cuda/test_queue.cpp deleted file mode 100644 index f5c3a7b22173..000000000000 --- a/sycl/unittests/pi/cuda/test_queue.cpp +++ /dev/null @@ -1,172 +0,0 @@ -//==---- test_queue.cpp --- PI unit tests ----------------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "gtest/gtest.h" - -#include - -#include "TestGetPlatforms.hpp" -#include "TestGetPlugin.hpp" -#include -#include -#include -#include -#include - -using namespace sycl; - -struct CudaTestQueue : public ::testing::TestWithParam { - -protected: - std::optional plugin = - pi::initializeAndGet(backend::ext_oneapi_cuda); - - pi_platform platform_; - pi_device device_; - pi_context context_; - - void SetUp() override { - // skip the tests if the CUDA backend is not available - if (!plugin.has_value()) { - GTEST_SKIP(); - } - - pi_uint32 numPlatforms = 0; - ASSERT_EQ(plugin->getBackend(), backend::ext_oneapi_cuda); - - ASSERT_EQ((plugin->call_nocheck( - 0, nullptr, &numPlatforms)), - PI_SUCCESS) - << "piPlatformsGet failed.\n"; - - ASSERT_EQ((plugin->call_nocheck( - numPlatforms, &platform_, nullptr)), - PI_SUCCESS) - << "piPlatformsGet failed.\n"; - - ASSERT_EQ((plugin->call_nocheck( - platform_, PI_DEVICE_TYPE_GPU, 1, &device_, nullptr)), - PI_SUCCESS); - ASSERT_EQ((plugin->call_nocheck( - nullptr, 1, &device_, nullptr, nullptr, &context_)), - PI_SUCCESS); - EXPECT_NE(context_, nullptr); - } - - void TearDown() override { - if (plugin.has_value()) { - plugin->call(device_); - plugin->call(context_); - } - } - - CudaTestQueue() = default; - - ~CudaTestQueue() = default; -}; - -TEST_F(CudaTestQueue, PICreateQueueSimple) { - pi_queue queue; - ASSERT_EQ((plugin->call_nocheck( - context_, device_, 0, &queue)), - PI_SUCCESS); - ASSERT_NE(queue, nullptr); - EXPECT_EQ(queue->get_context(), context_); - - unsigned int flags = 0; - CUstream stream = queue->get(); - cuStreamGetFlags(stream, &flags); - ASSERT_EQ(flags, CU_STREAM_NON_BLOCKING); - - ASSERT_EQ((plugin->call_nocheck(queue)), - PI_SUCCESS); -} - -TEST_F(CudaTestQueue, PIQueueFinishSimple) { - pi_queue queue; - ASSERT_EQ((plugin->call_nocheck( - context_, device_, 0, &queue)), - PI_SUCCESS); - ASSERT_NE(queue, nullptr); - - // todo: post work on queue, ensure the results are valid and the work is - // complete after piQueueFinish? - - ASSERT_EQ((plugin->call_nocheck(queue)), - PI_SUCCESS); - - ASSERT_EQ(cuStreamQuery(queue->get()), CUDA_SUCCESS); - - ASSERT_EQ((plugin->call_nocheck(queue)), - PI_SUCCESS); -} - -TEST_F(CudaTestQueue, PICreateQueueSimpleDefault) { - pi_queue queue; - ASSERT_EQ((plugin->call_nocheck( - context_, device_, __SYCL_PI_CUDA_USE_DEFAULT_STREAM, &queue)), - PI_SUCCESS); - ASSERT_NE(queue, nullptr); - EXPECT_EQ(queue->get_context(), context_); - - unsigned int flags = 0; - CUstream stream = queue->get(); - cuStreamGetFlags(stream, &flags); - ASSERT_EQ(flags, CU_STREAM_DEFAULT); - - ASSERT_EQ((plugin->call_nocheck(queue)), - PI_SUCCESS); -} - -TEST_F(CudaTestQueue, PICreateQueueSyncWithDefault) { - pi_queue queue; - ASSERT_EQ((plugin->call_nocheck( - context_, device_, __SYCL_PI_CUDA_SYNC_WITH_DEFAULT, &queue)), - PI_SUCCESS); - ASSERT_NE(queue, nullptr); - EXPECT_EQ(queue->get_context(), context_); - - unsigned int flags = 0; - CUstream stream = queue->get(); - cuStreamGetFlags(stream, &flags); - ASSERT_NE(flags, CU_STREAM_NON_BLOCKING); - - ASSERT_EQ((plugin->call_nocheck(queue)), - PI_SUCCESS); -} - -TEST_F(CudaTestQueue, PICreateQueueInterop) { - pi_queue queue; - ASSERT_EQ((plugin->call_nocheck( - context_, device_, 0, &queue)), - PI_SUCCESS); - ASSERT_NE(queue, nullptr); - EXPECT_EQ(queue->get_context(), context_); - - CUstream cuStream = queue->get(); - - CUcontext cuCtx; - CUresult res = cuStreamGetCtx(cuStream, &cuCtx); - ASSERT_EQ(res, CUDA_SUCCESS); - EXPECT_EQ(cuCtx, context_->get()); - - ASSERT_EQ((plugin->call_nocheck(queue)), - PI_SUCCESS); -} - -TEST_P(CudaTestQueue, SYCLQueueDefaultStream) { - std::vector CudaDevices = GetParam().get_devices(); - auto deviceA_ = CudaDevices[0]; - queue Queue(deviceA_, async_handler{}, - {ext::oneapi::cuda::property::queue::use_default_stream{}}); - - CUstream CudaStream = get_native(Queue); - unsigned int flags; - cuStreamGetFlags(CudaStream, &flags); - ASSERT_EQ(flags, CU_STREAM_DEFAULT); -} From abc3dcf8c347302d67e933e4a8a68d14b1305a20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Tue, 26 Apr 2022 13:30:47 +0100 Subject: [PATCH 10/17] addressed review comments --- sycl/plugins/cuda/pi_cuda.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 8ae503abd617..50bfcdf3763c 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -360,7 +360,7 @@ _pi_queue::native_type _pi_queue::get_compute() { } _pi_queue::native_type _pi_queue::get_transfer() { - if(!(properties_ & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)){ + if(transfer_streams_.empty()){ // for example in in-order queue return get_compute(); } if(n_transfer_streams_ < transfer_streams_.size()){ @@ -2263,8 +2263,6 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, return PI_INVALID_DEVICE; } - //ScopedContext active(context); - unsigned int flags = 0; if (properties == __SYCL_PI_CUDA_USE_DEFAULT_STREAM) { flags = CU_STREAM_DEFAULT; From 7180531bd5902847b4f3796acf2e1cef8260e9d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Tue, 26 Apr 2022 13:33:30 +0100 Subject: [PATCH 11/17] format --- sycl/plugins/cuda/pi_cuda.cpp | 124 +++++++++++++++++++--------------- sycl/plugins/cuda/pi_cuda.hpp | 32 +++++---- 2 files changed, 88 insertions(+), 68 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 509cc6500221..744ed4e2616b 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -271,17 +271,17 @@ void guessLocalWorkSize(size_t *threadsPerBlock, const size_t *global_work_size, } pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream, - pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list) { + pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list) { try { ScopedContext active(command_queue->get_context()); if (event_wait_list) { - auto result = - forLatestEvents(event_wait_list, num_events_in_wait_list, - [stream](pi_event event) -> pi_result { - return PI_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0)); - }); + auto result = forLatestEvents( + event_wait_list, num_events_in_wait_list, + [stream](pi_event event) -> pi_result { + return PI_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0)); + }); if (result != PI_SUCCESS) { return result; @@ -348,11 +348,10 @@ pi_result cuda_piEventRetain(pi_event event); /// \endcond - _pi_queue::native_type _pi_queue::get_compute() { - if(n_compute_streams_ < compute_streams_.size()){ + if (n_compute_streams_ < compute_streams_.size()) { unsigned int idx = n_compute_streams_++; - if(idxfor_each_stream([e=event->get()](CUstream s){ + queue->for_each_stream([e = event->get()](CUstream s) { PI_CHECK_ERROR(cuStreamWaitEvent(s, e, 0)); }); return PI_SUCCESS; @@ -2262,14 +2262,14 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, flags = CU_STREAM_NON_BLOCKING; } - const bool is_ooo = - properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; + const bool is_ooo = properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; std::vector computeCuStreams(is_ooo ? 128 : 1); std::vector transferCuStreams(is_ooo ? 64 : 0); queueImpl = std::unique_ptr<_pi_queue>( - new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams), context, device, properties, flags}); + new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams), + context, device, properties, flags}); *queue = queueImpl.release(); @@ -2328,8 +2328,8 @@ pi_result cuda_piQueueRelease(pi_queue command_queue) { std::unique_ptr<_pi_queue> queueImpl(command_queue); ScopedContext active(command_queue->get_context()); - - command_queue->for_each_stream([](CUstream s){ + + command_queue->for_each_stream([](CUstream s) { PI_CHECK_ERROR(cuStreamSynchronize(s)); PI_CHECK_ERROR(cuStreamDestroy(s)); }); @@ -2425,7 +2425,8 @@ pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, ScopedContext active(command_queue->get_context()); CUstream cuStream = command_queue->get_transfer(); - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, + event_wait_list); if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( @@ -2469,8 +2470,9 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, try { ScopedContext active(command_queue->get_context()); CUstream cuStream = command_queue->get_transfer(); - - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); + + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, + event_wait_list); if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( @@ -2795,7 +2797,8 @@ pi_result cuda_piEnqueueKernelLaunch( CUstream cuStream = command_queue->get_compute(); CUfunction cuFunc = kernel->get(); - retError = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); + retError = enqueueEventsWait(command_queue, cuStream, + num_events_in_wait_list, event_wait_list); // Set the implicit global offset parameter if kernel has offset variant if (kernel->get_with_offset_parameter()) { @@ -3630,7 +3633,8 @@ pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, } if (event) { - *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue, command_queue->get_compute()); + *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue, + command_queue->get_compute()); (*event)->start(); (*event)->record(); } @@ -3886,8 +3890,9 @@ pi_result cuda_piEnqueueMemBufferReadRect( try { ScopedContext active(command_queue->get_context()); CUstream cuStream = command_queue->get_transfer(); - - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); + + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, + event_wait_list); if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( @@ -3936,7 +3941,8 @@ pi_result cuda_piEnqueueMemBufferWriteRect( try { ScopedContext active(command_queue->get_context()); CUstream cuStream = command_queue->get_transfer(); - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, + event_wait_list); if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( @@ -3984,7 +3990,8 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, pi_result result; auto stream = command_queue->get_transfer(); - result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, event_wait_list); + result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, + event_wait_list); if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( @@ -4030,7 +4037,8 @@ pi_result cuda_piEnqueueMemBufferCopyRect( try { ScopedContext active(command_queue->get_context()); CUstream cuStream = command_queue->get_transfer(); - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, + event_wait_list); if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( @@ -4084,8 +4092,8 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, auto stream = command_queue->get_transfer(); pi_result result; - result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, event_wait_list); - + result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, + event_wait_list); if (event) { retImplEv = std::unique_ptr<_pi_event>(_pi_event::make_native( @@ -4258,7 +4266,8 @@ pi_result cuda_piEnqueueMemImageRead( try { ScopedContext active(command_queue->get_context()); CUstream cuStream = command_queue->get_transfer(); - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, + event_wait_list); CUarray array = image->mem_.surface_mem_.get_array(); @@ -4288,8 +4297,8 @@ pi_result cuda_piEnqueueMemImageRead( } if (event) { - auto new_event = - _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_READ, command_queue, cuStream); + auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_READ, + command_queue, cuStream); new_event->record(); *event = new_event; } @@ -4327,7 +4336,8 @@ cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, try { ScopedContext active(command_queue->get_context()); CUstream cuStream = command_queue->get_transfer(); - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, + event_wait_list); CUarray array = image->mem_.surface_mem_.get_array(); @@ -4357,8 +4367,8 @@ cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, } if (event) { - auto new_event = - _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_WRITE, command_queue, cuStream); + auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_WRITE, + command_queue, cuStream); new_event->record(); *event = new_event; } @@ -4388,7 +4398,8 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, try { ScopedContext active(command_queue->get_context()); CUstream cuStream = command_queue->get_transfer(); - retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); + retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, + event_wait_list); CUarray srcArray = src_image->mem_.surface_mem_.get_array(); CUarray dstArray = dst_image->mem_.surface_mem_.get_array(); @@ -4428,8 +4439,8 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, } if (event) { - auto new_event = - _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_COPY, command_queue, cuStream); + auto new_event = _pi_event::make_native(PI_COMMAND_TYPE_IMAGE_COPY, + command_queue, cuStream); new_event->record(); *event = new_event; } @@ -4500,7 +4511,8 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, if (event) { try { *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_MAP, - command_queue, command_queue->get_transfer()); + command_queue, + command_queue->get_transfer()); (*event)->start(); (*event)->record(); } catch (pi_result error) { @@ -4554,7 +4566,8 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, if (event) { try { *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, - command_queue, command_queue->get_transfer()); + command_queue, + command_queue->get_transfer()); (*event)->start(); (*event)->record(); } catch (pi_result error) { @@ -4680,10 +4693,11 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, try { ScopedContext active(queue->get_context()); CUstream cuStream = queue->get_compute(); - result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); + result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, + events_waitlist); if (event) { - event_ptr = std::unique_ptr<_pi_event>( - _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue, cuStream)); + event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native( + PI_COMMAND_TYPE_MEM_BUFFER_FILL, queue, cuStream)); event_ptr->start(); } result = PI_CHECK_ERROR(cuMemsetD8Async( @@ -4714,10 +4728,11 @@ pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, try { ScopedContext active(queue->get_context()); CUstream cuStream = queue->get_transfer(); - result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); + result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, + events_waitlist); if (event) { - event_ptr = std::unique_ptr<_pi_event>( - _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream)); + event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native( + PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream)); event_ptr->start(); } result = PI_CHECK_ERROR(cuMemcpyAsync( @@ -4763,10 +4778,11 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, try { ScopedContext active(queue->get_context()); CUstream cuStream = queue->get_transfer(); - result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); + result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, + events_waitlist); if (event) { - event_ptr = std::unique_ptr<_pi_event>( - _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream)); + event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native( + PI_COMMAND_TYPE_MEM_BUFFER_COPY, queue, cuStream)); event_ptr->start(); } result = PI_CHECK_ERROR(cuMemPrefetchAsync( @@ -4796,8 +4812,8 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, ScopedContext active(queue->get_context()); if (event) { - event_ptr = std::unique_ptr<_pi_event>( - _pi_event::make_native(PI_COMMAND_TYPE_USER, queue, queue->get_transfer())); + event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native( + PI_COMMAND_TYPE_USER, queue, queue->get_transfer())); event_ptr->start(); } diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index d1bdfc2ffc66..132dcba7df13 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -393,11 +393,15 @@ struct _pi_queue { std::atomic_uint32_t n_transfer_streams_; unsigned int flags_; - _pi_queue(std::vector &&compute_streams_, std::vector &&transfer_streams, _pi_context *context, - _pi_device *device, pi_queue_properties properties, unsigned int flags) - : compute_streams_{std::move(compute_streams_)}, transfer_streams_{std::move(transfer_streams)}, context_{context}, device_{device}, - properties_{properties}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0}, transfer_stream_idx_{0}, - n_compute_streams_{0}, n_transfer_streams_{0}, flags_(flags) { + _pi_queue(std::vector &&compute_streams_, + std::vector &&transfer_streams, _pi_context *context, + _pi_device *device, pi_queue_properties properties, + unsigned int flags) + : compute_streams_{std::move(compute_streams_)}, + transfer_streams_{std::move(transfer_streams)}, context_{context}, + device_{device}, properties_{properties}, refCount_{1}, eventCount_{0}, + compute_stream_idx_{0}, transfer_stream_idx_{0}, n_compute_streams_{0}, + n_transfer_streams_{0}, flags_(flags) { cuda_piContextRetain(context_); cuda_piDeviceRetain(device_); } @@ -409,18 +413,18 @@ struct _pi_queue { native_type get_compute(); native_type get_transfer(); - native_type get() { - return get_compute(); - }; + native_type get() { return get_compute(); }; - template - void for_each_stream(T&& f){ - unsigned int end = std::min(static_cast(compute_streams_.size()), n_compute_streams_.load()); - for(unsigned int i=0;i void for_each_stream(T &&f) { + unsigned int end = + std::min(static_cast(compute_streams_.size()), + n_compute_streams_.load()); + for (unsigned int i = 0; i < end; i++) { f(compute_streams_[i]); } - end = std::min(static_cast(transfer_streams_.size()), n_transfer_streams_.load()); - for(unsigned int i=0;i(transfer_streams_.size()), + n_transfer_streams_.load()); + for (unsigned int i = 0; i < end; i++) { f(transfer_streams_[i]); } } From bc4ef9ce809189d51b9a9b93fa2d4ab81bb3bbb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 5 May 2022 09:22:03 +0100 Subject: [PATCH 12/17] minor refactor --- sycl/plugins/cuda/pi_cuda.cpp | 14 +++++++------- sycl/plugins/cuda/pi_cuda.hpp | 14 ++++++++------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 744ed4e2616b..da0ad87e20cd 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -349,8 +349,8 @@ pi_result cuda_piEventRetain(pi_event event); /// \endcond _pi_queue::native_type _pi_queue::get_compute() { - if (n_compute_streams_ < compute_streams_.size()) { - unsigned int idx = n_compute_streams_++; + if (num_compute_streams_ < compute_streams_.size()) { + unsigned int idx = num_compute_streams_++; if (idx < compute_streams_.size()) { PI_CHECK_ERROR(cuStreamCreate(&compute_streams_[idx], flags_)); } @@ -362,8 +362,8 @@ _pi_queue::native_type _pi_queue::get_transfer() { if (transfer_streams_.empty()) { // for example in in-order queue return get_compute(); } - if (n_transfer_streams_ < transfer_streams_.size()) { - unsigned int idx = n_transfer_streams_++; + if (num_transfer_streams_ < transfer_streams_.size()) { + unsigned int idx = num_transfer_streams_++; if (idx < transfer_streams_.size()) { PI_CHECK_ERROR(cuStreamCreate(&transfer_streams_[idx], flags_)); } @@ -2262,10 +2262,10 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, flags = CU_STREAM_NON_BLOCKING; } - const bool is_ooo = properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; + const bool is_out_of_order= properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; - std::vector computeCuStreams(is_ooo ? 128 : 1); - std::vector transferCuStreams(is_ooo ? 64 : 0); + std::vector computeCuStreams(is_out_of_order? _pi_queue::default_num_compute_streams : 1); + std::vector transferCuStreams(is_out_of_order? _pi_queue::default_num_transfer_streams : 0); queueImpl = std::unique_ptr<_pi_queue>( new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams), diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 132dcba7df13..e5a8f5926bbf 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -379,6 +379,8 @@ struct _pi_mem { /// struct _pi_queue { using native_type = CUstream; + static constexpr int default_num_compute_streams = 128; + static constexpr int default_num_transfer_streams = 64; std::vector compute_streams_; std::vector transfer_streams_; @@ -389,8 +391,8 @@ struct _pi_queue { std::atomic_uint32_t eventCount_; std::atomic_uint32_t compute_stream_idx_; std::atomic_uint32_t transfer_stream_idx_; - std::atomic_uint32_t n_compute_streams_; - std::atomic_uint32_t n_transfer_streams_; + std::atomic_uint32_t num_compute_streams_; + std::atomic_uint32_t num_transfer_streams_; unsigned int flags_; _pi_queue(std::vector &&compute_streams_, @@ -400,8 +402,8 @@ struct _pi_queue { : compute_streams_{std::move(compute_streams_)}, transfer_streams_{std::move(transfer_streams)}, context_{context}, device_{device}, properties_{properties}, refCount_{1}, eventCount_{0}, - compute_stream_idx_{0}, transfer_stream_idx_{0}, n_compute_streams_{0}, - n_transfer_streams_{0}, flags_(flags) { + compute_stream_idx_{0}, transfer_stream_idx_{0}, num_compute_streams_{0}, + num_transfer_streams_{0}, flags_(flags) { cuda_piContextRetain(context_); cuda_piDeviceRetain(device_); } @@ -418,12 +420,12 @@ struct _pi_queue { template void for_each_stream(T &&f) { unsigned int end = std::min(static_cast(compute_streams_.size()), - n_compute_streams_.load()); + num_compute_streams_.load()); for (unsigned int i = 0; i < end; i++) { f(compute_streams_[i]); } end = std::min(static_cast(transfer_streams_.size()), - n_transfer_streams_.load()); + num_transfer_streams_.load()); for (unsigned int i = 0; i < end; i++) { f(transfer_streams_[i]); } From c4ccce88a32b7025153d39a4e4c28c39a69dc948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 5 May 2022 12:08:09 +0100 Subject: [PATCH 13/17] format --- sycl/plugins/cuda/pi_cuda.cpp | 9 ++++++--- sycl/plugins/cuda/pi_cuda.hpp | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 45cb920d69bb..f17fb08ef951 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -2280,10 +2280,13 @@ pi_result cuda_piQueueCreate(pi_context context, pi_device device, flags = CU_STREAM_NON_BLOCKING; } - const bool is_out_of_order= properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; + const bool is_out_of_order = + properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; - std::vector computeCuStreams(is_out_of_order? _pi_queue::default_num_compute_streams : 1); - std::vector transferCuStreams(is_out_of_order? _pi_queue::default_num_transfer_streams : 0); + std::vector computeCuStreams( + is_out_of_order ? _pi_queue::default_num_compute_streams : 1); + std::vector transferCuStreams( + is_out_of_order ? _pi_queue::default_num_transfer_streams : 0); queueImpl = std::unique_ptr<_pi_queue>( new _pi_queue{std::move(computeCuStreams), std::move(transferCuStreams), diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index e5a8f5926bbf..d203f7c60d55 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -402,8 +402,8 @@ struct _pi_queue { : compute_streams_{std::move(compute_streams_)}, transfer_streams_{std::move(transfer_streams)}, context_{context}, device_{device}, properties_{properties}, refCount_{1}, eventCount_{0}, - compute_stream_idx_{0}, transfer_stream_idx_{0}, num_compute_streams_{0}, - num_transfer_streams_{0}, flags_(flags) { + compute_stream_idx_{0}, transfer_stream_idx_{0}, + num_compute_streams_{0}, num_transfer_streams_{0}, flags_(flags) { cuda_piContextRetain(context_); cuda_piDeviceRetain(device_); } From 9731cd9378280305a129f095e623be29228abb84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 5 May 2022 14:15:31 +0100 Subject: [PATCH 14/17] addressed most comments --- sycl/plugins/cuda/pi_cuda.cpp | 65 ++++++++++++++++++----------------- sycl/plugins/cuda/pi_cuda.hpp | 7 ++-- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index f17fb08ef951..c70d882b9014 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -291,19 +291,20 @@ void guessLocalWorkSize(size_t *threadsPerBlock, const size_t *global_work_size, pi_result enqueueEventsWait(pi_queue command_queue, CUstream stream, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list) { + if (!event_wait_list) { + return PI_SUCCESS; + } try { ScopedContext active(command_queue->get_context()); - if (event_wait_list) { - auto result = forLatestEvents( - event_wait_list, num_events_in_wait_list, - [stream](pi_event event) -> pi_result { - return PI_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0)); - }); + auto result = forLatestEvents( + event_wait_list, num_events_in_wait_list, + [stream](pi_event event) -> pi_result { + return PI_CHECK_ERROR(cuStreamWaitEvent(stream, event->get(), 0)); + }); - if (result != PI_SUCCESS) { - return result; - } + if (result != PI_SUCCESS) { + return result; } return PI_SUCCESS; } catch (pi_result err) { @@ -366,7 +367,7 @@ pi_result cuda_piEventRetain(pi_event event); /// \endcond -_pi_queue::native_type _pi_queue::get_compute() { +CUstream _pi_queue::get_next_compute_stream() { if (num_compute_streams_ < compute_streams_.size()) { unsigned int idx = num_compute_streams_++; if (idx < compute_streams_.size()) { @@ -376,9 +377,9 @@ _pi_queue::native_type _pi_queue::get_compute() { return compute_streams_[compute_stream_idx_++ % compute_streams_.size()]; } -_pi_queue::native_type _pi_queue::get_transfer() { +CUstream _pi_queue::get_next_transfer_stream() { if (transfer_streams_.empty()) { // for example in in-order queue - return get_compute(); + return get_next_compute_stream(); } if (num_transfer_streams_ < transfer_streams_.size()) { unsigned int idx = num_transfer_streams_++; @@ -2372,7 +2373,7 @@ pi_result cuda_piQueueFinish(pi_queue command_queue) { nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code ScopedContext active(command_queue->get_context()); - command_queue->for_each_stream([&result](CUstream s) mutable { + command_queue->for_each_stream([&result](CUstream s) { result = PI_CHECK_ERROR(cuStreamSynchronize(s)); }); @@ -2405,7 +2406,7 @@ pi_result cuda_piQueueFlush(pi_queue command_queue) { pi_result cuda_piextQueueGetNativeHandle(pi_queue queue, pi_native_handle *nativeHandle) { ScopedContext active(queue->get_context()); - *nativeHandle = reinterpret_cast(queue->get_compute()); + *nativeHandle = reinterpret_cast(queue->get_next_compute_stream()); return PI_SUCCESS; } @@ -2444,7 +2445,7 @@ pi_result cuda_piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, try { ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_transfer(); + CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -2490,7 +2491,7 @@ pi_result cuda_piEnqueueMemBufferRead(pi_queue command_queue, pi_mem buffer, try { ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_transfer(); + CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -2815,7 +2816,7 @@ pi_result cuda_piEnqueueKernelLaunch( std::unique_ptr<_pi_event> retImplEv{nullptr}; - CUstream cuStream = command_queue->get_compute(); + CUstream cuStream = command_queue->get_next_compute_stream(); CUfunction cuFunc = kernel->get(); retError = enqueueEventsWait(command_queue, cuStream, @@ -3655,7 +3656,7 @@ pi_result cuda_piEnqueueEventsWaitWithBarrier(pi_queue command_queue, if (event) { *event = _pi_event::make_native(PI_COMMAND_TYPE_MARKER, command_queue, - command_queue->get_compute()); + command_queue->get_next_compute_stream()); (*event)->start(); (*event)->record(); } @@ -3910,7 +3911,7 @@ pi_result cuda_piEnqueueMemBufferReadRect( try { ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_transfer(); + CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -3961,7 +3962,7 @@ pi_result cuda_piEnqueueMemBufferWriteRect( try { ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_transfer(); + CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -4010,7 +4011,7 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, ScopedContext active(command_queue->get_context()); pi_result result; - auto stream = command_queue->get_transfer(); + auto stream = command_queue->get_next_transfer_stream(); result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, event_wait_list); @@ -4057,7 +4058,7 @@ pi_result cuda_piEnqueueMemBufferCopyRect( try { ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_transfer(); + CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -4111,7 +4112,7 @@ pi_result cuda_piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, try { ScopedContext active(command_queue->get_context()); - auto stream = command_queue->get_transfer(); + auto stream = command_queue->get_next_transfer_stream(); pi_result result; result = enqueueEventsWait(command_queue, stream, num_events_in_wait_list, event_wait_list); @@ -4286,7 +4287,7 @@ pi_result cuda_piEnqueueMemImageRead( try { ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_transfer(); + CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -4356,7 +4357,7 @@ cuda_piEnqueueMemImageWrite(pi_queue command_queue, pi_mem image, try { ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_transfer(); + CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -4418,7 +4419,7 @@ pi_result cuda_piEnqueueMemImageCopy(pi_queue command_queue, pi_mem src_image, try { ScopedContext active(command_queue->get_context()); - CUstream cuStream = command_queue->get_transfer(); + CUstream cuStream = command_queue->get_next_transfer_stream(); retErr = enqueueEventsWait(command_queue, cuStream, num_events_in_wait_list, event_wait_list); @@ -4533,7 +4534,7 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, try { *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_MAP, command_queue, - command_queue->get_transfer()); + command_queue->get_next_transfer_stream()); (*event)->start(); (*event)->record(); } catch (pi_result error) { @@ -4588,7 +4589,7 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, try { *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, command_queue, - command_queue->get_transfer()); + command_queue->get_next_transfer_stream()); (*event)->start(); (*event)->record(); } catch (pi_result error) { @@ -4713,7 +4714,7 @@ pi_result cuda_piextUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value, try { ScopedContext active(queue->get_context()); - CUstream cuStream = queue->get_compute(); + CUstream cuStream = queue->get_next_compute_stream(); result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); if (event) { @@ -4748,7 +4749,7 @@ pi_result cuda_piextUSMEnqueueMemcpy(pi_queue queue, pi_bool blocking, try { ScopedContext active(queue->get_context()); - CUstream cuStream = queue->get_transfer(); + CUstream cuStream = queue->get_next_transfer_stream(); result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); if (event) { @@ -4805,7 +4806,7 @@ pi_result cuda_piextUSMEnqueuePrefetch(pi_queue queue, const void *ptr, try { ScopedContext active(queue->get_context()); - CUstream cuStream = queue->get_transfer(); + CUstream cuStream = queue->get_next_transfer_stream(); result = enqueueEventsWait(queue, cuStream, num_events_in_waitlist, events_waitlist); if (event) { @@ -4841,7 +4842,7 @@ pi_result cuda_piextUSMEnqueueMemAdvise(pi_queue queue, const void *ptr, if (event) { event_ptr = std::unique_ptr<_pi_event>(_pi_event::make_native( - PI_COMMAND_TYPE_USER, queue, queue->get_transfer())); + PI_COMMAND_TYPE_USER, queue, queue->get_next_transfer_stream())); event_ptr->start(); } diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index d203f7c60d55..7cf31146efe2 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -413,9 +413,10 @@ struct _pi_queue { cuda_piDeviceRelease(device_); } - native_type get_compute(); - native_type get_transfer(); - native_type get() { return get_compute(); }; + // get_next_compute/transfer_stream() functions return streams from appropriate pools in round-robin fashion + native_type get_next_compute_stream(); + native_type get_next_transfer_stream(); + native_type get() { return get_next_compute_stream(); }; template void for_each_stream(T &&f) { unsigned int end = From c21a58cc8bcf26d49569293e7f31118a1c51e7e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Fri, 6 May 2022 10:48:52 +0100 Subject: [PATCH 15/17] resolved race condition --- sycl/plugins/cuda/pi_cuda.cpp | 2 ++ sycl/plugins/cuda/pi_cuda.hpp | 35 ++++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index c70d882b9014..5453e1f25a8a 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -369,6 +369,7 @@ pi_result cuda_piEventRetain(pi_event event); CUstream _pi_queue::get_next_compute_stream() { if (num_compute_streams_ < compute_streams_.size()) { + std::lock_guard guard(compute_stream_mutex_); unsigned int idx = num_compute_streams_++; if (idx < compute_streams_.size()) { PI_CHECK_ERROR(cuStreamCreate(&compute_streams_[idx], flags_)); @@ -382,6 +383,7 @@ CUstream _pi_queue::get_next_transfer_stream() { return get_next_compute_stream(); } if (num_transfer_streams_ < transfer_streams_.size()) { + std::lock_guard guard(transfer_stream_mutex_); unsigned int idx = num_transfer_streams_++; if (idx < transfer_streams_.size()) { PI_CHECK_ERROR(cuStreamCreate(&transfer_streams_[idx], flags_)); diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 7cf31146efe2..56ad0fdabf64 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -391,15 +391,17 @@ struct _pi_queue { std::atomic_uint32_t eventCount_; std::atomic_uint32_t compute_stream_idx_; std::atomic_uint32_t transfer_stream_idx_; - std::atomic_uint32_t num_compute_streams_; - std::atomic_uint32_t num_transfer_streams_; + unsigned int num_compute_streams_; + unsigned int num_transfer_streams_; unsigned int flags_; + std::mutex compute_stream_mutex_; + std::mutex transfer_stream_mutex_; - _pi_queue(std::vector &&compute_streams_, + _pi_queue(std::vector &&compute_streams, std::vector &&transfer_streams, _pi_context *context, _pi_device *device, pi_queue_properties properties, unsigned int flags) - : compute_streams_{std::move(compute_streams_)}, + : compute_streams_{std::move(compute_streams)}, transfer_streams_{std::move(transfer_streams)}, context_{context}, device_{device}, properties_{properties}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0}, transfer_stream_idx_{0}, @@ -419,16 +421,23 @@ struct _pi_queue { native_type get() { return get_next_compute_stream(); }; template void for_each_stream(T &&f) { - unsigned int end = - std::min(static_cast(compute_streams_.size()), - num_compute_streams_.load()); - for (unsigned int i = 0; i < end; i++) { - f(compute_streams_[i]); + { + std::lock_guard compute_guard(compute_stream_mutex_); + unsigned int end = + std::min(static_cast(compute_streams_.size()), + num_compute_streams_); + for (unsigned int i = 0; i < end; i++) { + f(compute_streams_[i]); + } } - end = std::min(static_cast(transfer_streams_.size()), - num_transfer_streams_.load()); - for (unsigned int i = 0; i < end; i++) { - f(transfer_streams_[i]); + { + std::lock_guard transfer_guard(transfer_stream_mutex_); + unsigned int end = + std::min(static_cast(transfer_streams_.size()), + num_transfer_streams_); + for (unsigned int i = 0; i < end; i++) { + f(transfer_streams_[i]); + } } } From c6b4a863d8646fadfc0b4d2b3d92c428af251466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Fri, 6 May 2022 10:51:42 +0100 Subject: [PATCH 16/17] format --- sycl/plugins/cuda/pi_cuda.cpp | 15 ++++++++------- sycl/plugins/cuda/pi_cuda.hpp | 9 +++++---- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 5453e1f25a8a..9a11e9d76c5e 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -2408,7 +2408,8 @@ pi_result cuda_piQueueFlush(pi_queue command_queue) { pi_result cuda_piextQueueGetNativeHandle(pi_queue queue, pi_native_handle *nativeHandle) { ScopedContext active(queue->get_context()); - *nativeHandle = reinterpret_cast(queue->get_next_compute_stream()); + *nativeHandle = + reinterpret_cast(queue->get_next_compute_stream()); return PI_SUCCESS; } @@ -4534,9 +4535,9 @@ pi_result cuda_piEnqueueMemBufferMap(pi_queue command_queue, pi_mem buffer, if (event) { try { - *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_MAP, - command_queue, - command_queue->get_next_transfer_stream()); + *event = _pi_event::make_native( + PI_COMMAND_TYPE_MEM_BUFFER_MAP, command_queue, + command_queue->get_next_transfer_stream()); (*event)->start(); (*event)->record(); } catch (pi_result error) { @@ -4589,9 +4590,9 @@ pi_result cuda_piEnqueueMemUnmap(pi_queue command_queue, pi_mem memobj, if (event) { try { - *event = _pi_event::make_native(PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, - command_queue, - command_queue->get_next_transfer_stream()); + *event = _pi_event::make_native( + PI_COMMAND_TYPE_MEM_BUFFER_UNMAP, command_queue, + command_queue->get_next_transfer_stream()); (*event)->start(); (*event)->record(); } catch (pi_result error) { diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 56ad0fdabf64..108852ea5c08 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -415,7 +415,8 @@ struct _pi_queue { cuda_piDeviceRelease(device_); } - // get_next_compute/transfer_stream() functions return streams from appropriate pools in round-robin fashion + // get_next_compute/transfer_stream() functions return streams from + // appropriate pools in round-robin fashion native_type get_next_compute_stream(); native_type get_next_transfer_stream(); native_type get() { return get_next_compute_stream(); }; @@ -425,16 +426,16 @@ struct _pi_queue { std::lock_guard compute_guard(compute_stream_mutex_); unsigned int end = std::min(static_cast(compute_streams_.size()), - num_compute_streams_); + num_compute_streams_); for (unsigned int i = 0; i < end; i++) { f(compute_streams_[i]); } } { std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int end = + unsigned int end = std::min(static_cast(transfer_streams_.size()), - num_transfer_streams_); + num_transfer_streams_); for (unsigned int i = 0; i < end; i++) { f(transfer_streams_[i]); } From 55adb7c15a302a62aad9276a812932587a8e78b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Tue, 10 May 2022 08:30:50 +0100 Subject: [PATCH 17/17] addressed review comment --- sycl/plugins/cuda/pi_cuda.cpp | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 9a11e9d76c5e..0c962048262b 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -369,10 +369,13 @@ pi_result cuda_piEventRetain(pi_event event); CUstream _pi_queue::get_next_compute_stream() { if (num_compute_streams_ < compute_streams_.size()) { + // the check above is for performance - so as not to lock mutex every time std::lock_guard guard(compute_stream_mutex_); - unsigned int idx = num_compute_streams_++; - if (idx < compute_streams_.size()) { - PI_CHECK_ERROR(cuStreamCreate(&compute_streams_[idx], flags_)); + // The second check is done after mutex is locked so other threads can not + // change num_compute_streams_ after that + if (num_compute_streams_ < compute_streams_.size()) { + PI_CHECK_ERROR( + cuStreamCreate(&compute_streams_[num_compute_streams_++], flags_)); } } return compute_streams_[compute_stream_idx_++ % compute_streams_.size()]; @@ -383,10 +386,13 @@ CUstream _pi_queue::get_next_transfer_stream() { return get_next_compute_stream(); } if (num_transfer_streams_ < transfer_streams_.size()) { + // the check above is for performance - so as not to lock mutex every time std::lock_guard guard(transfer_stream_mutex_); - unsigned int idx = num_transfer_streams_++; - if (idx < transfer_streams_.size()) { - PI_CHECK_ERROR(cuStreamCreate(&transfer_streams_[idx], flags_)); + // The second check is done after mutex is locked so other threads can not + // change num_transfer_streams_ after that + if (num_transfer_streams_ < transfer_streams_.size()) { + PI_CHECK_ERROR( + cuStreamCreate(&transfer_streams_[num_transfer_streams_++], flags_)); } } return transfer_streams_[transfer_stream_idx_++ % transfer_streams_.size()];