From 0cbc7af1409f86d3b2474bcbfcf804e9b7720ff6 Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Mon, 20 Sep 2021 17:09:25 +0100 Subject: [PATCH 1/2] [SYCL][CUDA] Fix context scope in kernel launch The `guessLocalWorkSize` function uses the CUDA API so it needs an active context, and there was no active `ScopedContext` when it was called which may cause issue. This fixes #2777 --- sycl/plugins/cuda/pi_cuda.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index d750969239995..f069c7a8ba6b6 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -2578,6 +2578,8 @@ pi_result cuda_piEnqueueKernelLaunch( bool providedLocalWorkGroupSize = (local_work_size != nullptr); pi_uint32 local_size = kernel->get_local_size(); + // Set the active context here as guessLocalWorkSize needs an active context + ScopedContext active(command_queue->get_context()); { size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_; maxWorkGroupSize = command_queue->device_->get_max_work_group_size(); @@ -2631,7 +2633,6 @@ pi_result cuda_piEnqueueKernelLaunch( std::unique_ptr<_pi_event> retImplEv{nullptr}; try { - ScopedContext active(command_queue->get_context()); CUstream cuStream = command_queue->get(); CUfunction cuFunc = kernel->get(); From 55f2f4fc5e17a1f626df2a350da0cdead1a08244 Mon Sep 17 00:00:00 2001 From: Nicolas Miller Date: Tue, 21 Sep 2021 10:06:42 +0100 Subject: [PATCH 2/2] [SYCL][CUDA] Fix try/catch block --- sycl/plugins/cuda/pi_cuda.cpp | 96 +++++++++++++++++------------------ 1 file changed, 48 insertions(+), 48 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index f069c7a8ba6b6..4c3aee067e483 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -2577,62 +2577,62 @@ pi_result cuda_piEnqueueKernelLaunch( size_t maxThreadsPerBlock[3] = {}; bool providedLocalWorkGroupSize = (local_work_size != nullptr); pi_uint32 local_size = kernel->get_local_size(); + pi_result retError = PI_SUCCESS; - // Set the active context here as guessLocalWorkSize needs an active context - ScopedContext active(command_queue->get_context()); - { - size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_; - maxWorkGroupSize = command_queue->device_->get_max_work_group_size(); - command_queue->device_->get_max_work_item_sizes(sizeof(maxThreadsPerBlock), - maxThreadsPerBlock); - - if (providedLocalWorkGroupSize) { - auto isValid = [&](int dim) { - if (reqdThreadsPerBlock[dim] != 0 && - local_work_size[dim] != reqdThreadsPerBlock[dim]) - return PI_INVALID_WORK_GROUP_SIZE; - - if (local_work_size[dim] > maxThreadsPerBlock[dim]) - return PI_INVALID_WORK_ITEM_SIZE; - // Checks that local work sizes are a divisor of the global work sizes - // which includes that the local work sizes are neither larger than the - // global work sizes and not 0. - if (0u == local_work_size[dim]) - return PI_INVALID_WORK_GROUP_SIZE; - if (0u != (global_work_size[dim] % local_work_size[dim])) - return PI_INVALID_WORK_GROUP_SIZE; - threadsPerBlock[dim] = static_cast(local_work_size[dim]); - return PI_SUCCESS; - }; - - for (size_t dim = 0; dim < work_dim; dim++) { - auto err = isValid(dim); - if (err != PI_SUCCESS) - return err; + try { + // Set the active context here as guessLocalWorkSize needs an active context + ScopedContext active(command_queue->get_context()); + { + size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_; + maxWorkGroupSize = command_queue->device_->get_max_work_group_size(); + command_queue->device_->get_max_work_item_sizes( + sizeof(maxThreadsPerBlock), maxThreadsPerBlock); + + if (providedLocalWorkGroupSize) { + auto isValid = [&](int dim) { + if (reqdThreadsPerBlock[dim] != 0 && + local_work_size[dim] != reqdThreadsPerBlock[dim]) + return PI_INVALID_WORK_GROUP_SIZE; + + if (local_work_size[dim] > maxThreadsPerBlock[dim]) + return PI_INVALID_WORK_ITEM_SIZE; + // Checks that local work sizes are a divisor of the global work sizes + // which includes that the local work sizes are neither larger than + // the global work sizes and not 0. + if (0u == local_work_size[dim]) + return PI_INVALID_WORK_GROUP_SIZE; + if (0u != (global_work_size[dim] % local_work_size[dim])) + return PI_INVALID_WORK_GROUP_SIZE; + threadsPerBlock[dim] = static_cast(local_work_size[dim]); + return PI_SUCCESS; + }; + + for (size_t dim = 0; dim < work_dim; dim++) { + auto err = isValid(dim); + if (err != PI_SUCCESS) + return err; + } + } else { + guessLocalWorkSize(threadsPerBlock, global_work_size, + maxThreadsPerBlock, kernel, local_size); } - } else { - guessLocalWorkSize(threadsPerBlock, global_work_size, maxThreadsPerBlock, - kernel, local_size); } - } - if (maxWorkGroupSize < - size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) { - return PI_INVALID_WORK_GROUP_SIZE; - } + if (maxWorkGroupSize < + size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) { + return PI_INVALID_WORK_GROUP_SIZE; + } - int blocksPerGrid[3] = {1, 1, 1}; + int blocksPerGrid[3] = {1, 1, 1}; - for (size_t i = 0; i < work_dim; i++) { - blocksPerGrid[i] = - static_cast(global_work_size[i] + threadsPerBlock[i] - 1) / - threadsPerBlock[i]; - } + for (size_t i = 0; i < work_dim; i++) { + blocksPerGrid[i] = + static_cast(global_work_size[i] + threadsPerBlock[i] - 1) / + threadsPerBlock[i]; + } - pi_result retError = PI_SUCCESS; - std::unique_ptr<_pi_event> retImplEv{nullptr}; + std::unique_ptr<_pi_event> retImplEv{nullptr}; - try { CUstream cuStream = command_queue->get(); CUfunction cuFunc = kernel->get();