From 0cbc7af1409f86d3b2474bcbfcf804e9b7720ff6 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Mon, 20 Sep 2021 17:09:25 +0100
Subject: [PATCH 1/2] [SYCL][CUDA] Fix context scope in kernel launch

The `guessLocalWorkSize` function uses the CUDA API so it needs an
active context, and there was no active `ScopedContext` when it was
called which may cause issue.

This fixes #2777
---
 sycl/plugins/cuda/pi_cuda.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index d750969239995..f069c7a8ba6b6 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -2578,6 +2578,8 @@ pi_result cuda_piEnqueueKernelLaunch(
   bool providedLocalWorkGroupSize = (local_work_size != nullptr);
   pi_uint32 local_size = kernel->get_local_size();
 
+  // Set the active context here as guessLocalWorkSize needs an active context
+  ScopedContext active(command_queue->get_context());
   {
     size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_;
     maxWorkGroupSize = command_queue->device_->get_max_work_group_size();
@@ -2631,7 +2633,6 @@ pi_result cuda_piEnqueueKernelLaunch(
   std::unique_ptr<_pi_event> retImplEv{nullptr};
 
   try {
-    ScopedContext active(command_queue->get_context());
     CUstream cuStream = command_queue->get();
     CUfunction cuFunc = kernel->get();
 

From 55f2f4fc5e17a1f626df2a350da0cdead1a08244 Mon Sep 17 00:00:00 2001
From: Nicolas Miller <nicolas.miller@codeplay.com>
Date: Tue, 21 Sep 2021 10:06:42 +0100
Subject: [PATCH 2/2] [SYCL][CUDA] Fix try/catch block

---
 sycl/plugins/cuda/pi_cuda.cpp | 96 +++++++++++++++++------------------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index f069c7a8ba6b6..4c3aee067e483 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -2577,62 +2577,62 @@ pi_result cuda_piEnqueueKernelLaunch(
   size_t maxThreadsPerBlock[3] = {};
   bool providedLocalWorkGroupSize = (local_work_size != nullptr);
   pi_uint32 local_size = kernel->get_local_size();
+  pi_result retError = PI_SUCCESS;
 
-  // Set the active context here as guessLocalWorkSize needs an active context
-  ScopedContext active(command_queue->get_context());
-  {
-    size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_;
-    maxWorkGroupSize = command_queue->device_->get_max_work_group_size();
-    command_queue->device_->get_max_work_item_sizes(sizeof(maxThreadsPerBlock),
-                                                    maxThreadsPerBlock);
-
-    if (providedLocalWorkGroupSize) {
-      auto isValid = [&](int dim) {
-        if (reqdThreadsPerBlock[dim] != 0 &&
-            local_work_size[dim] != reqdThreadsPerBlock[dim])
-          return PI_INVALID_WORK_GROUP_SIZE;
-
-        if (local_work_size[dim] > maxThreadsPerBlock[dim])
-          return PI_INVALID_WORK_ITEM_SIZE;
-        // Checks that local work sizes are a divisor of the global work sizes
-        // which includes that the local work sizes are neither larger than the
-        // global work sizes and not 0.
-        if (0u == local_work_size[dim])
-          return PI_INVALID_WORK_GROUP_SIZE;
-        if (0u != (global_work_size[dim] % local_work_size[dim]))
-          return PI_INVALID_WORK_GROUP_SIZE;
-        threadsPerBlock[dim] = static_cast<int>(local_work_size[dim]);
-        return PI_SUCCESS;
-      };
-
-      for (size_t dim = 0; dim < work_dim; dim++) {
-        auto err = isValid(dim);
-        if (err != PI_SUCCESS)
-          return err;
+  try {
+    // Set the active context here as guessLocalWorkSize needs an active context
+    ScopedContext active(command_queue->get_context());
+    {
+      size_t *reqdThreadsPerBlock = kernel->reqdThreadsPerBlock_;
+      maxWorkGroupSize = command_queue->device_->get_max_work_group_size();
+      command_queue->device_->get_max_work_item_sizes(
+          sizeof(maxThreadsPerBlock), maxThreadsPerBlock);
+
+      if (providedLocalWorkGroupSize) {
+        auto isValid = [&](int dim) {
+          if (reqdThreadsPerBlock[dim] != 0 &&
+              local_work_size[dim] != reqdThreadsPerBlock[dim])
+            return PI_INVALID_WORK_GROUP_SIZE;
+
+          if (local_work_size[dim] > maxThreadsPerBlock[dim])
+            return PI_INVALID_WORK_ITEM_SIZE;
+          // Checks that local work sizes are a divisor of the global work sizes
+          // which includes that the local work sizes are neither larger than
+          // the global work sizes and not 0.
+          if (0u == local_work_size[dim])
+            return PI_INVALID_WORK_GROUP_SIZE;
+          if (0u != (global_work_size[dim] % local_work_size[dim]))
+            return PI_INVALID_WORK_GROUP_SIZE;
+          threadsPerBlock[dim] = static_cast<int>(local_work_size[dim]);
+          return PI_SUCCESS;
+        };
+
+        for (size_t dim = 0; dim < work_dim; dim++) {
+          auto err = isValid(dim);
+          if (err != PI_SUCCESS)
+            return err;
+        }
+      } else {
+        guessLocalWorkSize(threadsPerBlock, global_work_size,
+                           maxThreadsPerBlock, kernel, local_size);
       }
-    } else {
-      guessLocalWorkSize(threadsPerBlock, global_work_size, maxThreadsPerBlock,
-                         kernel, local_size);
     }
-  }
 
-  if (maxWorkGroupSize <
-      size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
-    return PI_INVALID_WORK_GROUP_SIZE;
-  }
+    if (maxWorkGroupSize <
+        size_t(threadsPerBlock[0] * threadsPerBlock[1] * threadsPerBlock[2])) {
+      return PI_INVALID_WORK_GROUP_SIZE;
+    }
 
-  int blocksPerGrid[3] = {1, 1, 1};
+    int blocksPerGrid[3] = {1, 1, 1};
 
-  for (size_t i = 0; i < work_dim; i++) {
-    blocksPerGrid[i] =
-        static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
-        threadsPerBlock[i];
-  }
+    for (size_t i = 0; i < work_dim; i++) {
+      blocksPerGrid[i] =
+          static_cast<int>(global_work_size[i] + threadsPerBlock[i] - 1) /
+          threadsPerBlock[i];
+    }
 
-  pi_result retError = PI_SUCCESS;
-  std::unique_ptr<_pi_event> retImplEv{nullptr};
+    std::unique_ptr<_pi_event> retImplEv{nullptr};
 
-  try {
     CUstream cuStream = command_queue->get();
     CUfunction cuFunc = kernel->get();