Skip to content

Commit d7eba00

Browse files
author
iclsrc
committed
Merge from 'sycl' to 'sycl-web'
2 parents 0df8a77 + 5e7ea06 commit d7eba00

30 files changed

+846
-941
lines changed

buildbot/testlist.cfg

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
:test_exception_handling
1212
:test_group
1313
:test_h_item
14-
:test_handler
14+
#:test_handler
1515
:test_header
1616
:test_hierarchical
1717
:test_id
@@ -20,7 +20,7 @@
2020
:test_item
2121
:test_kernel
2222
:test_kernel_args
23-
:test_math_builtin_api
23+
#:test_math_builtin_api
2424
:test_multi_ptr
2525
:test_nd_item
2626
:test_nd_range
@@ -38,8 +38,7 @@
3838
:test_vector_api
3939
:test_vector_constructors
4040
:test_vector_load_store
41-
# Disable test to speedup testing until JIT is optimized
4241
#:test_vector_operators
4342
:test_vector_swizzle_assignment
44-
:test_vector_swizzles
45-
:test_vector_swizzles_opencl
43+
#:test_vector_swizzles
44+
#:test_vector_swizzles_opencl

sycl/include/CL/sycl/backend/cuda.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ namespace cuda {
1818

1919
// Mem Object info: Retrieve the raw CUDA pointer from a cl_mem
2020
#define PI_CUDA_RAW_POINTER (0xFF01)
21-
// Context creation: Use the primary context instead of a custom one
21+
// Context creation: Use a primary CUDA context instead of a custom one by
22+
// providing a property value of PI_TRUE for the following
23+
// property ID.
2224
#define PI_CONTEXT_PROPERTIES_CUDA_PRIMARY (0xFF02)
2325

2426
// PI Command Queue using Default stream

sycl/include/CL/sycl/context.hpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ class context {
4949
/// @param AsyncHandler is an instance of async_handler.
5050
/// @param UseCUDAPrimaryContext is a bool determining whether to use the
5151
/// primary context in the CUDA backend.
52-
context(const device &Device, async_handler AsyncHandler = {},
53-
bool UseCUDAPrimaryContext = false);
52+
explicit context(const device &Device, async_handler AsyncHandler = {},
53+
bool UseCUDAPrimaryContext = false);
5454

5555
/// Constructs a SYCL context instance using the provided platform.
5656
///
@@ -63,8 +63,8 @@ class context {
6363
/// @param AsyncHandler is an instance of async_handler.
6464
/// @param UseCUDAPrimaryContext is a bool determining whether to use the
6565
/// primary context in the CUDA backend.
66-
context(const platform &Platform, async_handler AsyncHandler = {},
67-
bool UseCUDAPrimaryContext = false);
66+
explicit context(const platform &Platform, async_handler AsyncHandler = {},
67+
bool UseCUDAPrimaryContext = false);
6868

6969
/// Constructs a SYCL context instance using list of devices.
7070
///
@@ -78,8 +78,9 @@ class context {
7878
/// @param AsyncHandler is an instance of async_handler.
7979
/// @param UseCUDAPrimaryContext is a bool determining whether to use the
8080
/// primary context in the CUDA backend.
81-
context(const vector_class<device> &DeviceList,
82-
async_handler AsyncHandler = {}, bool UseCUDAPrimaryContext = false);
81+
explicit context(const vector_class<device> &DeviceList,
82+
async_handler AsyncHandler = {},
83+
bool UseCUDAPrimaryContext = false);
8384

8485
/// Constructs a SYCL context instance from OpenCL cl_context.
8586
///

sycl/include/CL/sycl/detail/pi.h

Lines changed: 579 additions & 772 deletions
Large diffs are not rendered by default.

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 103 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -528,43 +528,57 @@ pi_result cuda_piPlatformsGet(pi_uint32 num_entries, pi_platform *platforms,
528528
pi_uint32 *num_platforms) {
529529

530530
try {
531-
static constexpr pi_uint32 numPlatforms = 1;
531+
static std::once_flag initFlag;
532+
static pi_uint32 numPlatforms = 1;
533+
static _pi_platform platformId;
532534

533-
if (num_platforms != nullptr) {
534-
*num_platforms = numPlatforms;
535+
if (num_entries == 0 and platforms != nullptr) {
536+
return PI_INVALID_VALUE;
537+
}
538+
if (platforms == nullptr and num_platforms == nullptr) {
539+
return PI_INVALID_VALUE;
535540
}
536541

537542
pi_result err = PI_SUCCESS;
538543

539-
if (platforms != nullptr) {
540-
541-
assert(num_entries != 0);
542-
543-
static std::once_flag initFlag;
544-
static _pi_platform platformId;
545-
std::call_once(
546-
initFlag,
547-
[](pi_result &err) {
548-
err = PI_CHECK_ERROR(cuInit(0));
549-
550-
int numDevices = 0;
551-
err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices));
544+
std::call_once(
545+
initFlag,
546+
[](pi_result &err) {
547+
if (cuInit(0) != CUDA_SUCCESS) {
548+
numPlatforms = 0;
549+
return;
550+
}
551+
int numDevices = 0;
552+
err = PI_CHECK_ERROR(cuDeviceGetCount(&numDevices));
553+
if (numDevices == 0) {
554+
numPlatforms = 0;
555+
return;
556+
}
557+
try {
552558
platformId.devices_.reserve(numDevices);
553-
try {
554-
for (int i = 0; i < numDevices; ++i) {
555-
CUdevice device;
556-
err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
557-
platformId.devices_.emplace_back(
558-
new _pi_device{device, &platformId});
559-
}
560-
} catch (...) {
561-
// Clear and rethrow to allow retry
562-
platformId.devices_.clear();
563-
throw;
559+
for (int i = 0; i < numDevices; ++i) {
560+
CUdevice device;
561+
err = PI_CHECK_ERROR(cuDeviceGet(&device, i));
562+
platformId.devices_.emplace_back(
563+
new _pi_device{device, &platformId});
564564
}
565-
},
566-
err);
565+
} catch (const std::bad_alloc &) {
566+
// Signal out-of-memory situation
567+
platformId.devices_.clear();
568+
err = PI_OUT_OF_HOST_MEMORY;
569+
} catch (...) {
570+
// Clear and rethrow to allow retry
571+
platformId.devices_.clear();
572+
throw;
573+
}
574+
},
575+
err);
567576

577+
if (num_platforms != nullptr) {
578+
*num_platforms = numPlatforms;
579+
}
580+
581+
if (platforms != nullptr) {
568582
*platforms = &platformId;
569583
}
570584

@@ -1110,12 +1124,30 @@ pi_result cuda_piDeviceGetInfo(pi_device device, pi_device_info param_name,
11101124
}
11111125

11121126
/* Context APIs */
1113-
pi_result cuda_piContextCreate(const cl_context_properties *properties,
1114-
pi_uint32 num_devices, const pi_device *devices,
1115-
void (*pfn_notify)(const char *errinfo,
1116-
const void *private_info,
1117-
size_t cb, void *user_data),
1118-
void *user_data, pi_context *retcontext) {
1127+
1128+
/// Create a PI CUDA context.
1129+
///
1130+
/// By default creates a scoped context and keeps the last active CUDA context
1131+
/// on top of the CUDA context stack.
1132+
/// With the PI_CONTEXT_PROPERTIES_CUDA_PRIMARY key/id and a value of PI_TRUE
1133+
/// creates a primary CUDA context and activates it on the CUDA context stack.
1134+
///
1135+
/// @param[in] properties 0 terminated array of key/id-value combinations. Can
1136+
/// be nullptr. Only accepts property key/id PI_CONTEXT_PROPERTIES_CUDA_PRIMARY
1137+
/// with a pi_bool value.
1138+
/// @param[in] num_devices Number of devices to create the context for.
1139+
/// @param[in] devices Devices to create the context for.
1140+
/// @param[in] pfn_notify Callback, currently unused.
1141+
/// @param[in] user_data User data for callback.
1142+
/// @param[out] retcontext Set to created context on success.
1143+
///
1144+
/// @return PI_SUCCESS on success, otherwise an error return code.
1145+
pi_result cuda_piContextCreate(const pi_context_properties *properties,
1146+
pi_uint32 num_devices, const pi_device *devices,
1147+
void (*pfn_notify)(const char *errinfo,
1148+
const void *private_info,
1149+
size_t cb, void *user_data),
1150+
void *user_data, pi_context *retcontext) {
11191151

11201152
assert(devices != nullptr);
11211153
// TODO: How to implement context callback?
@@ -1127,31 +1159,51 @@ pi_result cuda_piContextCreate(const cl_context_properties *properties,
11271159
assert(retcontext != nullptr);
11281160
pi_result errcode_ret = PI_SUCCESS;
11291161

1162+
// Parse properties.
1163+
bool property_cuda_primary = false;
1164+
while (properties && (0 != *properties)) {
1165+
// Consume property ID.
1166+
pi_context_properties id = *properties;
1167+
++properties;
1168+
// Consume property value.
1169+
pi_context_properties value = *properties;
1170+
++properties;
1171+
switch (id) {
1172+
case PI_CONTEXT_PROPERTIES_CUDA_PRIMARY:
1173+
assert(value == PI_FALSE || value == PI_TRUE);
1174+
property_cuda_primary = static_cast<bool>(value);
1175+
break;
1176+
default:
1177+
// Unknown property.
1178+
assert(!"Unknown piContextCreate property in property list");
1179+
return PI_INVALID_VALUE;
1180+
}
1181+
}
1182+
11301183
std::unique_ptr<_pi_context> piContextPtr{nullptr};
11311184
try {
1132-
if (properties && *properties != PI_CONTEXT_PROPERTIES_CUDA_PRIMARY) {
1133-
throw pi_result(CL_INVALID_VALUE);
1134-
} else if (!properties) {
1185+
if (property_cuda_primary) {
1186+
// Use the CUDA primary context and assume that we want to use it
1187+
// immediately as we want to forge context switches.
1188+
CUcontext Ctxt;
1189+
errcode_ret = PI_CHECK_ERROR(
1190+
cuDevicePrimaryCtxRetain(&Ctxt, devices[0]->cuDevice_));
1191+
piContextPtr = std::unique_ptr<_pi_context>(
1192+
new _pi_context{_pi_context::kind::primary, Ctxt, *devices});
1193+
errcode_ret = PI_CHECK_ERROR(cuCtxPushCurrent(Ctxt));
1194+
} else {
1195+
// Create a scoped context.
11351196
CUcontext newContext, current;
11361197
PI_CHECK_ERROR(cuCtxGetCurrent(&current));
1137-
errcode_ret = PI_CHECK_ERROR(cuCtxCreate(&newContext, CU_CTX_MAP_HOST,
1138-
(*devices)->cuDevice_));
1198+
errcode_ret = PI_CHECK_ERROR(
1199+
cuCtxCreate(&newContext, CU_CTX_MAP_HOST, devices[0]->cuDevice_));
11391200
piContextPtr = std::unique_ptr<_pi_context>(new _pi_context{
11401201
_pi_context::kind::user_defined, newContext, *devices});
1202+
// For scoped contexts keep the last active CUDA one on top of the stack
1203+
// as `cuCtxCreate` replaces it implicitly otherwise.
11411204
if (current != nullptr) {
1142-
// If there was an existing context on the thread we recover it
11431205
PI_CHECK_ERROR(cuCtxSetCurrent(current));
11441206
}
1145-
} else if (properties
1146-
&& *properties == PI_CONTEXT_PROPERTIES_CUDA_PRIMARY) {
1147-
CUcontext Ctxt;
1148-
errcode_ret = PI_CHECK_ERROR(cuDevicePrimaryCtxRetain(
1149-
&Ctxt, (*devices)->cuDevice_));
1150-
piContextPtr = std::unique_ptr<_pi_context>(
1151-
new _pi_context{_pi_context::kind::primary, Ctxt, *devices});
1152-
errcode_ret = PI_CHECK_ERROR(cuCtxPushCurrent(Ctxt));
1153-
} else {
1154-
throw pi_result(CL_INVALID_VALUE);
11551207
}
11561208

11571209
*retcontext = piContextPtr.release();

sycl/plugins/opencl/pi_opencl.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -451,12 +451,12 @@ pi_result OCL(piextGetDeviceFunctionPointer)(pi_device device,
451451
function_pointer_ret));
452452
}
453453

454-
pi_result OCL(piContextCreate)(
455-
const cl_context_properties *properties, // TODO: untie from OpenCL
456-
pi_uint32 num_devices, const pi_device *devices,
457-
void (*pfn_notify)(const char *errinfo, const void *private_info, size_t cb,
458-
void *user_data1),
459-
void *user_data, pi_context *retcontext) {
454+
pi_result OCL(piContextCreate)(const pi_context_properties *properties,
455+
pi_uint32 num_devices, const pi_device *devices,
456+
void (*pfn_notify)(const char *errinfo,
457+
const void *private_info,
458+
size_t cb, void *user_data1),
459+
void *user_data, pi_context *retcontext) {
460460
pi_result ret = PI_INVALID_OPERATION;
461461
*retcontext = cast<pi_context>(
462462
clCreateContext(properties, cast<cl_uint>(num_devices),

sycl/source/detail/context_impl.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,8 @@ context_impl::context_impl(const vector_class<cl::sycl::device> Devices,
4444

4545
if (MPlatform->is_cuda()) {
4646
#if USE_PI_CUDA
47-
const cl_context_properties props[] = {
48-
PI_CONTEXT_PROPERTIES_CUDA_PRIMARY,
49-
0};
47+
const pi_context_properties props[] = {PI_CONTEXT_PROPERTIES_CUDA_PRIMARY,
48+
UseCUDAPrimaryContext, 0};
5049

5150
getPlugin().call<PiApiKind::piContextCreate>(props, DeviceIds.size(),
5251
DeviceIds.data(), nullptr, nullptr, &MContext);

sycl/source/detail/event_impl.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,16 +95,14 @@ event_impl::event_impl(QueueImplPtr Queue) : MQueue(Queue) {
9595

9696
void event_impl::wait(
9797
std::shared_ptr<cl::sycl::detail::event_impl> Self) const {
98-
9998
if (MEvent)
10099
// presence of MEvent means the command has been enqueued, so no need to
101100
// go via the slow path event waiting in the scheduler
102101
waitInternal();
103102
else if (MCommand)
104-
detail::Scheduler::getInstance().waitForEvent(std::move(Self));
103+
detail::Scheduler::getInstance().waitForEvent(Self);
105104
if (MCommand && !SYCLConfig<SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP>::get())
106-
detail::Scheduler::getInstance().cleanupFinishedCommands(
107-
static_cast<Command *>(MCommand));
105+
detail::Scheduler::getInstance().cleanupFinishedCommands(std::move(Self));
108106
}
109107

110108
void event_impl::wait_and_throw(

sycl/source/detail/scheduler/scheduler.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,9 +123,13 @@ void Scheduler::waitForEvent(EventImplPtr Event) {
123123
GraphProcessor::waitForEvent(std::move(Event));
124124
}
125125

126-
void Scheduler::cleanupFinishedCommands(Command *FinishedCmd) {
126+
void Scheduler::cleanupFinishedCommands(EventImplPtr FinishedEvent) {
127127
std::lock_guard<std::mutex> lock(MGraphLock);
128-
MGraphBuilder.cleanupFinishedCommands(FinishedCmd);
128+
Command *FinishedCmd = static_cast<Command *>(FinishedEvent->getCommand());
129+
// The command might have been cleaned up (and set to nullptr) by another
130+
// thread
131+
if (FinishedCmd)
132+
MGraphBuilder.cleanupFinishedCommands(FinishedCmd);
129133
}
130134

131135
void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) {

sycl/source/detail/scheduler/scheduler.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ class Scheduler {
7979

8080
// Removes finished non-leaf non-alloca commands from the subgraph (assuming
8181
// that all its commands have been waited for).
82-
void cleanupFinishedCommands(Command *FinishedCmd);
82+
void cleanupFinishedCommands(EventImplPtr FinishedEvent);
8383

8484
// Creates nodes in the graph, that update Req with the pointer to the host
8585
// memory which contains the latest data of the memory object. New

sycl/test/.clang-format

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
BasedOnStyle: LLVM
2+
ColumnLimit: 0

0 commit comments

Comments
 (0)