diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index 43af7fc852f00..7d1fa27a6e905 100644 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -34,6 +34,7 @@ subject to change. Do not rely on these variables in production code. | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE` | Any(\*) | This environment variable enables users to control use of copy engines for copy operations. If the value is an integer, it will allow the use of copy engines, if available in the device, in Level Zero plugin to transfer SYCL buffer or image data between the host and/or device(s) and to fill SYCL buffer or image data in device or shared memory. The value of this environment variable can also be a pair of the form "lower_index:upper_index" where the indices point to copy engines in a list of all available copy engines. The default is 1. | | `SYCL_PI_LEVEL_ZERO_USE_COPY_ENGINE_FOR_D2D_COPY` (experimental) | Integer | Allows the use of copy engine, if available in the device, in Level Zero plugin for device to device copy operations. The default is 0. This option is experimental and will be removed once heuristics are added to make a decision about use of copy engine for device to device copy operations. | | `SYCL_PI_LEVEL_ZERO_TRACK_INDIRECT_ACCESS_MEMORY` | Any(\*) | Enable support of the kernels with indirect access and corresponding deferred release of memory allocations in the Level Zero plugin. | +| `SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS` | Any(\*) | Enable support of device-scope events whose state is not visible to the host. If enabled the Level Zero plugin would create all events having device-scope only and create proxy host-visible events for them when their status is needed (wait/query) on the host. The default is 0, meaning all events are host-visible. | | `SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE` | Any(\*) | Enables tracing of `parallel_for` invocations with rounded-up ranges. | | `SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING` | Any(\*) | Disables automatic rounding-up of `parallel_for` invocation ranges. | | `SYCL_ENABLE_PCI` | Integer | When set to 1, enables obtaining the GPU PCI address when using the Level Zero backend. The default is 0. | diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 338449afcda52..b7b70a2ded165 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -181,6 +181,14 @@ static void zePrint(const char *Format, ...) { } } +// Controls whether device-scope events are used. +static const bool ZeAllHostVisibleEvents = [] { + const auto DeviceEventsStr = + std::getenv("SYCL_PI_LEVEL_ZERO_DEVICE_SCOPE_EVENTS"); + bool result = (DeviceEventsStr ? (std::atoi(DeviceEventsStr) == 0) : true); + return result; +}(); + // Helper function to implement zeHostSynchronize. // The behavior is to avoid infinite wait during host sync under ZE_DEBUG. // This allows for a much more responsive debugging of hangs. @@ -379,8 +387,8 @@ pi_result _pi_mem::removeMapping(void *MappedTo, Mapping &MapInfo) { } pi_result -_pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool, - size_t &Index) { +_pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool, + size_t &Index, bool HostVisible) { // Maximum number of events that can be present in an event ZePool is captured // here. Setting it to 256 gave best possible performance for several // benchmarks. @@ -396,10 +404,23 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool, return PI_INVALID_VALUE; } + // Setup for host-visible pool as needed. + ze_event_pool_flag_t ZePoolFlag = {}; + ze_event_pool_handle_t *ZePool = [&] { + if (ZeAllHostVisibleEvents) { + ZePoolFlag = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + return &ZeEventPool; + } else if (HostVisible) { + ZePoolFlag = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + return &ZeHostVisibleEventPool; + } else { + return &ZeEventPool; + } + }(); + Index = 0; // Create one event ZePool per MaxNumEventsPerPool events - if ((ZeEventPool == nullptr) || - (NumEventsAvailableInEventPool[ZeEventPool] == 0)) { + if ((*ZePool == nullptr) || (NumEventsAvailableInEventPool[*ZePool] == 0)) { // Creation of the new ZePool with record in NumEventsAvailableInEventPool // and initialization of the record in NumEventsUnreleasedInEventPool must // be done atomically. Otherwise it is possible that @@ -414,34 +435,28 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &ZePool, ZeStruct ZeEventPoolDesc; ZeEventPoolDesc.count = MaxNumEventsPerPool; - - // Make all events visible on the host. - // TODO: events that are used only on device side APIs can be optimized - // to not be from the host-visible pool. - // - ZeEventPoolDesc.flags = - ZE_EVENT_POOL_FLAG_HOST_VISIBLE | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; + ZeEventPoolDesc.flags = ZePoolFlag | ZE_EVENT_POOL_FLAG_KERNEL_TIMESTAMP; std::vector ZeDevices; std::for_each(Devices.begin(), Devices.end(), [&](pi_device &D) { ZeDevices.push_back(D->ZeDevice); }); ZE_CALL(zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc, ZeDevices.size(), - &ZeDevices[0], &ZeEventPool)); - NumEventsAvailableInEventPool[ZeEventPool] = MaxNumEventsPerPool - 1; - NumEventsUnreleasedInEventPool[ZeEventPool] = MaxNumEventsPerPool; + &ZeDevices[0], ZePool)); + NumEventsAvailableInEventPool[*ZePool] = MaxNumEventsPerPool - 1; + NumEventsUnreleasedInEventPool[*ZePool] = MaxNumEventsPerPool; } else { std::lock_guard NumEventsAvailableInEventPoolGuard( NumEventsAvailableInEventPoolMutex); - Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[ZeEventPool]; - --NumEventsAvailableInEventPool[ZeEventPool]; + Index = MaxNumEventsPerPool - NumEventsAvailableInEventPool[*ZePool]; + --NumEventsAvailableInEventPool[*ZePool]; } - ZePool = ZeEventPool; + Pool = *ZePool; return PI_SUCCESS; } -pi_result _pi_context::decrementUnreleasedEventsInPool(pi_event Event) { - ze_event_pool_handle_t ZePool = Event->ZeEventPool; +pi_result +_pi_context::decrementUnreleasedEventsInPool(ze_event_pool_handle_t &ZePool) { if (!ZePool) { // This must be an interop event created on a users's pool. // Do nothing. @@ -460,9 +475,9 @@ pi_result _pi_context::decrementUnreleasedEventsInPool(pi_event Event) { // multiple pi_context::ZeEventPool can be created if all slots in the pool // are already used up. So nullifying pi_context::ZeEventPool may point // a different EventPool than Event->ZeEventPool. - if (ZeEventPool == Event->ZeEventPool) + if (ZeEventPool == ZePool) ZeEventPool = nullptr; - Event->ZeEventPool = nullptr; + ZePool = nullptr; } return PI_SUCCESS; } @@ -761,6 +776,8 @@ pi_result _pi_context::finalize() { NumEventsUnreleasedInEventPoolMutex); if (ZeEventPool) ZE_CALL(zeEventPoolDestroy, (ZeEventPool)); + if (ZeHostVisibleEventPool) + ZE_CALL(zeEventPoolDestroy, (ZeHostVisibleEventPool)); // Destroy the command list used for initializations ZE_CALL(zeCommandListDestroy, (ZeCommandListInit)); @@ -1050,7 +1067,10 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // therefore that this Queue is idle. bool CurrentlyEmpty = this->LastCommandEvent == nullptr; - this->LastCommandEvent = CommandList->second.EventList.back(); + // The list can be empty if command-list only contains signals of proxy + // events. + if (!CommandList->second.EventList.empty()) + this->LastCommandEvent = CommandList->second.EventList.back(); // Batch if allowed to, but don't batch if we know there are no kernels // from this queue that are currently executing. This is intended to get @@ -1244,7 +1264,9 @@ pi_result _pi_ze_event_list_t::createAndRetainPiZeEventList( PI_ASSERT(EventList[I] != nullptr, PI_INVALID_VALUE); auto ZeEvent = EventList[I]->ZeEvent; - if (FilterEventWaitList) { + // Avoid polling of the device-scope events. + // TODO: be more fine-grain and check individual events. + if (FilterEventWaitList && ZeAllHostVisibleEvents) { auto Res = ZE_CALL_NOCHECK(zeEventQueryStatus, (ZeEvent)); if (Res == ZE_RESULT_SUCCESS) { // Event has already completed, don't put it into the list @@ -1539,6 +1561,8 @@ pi_result piPlatformsGet(pi_uint32 NumEntries, pi_platform *Platforms, if (NumPlatforms) *NumPlatforms = PiPlatformsCache->size(); + zePrint("Using %s events\n", + ZeAllHostVisibleEvents ? "all host-visible" : "device-only"); return PI_SUCCESS; } @@ -4371,6 +4395,74 @@ pi_result piextKernelGetNativeHandle(pi_kernel Kernel, // // Events // +ze_event_handle_t _pi_event::getHostVisibleEvent() const { + if (ZeAllHostVisibleEvents) { + return ZeEvent; + } else if (ZeHostVisibleEvent) { + return ZeHostVisibleEvent; + } else { + die("The host-visible proxy event missing"); + } +} + +pi_result +_pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent) { + + if (ZeAllHostVisibleEvents) { + HostVisibleEvent = ZeEvent; + } else if (ZeHostVisibleEvent) { + HostVisibleEvent = ZeHostVisibleEvent; + } else { + size_t Index; + ze_event_pool_handle_t ZeEventPool = {}; + if (auto Res = + Context->getFreeSlotInExistingOrNewPool(ZeEventPool, Index, true)) + return Res; + + // Create a "proxy" host-visible event. + // + // TODO: consider creating just single host-visible proxy event to + // represent multiple device-scope events. E.g. have a host-visible + // event at the end of each command-list to represent device-scope + // events from every command in that command-list. + // + ZeStruct ZeEventDesc; + ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + ZeEventDesc.wait = 0; + ZeEventDesc.index = Index; + + ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeHostVisibleEvent)); + ZeHostVisibleEventPool = ZeEventPool; + HostVisibleEvent = ZeHostVisibleEvent; + + // Submit the command(s) signalling the proxy event to the queue. + // We have to first submit a wait for the device-only event for which this + // proxy is created. + // + // Get a new command list to be used on this call + { + std::lock_guard Lock(Queue->PiQueueMutex); + + // We want to batch these commands to avoid extra submissions (costly) + bool OkToBatch = true; + + pi_command_list_ptr_t CommandList{}; + if (auto Res = Queue->Context->getAvailableCommandList(Queue, CommandList, + false, OkToBatch)) + return Res; + + ZE_CALL(zeCommandListAppendWaitOnEvents, + (CommandList->first, 1, &ZeEvent)); + ZE_CALL(zeCommandListAppendSignalEvent, + (CommandList->first, ZeHostVisibleEvent)); + + if (auto Res = Queue->executeCommandList(CommandList, false, OkToBatch)) + return Res; + } + } + return PI_SUCCESS; +} + pi_result piEventCreate(pi_context Context, pi_event *RetEvent) { size_t Index = 0; ze_event_pool_handle_t ZeEventPool = {}; @@ -4379,12 +4471,21 @@ pi_result piEventCreate(pi_context Context, pi_event *RetEvent) { ze_event_handle_t ZeEvent; ZeStruct ZeEventDesc; - // We have to set the SIGNAL flag as HOST scope because the - // Level-Zero plugin implementation waits for the events to complete - // on the host. - ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; - ZeEventDesc.wait = 0; ZeEventDesc.index = Index; + ZeEventDesc.wait = 0; + // + // Set the scope to "device" for every event. This is sufficient for global + // device access and peer device access. If needed to be waited on the host + // we are doing special handling, see piEventsWait. + // + // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be + // used in some circumstances. + // + if (ZeAllHostVisibleEvents) { + ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; + } else { + ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_DEVICE; + } ZE_CALL(zeEventCreate, (ZeEventPool, &ZeEventDesc, &ZeEvent)); @@ -4435,13 +4536,18 @@ pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName, } } + // Make sure that we query the host-visible event. + ze_event_handle_t ZeHostVisibleEvent; + if (auto Res = Event->getOrCreateHostVisibleEvent(ZeHostVisibleEvent)) + return Res; + ze_result_t ZeResult; - ZeResult = ZE_CALL_NOCHECK(zeEventQueryStatus, (Event->ZeEvent)); + ZeResult = ZE_CALL_NOCHECK(zeEventQueryStatus, (ZeHostVisibleEvent)); if (ZeResult == ZE_RESULT_SUCCESS) { return getInfo(ParamValueSize, ParamValue, ParamValueSizeRet, pi_int32{CL_COMPLETE}); // Untie from OpenCL } - // TODO: We don't know if the status is queueed, submitted or running. + // TODO: We don't know if the status is queued, submitted or running. // For now return "running", as others are unlikely to be of // interest. return getInfo(ParamValueSize, ParamValue, ParamValueSizeRet, @@ -4644,6 +4750,17 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { return PI_INVALID_EVENT; } + // Make sure to add all host-visible "proxy" event signals if needed. + // This ensures that all signalling commands are submitted below and + // thus proxy events can be waited without a deadlock. + // + for (uint32_t I = 0; I < NumEvents; I++) { + ze_event_handle_t ZeHostVisibleEvent; + if (auto Res = + EventList[I]->getOrCreateHostVisibleEvent(ZeHostVisibleEvent)) + return Res; + } + // Submit dependent open command lists for execution, if any for (uint32_t I = 0; I < NumEvents; I++) { auto Queue = EventList[I]->Queue; @@ -4659,7 +4776,7 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) { } for (uint32_t I = 0; I < NumEvents; I++) { - ze_event_handle_t ZeEvent = EventList[I]->ZeEvent; + ze_event_handle_t ZeEvent = EventList[I]->getHostVisibleEvent(); zePrint("ZeEvent = %#lx\n", pi_cast(ZeEvent)); ZE_CALL(zeHostSynchronize, (ZeEvent)); @@ -4725,11 +4842,20 @@ static pi_result EventRelease(pi_event Event, pi_queue LockedQueue) { if (Event->OwnZeEvent) { ZE_CALL(zeEventDestroy, (Event->ZeEvent)); } + if (Event->ZeHostVisibleEvent) { + ZE_CALL(zeEventDestroy, (Event->ZeHostVisibleEvent)); + } auto Context = Event->Context; - if (auto Res = Context->decrementUnreleasedEventsInPool(Event)) + if (auto Res = Context->decrementUnreleasedEventsInPool(Event->ZeEventPool)) return Res; + if (Event->ZeHostVisibleEvent) { + if (auto Res = Context->decrementUnreleasedEventsInPool( + Event->ZeHostVisibleEventPool)) + return Res; + } + // We intentionally incremented the reference counter when an event is // created so that we can avoid pi_queue is released before the associated // pi_event is released. Here we have to decrement it so pi_queue diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 748fb331d9025..0e65b46bf9d84 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -536,12 +536,14 @@ struct _pi_context : _pi_object { bool AllowBatching = false); // Get index of the free slot in the available pool. If there is no available - // pool then create new one. - pi_result getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &); + // pool then create new one. The HostVisible parameter tells if we need a + // slot for a host-visible event. + pi_result getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &, size_t &, + bool HostVisible = false); // If event is destroyed then decrement number of events living in the pool // and destroy the pool if there are no unreleased events. - pi_result decrementUnreleasedEventsInPool(pi_event Event); + pi_result decrementUnreleasedEventsInPool(ze_event_pool_handle_t &ZePool); // Store USM allocator context(internal allocator structures) // for USM shared and device allocations. There is 1 allocator context @@ -561,11 +563,17 @@ struct _pi_context : _pi_object { private: // Following member variables are used to manage assignment of events // to event pools. - // TODO: These variables may be moved to pi_device and pi_platform - // if appropriate. + // + // TODO: Create pi_event_pool class to encapsulate working with pools. + // This will avoid needing the use of maps below, and cleanup the + // pi_context overall. + // // Event pool to which events are being added to. - ze_event_pool_handle_t ZeEventPool; + ze_event_pool_handle_t ZeEventPool = {nullptr}; + // Event pool to which host-visible events are added to. + ze_event_pool_handle_t ZeHostVisibleEventPool = {nullptr}; + // This map will be used to determine if a pool is full or not // by storing number of empty slots available in the pool. std::unordered_map @@ -902,6 +910,19 @@ struct _pi_event : _pi_object { // Level Zero event pool handle. ze_event_pool_handle_t ZeEventPool; + // In case we use device-only events/pools these are their host-visible + // counterparts. The idea is that two Level-Zero events co-exist: + // - one is always created with device-scope and used for GPU book-keeping. + // - the other host-visible proxy event is created on demand when we need + // to query/wait on a device-scope event from the host. + // + ze_event_handle_t ZeHostVisibleEvent = {nullptr}; + ze_event_pool_handle_t ZeHostVisibleEventPool = {nullptr}; + // Get the host-visible event or create one and enqueue its signal. + pi_result getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent); + // Get the host-visible event ensuring that one was already created before. + ze_event_handle_t getHostVisibleEvent() const; + // Level Zero command list where the command signaling this event was appended // to. This is currently used to remember/destroy the command list after all // commands in it are completed, i.e. this event signaled.