diff --git a/sycl/doc/SYCLEnvironmentVariables.md b/sycl/doc/SYCLEnvironmentVariables.md index f3ef61a34896a..f7c538b27d856 100644 --- a/sycl/doc/SYCLEnvironmentVariables.md +++ b/sycl/doc/SYCLEnvironmentVariables.md @@ -18,6 +18,7 @@ subject to change. Do not rely on these variables in production code. | SYCL_USE_KERNEL_SPV | Path to the SPIR-V binary | Load device image from the specified file. If runtime is unable to read the file, `cl::sycl::runtime_error` exception is thrown.| | SYCL_DUMP_IMAGES | Any(*) | Dump device image binaries to file. Control has no effect if SYCL_USE_KERNEL_SPV is set. | | SYCL_PRINT_EXECUTION_GRAPH | Described [below](#sycl_print_execution_graph-options) | Print execution graph to DOT text file. | +| SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP | Any(*) | Disable cleanup of finished command nodes at host-device synchronization points. | | SYCL_THROW_ON_BLOCK | Any(*) | Throw an exception on attempt to wait for a blocked command. | | SYCL_DEVICELIB_INHIBIT_NATIVE | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. | | SYCL_DEVICE_ALLOWLIST | A list of devices and their minimum driver version following the pattern: DeviceName:{{XXX}},DriverVersion:{{X.Y.Z.W}}. Also may contain PlatformName and PlatformVersion | Filter out devices that do not match the pattern specified. Regular expression can be passed and the SYCL RT will select only those devices which satisfy the regex. | diff --git a/sycl/include/CL/sycl/detail/scheduler/commands.hpp b/sycl/include/CL/sycl/detail/scheduler/commands.hpp index 85023f165c5ea..66d1567d70b1e 100644 --- a/sycl/include/CL/sycl/detail/scheduler/commands.hpp +++ b/sycl/include/CL/sycl/detail/scheduler/commands.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -98,7 +99,7 @@ class Command { void addDep(EventImplPtr Event) { MDepsEvents.push_back(std::move(Event)); } - void addUser(Command *NewUser) { MUsers.push_back(NewUser); } + void addUser(Command *NewUser) { MUsers.insert(NewUser); } // Return type of the command, e.g. Allocate, MemoryCopy. CommandType getType() const { return MType; } @@ -149,11 +150,13 @@ class Command { // Contains list of dependencies(edges) std::vector MDeps; // Contains list of commands that depend on the command - std::vector MUsers; + std::unordered_set MUsers; // Indicates whether the command can be blocked from enqueueing bool MIsBlockable = false; // Indicates whether the command is blocked from enqueueing std::atomic MCanEnqueue; + // Counts the number of memory objects this command is a leaf for + unsigned MLeafCounter = 0; const char *MBlockReason = "Unknown"; }; diff --git a/sycl/include/CL/sycl/detail/scheduler/scheduler.hpp b/sycl/include/CL/sycl/detail/scheduler/scheduler.hpp index 9fff10ee6b80c..29e061fb0a8bb 100644 --- a/sycl/include/CL/sycl/detail/scheduler/scheduler.hpp +++ b/sycl/include/CL/sycl/detail/scheduler/scheduler.hpp @@ -77,6 +77,10 @@ class Scheduler { // sycl::image destructors. void removeMemoryObject(detail::SYCLMemObjI *MemObj); + // Removes finished non-leaf non-alloca commands from the subgraph (assuming + // that all its commands have been waited for). + void cleanupFinishedCommands(Command *FinishedCmd); + // Creates nodes in the graph, that update Req with the pointer to the host // memory which contains the latest data of the memory object. New operations // with the same memory object that have side effects are blocked until @@ -125,8 +129,9 @@ class Scheduler { // Event passed and its dependencies. void optimize(EventImplPtr Event); - // Removes unneeded commands from the graph. - void cleanupCommands(bool CleanupReleaseCommands = false); + // Removes finished non-leaf non-alloca commands from the subgraph (assuming + // that all its commands have been waited for). + void cleanupFinishedCommands(Command *FinishedCmd); // Reschedules command passed using Queue provided. this can lead to // rescheduling of all dependent commands. This can be used when user @@ -140,6 +145,9 @@ class Scheduler { MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue, Requirement *Req); + // Decrements leaf counters for all leaves of the record. + void decrementLeafCountersForRecord(MemObjRecord *Record); + // Removes commands that use given MemObjRecord from the graph. void cleanupCommandsForRecord(MemObjRecord *Record); diff --git a/sycl/source/detail/config.def b/sycl/source/detail/config.def index 9ffacb4e1b113..2f0af4d345c8d 100644 --- a/sycl/source/detail/config.def +++ b/sycl/source/detail/config.def @@ -11,5 +11,5 @@ // underscore(__). CONFIG(SYCL_PRINT_EXECUTION_GRAPH, 32, __SYCL_PRINT_EXECUTION_GRAPH) +CONFIG(SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP, 1, __SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP) CONFIG(SYCL_DEVICE_ALLOWLIST, 1024, __SYCL_DEVICE_ALLOWLIST) - diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 6bd4f49f2ffdb..2e3a7bc61b6f9 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -13,6 +13,8 @@ #include #include +#include "detail/config.hpp" + #include __SYCL_INLINE namespace cl { @@ -100,6 +102,9 @@ void event_impl::wait( waitInternal(); else if (MCommand) detail::Scheduler::getInstance().waitForEvent(std::move(Self)); + if (MCommand && !SYCLConfig::get()) + detail::Scheduler::getInstance().cleanupFinishedCommands( + static_cast(MCommand)); } void event_impl::wait_and_throw( diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index e4478d394e131..cc43aa8e81375 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -138,13 +138,15 @@ void Scheduler::GraphBuilder::UpdateLeaves(const std::set &Cmds, if (ReadOnlyReq) return; - for (const Command *Cmd : Cmds) { + for (Command *Cmd : Cmds) { auto NewEnd = std::remove(Record->MReadLeaves.begin(), Record->MReadLeaves.end(), Cmd); + Cmd->MLeafCounter -= std::distance(NewEnd, Record->MReadLeaves.end()); Record->MReadLeaves.erase(NewEnd, Record->MReadLeaves.end()); NewEnd = std::remove(Record->MWriteLeaves.begin(), Record->MWriteLeaves.end(), Cmd); + Cmd->MLeafCounter -= std::distance(NewEnd, Record->MWriteLeaves.end()); Record->MWriteLeaves.erase(NewEnd, Record->MWriteLeaves.end()); } } @@ -166,8 +168,10 @@ void Scheduler::GraphBuilder::AddNodeToLeaves(MemObjRecord *Record, Dep.MDepCommand = OldLeaf; Cmd->addDep(Dep); OldLeaf->addUser(Cmd); + --(OldLeaf->MLeafCounter); } Leaves.push_back(Cmd); + ++(Cmd->MLeafCounter); } UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd( @@ -560,6 +564,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( Record->MAllocaCommands.push_back(AllocaCmd); Record->MWriteLeaves.push_back(AllocaCmd); + ++(AllocaCmd->MLeafCounter); } return AllocaCmd; } @@ -633,6 +638,16 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr CommandGroup, return NewCmd.release(); } +void Scheduler::GraphBuilder::decrementLeafCountersForRecord( + MemObjRecord *Record) { + for (Command *Cmd : Record->MReadLeaves) { + --(Cmd->MLeafCounter); + } + for (Command *Cmd : Record->MWriteLeaves) { + --(Cmd->MLeafCounter); + } +} + void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) { std::vector &AllocaCommands = Record->MAllocaCommands; if (AllocaCommands.empty()) @@ -683,9 +698,7 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) { for (auto DepCmdIt : ShouldBeUpdated) { if (!DepCmdIt.second) continue; - std::vector &DepUsers = DepCmdIt.first->MUsers; - DepUsers.erase(std::remove(DepUsers.begin(), DepUsers.end(), Cmd), - DepUsers.end()); + DepCmdIt.first->MUsers.erase(Cmd); } // If all dependencies have been removed this way, mark the command for @@ -702,8 +715,48 @@ void Scheduler::GraphBuilder::cleanupCommandsForRecord(MemObjRecord *Record) { } } -void Scheduler::GraphBuilder::cleanupCommands(bool CleanupReleaseCommands) { - // TODO: Implement. +void Scheduler::GraphBuilder::cleanupFinishedCommands(Command *FinishedCmd) { + std::queue CmdsToVisit({FinishedCmd}); + std::set Visited; + + // Traverse the graph using BFS + while (!CmdsToVisit.empty()) { + Command *Cmd = CmdsToVisit.front(); + CmdsToVisit.pop(); + + if (!Visited.insert(Cmd).second) + continue; + + for (const DepDesc &Dep : Cmd->MDeps) { + if (Dep.MDepCommand) + CmdsToVisit.push(Dep.MDepCommand); + } + + // Do not clean up the node if it is a leaf for any memory object + if (Cmd->MLeafCounter > 0) + continue; + // Do not clean up allocation commands + Command::CommandType CmdT = Cmd->getType(); + if (CmdT == Command::ALLOCA || CmdT == Command::ALLOCA_SUB_BUF) + continue; + + for (Command *UserCmd : Cmd->MUsers) { + for (DepDesc &Dep : UserCmd->MDeps) { + // Link the users of the command to the alloca command(s) instead + if (Dep.MDepCommand == Cmd) { + Dep.MDepCommand = Dep.MAllocaCmd; + Dep.MDepCommand->MUsers.insert(UserCmd); + } + } + } + // Update dependency users + for (DepDesc &Dep : Cmd->MDeps) { + Command *DepCmd = Dep.MDepCommand; + DepCmd->MUsers.erase(Cmd); + } + Cmd->getEvent()->setCommand(nullptr); + delete Cmd; + } } void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjI *MemObject) { diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index c3352907948fa..ea254cee5cd84 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -117,6 +117,11 @@ void Scheduler::waitForEvent(EventImplPtr Event) { GraphProcessor::waitForEvent(std::move(Event)); } +void Scheduler::cleanupFinishedCommands(Command *FinishedCmd) { + std::lock_guard lock(MGraphLock); + MGraphBuilder.cleanupFinishedCommands(FinishedCmd); +} + void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) { std::lock_guard lock(MGraphLock); @@ -125,6 +130,7 @@ void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) { // No operations were performed on the mem object return; waitForRecordToFinish(Record); + MGraphBuilder.decrementLeafCountersForRecord(Record); MGraphBuilder.cleanupCommandsForRecord(Record); MGraphBuilder.removeRecordForMemObj(MemObj); } diff --git a/sycl/test/scheduler/FakeCommand.hpp b/sycl/test/scheduler/FakeCommand.hpp deleted file mode 100644 index 70202c53f44e2..0000000000000 --- a/sycl/test/scheduler/FakeCommand.hpp +++ /dev/null @@ -1,23 +0,0 @@ -#include - -// A fake command class used for testing -class FakeCommand : public cl::sycl::detail::Command { -public: - FakeCommand(cl::sycl::detail::QueueImplPtr Queue, - cl::sycl::detail::Requirement Req) - : Command{cl::sycl::detail::Command::ALLOCA, Queue}, - MRequirement{std::move(Req)} {} - - void printDot(std::ostream &Stream) const override {} - - const cl::sycl::detail::Requirement *getRequirement() const final { - return &MRequirement; - }; - - cl_int enqueueImp() override { return MRetVal; } - - cl_int MRetVal = CL_SUCCESS; - -protected: - cl::sycl::detail::Requirement MRequirement; -}; diff --git a/sycl/test/scheduler/FinishedCmdCleanup.cpp b/sycl/test/scheduler/FinishedCmdCleanup.cpp new file mode 100644 index 0000000000000..ed8e3c2670e39 --- /dev/null +++ b/sycl/test/scheduler/FinishedCmdCleanup.cpp @@ -0,0 +1,92 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %t.out +#include + +#include +#include + +#include "SchedulerTestUtils.hpp" + +using namespace cl::sycl; + +// This test checks regular execution graph cleanup at host-device +// synchronization points +int main() { + TestScheduler TS; + queue Queue; + buffer BufA(range<1>(1)); + buffer BufB(range<1>(1)); + buffer BufC(range<1>(1)); + detail::Requirement FakeReqA = getFakeRequirement(BufA); + detail::Requirement FakeReqB = getFakeRequirement(BufB); + detail::Requirement FakeReqC = getFakeRequirement(BufC); + detail::MemObjRecord *RecC = + TS.getOrInsertMemObjRecord(detail::getSyclObjImpl(Queue), &FakeReqC); + + // Create a graph and check that all inner nodes have been deleted and + // their users have had the corresponding dependency replaced with a + // dependency on the alloca. The graph should undergo the following + // transformation: + // +---------+ +---------+ +---------++---------+ + // | LeafA | <-- | InnerA | | LeafA || LeafB | + // +---------+ +---------+ +---------++---------+ + // | | | | + // | | ===> | | + // v v v v + // +---------+ +---------+ +---------++---------+ + // | InnerC | | InnerB | | AllocaA || AllocaB | + // +---------+ +---------+ +---------++---------+ + // | | + // | | + // v v + // +---------+ +---------+ + // | AllocaA | | LeafB | + // +---------+ +---------+ + // | + // | + // v + // +---------+ + // | AllocaB | + // +---------+ + detail::AllocaCommand AllocaA{detail::getSyclObjImpl(Queue), FakeReqA}; + detail::AllocaCommand AllocaB{detail::getSyclObjImpl(Queue), FakeReqB}; + + int NInnerCommandsAlive = 3; + std::function Callback = [&]() { --NInnerCommandsAlive; }; + + FakeCommand *InnerC = new FakeCommandWithCallback( + detail::getSyclObjImpl(Queue), FakeReqA, Callback); + addEdge(InnerC, &AllocaA, &AllocaA); + + FakeCommand LeafB{detail::getSyclObjImpl(Queue), FakeReqB}; + addEdge(&LeafB, &AllocaB, &AllocaB); + TS.AddNodeToLeaves(RecC, &LeafB); + + FakeCommand LeafA{detail::getSyclObjImpl(Queue), FakeReqA}; + addEdge(&LeafA, InnerC, &AllocaA); + TS.AddNodeToLeaves(RecC, &LeafA); + + FakeCommand *InnerB = new FakeCommandWithCallback( + detail::getSyclObjImpl(Queue), FakeReqB, Callback); + addEdge(InnerB, &LeafB, &AllocaB); + + FakeCommand *InnerA = new FakeCommandWithCallback( + detail::getSyclObjImpl(Queue), FakeReqA, Callback); + addEdge(InnerA, &LeafA, &AllocaA); + addEdge(InnerA, InnerB, &AllocaB); + + TS.cleanupFinishedCommands(InnerA); + TS.removeRecordForMemObj(detail::getSyclObjImpl(BufC).get()); + + assert(NInnerCommandsAlive == 0); + + assert(LeafA.MDeps.size() == 1); + assert(LeafA.MDeps[0].MDepCommand == &AllocaA); + assert(AllocaA.MUsers.size() == 1); + assert(*AllocaA.MUsers.begin() == &LeafA); + + assert(LeafB.MDeps.size() == 1); + assert(LeafB.MDeps[0].MDepCommand == &AllocaB); + assert(AllocaB.MUsers.size() == 1); + assert(*AllocaB.MUsers.begin() == &LeafB); +} diff --git a/sycl/test/scheduler/LeafLimit.cpp b/sycl/test/scheduler/LeafLimit.cpp index b04028ec850bf..0489c56f2a690 100644 --- a/sycl/test/scheduler/LeafLimit.cpp +++ b/sycl/test/scheduler/LeafLimit.cpp @@ -2,30 +2,17 @@ // RUN: %t.out #include +#include #include #include #include -#include "FakeCommand.hpp" +#include "SchedulerTestUtils.hpp" // This test checks the leaf limit imposed on the execution graph using namespace cl::sycl; -class TestScheduler : public detail::Scheduler { -public: - void AddNodeToLeaves(detail::MemObjRecord *Rec, detail::Command *Cmd, - access::mode Mode) { - return MGraphBuilder.AddNodeToLeaves(Rec, Cmd, Mode); - } - - detail::MemObjRecord * - getOrInsertMemObjRecord(const detail::QueueImplPtr &Queue, - detail::Requirement *Req) { - return MGraphBuilder.getOrInsertMemObjRecord(Queue, Req); - } -}; - int main() { TestScheduler TS; queue Queue; @@ -56,7 +43,7 @@ int main() { } // Add edges as leaves and exceed the leaf limit for (auto LeafPtr : LeavesToAdd) { - TS.AddNodeToLeaves(Rec, LeafPtr, access::mode::read_write); + TS.AddNodeToLeaves(Rec, LeafPtr); } // Check that the oldest leaf has been removed from the leaf list // and added as a dependency of the newest one instead @@ -70,7 +57,9 @@ int main() { FakeCommand *OldestLeaf = LeavesToAdd.front(); FakeCommand *NewestLeaf = LeavesToAdd.back(); assert(OldestLeaf->MUsers.size() == 1); - assert(OldestLeaf->MUsers[0] == NewestLeaf); + assert(OldestLeaf->MUsers.count(NewestLeaf)); assert(NewestLeaf->MDeps.size() == 2); - assert(NewestLeaf->MDeps[1].MDepCommand == OldestLeaf); + assert(std::any_of( + NewestLeaf->MDeps.begin(), NewestLeaf->MDeps.end(), + [&](const detail::DepDesc &DD) { return DD.MDepCommand == OldestLeaf; })); } diff --git a/sycl/test/scheduler/MemObjCommandCleanup.cpp b/sycl/test/scheduler/MemObjCommandCleanup.cpp index bede449360353..e96c771f95c79 100644 --- a/sycl/test/scheduler/MemObjCommandCleanup.cpp +++ b/sycl/test/scheduler/MemObjCommandCleanup.cpp @@ -2,11 +2,10 @@ // RUN: %t.out #include -#include #include #include -#include "FakeCommand.hpp" +#include "SchedulerTestUtils.hpp" // This test checks that the execution graph cleanup on memory object // destruction traverses the entire graph, rather than only the immediate users @@ -14,53 +13,6 @@ using namespace cl::sycl; -class TestScheduler : public detail::Scheduler { -public: - void cleanupCommandsForRecord(detail::MemObjRecord *Rec) { - MGraphBuilder.cleanupCommandsForRecord(Rec); - } - - void removeRecordForMemObj(detail::SYCLMemObjI *MemObj) { - MGraphBuilder.removeRecordForMemObj(MemObj); - } - - detail::MemObjRecord * - getOrInsertMemObjRecord(const detail::QueueImplPtr &Queue, - detail::Requirement *Req) { - return MGraphBuilder.getOrInsertMemObjRecord(Queue, Req); - } -}; - -class FakeCommandWithCallback : public FakeCommand { -public: - FakeCommandWithCallback(detail::QueueImplPtr Queue, detail::Requirement Req, - std::function Callback) - : FakeCommand(Queue, Req), MCallback(std::move(Callback)) {} - - ~FakeCommandWithCallback() override { MCallback(); } - -protected: - std::function MCallback; -}; - -template -detail::Requirement getFakeRequirement(const MemObjT &MemObj) { - return {{0, 0, 0}, - {0, 0, 0}, - {0, 0, 0}, - access::mode::read_write, - detail::getSyclObjImpl(MemObj).get(), - 0, - 0, - 0}; -} - -void addEdge(detail::Command *User, detail::Command *Dep, - detail::AllocaCommandBase *Alloca) { - User->addDep(detail::DepDesc{Dep, User->getRequirement(), Alloca}); - Dep->addUser(User); -} - int main() { TestScheduler TS; queue Queue; diff --git a/sycl/test/scheduler/SchedulerTestUtils.hpp b/sycl/test/scheduler/SchedulerTestUtils.hpp new file mode 100644 index 0000000000000..856b113985128 --- /dev/null +++ b/sycl/test/scheduler/SchedulerTestUtils.hpp @@ -0,0 +1,81 @@ +#pragma once +#include + +#include +// This header contains a few common classes/methods used in +// execution graph testing. + +class FakeCommand : public cl::sycl::detail::Command { +public: + FakeCommand(cl::sycl::detail::QueueImplPtr Queue, + cl::sycl::detail::Requirement Req) + : Command{cl::sycl::detail::Command::EMPTY_TASK, Queue}, + MRequirement{std::move(Req)} {} + + void printDot(std::ostream &Stream) const override {} + + const cl::sycl::detail::Requirement *getRequirement() const final { + return &MRequirement; + }; + + cl_int enqueueImp() override { return MRetVal; } + + cl_int MRetVal = CL_SUCCESS; + +protected: + cl::sycl::detail::Requirement MRequirement; +}; + +class FakeCommandWithCallback : public FakeCommand { +public: + FakeCommandWithCallback(cl::sycl::detail::QueueImplPtr Queue, + cl::sycl::detail::Requirement Req, + std::function Callback) + : FakeCommand(Queue, Req), MCallback(std::move(Callback)) {} + + ~FakeCommandWithCallback() override { MCallback(); } + +protected: + std::function MCallback; +}; + +class TestScheduler : public cl::sycl::detail::Scheduler { +public: + cl::sycl::detail::MemObjRecord * + getOrInsertMemObjRecord(const cl::sycl::detail::QueueImplPtr &Queue, + cl::sycl::detail::Requirement *Req) { + return MGraphBuilder.getOrInsertMemObjRecord(Queue, Req); + } + + void removeRecordForMemObj(cl::sycl::detail::SYCLMemObjI *MemObj) { + MGraphBuilder.removeRecordForMemObj(MemObj); + } + + void cleanupCommandsForRecord(cl::sycl::detail::MemObjRecord *Rec) { + MGraphBuilder.cleanupCommandsForRecord(Rec); + } + + void AddNodeToLeaves( + cl::sycl::detail::MemObjRecord *Rec, cl::sycl::detail::Command *Cmd, + cl::sycl::access::mode Mode = cl::sycl::access::mode::read_write) { + return MGraphBuilder.AddNodeToLeaves(Rec, Cmd, Mode); + } +}; + +void addEdge(cl::sycl::detail::Command *User, cl::sycl::detail::Command *Dep, + cl::sycl::detail::AllocaCommandBase *Alloca) { + User->addDep(cl::sycl::detail::DepDesc{Dep, User->getRequirement(), Alloca}); + Dep->addUser(User); +} + +template +cl::sycl::detail::Requirement getFakeRequirement(const MemObjT &MemObj) { + return {{0, 0, 0}, + {0, 0, 0}, + {0, 0, 0}, + cl::sycl::access::mode::read_write, + cl::sycl::detail::getSyclObjImpl(MemObj).get(), + 0, + 0, + 0}; +}