From 3f710b784c06cf1762f54f4e5237e34b47fad736 Mon Sep 17 00:00:00 2001 From: Alexander Batashev Date: Thu, 2 Apr 2020 15:38:41 +0300 Subject: [PATCH 1/7] [SYCL][Doc] Add Graph Scheduler design documentation Co-authored-by: Vlad Romanov Signed-off-by: Alexander Batashev --- sycl/include/CL/sycl/detail/cg.hpp | 54 +- sycl/source/detail/scheduler/commands.hpp | 156 ++--- .../source/detail/scheduler/graph_builder.cpp | 39 +- sycl/source/detail/scheduler/scheduler.hpp | 560 ++++++++++++++++-- 4 files changed, 624 insertions(+), 185 deletions(-) diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp index a87daa3e8e154..adb2a8134d7c2 100644 --- a/sycl/include/CL/sycl/detail/cg.hpp +++ b/sycl/include/CL/sycl/detail/cg.hpp @@ -35,11 +35,12 @@ class interop_handler { template friend class accessor; + public: - using ReqToMem = std::pair; + using ReqToMem = std::pair; - interop_handler(std::vector MemObjs, cl_command_queue PiQueue) : - MQueue(PiQueue), MMemObjs(MemObjs) {} + interop_handler(std::vector MemObjs, cl_command_queue PiQueue) + : MQueue(PiQueue), MMemObjs(MemObjs) {} cl_command_queue get_queue() const noexcept { return MQueue; }; @@ -52,10 +53,11 @@ class interop_handler { detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc; return getMemImpl(detail::getSyclObjImpl(*AccBase).get()); } + private: cl_command_queue MQueue; std::vector MMemObjs; - cl_mem getMemImpl(detail::Requirement* Req) const; + cl_mem getMemImpl(detail::Requirement *Req) const; }; namespace detail { @@ -274,8 +276,7 @@ class HostKernel : public HostKernelBase { template typename std::enable_if>::value>::type runOnHost(const NDRDescT &NDRDesc) { - sycl::range GroupSize( - InitializedVal::template get<0>()); + sycl::range GroupSize(InitializedVal::template get<0>()); for (int I = 0; I < Dims; ++I) { if (NDRDesc.LocalSize[I] == 0 || NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0) @@ -284,8 +285,7 @@ class HostKernel : public HostKernelBase { GroupSize[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I]; } - sycl::range LocalSize( - InitializedVal::template get<0>()); + sycl::range LocalSize(InitializedVal::template get<0>()); sycl::range GlobalSize( InitializedVal::template get<0>()); sycl::id GlobalOffset; @@ -326,10 +326,9 @@ class HostKernel : public HostKernelBase { NGroups[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I]; } - sycl::range LocalSize( - InitializedVal::template get<0>()); + sycl::range LocalSize(InitializedVal::template get<0>()); sycl::range GlobalSize( - InitializedVal::template get<0>()); + InitializedVal::template get<0>()); for (int I = 0; I < Dims; ++I) { LocalSize[I] = NDRDesc.LocalSize[I]; GlobalSize[I] = NDRDesc.GlobalSize[I]; @@ -345,10 +344,10 @@ class HostKernel : public HostKernelBase { }; class stream_impl; -// The base class for all types of command groups. +/// Base class for all types of command groups. class CG { public: - // Type of the command group. + /// Type of the command group. enum CGTYPE { NONE, KERNEL, @@ -392,20 +391,20 @@ class CG { private: CGTYPE MType; - // The following storages needed to ensure that arguments won't die while + // The following storages are needed to ensure that arguments won't die while // we are using them. - // Storage for standard layout arguments. + /// Storage for standard layout arguments. vector_class> MArgsStorage; - // Storage for accessors. + /// Storage for accessors. vector_class MAccStorage; - // Storage for shared_ptrs. + /// Storage for shared_ptrs. vector_class> MSharedPtrStorage; public: - // List of requirements that specify which memory is needed for the command - // group to be executed. + /// List of requirements that specify which memory is needed for the command + /// group to be executed. vector_class MRequirements; - // List of events that order the execution of this CG + /// List of events that order the execution of this CG vector_class MEvents; // Member variables to capture the user code-location // information from Q.submit(), Q.parallel_for() etc @@ -415,9 +414,10 @@ class CG { int32_t MLine, MColumn; }; -// The class which represents "execute kernel" command group. +/// "Execute kernel" command group class. class CGExecKernel : public CG { public: + /// Stores ND-range description. NDRDescT MNDRDesc; unique_ptr_class MHostKernel; shared_ptr_class MSyclKernel; @@ -455,7 +455,7 @@ class CGExecKernel : public CG { } }; -// The class which represents "copy" command group. +/// "Copy memory" command group class. class CGCopy : public CG { void *MSrc; void *MDst; @@ -476,7 +476,7 @@ class CGCopy : public CG { void *getDst() { return MDst; } }; -// The class which represents "fill" command group. +/// "Fill memory" command group class. class CGFill : public CG { public: vector_class MPattern; @@ -496,7 +496,7 @@ class CGFill : public CG { Requirement *getReqToFill() { return MPtr; } }; -// The class which represents "update host" command group. +/// "Update host" command group class. class CGUpdateHost : public CG { Requirement *MPtr; @@ -515,7 +515,7 @@ class CGUpdateHost : public CG { Requirement *getReqToUpdate() { return MPtr; } }; -// The class which represents "copy" command group for USM pointers. +/// "Copy USM" command group class. class CGCopyUSM : public CG { void *MSrc; void *MDst; @@ -539,7 +539,7 @@ class CGCopyUSM : public CG { size_t getLength() { return MLength; } }; -// The class which represents "fill" command group for USM pointers. +/// "Fill USM" command group class. class CGFillUSM : public CG { vector_class MPattern; void *MDst; @@ -562,7 +562,7 @@ class CGFillUSM : public CG { int getFill() { return MPattern[0]; } }; -// The class which represents "prefetch" command group for USM pointers. +/// "Prefetch USM" command group class. class CGPrefetchUSM : public CG { void *MDst; size_t MLength; diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 004045db3e8e2..cf69d24bbbf4b 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -38,7 +38,7 @@ class ReleaseCommand; enum BlockingT { NON_BLOCKING = 0, BLOCKING }; -// The struct represents the result of command enqueueing +/// Result of command enqueueing. struct EnqueueResultT { enum ResultT { SyclEnqueueReady, @@ -49,15 +49,15 @@ struct EnqueueResultT { EnqueueResultT(ResultT Result = SyclEnqueueSuccess, Command *Cmd = nullptr, cl_int ErrCode = CL_SUCCESS) : MResult(Result), MCmd(Cmd), MErrCode(ErrCode) {} - // Indicates result of enqueueing + /// Indicates result of enqueueing. ResultT MResult; - // Pointer to the command failed to enqueue + /// Pointer to the command failed to enqueue. Command *MCmd; - // Error code which is set when enqueueing fails + /// Error code which is set when enqueueing fails. cl_int MErrCode; }; -// DepDesc represents dependency between two commands +/// Dependency between two commands. struct DepDesc { DepDesc(Command *DepCommand, const Requirement *Req, AllocaCommandBase *AllocaCmd) @@ -68,20 +68,22 @@ struct DepDesc { std::tie(Rhs.MDepRequirement, Rhs.MDepCommand); } - // The actual dependency command. + /// The actual dependency command. Command *MDepCommand = nullptr; - // Requirement for the dependency. + /// Requirement for the dependency. const Requirement *MDepRequirement = nullptr; - // Allocation command for the memory object we have requirement for. - // Used to simplify searching for memory handle. + /// Allocation command for the memory object we have requirement for. + /// Used to simplify searching for memory handle. AllocaCommandBase *MAllocaCmd = nullptr; }; -// The Command represents some action that needs to be performed on one or -// more memory objects. The command has vector of Depdesc objects that -// represent dependencies of the command. It has vector of pointer to commands -// that depend on the command. It has pointer to sycl::queue object. And has -// event that is associated with the command. +/// The Command represents some action that needs to be performed on one or +/// more memory objects. The command has vector of DepDesc objects that +/// represent dependencies of the command. It has vector of pointer to commands +/// that depend on the command. It has a pointer to \ref queue object. And has +/// event that is associated with the command. +/// +/// \ingroup sycl_graph class Command { public: enum CommandType { @@ -104,13 +106,15 @@ class Command { void addUser(Command *NewUser) { MUsers.insert(NewUser); } - // Return type of the command, e.g. Allocate, MemoryCopy. + /// \return type of the command, e.g. Allocate, MemoryCopy. CommandType getType() const { return MType; } - // The method checks if the command is enqueued, waits for it to be - // unblocked if "Blocking" argument is true, then calls enqueueImp. Returns - // true if the command is enqueued. Sets EnqueueResult to the specific - // status otherwise. + /// Checks if the command is enqueued, and calls enqueueImp. + /// + /// \param EnqueueResult is set to the specific status if enqueue failed. + /// \param Blocking if this argument is true, function will wait for command + /// to be unblocked before calling enqueueImp. + /// \return true if the command is enqueued. bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking); bool isFinished(); @@ -124,34 +128,33 @@ class Command { std::shared_ptr getEvent() const { return MEvent; } // Methods needed to support SYCL instrumentation - // - // Proxy method which calls emitInstrumentationData. + + /// Proxy method which calls emitInstrumentationData. void emitInstrumentationDataProxy(); - // Instrumentation method which emits telemetry data. + /// Instrumentation method which emits telemetry data. virtual void emitInstrumentationData() = 0; - // This function looks at all the dependencies for - // the release command and enables instrumentation - // to report these dependencies as edges + /// Looks at all the dependencies for the release command and enables + /// instrumentation to report these dependencies as edges. void resolveReleaseDependencies(std::set &list); - // Creates an edge event when the dependency is a command + /// Creates an edge event when the dependency is a command void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, const string_class &Prefix, bool IsCommand); - // Creates an edge event when the dependency is an event + /// Creates an edge event when the dependency is an event void emitEdgeEventForEventDependence(Command *Cmd, RT::PiEvent &EventAddr); - // Creates a signal event with the enqueued kernel event handle + /// Creates a signal event with the enqueued kernel event handle void emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr); /// Create a trace event of node_create type; this must be guarded by a /// check for xptiTraceEnabled() /// Post Condition: MTraceEvent will be set to the event created - /// @param MAddress The address to use to create the payload + /// \param MAddress The address to use to create the payload uint64_t makeTraceEventProlog(void *MAddress); - // If prolog has been run, run epilog; this must be guarded by a check for - // xptiTraceEnabled() + /// If prolog has been run, run epilog; this must be guarded by a check for + /// xptiTraceEnabled() void makeTraceEventEpilog(); - // Emits an event of Type + /// Emits an event of Type void emitInstrumentation(uint16_t Type, const char *Txt = nullptr); - // + // End Methods needed to support SYCL instrumentation virtual void printDot(std::ostream &Stream) const = 0; @@ -172,58 +175,58 @@ class Command { RT::PiEvent &Event); std::vector prepareEvents(ContextImplPtr Context); - // Private interface. Derived classes should implement this method. + /// Private interface. Derived classes should implement this method. virtual cl_int enqueueImp() = 0; - // The type of the command + /// The type of the command CommandType MType; - // Mutex used to protect enqueueing from race conditions + /// Mutex used to protect enqueueing from race conditions std::mutex MEnqueueMtx; public: - // Contains list of dependencies(edges) + /// Contains list of dependencies(edges) std::vector MDeps; - // Contains list of commands that depend on the command + /// Contains list of commands that depend on the command std::unordered_set MUsers; - // Indicates whether the command can be blocked from enqueueing + /// Indicates whether the command can be blocked from enqueueing bool MIsBlockable = false; - // Counts the number of memory objects this command is a leaf for + /// Counts the number of memory objects this command is a leaf for unsigned MLeafCounter = 0; const char *MBlockReason = "Unknown"; - // Describes the status of a command + /// Describes the status of a command std::atomic MEnqueueStatus; // All member variable defined here are needed for the SYCL instrumentation // layer. Do not guard these variables below with XPTI_ENABLE_INSTRUMENTATION // to ensure we have the same object layout when the macro in the library and // SYCL app are not the same. - // - // The event for node_create and task_begin + + /// The event for node_create and task_begin void *MTraceEvent = nullptr; - // The stream under which the traces are emitted; stream ids are - // positive integers and we set it to an invalid value + /// The stream under which the traces are emitted; stream ids are + /// positive integers and we set it to an invalid value int32_t MStreamID = -1; - // Reserved for storing the object address such as SPIRV or memory object - // address + /// Reserved for storing the object address such as SPIRV or memory object + /// address void *MAddress = nullptr; - // Buffer to build the address string + /// Buffer to build the address string string_class MAddressString; - // Buffer to build the command node type + /// Buffer to build the command node type string_class MCommandNodeType; - // Buffer to build the command end-user understandable name + /// Buffer to build the command end-user understandable name string_class MCommandName; - // Flag to indicate if makeTraceEventProlog() has been run + /// Flag to indicate if makeTraceEventProlog() has been run bool MTraceEventPrologComplete = false; - // Flag to indicate if this is the first time we are seeing this payload + /// Flag to indicate if this is the first time we are seeing this payload bool MFirstInstance = false; - // Instance ID tracked for the command + /// Instance ID tracked for the command uint64_t MInstanceID = 0; }; -// The command does nothing during enqueue. The task can be used to implement -// lock in the graph, or to merge several nodes into one. +/// The empty command does nothing during enqueue. The task can be used to +/// implement lock in the graph, or to merge several nodes into one. class EmptyCommand : public Command { public: EmptyCommand(QueueImplPtr Queue, Requirement Req); @@ -239,8 +242,8 @@ class EmptyCommand : public Command { Requirement MRequirement; }; -// The command enqueues release instance of memory allocated on Host or -// underlying framework. +/// The release command enqueues release instance of memory allocated on Host or +/// underlying framework. class ReleaseCommand : public Command { public: ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd); @@ -251,10 +254,11 @@ class ReleaseCommand : public Command { private: cl_int enqueueImp() final; - // Command which allocates memory release command should dealocate + /// Command which allocates memory release command should dealocate. AllocaCommandBase *MAllocaCmd = nullptr; }; +/// Base class for memory allocation commands. class AllocaCommandBase : public Command { public: AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, @@ -272,17 +276,17 @@ class AllocaCommandBase : public Command { void *MMemAllocation = nullptr; - // Alloca command linked with current command. - // Device and host alloca commands can be linked, so they may share the same - // memory. Only one allocation from a pair can be accessed at a time. Alloca - // commands associated with such allocation is "active". In order to switch - // "active" status between alloca commands map/unmap operations are used. + /// Alloca command linked with current command. + /// Device and host alloca commands can be linked, so they may share the same + /// memory. Only one allocation from a pair can be accessed at a time. Alloca + /// commands associated with such allocation is "active". In order to switch + /// "active" status between alloca commands map/unmap operations are used. AllocaCommandBase *MLinkedAllocaCmd = nullptr; - // Indicates that current alloca is active one. + /// Indicates that current alloca is active one. bool MIsActive = true; - // Indicates that the command owns memory allocation in case of connected - // alloca command + /// Indicates that the command owns memory allocation in case of connected + /// alloca command bool MIsLeaderAlloca = true; protected: @@ -290,8 +294,8 @@ class AllocaCommandBase : public Command { ReleaseCommand MReleaseCmd; }; -// The command enqueues allocation of instance of memory object on Host or -// underlying framework. +/// The alloca command enqueues allocation of instance of memory object on Host +/// or underlying framework. class AllocaCommand : public AllocaCommandBase { public: AllocaCommand(QueueImplPtr Queue, Requirement Req, @@ -304,11 +308,12 @@ class AllocaCommand : public AllocaCommandBase { private: cl_int enqueueImp() final; - // The flag indicates that alloca should try to reuse pointer provided by - // the user during memory object construction + /// The flag indicates that alloca should try to reuse pointer provided by + /// the user during memory object construction bool MInitFromUserData = false; }; +/// The AllocaSubBuf command enqueues creation of sub-buffer of memory object. class AllocaSubBufCommand : public AllocaCommandBase { public: AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, @@ -324,6 +329,7 @@ class AllocaSubBufCommand : public AllocaCommandBase { AllocaCommandBase *MParentAlloca = nullptr; }; +/// The map command enqueues mapping of host memory onto device memory. class MapMemObject : public Command { public: MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, @@ -342,6 +348,7 @@ class MapMemObject : public Command { access::mode MMapMode; }; +/// The unmap command removes mapping of host memory onto device memory. class UnMapMemObject : public Command { public: UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, @@ -359,7 +366,8 @@ class UnMapMemObject : public Command { void **MSrcPtr = nullptr; }; -// The command enqueues memory copy between two instances of memory object. +/// The mem copy command enqueues memory copy between two instances of memory +/// object. class MemCpyCommand : public Command { public: MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, @@ -380,7 +388,8 @@ class MemCpyCommand : public Command { AllocaCommandBase *MDstAllocaCmd = nullptr; }; -// The command enqueues memory copy between two instances of memory object. +/// The mem copy host command enqueues memory copy between two instances of +/// memory object. class MemCpyCommandHost : public Command { public: MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd, @@ -401,7 +410,8 @@ class MemCpyCommandHost : public Command { void **MDstPtr = nullptr; }; -// The command enqueues execution of kernel or explicit memory operation. +/// The exec CG command enqueues execution of kernel or explicit memory +/// operation. class ExecCGCommand : public Command { public: ExecCGCommand(std::unique_ptr CommandGroup, QueueImplPtr Queue); diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 2105c22cf6af7..02cb5bd4c4969 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -27,9 +27,10 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { -// The function checks whether two requirements overlaps or not. This -// information can be used to prove that executing two kernels that -// work on different parts of the memory object in parallel is legal. +/// Checks whether two requirements overlaps or not. +/// +/// This information can be used to prove that executing two kernels that +/// work on different parts of the memory object in parallel is legal. static bool doOverlap(const Requirement *LHS, const Requirement *RHS) { return (LHS->MOffsetInBytes + LHS->MAccessRange.size() * LHS->MElemSize >= RHS->MOffsetInBytes) || @@ -43,12 +44,12 @@ static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) { return LHS == RHS || (LHS->is_host() && RHS->is_host()); } -// The function checks if current requirement is requirement for sub buffer +/// Checks if current requirement is requirement for sub buffer static bool IsSuitableSubReq(const Requirement *Req) { return Req->MIsSubBuffer; } -// Checks if the required access mode is allowed under the current one +/// Checks if the required access mode is allowed under the current one. static bool isAccessModeAllowed(access::mode Required, access::mode Current) { switch (Current) { case access::mode::read: @@ -104,7 +105,6 @@ static void printDotRecursive(std::fstream &Stream, void Scheduler::GraphBuilder::printGraphAsDot(const char *ModeName) { static size_t Counter = 0; - std::string ModeNameStr(ModeName); std::string FileName = "graph_" + std::to_string(Counter) + ModeNameStr + ".dot"; @@ -123,13 +123,10 @@ void Scheduler::GraphBuilder::printGraphAsDot(const char *ModeName) { Stream << "}" << std::endl; } -// Returns record for the memory objects passed, nullptr if doesn't exist. MemObjRecord *Scheduler::GraphBuilder::getMemObjRecord(SYCLMemObjI *MemObject) { return MemObject->MRecord.get(); } -// Returns record for the memory object requirement refers to, if doesn't -// exist, creates new one. MemObjRecord * Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue, Requirement *Req) { @@ -147,7 +144,6 @@ Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue, return MemObject->MRecord.get(); } -// Helper function which removes all values in Cmds from Leaves void Scheduler::GraphBuilder::updateLeaves(const std::set &Cmds, MemObjRecord *Record, access::mode AccessMode) { @@ -398,7 +394,7 @@ Command *Scheduler::GraphBuilder::addCopyBack(Requirement *Req) { // The function implements SYCL host accessor logic: host accessor // should provide access to the buffer in user space. Command *Scheduler::GraphBuilder::addHostAccessor(Requirement *Req, - const bool destructor) { + const bool destructor) { const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue(); @@ -450,15 +446,14 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost( return insertMemoryMove(Record, Req, HostQueue); } -// The functions finds dependencies for the requirement. It starts searching -// from list of "leaf" commands for the record and check if the examining -// command can be executed in parallel with new one with regard to the memory -// object. If can, then continue searching through dependencies of that -// command. There are several rules used: -// -// 1. New and examined commands only read -> can bypass -// 2. New and examined commands has non-overlapping requirements -> can bypass -// 3. New and examined commands has different contexts -> cannot bypass +/// Start searching from list of "leaf" commands for the record and check if the +/// examining command can be executed in parallel with new one with regard to +/// the memory object. If can, then continue searching through dependencies of +/// that command. There are several rules used: +/// +/// 1. New and examined commands only read -> can bypass +/// 2. New and examined commands has non-overlapping requirements -> can bypass +/// 3. New and examined commands has different contexts -> cannot bypass std::set Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, Requirement *Req, const ContextImplPtr &Context) { @@ -606,8 +601,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq( // To ensure that the leader allocation is removed first AllocaCmd->getReleaseCmd()->addDep( - DepDesc(LinkedAllocaCmd->getReleaseCmd(), AllocaCmd->getRequirement(), - LinkedAllocaCmd)); + DepDesc(LinkedAllocaCmd->getReleaseCmd(), + AllocaCmd->getRequirement(), LinkedAllocaCmd)); // Device allocation takes ownership of the host ptr during // construction, host allocation doesn't. So, device allocation should diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 19c79ea5e4e18..1f8681a0b4df9 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -19,6 +19,152 @@ #include #include +/// \defgroup sycl_graph DPC++ Execution Graph +/// +/// DPC++, unlike OpenCL, provides a programming model in which user doesn't +/// need to manage dependencies between kernels and memory explicitly. The DPC++ +/// Runtime must ensure correct execution with respect to order commands are +/// submitted. +/// +/// This document describes the part of the DPC++ Runtime that is responsible +/// for building and processing dependency graph. +/// +/// ## A couple of words about DPC++ and SYCL execution and memory model +/// +/// The SYCL framework defines command group (\ref CG) as an entity that +/// represents minimal execution block. The command group is submitted to SYCL +/// queue and consists of a kernel and its requirements. The SYCL queue defines +/// device and context using which the kernel should be executed. +/// +/// There are also command groups that consist of memory requirements and +/// explicit memory operation, such as copy, fill, update_host. In this case +/// it's up to an implementation how to implement these operations. +/// +/// The relative order of command groups submission defines order in which +/// kernels must be executed if their memory requirements intersect. For +/// example, if a command group A writes to a buffer X, command group B reads +/// from X, then the scheduled execution order of A and B will be the same as +/// their dynamic submission order (matches program order if submitted from the +/// same host thread). +/// +/// Memory requirements are requests to SYCL memory objects, such as buffer and +/// image. SYCL memory objects are not bound to any specific context or device, +/// it's SYCL responsibility to allocate and/or copy memory to the target +/// context to achieve correct execution. +/// +/// Refer to SYCL Specification 1.2.2 sections 3.4 and 3.5 to find more +/// information about SYCL execution and memory model. +/// +/// ### Example of DPC++ application +/// +/// \code{.cpp} +/// { +/// // Creating DPC++ CPU and GPU queues +/// cl::sycl::queue CPU_Queue = ...; +/// cl::sycl::queue GPU_Queue = ...; +/// +/// // Creating 3 DPC++ buffers +/// auto BufferA = ...; // Buffer is initialized with host memory. +/// auto BufferB = ...; +/// auto BufferC = ...; +/// +/// // "Copy command group" section +/// // Request processing copy "explicit" operation on CPU +/// // The copy operation reads from BufferA and writes to BufferB +/// +/// CPU_Queue.submit([&](handler &CGH) { +/// auto A = BufferA.get_access(CGH); +/// auto B = BufferB.get_access(CGH); +/// CGH.copy(A, B); +/// }); +/// +/// // "Multi command group" section +/// // Request processing multi kernel on GPU +/// // The kernel reads from BufferB, multiplies by 4 and writes result to +/// // BufferC +/// +/// GPU_Queue.submit([&](handler &CGH) { +/// auto B = BufferB.get_access(CGH); +/// auto C = BufferC.get_access(CGH); +/// CGH.parallel_for(range<1>{N}, [=](id<1> Index) { +/// C[Index] = B[Index] * 4; +/// }); +/// }); +/// +/// // "Host accessor creation" section +/// // Request the latest data of BufferC for the moment +/// // This is synchronization point what means SYCL RT blocks on creation of +/// // the accessor until requested data is available. +/// auto C = BufferC.get_access(); +/// } +/// \endcode +/// +/// In the example above DPC++ RT does: +/// +/// 1. **Copy command group**. +/// DPC++ RT allocates memory for BufferA and BufferB on CPU then execute +/// "copy" explicit memory operation on CPU. +/// +/// 2. **Multi command group** +/// DPC++ RT allocates memory for BufferC and BufferB on GPU and copy +/// content of BufferB from CPU to GPU, then execute "multi" kernel on GPU. +/// +/// 3. **Host accessor creation** +/// DPC++ RT allocates(it's possible to reuse already allocated memory) +/// memory available for user for BufferC then copy content of BufferC from +/// GPU to this memory. +/// +/// So, the example above will be converted to the following OpenCL pseudo code +/// (for both eager and lazy execution): +/// \code{.cpp} +/// // Initialization(not related to the Scheduler) +/// Platform = clGetPlatforms(...); +/// DeviceCPU = clGetDevices(CL_DEVICE_TYPE_CPU, ...); +/// DeviceGPU = clGetDevices(CL_DEVICE_TYPE_GPU, ...); +/// ContextCPU = clCreateContext(DeviceCPU, ...) +/// ContextGPU = clCreateContext(DeviceGPU, ...) +/// QueueCPU = clCreateCommandQueue(ContextCPU, DeviceCPU, ...); +/// QueueGPU = clCreateCommandQueue(ContextGPU, DeviceGPU, ...); +/// +/// // Copy command group: +/// BufferACPU = clCreateBuffer(ContextCPU, CL_MEM_USE_HOST_PTR, ...); +/// BufferBCPU = clCreateBuffer(ContextCPU, ...); +/// CopyEvent = clEnqueueCopyBuffer(QueueCPU, BufferACPU, BufferBCPU, ...) +/// +/// // Multi command group: +/// ReadBufferEvent = +/// clEnqueueReadBuffer(QueueCPU, BufferBCPU, HostPtr, CopyEvent, ...); +/// BufferBGPU = clCreateBuffer(ContextGPU, ...); +/// +/// UserEvent = clCreateUserEvent(ContextCPU); +/// clSetEventCallback(ReadBufferEvent, event_completion_callback, +/// /*data=*/UserEvent); +/// +/// WriteBufferEvent = clEnqueueWriteBuffer(QueueGPU, BufferBGPU, HostPtr, +/// UserEvent, ...); BufferCGPU = clCreateBuffer(ContextGPU, ...); ProgramGPU = +/// clCreateProgramWithIL(ContextGPU, ...); clBuildProgram(ProgramGPU); +/// MultiKernel = clCreateKernel("multi"); +/// clSetKernelArg(MultiKernel, BufferBGPU, ...); +/// clSetKernelArg(MultiKernel, BufferCGPU, ...); +/// MultiEvent = +/// clEnqueueNDRangeKernel(QueueGPU, MultiKernel, WriteBufferEvent, ...); +/// +/// // Host accessor creation: +/// clEnqueueMapBuffer(QueueGPU, BufferCGPU, BLOCKING_MAP, MultiEvent, ...); +/// +/// // Releasing mem objects during SYCL buffers destruction. +/// clReleaseBuffer(BufferACPU); +/// clReleaseBuffer(BufferBCPU); +/// clReleaseBuffer(BufferBGPU); +/// clReleaseBuffer(BufferCGPU); +/// +/// // Release(not related to the Scheduler) +/// clReleaseKernel(MultiKernel); +/// clReleaseProgram(ProgramGPU); +/// clReleaseContext(ContextGPU); +/// clReleaseContext(ContextCPU); +/// \endcode + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { @@ -31,8 +177,12 @@ using QueueImplPtr = std::shared_ptr; using EventImplPtr = std::shared_ptr; using ContextImplPtr = std::shared_ptr; -// The MemObjRecord is created for each memory object used in command -// groups. There should be only one MemObjRecord for SYCL memory object. +/// Memory Object Record +/// +/// The MemObjRecord is used in command groups (todo better desc). +/// There must be a single MemObjRecord for each SYCL memory object. +/// +/// \ingroup sycl_graph struct MemObjRecord { MemObjRecord(ContextImplPtr CurContext, std::size_t LeafLimit) : MReadLeaves{LeafLimit}, MWriteLeaves{LeafLimit}, MCurContext{ @@ -59,46 +209,223 @@ struct MemObjRecord { bool MMemModified = false; }; +/// DPC++ graph scheduler class. +/// +/// \section sched_overview Scheduler Overview +/// +/// The Scheduler is a part of DPC++ RT which ensures correct execution of +/// command groups. To achieve this Scheduler manages acyclic dependency graph +/// (which can have independent sub-graphs) that consists of several types of +/// nodes that represent specific commands: + +/// 1. Allocate memory. +/// The command represents memory allocation operation. There can be +/// multiple allocations for single SYCL memory object. +/// 2. Release memory. +/// The command represents memory release operation. +/// 3. Execute command group. +/// The command represents \ref CG "Command Group" (kernel) execution +/// operation. +/// 4. Copy memory. +/// The command represents memory copy operation between two memory +/// allocations of a single memory object. +/// +/// As a main input Scheduler takes command group and returns an event +/// representing the command group, so it can be waited on later. When a new +/// command group comes Scheduler adds one or more nodes to the graph +/// depending on the command groups' requirements. For example, if a new +/// command group is submitted to the SYCL context which has the latest data +/// for all the requirements, Scheduler adds a new "Execute command group" +/// command making it dependent on all commands affecting new command group's +/// requirements. But if one of the requirements has no up-to-date instance in +/// the context which the command group is submitted to, Scheduler +/// additionally inserts copy memory command (together with allocate memory +/// command if needed). +/// +/// A simple graph looks like: +// +// +----------+ +----------+ +----------+ +// | | | | | | +// | Allocate |<----| Execute |<----| Execute | +// | | | | | | +// +----------+ +----------+ +----------+ +// +/// \dot +/// digraph G { +/// rankdir="LR"; +/// Execute1 [label = "Execute"]; +/// Execute2 [label = "Execute"]; +/// Allocate; +/// Allocate -> Execute2 [dir = back]; +/// Execute2 -> Execute1 [dir = back]; +/// } +/// \enddot +/// +/// Where nodes represent commands and edges represent dependencies between +/// them. There are three commands connected by arrows which mean that before +/// executing second command group the first one must be executed. Also before +/// executing the first command group memory allocation must be performed. +/// +/// At some point Scheduler enqueues commands to the underlying devices. To do +/// this Scheduler makes topological sort to get order in which commands are +/// need to be enqueued. For example, the following graph (D depends on B and C, +/// B and C depends on A) will be enqueued in the following order: +/// \code{.cpp} +/// EventA = Enqueue(A, /*Deps=*/{}); +/// EventB = Enqueue(B, /*Deps=*/{EventA}); +/// EventC = Enqueue(C, /*Deps=*/{EventA}); +/// EventD = Enqueue(D, /*Deps=*/{EventB, EventC}); +/// \endcode +/// +// +----------+ +// | | +// | D | +// | | +// +----------+ +// / \ +// / \ +// v v +// +----------+ +----------+ +// | | | | +// | B | | C | +// | | | | +// +----------+ +----------+ +// \ / +// \ / +// v v +// +----------+ +// | | +// | A | +// | | +// +----------+ +/// \dot +/// digraph G { +/// D -> B; +/// D -> C; +/// C -> A; +/// B -> A; +/// } +/// \enddot +/// +/// \section sched_impl Implementation details +/// +/// The Scheduler is split up into two parts: graph builder and graph +/// processor. +/// +/// To build dependencies Scheduler needs to memorize memory object and +/// commands that modify it. +/// +/// To detect that two command groups access the same memory object and create +/// a dependency between them the scheduler needs to store information about +/// the memory object. +/// +/// \subsection sched_thread_safety Thread safety +/// +/// To ensure thread safe execution of methods Scheduler provides access to the +/// graph should be guarded by read-write mutex(analog of shared mutex from +/// C++17). +/// +/// An read-write mutex allows concurrent access to read-only operations, while +/// write operations require exclusive access. +/// +/// All the methods of GraphBuilder lock the mutex in write mode because these +/// methods can modify the graph. +/// Methods of GraphProcessor lock the mutex in read mode as they are not +/// modifying the graph. +/// +/// \subsection shced_err_handling Error handling +/// +/// There are two sources of erros that needs to be handled in Scheduler: +/// 1. the error that happens during command enqueue process +/// 2. the error that happend during command execution. +/// +/// If error occurs during commands enqueue process Command::enqueue method +/// return faulty command. The Scheduler then reschedules the command and all +/// dependent commands (if any). +/// +/// An error with command processing can happen in underlying runtime, in this +/// case Scheduler is notified asynchronously(using callback mechanism) what +/// triggers rescheduling. +/// +/// \ingroup sycl_graph class Scheduler { public: - // Registers command group, adds it to the dependency graph and returns an - // event object that can be used for waiting later. It's called by SYCL's - // queue.submit. + /// Registers a command group, and adds it to the dependency graph. + /// + /// It's called by SYCL's queue.submit. + /// + /// \param CommandGroup is a unique_ptr to a command group to be added. + /// \return an event object to wait on for command group completetion. EventImplPtr addCG(std::unique_ptr CommandGroup, QueueImplPtr Queue); + /// Registers a command group, that copies most recent memory to the memory + /// pointed by the requirement. + /// + /// \param Req is a requirement that points to the memory where data is + /// needed. + /// \return an event object to wait on for copy finish. EventImplPtr addCopyBack(Requirement *Req); - // Blocking call that waits for the event passed. For the eager execution - // mode this method invokes corresponding function of device API. In the - // lazy execution mode the method may enqueue the command associated with - // the event passed and its dependency before calling device API. + /// Waits for the event. + /// + /// This operation is blocking. For eager execution mode this method invokes + /// corresponding function of device API. In lazy execution mode the method + /// may enqueue the command, associated with the event, and its dependency + /// before calling device API. + /// + /// \param Event is a pointer to event to wait on. void waitForEvent(EventImplPtr Event); - // Removes buffer pointed by MemObj from the graph: ensures all commands - // accessing the memory objects are executed and triggers deallocation of - // all memory assigned to the memory object. It's called from the - // sycl::buffer and sycl::image destructors. + /// Removes buffer from the graph. + /// + /// The lifetime of memory object descriptor begins when first command group + /// that uses memory object comes and ends when "removeMemoryObject(...)" + /// method is called which means there will be no command group that uses the + /// memory object. When removeMemoryObject is called Scheduler will enqueue + /// and wait on all ReleseCommand's associated with the memory object, what + /// effectively guarantees that all commands accessing the memory object is + /// complete and then resources for the memory object is freed. Then all the + /// commands affecting the memory object are removed. + /// + /// On destruction Scheduler triggers destruction of all memory object + /// descriptors in order to wait on all commands not yet executed and all + /// memory it manages. + /// + /// This member function is used by \ref buffer and \ref image. + /// + /// \param MemObj is a memory object that points to buffer being removed. void removeMemoryObject(detail::SYCLMemObjI *MemObj); - // Removes finished non-leaf non-alloca commands from the subgraph (assuming - // that all its commands have been waited for). + /// Removes finished non-leaf non-alloca commands from the subgraph + /// (assuming that all its commands have been waited for). + /// \sa GraphBuilder::cleanupFinishedCommands + /// + /// \param FinishedEvent is a cleanup candidate event. void cleanupFinishedCommands(EventImplPtr FinishedEvent); - // Creates nodes in the graph, that update Req with the pointer to the host - // memory which contains the latest data of the memory object. New - // operations with the same memory object that have side effects are blocked - // until releaseHostAccessor is called. Returns an event which indicates - // when these nodes are completed and host accessor is ready for using. + /// Adds nodes to the graph, that update the requirement with the pointer + /// to the host memory. + /// + /// Assumes the host pointer contains the latest data. New operations with + /// the same memory object that have side effects are blocked until + /// releaseHostAccessor(Requirement *Req) is callled. + /// + /// \param Req is the requirement to be updated. + /// \return an event which indicates when these nodes are completed + /// and host accessor is ready for use. EventImplPtr addHostAccessor(Requirement *Req, const bool Destructor = false); - // Unblocks operations with the memory object. + /// Unblocks operations with the memory object. + /// + /// \param Req is a requirement that points to the memory object being + /// unblocked. void releaseHostAccessor(Requirement *Req); - // Returns an instance of the scheduler object. + /// \return an instance of the scheduler object. static Scheduler &getInstance(); - // Returns list of "immediate" dependencies for the Event given. + /// \return a vector of "immediate" dependencies for the Event given. std::vector getWaitList(EventImplPtr Event); QueueImplPtr getDefaultHostQueue() { return DefaultHostQueue; } @@ -107,71 +434,97 @@ class Scheduler { Scheduler(); static Scheduler instance; - // The graph builder provides interfaces that can change already existing - // graph (e.g. add/remove edges/nodes). + /// Graph builder class. + /// + /// The graph builder provides means to change an existing graph (e.g. add + /// or remove edges/nodes). + /// + /// \ingroup sycl_graph class GraphBuilder { public: GraphBuilder(); - // Registers command group, adds it to the dependency graph and returns an - // command that represents command group execution. It's called by SYCL's - // queue::submit. + /// Registers \ref CG "command group" and adds it to the dependency graph. + /// + /// \sa queue::submit, Scheduler::addCG + /// + /// \return a command that represents command group execution. Command *addCG(std::unique_ptr CommandGroup, QueueImplPtr Queue); + /// Registers a \ref CG "command group" that updates host memory to the + /// latest state. + /// + /// \return a command that represents command group execution. Command *addCGUpdateHost(std::unique_ptr CommandGroup, QueueImplPtr HostQueue); + /// Registers a \ref CG "command group" to update memory to the latest + /// state. + /// + /// \param Req is a requirement, that describes memory object. Command *addCopyBack(Requirement *Req); + + /// Registers a \ref CG "command group" to create a host accessor. + /// + /// \param Req points to memory being accessed. Command *addHostAccessor(Requirement *Req, const bool destructor = false); - // [Provisional] Optimizes the whole graph. + /// [Provisional] Optimizes the whole graph. void optimize(); - // [Provisional] Optimizes subgraph that consists of command associated - // with Event passed and its dependencies. + /// [Provisional] Optimizes subgraph that consists of command associated + /// with Event passed and its dependencies. void optimize(EventImplPtr Event); - // Removes finished non-leaf non-alloca commands from the subgraph (assuming - // that all its commands have been waited for). + /// Removes finished non-leaf non-alloca commands from the subgraph + /// (assuming that all its commands have been waited for). void cleanupFinishedCommands(Command *FinishedCmd); - // Reschedules command passed using Queue provided. this can lead to - // rescheduling of all dependent commands. This can be used when user - // provides "secondary" queue to submit method which may be used when - // command fails to enqueue/execute in primary queue. + /// Reschedules command passed using Queue provided. t + /// + /// This can lead to rescheduling of all dependent commands. This can be + /// used when user provides "secondary" queue to submit method which may + /// be used when command fails to enqueue/execute in primary queue. void rescheduleCommand(Command *Cmd, QueueImplPtr Queue); + /// \return a pointer to the corresponding memory object record for the + /// SYCL memory object provided, or nullptr if it does not exist. MemObjRecord *getMemObjRecord(SYCLMemObjI *MemObject); - // Returns pointer to MemObjRecord for pointer to memory object. - // Return nullptr if there the record is not found. + + /// \return a pointer to MemObjRecord for pointer to memory object. If the + /// record is not found, nullptr is returned. MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue, Requirement *Req); - // Decrements leaf counters for all leaves of the record. + /// Decrements leaf counters for all leaves of the record. void decrementLeafCountersForRecord(MemObjRecord *Record); - // Removes commands that use given MemObjRecord from the graph. + /// Removes commands that use given MemObjRecord from the graph. void cleanupCommandsForRecord(MemObjRecord *Record); - // Removes MemObjRecord for memory object passed. + /// Removes MemObjRecord for memory object passed. void removeRecordForMemObj(SYCLMemObjI *MemObject); - // Add new command to leaves if needed. + /// Add new command to leaves if needed. void addNodeToLeaves(MemObjRecord *Record, Command *Cmd, access::mode AccessMode); - // Removes commands from leaves. + /// Removes commands from leaves. void updateLeaves(const std::set &Cmds, MemObjRecord *Record, access::mode AccessMode); std::vector MMemObjs; private: - // The method inserts required command to make so the latest state for the - // memory object Record refers to resides in the context which is bound to - // the Queue. Can insert copy/map/unmap operations depending on the source - // and destination. + /// Inserts required command to update memory object state in the context. + /// + /// Copy/map/unmap operations can be inserted depending on the source and + /// destination. + /// + /// \param Record is a memory object that needs to be updated. + /// \param Req is a Requirement describing destination. + /// \param Queue is a queue that is bound to target context. Command *insertMemoryMove(MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue); @@ -184,24 +537,30 @@ class Scheduler { insertUpdateHostReqCmd(MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue); + /// Finds dependencies for the requirement. std::set findDepsForReq(MemObjRecord *Record, Requirement *Req, const ContextImplPtr &Context); - // Finds a command dependency corresponding to the record + /// Finds a command dependency corresponding to the record. DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record); - // Searches for suitable alloca in memory record. + /// Searches for suitable alloca in memory record. AllocaCommandBase *findAllocaForReq(MemObjRecord *Record, Requirement *Req, const ContextImplPtr &Context); - // Searches for suitable alloca in memory record. - // If none found, creates new one. + + /// Searches for suitable alloca in memory record. + /// + /// If none found, creates new one. AllocaCommandBase *getOrCreateAllocaForReq(MemObjRecord *Record, Requirement *Req, QueueImplPtr Queue); void markModifiedIfWrite(MemObjRecord *Record, Requirement *Req); - // Print contents of graph to text file in DOT format + /// Prints contents of graph to text file in DOT format + /// + /// \param ModeName is a stringified printing mode name to be used + /// in the result file name. void printGraphAsDot(const char *ModeName); enum PrintOptions { BeforeAddCG = 0, @@ -215,21 +574,96 @@ class Scheduler { std::array MPrintOptionsArray; }; - // The class that provides interfaces for enqueueing command and its - // dependencies to the underlying runtime. Methods of this class must not - // modify the graph. + /// Graph Processor provided interfaces for enqueueing commands and their + /// dependencies to the underlying runtime. + /// + /// Member functions of this class do not modify the graph. + /// + /// \section sched_enqueue Command enqueueing + /// \todo lazy mode is not implemented. + /// + /// The Scheduler can work in two modes of enqueueing commands: eager(default) + /// and lazy. In eager mode commands are enqueued whenever they come to the + /// Scheduler. In lazy mode they are not enqueued until content of the buffer + /// they are accessing is requested by user. + /// + /// Each command has enqueue method which takes vector of events that + /// represents dependencies and returns event which represents the command. + /// GraphProcessor makes topological sort to get order in which commands are + /// need to be enqueued. Then enqueue each command passing vector of events + /// that this command needs to wait on. If error happens during command + /// enqueue, the whole process is stopped, faulty command is propagated back + /// to the Scheduler. + /// + /// The command with dependencies that belong to different context from + /// command's one can't be enqueued directly(limitation of OpenCL runtime). + /// Instead for each dependency a proxy event in the target context is created + /// and linked using OpenCL callback mechanism with original one. For example, + /// the following SYCL code: + /// + /// \code{.cpp} + /// // The ContextA and ContextB are different OpenCL contexts + /// sycl::queue Q1(ContextA); + /// sycl::queue Q2(ContextB); + /// + /// Q1.submit(Task1); + /// + /// Q2.submit(Task2); + /// \endcode + /// + /// is translated to the following OCL API calls: + /// + /// \code{.cpp} + /// void event_completion_callback(void *data) { + /// // Change status of event to complete. + /// clSetEventStatus((cl_event *)data, CL_COMPLETE); // Scope of Context2 + /// } + /// + /// // Enqueue TASK1 + /// EventTask1 = clEnqueueNDRangeKernel(Q1, TASK1, ..); // Scope of Context1 + /// // Read memory to host + /// ReadMem = clEnqueueReadBuffer(A, .., /*Deps=*/EventTask1); // Scope of + /// // Context1 + /// + /// // Create user event with initial status "not completed". + /// UserEvent = clCreateUserEvent(Context2); // Scope of Context2 + /// // Ask OpenCL to call callback with UserEvent as data when "read memory + /// // to host" operation is completed + /// clSetEventCallback(ReadMem, event_completion_callback, + /// /*data=*/UserEvent); // Scope of Context1 + /// + /// // Enqueue write memory from host, block it on user event + /// // It will be unblocked when we change UserEvent status to completed in + /// // callback. + /// WriteMem = + /// clEnqueueWriteBuffer(A, .., /*Dep=*/UserEvent); // Scope of Context2 + /// // Enqueue TASK2 + /// EventTask2 = + /// clEnqueueNDRangeKernel(TASK, .., /*Dep=*/WriteMem); // Scope of + /// // Context2 + /// \endcode + /// + /// The alternative approach that has been considered is to have separate + /// dispatcher thread that would wait for all events from the Context other + /// then target Context to complete and then enqueue command with dependencies + /// from target Context only. Alternative approach makes code significantly + /// more complex and can hurt performance on CPU device vs chosen approach + /// with callbacks. + /// + /// \ingroup sycl_graph class GraphProcessor { public: - // Returns a list of events that represent immediate dependencies of the - // command associated with Event passed. + /// \return a list of events that represent immediate dependencies of the + /// command associated with Event passed. static std::vector getWaitList(EventImplPtr Event); - // Wait for the command, associated with Event passed, is completed. + /// Waits for the command, associated with Event passed, is completed. static void waitForEvent(EventImplPtr Event); - // Enqueue the command passed and all it's dependencies to the underlying - // device. Returns true is the command is successfully enqueued. Sets - // EnqueueResult to the specific status otherwise. + /// Enqueues the command and all its dependencies. + /// + /// \param EnqueueResult is set to specific status if enqueue failed. + /// \return true if the command is successfully enqueued. static bool enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult, BlockingT Blocking = NON_BLOCKING); }; @@ -237,7 +671,7 @@ class Scheduler { void waitForRecordToFinish(MemObjRecord *Record); GraphBuilder MGraphBuilder; - // Use read-write mutex in future. + // TODO Use read-write mutex in future. std::mutex MGraphLock; QueueImplPtr DefaultHostQueue; From 5011e3dafdeae8ddf2cca21443d91ac557bd7862 Mon Sep 17 00:00:00 2001 From: Alexander Batashev Date: Thu, 2 Apr 2020 21:36:07 +0300 Subject: [PATCH 2/7] Apply suggestions from code review Signed-off-by: Alexander Batashev Co-Authored-By: Artem Gindinson --- sycl/source/detail/scheduler/commands.hpp | 6 +++--- sycl/source/detail/scheduler/graph_builder.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index cf69d24bbbf4b..88f1d0cf21e9a 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -51,7 +51,7 @@ struct EnqueueResultT { : MResult(Result), MCmd(Cmd), MErrCode(ErrCode) {} /// Indicates result of enqueueing. ResultT MResult; - /// Pointer to the command failed to enqueue. + /// Pointer to the command which failed to enqueue. Command *MCmd; /// Error code which is set when enqueueing fails. cl_int MErrCode; @@ -78,9 +78,9 @@ struct DepDesc { }; /// The Command represents some action that needs to be performed on one or -/// more memory objects. The command has vector of DepDesc objects that +/// more memory objects. The Command has a vector of DepDesc objects that /// represent dependencies of the command. It has vector of pointer to commands -/// that depend on the command. It has a pointer to \ref queue object. And has +/// that depend on the command. It has a pointer to \ref queue object and an /// event that is associated with the command. /// /// \ingroup sycl_graph diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 02cb5bd4c4969..4785e16e32177 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -446,14 +446,14 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost( return insertMemoryMove(Record, Req, HostQueue); } -/// Start searching from list of "leaf" commands for the record and check if the -/// examining command can be executed in parallel with new one with regard to -/// the memory object. If can, then continue searching through dependencies of +/// Start the search for the record from list of "leaf" commands and check if the +/// examining command can be executed in parallel with the new one with regard to +/// the memory object. If it can, then continue searching through dependencies of /// that command. There are several rules used: /// /// 1. New and examined commands only read -> can bypass /// 2. New and examined commands has non-overlapping requirements -> can bypass -/// 3. New and examined commands has different contexts -> cannot bypass +/// 3. New and examined commands have different contexts -> cannot bypass std::set Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, Requirement *Req, const ContextImplPtr &Context) { From d0b265ed4b1b3621299d67a1ee83f0b3c6598669 Mon Sep 17 00:00:00 2001 From: Alexander Batashev Date: Thu, 9 Apr 2020 11:11:39 +0300 Subject: [PATCH 3/7] Apply suggestions from code review Signed-off-by: Alexander Batashev Co-Authored-By: Romanov Vlad <17316488+romanovvlad@users.noreply.github.com> Co-Authored-By: Sergey Semenov <43845535+sergey-semenov@users.noreply.github.com> --- sycl/source/detail/scheduler/commands.hpp | 4 +- .../source/detail/scheduler/graph_builder.cpp | 6 +- sycl/source/detail/scheduler/scheduler.hpp | 77 +++++++++---------- 3 files changed, 43 insertions(+), 44 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 88f1d0cf21e9a..a9affcc1796f7 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -79,7 +79,7 @@ struct DepDesc { /// The Command represents some action that needs to be performed on one or /// more memory objects. The Command has a vector of DepDesc objects that -/// represent dependencies of the command. It has vector of pointer to commands +/// represent dependencies of the command. It has a vector of pointers to commands /// that depend on the command. It has a pointer to \ref queue object and an /// event that is associated with the command. /// @@ -242,7 +242,7 @@ class EmptyCommand : public Command { Requirement MRequirement; }; -/// The release command enqueues release instance of memory allocated on Host or +/// The release command enqueues release of a memory object instance allocated on Host or /// underlying framework. class ReleaseCommand : public Command { public: diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index 4785e16e32177..a16d382fbb9af 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -27,7 +27,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { -/// Checks whether two requirements overlaps or not. +/// Checks whether two requirements overlap or not. /// /// This information can be used to prove that executing two kernels that /// work on different parts of the memory object in parallel is legal. @@ -44,7 +44,7 @@ static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) { return LHS == RHS || (LHS->is_host() && RHS->is_host()); } -/// Checks if current requirement is requirement for sub buffer +/// Checks if current requirement is requirement for sub buffer. static bool IsSuitableSubReq(const Requirement *Req) { return Req->MIsSubBuffer; } @@ -447,7 +447,7 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost( } /// Start the search for the record from list of "leaf" commands and check if the -/// examining command can be executed in parallel with the new one with regard to +/// examined command can be executed in parallel with the new one with regard to /// the memory object. If it can, then continue searching through dependencies of /// that command. There are several rules used: /// diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 1f8681a0b4df9..a94860e5bfaa9 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -21,9 +21,9 @@ /// \defgroup sycl_graph DPC++ Execution Graph /// -/// DPC++, unlike OpenCL, provides a programming model in which user doesn't +/// DPC++, unlike OpenCL, provides a programming model in which the user doesn't /// need to manage dependencies between kernels and memory explicitly. The DPC++ -/// Runtime must ensure correct execution with respect to order commands are +/// Runtime must ensure correct execution with respect to the order commands are /// submitted. /// /// This document describes the part of the DPC++ Runtime that is responsible @@ -34,13 +34,13 @@ /// The SYCL framework defines command group (\ref CG) as an entity that /// represents minimal execution block. The command group is submitted to SYCL /// queue and consists of a kernel and its requirements. The SYCL queue defines -/// device and context using which the kernel should be executed. +/// the device and context using which the kernel should be executed. /// /// There are also command groups that consist of memory requirements and -/// explicit memory operation, such as copy, fill, update_host. In this case +/// an explicit memory operation, such as copy, fill, update_host. In this case /// it's up to an implementation how to implement these operations. /// -/// The relative order of command groups submission defines order in which +/// The relative order of command groups submission defines the order in which /// kernels must be executed if their memory requirements intersect. For /// example, if a command group A writes to a buffer X, command group B reads /// from X, then the scheduled execution order of A and B will be the same as @@ -52,7 +52,7 @@ /// it's SYCL responsibility to allocate and/or copy memory to the target /// context to achieve correct execution. /// -/// Refer to SYCL Specification 1.2.2 sections 3.4 and 3.5 to find more +/// Refer to SYCL Specification 1.2.1 sections 3.4 and 3.5 to find more /// information about SYCL execution and memory model. /// /// ### Example of DPC++ application @@ -69,7 +69,7 @@ /// auto BufferC = ...; /// /// // "Copy command group" section -/// // Request processing copy "explicit" operation on CPU +/// // Request processing explicit copy operation on CPU /// // The copy operation reads from BufferA and writes to BufferB /// /// CPU_Queue.submit([&](handler &CGH) { @@ -93,17 +93,17 @@ /// /// // "Host accessor creation" section /// // Request the latest data of BufferC for the moment -/// // This is synchronization point what means SYCL RT blocks on creation of +/// // This is a synchronization point, which means that the DPC++ RT blocks on creation of /// // the accessor until requested data is available. /// auto C = BufferC.get_access(); /// } /// \endcode /// -/// In the example above DPC++ RT does: +/// In the example above the DPC++ RT does the following: /// /// 1. **Copy command group**. -/// DPC++ RT allocates memory for BufferA and BufferB on CPU then execute -/// "copy" explicit memory operation on CPU. +/// The DPC++ RT allocates memory for BufferA and BufferB on CPU then executes +/// an explicit copy operation on CPU. /// /// 2. **Multi command group** /// DPC++ RT allocates memory for BufferC and BufferB on GPU and copy @@ -115,7 +115,6 @@ /// GPU to this memory. /// /// So, the example above will be converted to the following OpenCL pseudo code -/// (for both eager and lazy execution): /// \code{.cpp} /// // Initialization(not related to the Scheduler) /// Platform = clGetPlatforms(...); @@ -220,7 +219,7 @@ struct MemObjRecord { /// 1. Allocate memory. /// The command represents memory allocation operation. There can be -/// multiple allocations for single SYCL memory object. +/// multiple allocations for a single SYCL memory object. /// 2. Release memory. /// The command represents memory release operation. /// 3. Execute command group. @@ -232,7 +231,7 @@ struct MemObjRecord { /// /// As a main input Scheduler takes command group and returns an event /// representing the command group, so it can be waited on later. When a new -/// command group comes Scheduler adds one or more nodes to the graph +/// command group comes, Scheduler adds one or more nodes to the graph /// depending on the command groups' requirements. For example, if a new /// command group is submitted to the SYCL context which has the latest data /// for all the requirements, Scheduler adds a new "Execute command group" @@ -316,16 +315,16 @@ struct MemObjRecord { /// commands that modify it. /// /// To detect that two command groups access the same memory object and create -/// a dependency between them the scheduler needs to store information about +/// a dependency between them, Scheduler needs to store information about /// the memory object. /// /// \subsection sched_thread_safety Thread safety /// -/// To ensure thread safe execution of methods Scheduler provides access to the -/// graph should be guarded by read-write mutex(analog of shared mutex from +/// To ensure thread safe execution of methods, Scheduler provides access to the +/// graph that's guarded by a read-write mutex (analog of shared mutex from /// C++17). /// -/// An read-write mutex allows concurrent access to read-only operations, while +/// A read-write mutex allows concurrent access to read-only operations, while /// write operations require exclusive access. /// /// All the methods of GraphBuilder lock the mutex in write mode because these @@ -335,8 +334,8 @@ struct MemObjRecord { /// /// \subsection shced_err_handling Error handling /// -/// There are two sources of erros that needs to be handled in Scheduler: -/// 1. the error that happens during command enqueue process +/// There are two sources of errors that needs to be handled in Scheduler: +/// 1. errors that happen during command enqueue process /// 2. the error that happend during command execution. /// /// If error occurs during commands enqueue process Command::enqueue method @@ -344,7 +343,7 @@ struct MemObjRecord { /// dependent commands (if any). /// /// An error with command processing can happen in underlying runtime, in this -/// case Scheduler is notified asynchronously(using callback mechanism) what +/// case Scheduler is notified asynchronously (using callback mechanism) what /// triggers rescheduling. /// /// \ingroup sycl_graph @@ -355,7 +354,7 @@ class Scheduler { /// It's called by SYCL's queue.submit. /// /// \param CommandGroup is a unique_ptr to a command group to be added. - /// \return an event object to wait on for command group completetion. + /// \return an event object to wait on for command group completion. EventImplPtr addCG(std::unique_ptr CommandGroup, QueueImplPtr Queue); @@ -379,13 +378,13 @@ class Scheduler { /// Removes buffer from the graph. /// - /// The lifetime of memory object descriptor begins when first command group - /// that uses memory object comes and ends when "removeMemoryObject(...)" + /// The lifetime of memory object descriptor begins when the first command group + /// that uses the memory object is submitted and ends when "removeMemoryObject(...)" /// method is called which means there will be no command group that uses the /// memory object. When removeMemoryObject is called Scheduler will enqueue - /// and wait on all ReleseCommand's associated with the memory object, what - /// effectively guarantees that all commands accessing the memory object is - /// complete and then resources for the memory object is freed. Then all the + /// and wait on all release commands associated with the memory object, which + /// effectively guarantees that all commands accessing the memory object are + /// complete and then the resources allocated for the memory object are freed. Then all the /// commands affecting the memory object are removed. /// /// On destruction Scheduler triggers destruction of all memory object @@ -394,7 +393,7 @@ class Scheduler { /// /// This member function is used by \ref buffer and \ref image. /// - /// \param MemObj is a memory object that points to buffer being removed. + /// \param MemObj is a memory object that points to the buffer being removed. void removeMemoryObject(detail::SYCLMemObjI *MemObj); /// Removes finished non-leaf non-alloca commands from the subgraph @@ -481,10 +480,10 @@ class Scheduler { /// (assuming that all its commands have been waited for). void cleanupFinishedCommands(Command *FinishedCmd); - /// Reschedules command passed using Queue provided. t + /// Reschedules the command passed using Queue provided. /// /// This can lead to rescheduling of all dependent commands. This can be - /// used when user provides "secondary" queue to submit method which may + /// used when the user provides a "secondary" queue to the submit method which may /// be used when command fails to enqueue/execute in primary queue. void rescheduleCommand(Command *Cmd, QueueImplPtr Queue); @@ -500,13 +499,13 @@ class Scheduler { /// Decrements leaf counters for all leaves of the record. void decrementLeafCountersForRecord(MemObjRecord *Record); - /// Removes commands that use given MemObjRecord from the graph. + /// Removes commands that use the given MemObjRecord from the graph. void cleanupCommandsForRecord(MemObjRecord *Record); - /// Removes MemObjRecord for memory object passed. + /// Removes the MemObjRecord for the memory object passed. void removeRecordForMemObj(SYCLMemObjI *MemObject); - /// Add new command to leaves if needed. + /// Adds new command to leaves if needed. void addNodeToLeaves(MemObjRecord *Record, Command *Cmd, access::mode AccessMode); @@ -517,7 +516,7 @@ class Scheduler { std::vector MMemObjs; private: - /// Inserts required command to update memory object state in the context. + /// Inserts the command required to update the memory object state in the context. /// /// Copy/map/unmap operations can be inserted depending on the source and /// destination. @@ -574,7 +573,7 @@ class Scheduler { std::array MPrintOptionsArray; }; - /// Graph Processor provided interfaces for enqueueing commands and their + /// Graph Processor provides interfaces for enqueueing commands and their /// dependencies to the underlying runtime. /// /// Member functions of this class do not modify the graph. @@ -582,17 +581,17 @@ class Scheduler { /// \section sched_enqueue Command enqueueing /// \todo lazy mode is not implemented. /// - /// The Scheduler can work in two modes of enqueueing commands: eager(default) + /// The Scheduler can work in two modes of enqueueing commands: eager (default) /// and lazy. In eager mode commands are enqueued whenever they come to the - /// Scheduler. In lazy mode they are not enqueued until content of the buffer + /// Scheduler. In lazy mode they are not enqueued until the content of the buffer /// they are accessing is requested by user. /// /// Each command has enqueue method which takes vector of events that /// represents dependencies and returns event which represents the command. /// GraphProcessor makes topological sort to get order in which commands are /// need to be enqueued. Then enqueue each command passing vector of events - /// that this command needs to wait on. If error happens during command - /// enqueue, the whole process is stopped, faulty command is propagated back + /// that this command needs to wait on. If an error happens during command + /// enqueue, the whole process is stopped, the faulty command is propagated back /// to the Scheduler. /// /// The command with dependencies that belong to different context from From f6eb8ff95e2c18109e4b5c953ef245e8b111273f Mon Sep 17 00:00:00 2001 From: Alexander Batashev Date: Thu, 9 Apr 2020 11:15:00 +0300 Subject: [PATCH 4/7] Apply suggestions from code review Signed-off-by: Alexander Batashev Co-Authored-By: Sergey Semenov <43845535+sergey-semenov@users.noreply.github.com> --- sycl/source/detail/scheduler/scheduler.hpp | 28 +++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index a94860e5bfaa9..57feb83036c3a 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -229,8 +229,8 @@ struct MemObjRecord { /// The command represents memory copy operation between two memory /// allocations of a single memory object. /// -/// As a main input Scheduler takes command group and returns an event -/// representing the command group, so it can be waited on later. When a new +/// As the main input Scheduler takes a command group and returns an event +/// representing it, so it can be waited on later. When a new /// command group comes, Scheduler adds one or more nodes to the graph /// depending on the command groups' requirements. For example, if a new /// command group is submitted to the SYCL context which has the latest data @@ -266,8 +266,8 @@ struct MemObjRecord { /// executing the first command group memory allocation must be performed. /// /// At some point Scheduler enqueues commands to the underlying devices. To do -/// this Scheduler makes topological sort to get order in which commands are -/// need to be enqueued. For example, the following graph (D depends on B and C, +/// this, Scheduler performs topological sort to get the order in which commands should +/// be enqueued. For example, the following graph (D depends on B and C, /// B and C depends on A) will be enqueued in the following order: /// \code{.cpp} /// EventA = Enqueue(A, /*Deps=*/{}); @@ -311,8 +311,8 @@ struct MemObjRecord { /// The Scheduler is split up into two parts: graph builder and graph /// processor. /// -/// To build dependencies Scheduler needs to memorize memory object and -/// commands that modify it. +/// To build dependencies, Scheduler needs to memorize memory objects and +/// commands that modify them. /// /// To detect that two command groups access the same memory object and create /// a dependency between them, Scheduler needs to store information about @@ -338,8 +338,8 @@ struct MemObjRecord { /// 1. errors that happen during command enqueue process /// 2. the error that happend during command execution. /// -/// If error occurs during commands enqueue process Command::enqueue method -/// return faulty command. The Scheduler then reschedules the command and all +/// If an error occurs during command enqueue process, the Command::enqueue method +/// returns the faulty command. Scheduler then reschedules the command and all /// dependent commands (if any). /// /// An error with command processing can happen in underlying runtime, in this @@ -484,7 +484,7 @@ class Scheduler { /// /// This can lead to rescheduling of all dependent commands. This can be /// used when the user provides a "secondary" queue to the submit method which may - /// be used when command fails to enqueue/execute in primary queue. + /// be used when the command fails to enqueue/execute in the primary queue. void rescheduleCommand(Command *Cmd, QueueImplPtr Queue); /// \return a pointer to the corresponding memory object record for the @@ -588,15 +588,15 @@ class Scheduler { /// /// Each command has enqueue method which takes vector of events that /// represents dependencies and returns event which represents the command. - /// GraphProcessor makes topological sort to get order in which commands are - /// need to be enqueued. Then enqueue each command passing vector of events + /// GraphProcessor performs topological sort to get the order in which commands have to + /// be enqueued. Then it enqueues each command, passing a vector of events /// that this command needs to wait on. If an error happens during command /// enqueue, the whole process is stopped, the faulty command is propagated back /// to the Scheduler. /// - /// The command with dependencies that belong to different context from - /// command's one can't be enqueued directly(limitation of OpenCL runtime). - /// Instead for each dependency a proxy event in the target context is created + /// The command with dependencies that belong to a context different from its own + /// can't be enqueued directly (limitation of OpenCL runtime). + /// Instead, for each dependency, a proxy event is created in the target context /// and linked using OpenCL callback mechanism with original one. For example, /// the following SYCL code: /// From f9e6934278d976b8f51416b1a71142c24a39a97f Mon Sep 17 00:00:00 2001 From: Alexander Batashev Date: Thu, 9 Apr 2020 13:34:07 +0300 Subject: [PATCH 5/7] Address more comments Signed-off-by: Alexander Batashev --- sycl/source/detail/scheduler/commands.hpp | 73 +++++++-------- sycl/source/detail/scheduler/scheduler.hpp | 101 ++++++++++----------- 2 files changed, 84 insertions(+), 90 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index a9affcc1796f7..c3a3edcdbaf33 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -49,7 +49,7 @@ struct EnqueueResultT { EnqueueResultT(ResultT Result = SyclEnqueueSuccess, Command *Cmd = nullptr, cl_int ErrCode = CL_SUCCESS) : MResult(Result), MCmd(Cmd), MErrCode(ErrCode) {} - /// Indicates result of enqueueing. + /// Indicates the result of enqueueing. ResultT MResult; /// Pointer to the command which failed to enqueue. Command *MCmd; @@ -77,11 +77,11 @@ struct DepDesc { AllocaCommandBase *MAllocaCmd = nullptr; }; -/// The Command represents some action that needs to be performed on one or -/// more memory objects. The Command has a vector of DepDesc objects that -/// represent dependencies of the command. It has a vector of pointers to commands -/// that depend on the command. It has a pointer to \ref queue object and an -/// event that is associated with the command. +/// The Command class represents some action that needs to be performed on one +/// or more memory objects. The Command has a vector of DepDesc objects that +/// represent dependencies of the command. It has a vector of pointers to +/// commands that depend on the command. It has a pointer to a \ref queue object +/// and an event that is associated with the command. /// /// \ingroup sycl_graph class Command { @@ -112,8 +112,8 @@ class Command { /// Checks if the command is enqueued, and calls enqueueImp. /// /// \param EnqueueResult is set to the specific status if enqueue failed. - /// \param Blocking if this argument is true, function will wait for command - /// to be unblocked before calling enqueueImp. + /// \param Blocking if this argument is true, function will wait for the + /// command to be unblocked before calling enqueueImp. /// \return true if the command is enqueued. bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking); @@ -136,23 +136,23 @@ class Command { /// Looks at all the dependencies for the release command and enables /// instrumentation to report these dependencies as edges. void resolveReleaseDependencies(std::set &list); - /// Creates an edge event when the dependency is a command + /// Creates an edge event when the dependency is a command. void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, const string_class &Prefix, bool IsCommand); - /// Creates an edge event when the dependency is an event + /// Creates an edge event when the dependency is an event. void emitEdgeEventForEventDependence(Command *Cmd, RT::PiEvent &EventAddr); - /// Creates a signal event with the enqueued kernel event handle + /// Creates a signal event with the enqueued kernel event handle. void emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr); /// Create a trace event of node_create type; this must be guarded by a - /// check for xptiTraceEnabled() - /// Post Condition: MTraceEvent will be set to the event created - /// \param MAddress The address to use to create the payload + /// check for xptiTraceEnabled(). + /// Post Condition: MTraceEvent will be set to the event created. + /// \param MAddress The address to use to create the payload. uint64_t makeTraceEventProlog(void *MAddress); /// If prolog has been run, run epilog; this must be guarded by a check for - /// xptiTraceEnabled() + /// xptiTraceEnabled(). void makeTraceEventEpilog(); - /// Emits an event of Type + /// Emits an event of Type. void emitInstrumentation(uint16_t Type, const char *Txt = nullptr); // End Methods needed to support SYCL instrumentation @@ -178,7 +178,7 @@ class Command { /// Private interface. Derived classes should implement this method. virtual cl_int enqueueImp() = 0; - /// The type of the command + /// The type of the command. CommandType MType; /// Mutex used to protect enqueueing from race conditions std::mutex MEnqueueMtx; @@ -186,16 +186,16 @@ class Command { public: /// Contains list of dependencies(edges) std::vector MDeps; - /// Contains list of commands that depend on the command + /// Contains list of commands that depend on the command. std::unordered_set MUsers; - /// Indicates whether the command can be blocked from enqueueing + /// Indicates whether the command can be blocked from enqueueing. bool MIsBlockable = false; - /// Counts the number of memory objects this command is a leaf for + /// Counts the number of memory objects this command is a leaf for. unsigned MLeafCounter = 0; const char *MBlockReason = "Unknown"; - /// Describes the status of a command + /// Describes the status of the command. std::atomic MEnqueueStatus; // All member variable defined here are needed for the SYCL instrumentation @@ -203,25 +203,26 @@ class Command { // to ensure we have the same object layout when the macro in the library and // SYCL app are not the same. - /// The event for node_create and task_begin + /// The event for node_create and task_begin. void *MTraceEvent = nullptr; - /// The stream under which the traces are emitted; stream ids are - /// positive integers and we set it to an invalid value + /// The stream under which the traces are emitted. + /// + /// Stream ids are positive integers and we set it to an invalid value. int32_t MStreamID = -1; /// Reserved for storing the object address such as SPIRV or memory object - /// address + /// address. void *MAddress = nullptr; - /// Buffer to build the address string + /// Buffer to build the address string. string_class MAddressString; - /// Buffer to build the command node type + /// Buffer to build the command node type. string_class MCommandNodeType; - /// Buffer to build the command end-user understandable name + /// Buffer to build the command end-user understandable name. string_class MCommandName; - /// Flag to indicate if makeTraceEventProlog() has been run + /// Flag to indicate if makeTraceEventProlog() has been run. bool MTraceEventPrologComplete = false; - /// Flag to indicate if this is the first time we are seeing this payload + /// Flag to indicate if this is the first time we are seeing this payload. bool MFirstInstance = false; - /// Instance ID tracked for the command + /// Instance ID tracked for the command. uint64_t MInstanceID = 0; }; @@ -242,8 +243,8 @@ class EmptyCommand : public Command { Requirement MRequirement; }; -/// The release command enqueues release of a memory object instance allocated on Host or -/// underlying framework. +/// The release command enqueues release of a memory object instance allocated +/// on Host or underlying framework. class ReleaseCommand : public Command { public: ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd); @@ -286,7 +287,7 @@ class AllocaCommandBase : public Command { bool MIsActive = true; /// Indicates that the command owns memory allocation in case of connected - /// alloca command + /// alloca command. bool MIsLeaderAlloca = true; protected: @@ -309,7 +310,7 @@ class AllocaCommand : public AllocaCommandBase { cl_int enqueueImp() final; /// The flag indicates that alloca should try to reuse pointer provided by - /// the user during memory object construction + /// the user during memory object construction. bool MInitFromUserData = false; }; @@ -329,7 +330,7 @@ class AllocaSubBufCommand : public AllocaCommandBase { AllocaCommandBase *MParentAlloca = nullptr; }; -/// The map command enqueues mapping of host memory onto device memory. +/// The map command enqueues mapping of device memory onto host memory. class MapMemObject : public Command { public: MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 57feb83036c3a..410939f9426f1 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -33,12 +33,13 @@ /// /// The SYCL framework defines command group (\ref CG) as an entity that /// represents minimal execution block. The command group is submitted to SYCL -/// queue and consists of a kernel and its requirements. The SYCL queue defines -/// the device and context using which the kernel should be executed. +/// queue and consists of a kernel or an explicit memory operation, and their +/// requirements. The SYCL queue defines the device and context using which the +/// kernel should be executed. /// -/// There are also command groups that consist of memory requirements and -/// an explicit memory operation, such as copy, fill, update_host. In this case -/// it's up to an implementation how to implement these operations. +/// The commands that contain explicit memory operations include copy, fill, +/// update_host and other operations. It's up to implementation how to define +/// these operations. /// /// The relative order of command groups submission defines the order in which /// kernels must be executed if their memory requirements intersect. For @@ -93,8 +94,8 @@ /// /// // "Host accessor creation" section /// // Request the latest data of BufferC for the moment -/// // This is a synchronization point, which means that the DPC++ RT blocks on creation of -/// // the accessor until requested data is available. +/// // This is a synchronization point, which means that the DPC++ RT blocks +/// // on creation of the accessor until requested data is available. /// auto C = BufferC.get_access(); /// } /// \endcode @@ -102,8 +103,8 @@ /// In the example above the DPC++ RT does the following: /// /// 1. **Copy command group**. -/// The DPC++ RT allocates memory for BufferA and BufferB on CPU then executes -/// an explicit copy operation on CPU. +/// The DPC++ RT allocates memory for BufferA and BufferB on CPU then +/// executes an explicit copy operation on CPU. /// /// 2. **Multi command group** /// DPC++ RT allocates memory for BufferC and BufferB on GPU and copy @@ -266,8 +267,8 @@ struct MemObjRecord { /// executing the first command group memory allocation must be performed. /// /// At some point Scheduler enqueues commands to the underlying devices. To do -/// this, Scheduler performs topological sort to get the order in which commands should -/// be enqueued. For example, the following graph (D depends on B and C, +/// this, Scheduler performs topological sort to get the order in which commands +/// should be enqueued. For example, the following graph (D depends on B and C, /// B and C depends on A) will be enqueued in the following order: /// \code{.cpp} /// EventA = Enqueue(A, /*Deps=*/{}); @@ -308,8 +309,7 @@ struct MemObjRecord { /// /// \section sched_impl Implementation details /// -/// The Scheduler is split up into two parts: graph builder and graph -/// processor. +/// The Scheduler is split up into two parts: graph builder and graph processor. /// /// To build dependencies, Scheduler needs to memorize memory objects and /// commands that modify them. @@ -338,9 +338,9 @@ struct MemObjRecord { /// 1. errors that happen during command enqueue process /// 2. the error that happend during command execution. /// -/// If an error occurs during command enqueue process, the Command::enqueue method -/// returns the faulty command. Scheduler then reschedules the command and all -/// dependent commands (if any). +/// If an error occurs during command enqueue process, the Command::enqueue +/// method returns the faulty command. Scheduler then reschedules the command +/// and all dependent commands (if any). /// /// An error with command processing can happen in underlying runtime, in this /// case Scheduler is notified asynchronously (using callback mechanism) what @@ -378,26 +378,23 @@ class Scheduler { /// Removes buffer from the graph. /// - /// The lifetime of memory object descriptor begins when the first command group - /// that uses the memory object is submitted and ends when "removeMemoryObject(...)" - /// method is called which means there will be no command group that uses the - /// memory object. When removeMemoryObject is called Scheduler will enqueue - /// and wait on all release commands associated with the memory object, which - /// effectively guarantees that all commands accessing the memory object are - /// complete and then the resources allocated for the memory object are freed. Then all the - /// commands affecting the memory object are removed. - /// - /// On destruction Scheduler triggers destruction of all memory object - /// descriptors in order to wait on all commands not yet executed and all - /// memory it manages. + /// The lifetime of memory object descriptor begins when the first command + /// group that uses the memory object is submitted and ends when + /// "removeMemoryObject(...)" method is called which means there will be no + /// command group that uses the memory object. When removeMemoryObject is + /// called Scheduler will enqueue and wait on all release commands associated + /// with the memory object, which effectively guarantees that all commands + /// accessing the memory object are complete and then the resources allocated + /// for the memory object are freed. Then all the commands affecting the + /// memory object are removed. /// /// This member function is used by \ref buffer and \ref image. /// /// \param MemObj is a memory object that points to the buffer being removed. void removeMemoryObject(detail::SYCLMemObjI *MemObj); - /// Removes finished non-leaf non-alloca commands from the subgraph - /// (assuming that all its commands have been waited for). + /// Removes finished non-leaf non-alloca commands from the subgraph (assuming + /// that all its commands have been waited for). /// \sa GraphBuilder::cleanupFinishedCommands /// /// \param FinishedEvent is a cleanup candidate event. @@ -458,13 +455,12 @@ class Scheduler { Command *addCGUpdateHost(std::unique_ptr CommandGroup, QueueImplPtr HostQueue); - /// Registers a \ref CG "command group" to update memory to the latest - /// state. + /// Enqueues a command to update memory to the latest state. /// /// \param Req is a requirement, that describes memory object. Command *addCopyBack(Requirement *Req); - /// Registers a \ref CG "command group" to create a host accessor. + /// Enqueues a command to create a host accessor. /// /// \param Req points to memory being accessed. Command *addHostAccessor(Requirement *Req, const bool destructor = false); @@ -483,8 +479,9 @@ class Scheduler { /// Reschedules the command passed using Queue provided. /// /// This can lead to rescheduling of all dependent commands. This can be - /// used when the user provides a "secondary" queue to the submit method which may - /// be used when the command fails to enqueue/execute in the primary queue. + /// used when the user provides a "secondary" queue to the submit method + /// which may be used when the command fails to enqueue/execute in the + /// primary queue. void rescheduleCommand(Command *Cmd, QueueImplPtr Queue); /// \return a pointer to the corresponding memory object record for the @@ -516,7 +513,8 @@ class Scheduler { std::vector MMemObjs; private: - /// Inserts the command required to update the memory object state in the context. + /// Inserts the command required to update the memory object state in the + /// context. /// /// Copy/map/unmap operations can be inserted depending on the source and /// destination. @@ -579,26 +577,21 @@ class Scheduler { /// Member functions of this class do not modify the graph. /// /// \section sched_enqueue Command enqueueing - /// \todo lazy mode is not implemented. - /// - /// The Scheduler can work in two modes of enqueueing commands: eager (default) - /// and lazy. In eager mode commands are enqueued whenever they come to the - /// Scheduler. In lazy mode they are not enqueued until the content of the buffer - /// they are accessing is requested by user. /// - /// Each command has enqueue method which takes vector of events that - /// represents dependencies and returns event which represents the command. - /// GraphProcessor performs topological sort to get the order in which commands have to - /// be enqueued. Then it enqueues each command, passing a vector of events - /// that this command needs to wait on. If an error happens during command - /// enqueue, the whole process is stopped, the faulty command is propagated back - /// to the Scheduler. + /// Commands are enqueued whenever they come to the Scheduler. Each command + /// has enqueue method which takes vector of events that represents + /// dependencies and returns event which represents the command. + /// GraphProcessor performs topological sort to get the order in which + /// commands have to be enqueued. Then it enqueues each command, passing a + /// vector of events that this command needs to wait on. If an error happens + /// during command enqueue, the whole process is stopped, the faulty command + /// is propagated back to the Scheduler. /// - /// The command with dependencies that belong to a context different from its own - /// can't be enqueued directly (limitation of OpenCL runtime). - /// Instead, for each dependency, a proxy event is created in the target context - /// and linked using OpenCL callback mechanism with original one. For example, - /// the following SYCL code: + /// The command with dependencies that belong to a context different from its + /// own can't be enqueued directly (limitation of OpenCL runtime). + /// Instead, for each dependency, a proxy event is created in the target + /// context and linked using OpenCL callback mechanism with original one. + /// For example, the following SYCL code: /// /// \code{.cpp} /// // The ContextA and ContextB are different OpenCL contexts From 93c08b00b3d3adc8bc7b9ece0926ddcb8414daef Mon Sep 17 00:00:00 2001 From: Alexander Batashev Date: Thu, 9 Apr 2020 14:16:08 +0300 Subject: [PATCH 6/7] format Signed-off-by: Alexander Batashev --- sycl/source/detail/scheduler/graph_builder.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp index a16d382fbb9af..b4a7a7273be91 100644 --- a/sycl/source/detail/scheduler/graph_builder.cpp +++ b/sycl/source/detail/scheduler/graph_builder.cpp @@ -446,10 +446,10 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost( return insertMemoryMove(Record, Req, HostQueue); } -/// Start the search for the record from list of "leaf" commands and check if the -/// examined command can be executed in parallel with the new one with regard to -/// the memory object. If it can, then continue searching through dependencies of -/// that command. There are several rules used: +/// Start the search for the record from list of "leaf" commands and check if +/// the examined command can be executed in parallel with the new one with +/// regard to the memory object. If it can, then continue searching through +/// dependencies of that command. There are several rules used: /// /// 1. New and examined commands only read -> can bypass /// 2. New and examined commands has non-overlapping requirements -> can bypass From 8bf29c706813763306ed336d25ba583759a02405 Mon Sep 17 00:00:00 2001 From: Alexander Batashev Date: Tue, 14 Apr 2020 09:17:16 +0300 Subject: [PATCH 7/7] Fix more comments Signed-off-by: Alexander Batashev --- sycl/source/detail/scheduler/scheduler.hpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index 410939f9426f1..5affe5bde4977 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -21,7 +21,7 @@ /// \defgroup sycl_graph DPC++ Execution Graph /// -/// DPC++, unlike OpenCL, provides a programming model in which the user doesn't +/// SYCL, unlike OpenCL, provides a programming model in which the user doesn't /// need to manage dependencies between kernels and memory explicitly. The DPC++ /// Runtime must ensure correct execution with respect to the order commands are /// submitted. @@ -60,11 +60,11 @@ /// /// \code{.cpp} /// { -/// // Creating DPC++ CPU and GPU queues +/// // Creating SYCL CPU and GPU queues /// cl::sycl::queue CPU_Queue = ...; /// cl::sycl::queue GPU_Queue = ...; /// -/// // Creating 3 DPC++ buffers +/// // Creating 3 SYCL buffers /// auto BufferA = ...; // Buffer is initialized with host memory. /// auto BufferB = ...; /// auto BufferC = ...; @@ -369,9 +369,7 @@ class Scheduler { /// Waits for the event. /// /// This operation is blocking. For eager execution mode this method invokes - /// corresponding function of device API. In lazy execution mode the method - /// may enqueue the command, associated with the event, and its dependency - /// before calling device API. + /// corresponding function of device API. /// /// \param Event is a pointer to event to wait on. void waitForEvent(EventImplPtr Event);