From 3f710b784c06cf1762f54f4e5237e34b47fad736 Mon Sep 17 00:00:00 2001
From: Alexander Batashev <alexander.batashev@intel.com>
Date: Thu, 2 Apr 2020 15:38:41 +0300
Subject: [PATCH 1/7] [SYCL][Doc] Add Graph Scheduler design documentation

Co-authored-by: Vlad Romanov <vlad.romanov@intel.com>
Signed-off-by: Alexander Batashev <alexander.batashev@intel.com>
---
 sycl/include/CL/sycl/detail/cg.hpp            |  54 +-
 sycl/source/detail/scheduler/commands.hpp     | 156 ++---
 .../source/detail/scheduler/graph_builder.cpp |  39 +-
 sycl/source/detail/scheduler/scheduler.hpp    | 560 ++++++++++++++++--
 4 files changed, 624 insertions(+), 185 deletions(-)
diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp
index a87daa3e8e154..adb2a8134d7c2 100644
--- a/sycl/include/CL/sycl/detail/cg.hpp
+++ b/sycl/include/CL/sycl/detail/cg.hpp
@@ -35,11 +35,12 @@ class interop_handler {
   template <typename DataT, int Dims, access::mode AccMode,
             access::target AccTarget, access::placeholder isPlaceholder>
   friend class accessor;
+
 public:
-  using ReqToMem = std::pair<detail::Requirement*, pi_mem>;
+  using ReqToMem = std::pair<detail::Requirement *, pi_mem>;
 
-  interop_handler(std::vector<ReqToMem> MemObjs, cl_command_queue PiQueue) :
-    MQueue(PiQueue), MMemObjs(MemObjs) {}
+  interop_handler(std::vector<ReqToMem> MemObjs, cl_command_queue PiQueue)
+      : MQueue(PiQueue), MMemObjs(MemObjs) {}
 
   cl_command_queue get_queue() const noexcept { return MQueue; };
 
@@ -52,10 +53,11 @@ class interop_handler {
     detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
     return getMemImpl(detail::getSyclObjImpl(*AccBase).get());
   }
+
 private:
   cl_command_queue MQueue;
   std::vector<ReqToMem> MMemObjs;
-  cl_mem getMemImpl(detail::Requirement* Req) const;
+  cl_mem getMemImpl(detail::Requirement *Req) const;
 };
 
 namespace detail {
@@ -274,8 +276,7 @@ class HostKernel : public HostKernelBase {
   template <class ArgT = KernelArgType>
   typename std::enable_if<std::is_same<ArgT, nd_item<Dims>>::value>::type
   runOnHost(const NDRDescT &NDRDesc) {
-    sycl::range<Dims> GroupSize(
-        InitializedVal<Dims, range>::template get<0>());
+    sycl::range<Dims> GroupSize(InitializedVal<Dims, range>::template get<0>());
     for (int I = 0; I < Dims; ++I) {
       if (NDRDesc.LocalSize[I] == 0 ||
           NDRDesc.GlobalSize[I] % NDRDesc.LocalSize[I] != 0)
@@ -284,8 +285,7 @@ class HostKernel : public HostKernelBase {
       GroupSize[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
     }
 
-    sycl::range<Dims> LocalSize(
-        InitializedVal<Dims, range>::template get<0>());
+    sycl::range<Dims> LocalSize(InitializedVal<Dims, range>::template get<0>());
     sycl::range<Dims> GlobalSize(
         InitializedVal<Dims, range>::template get<0>());
     sycl::id<Dims> GlobalOffset;
@@ -326,10 +326,9 @@ class HostKernel : public HostKernelBase {
       NGroups[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
     }
 
-    sycl::range<Dims> LocalSize(
-      InitializedVal<Dims, range>::template get<0>());
+    sycl::range<Dims> LocalSize(InitializedVal<Dims, range>::template get<0>());
     sycl::range<Dims> GlobalSize(
-      InitializedVal<Dims, range>::template get<0>());
+        InitializedVal<Dims, range>::template get<0>());
     for (int I = 0; I < Dims; ++I) {
       LocalSize[I] = NDRDesc.LocalSize[I];
       GlobalSize[I] = NDRDesc.GlobalSize[I];
@@ -345,10 +344,10 @@ class HostKernel : public HostKernelBase {
 };
 
 class stream_impl;
-// The base class for all types of command groups.
+/// Base class for all types of command groups.
 class CG {
 public:
-  // Type of the command group.
+  /// Type of the command group.
   enum CGTYPE {
     NONE,
     KERNEL,
@@ -392,20 +391,20 @@ class CG {
 
 private:
   CGTYPE MType;
-  // The following storages needed to ensure that arguments won't die while
+  // The following storages are needed to ensure that arguments won't die while
   // we are using them.
-  // Storage for standard layout arguments.
+  /// Storage for standard layout arguments.
   vector_class<vector_class<char>> MArgsStorage;
-  // Storage for accessors.
+  /// Storage for accessors.
   vector_class<detail::AccessorImplPtr> MAccStorage;
-  // Storage for shared_ptrs.
+  /// Storage for shared_ptrs.
   vector_class<shared_ptr_class<const void>> MSharedPtrStorage;
 
 public:
-  // List of requirements that specify which memory is needed for the command
-  // group to be executed.
+  /// List of requirements that specify which memory is needed for the command
+  /// group to be executed.
   vector_class<Requirement *> MRequirements;
-  // List of events that order the execution of this CG
+  /// List of events that order the execution of this CG
   vector_class<detail::EventImplPtr> MEvents;
   // Member variables to capture the user code-location
   // information from Q.submit(), Q.parallel_for() etc
@@ -415,9 +414,10 @@ class CG {
   int32_t MLine, MColumn;
 };
 
-// The class which represents "execute kernel" command group.
+/// "Execute kernel" command group class.
 class CGExecKernel : public CG {
 public:
+  /// Stores ND-range description.
   NDRDescT MNDRDesc;
   unique_ptr_class<HostKernelBase> MHostKernel;
   shared_ptr_class<detail::kernel_impl> MSyclKernel;
@@ -455,7 +455,7 @@ class CGExecKernel : public CG {
   }
 };
 
-// The class which represents "copy" command group.
+/// "Copy memory" command group class.
 class CGCopy : public CG {
   void *MSrc;
   void *MDst;
@@ -476,7 +476,7 @@ class CGCopy : public CG {
   void *getDst() { return MDst; }
 };
 
-// The class which represents "fill" command group.
+/// "Fill memory" command group class.
 class CGFill : public CG {
 public:
   vector_class<char> MPattern;
@@ -496,7 +496,7 @@ class CGFill : public CG {
   Requirement *getReqToFill() { return MPtr; }
 };
 
-// The class which represents "update host" command group.
+/// "Update host" command group class.
 class CGUpdateHost : public CG {
   Requirement *MPtr;
 
@@ -515,7 +515,7 @@ class CGUpdateHost : public CG {
   Requirement *getReqToUpdate() { return MPtr; }
 };
 
-// The class which represents "copy" command group for USM pointers.
+/// "Copy USM" command group class.
 class CGCopyUSM : public CG {
   void *MSrc;
   void *MDst;
@@ -539,7 +539,7 @@ class CGCopyUSM : public CG {
   size_t getLength() { return MLength; }
 };
 
-// The class which represents "fill" command group for USM pointers.
+/// "Fill USM" command group class.
 class CGFillUSM : public CG {
   vector_class<char> MPattern;
   void *MDst;
@@ -562,7 +562,7 @@ class CGFillUSM : public CG {
   int getFill() { return MPattern[0]; }
 };
 
-// The class which represents "prefetch" command group for USM pointers.
+/// "Prefetch USM" command group class.
 class CGPrefetchUSM : public CG {
   void *MDst;
   size_t MLength;
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 004045db3e8e2..cf69d24bbbf4b 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -38,7 +38,7 @@ class ReleaseCommand;
 
 enum BlockingT { NON_BLOCKING = 0, BLOCKING };
 
-// The struct represents the result of command enqueueing
+/// Result of command enqueueing.
 struct EnqueueResultT {
   enum ResultT {
     SyclEnqueueReady,
@@ -49,15 +49,15 @@ struct EnqueueResultT {
   EnqueueResultT(ResultT Result = SyclEnqueueSuccess, Command *Cmd = nullptr,
                  cl_int ErrCode = CL_SUCCESS)
       : MResult(Result), MCmd(Cmd), MErrCode(ErrCode) {}
-  // Indicates result of enqueueing
+  /// Indicates result of enqueueing.
   ResultT MResult;
-  // Pointer to the command failed to enqueue
+  /// Pointer to the command failed to enqueue.
   Command *MCmd;
-  // Error code which is set when enqueueing fails
+  /// Error code which is set when enqueueing fails.
   cl_int MErrCode;
 };
 
-// DepDesc represents dependency between two commands
+/// Dependency between two commands.
 struct DepDesc {
   DepDesc(Command *DepCommand, const Requirement *Req,
           AllocaCommandBase *AllocaCmd)
@@ -68,20 +68,22 @@ struct DepDesc {
            std::tie(Rhs.MDepRequirement, Rhs.MDepCommand);
   }
 
-  // The actual dependency command.
+  /// The actual dependency command.
   Command *MDepCommand = nullptr;
-  // Requirement for the dependency.
+  /// Requirement for the dependency.
   const Requirement *MDepRequirement = nullptr;
-  // Allocation command for the memory object we have requirement for.
-  // Used to simplify searching for memory handle.
+  /// Allocation command for the memory object we have requirement for.
+  /// Used to simplify searching for memory handle.
   AllocaCommandBase *MAllocaCmd = nullptr;
 };
 
-// The Command represents some action that needs to be performed on one or
-// more memory objects. The command has vector of Depdesc objects that
-// represent dependencies of the command. It has vector of pointer to commands
-// that depend on the command. It has pointer to sycl::queue object. And has
-// event that is associated with the command.
+/// The Command represents some action that needs to be performed on one or
+/// more memory objects. The command has vector of DepDesc objects that
+/// represent dependencies of the command. It has vector of pointer to commands
+/// that depend on the command. It has a pointer to \ref queue object. And has
+/// event that is associated with the command.
+///
+/// \ingroup sycl_graph
 class Command {
 public:
   enum CommandType {
@@ -104,13 +106,15 @@ class Command {
 
   void addUser(Command *NewUser) { MUsers.insert(NewUser); }
 
-  // Return type of the command, e.g. Allocate, MemoryCopy.
+  /// \return type of the command, e.g. Allocate, MemoryCopy.
   CommandType getType() const { return MType; }
 
-  // The method checks if the command is enqueued, waits for it to be
-  // unblocked if "Blocking" argument is true, then calls enqueueImp. Returns
-  // true if the command is enqueued. Sets EnqueueResult to the specific
-  // status otherwise.
+  /// Checks if the command is enqueued, and calls enqueueImp.
+  ///
+  /// \param EnqueueResult is set to the specific status if enqueue failed.
+  /// \param Blocking if this argument is true, function will wait for command
+  /// to be unblocked before calling enqueueImp.
+  /// \return true if the command is enqueued.
   bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking);
 
   bool isFinished();
@@ -124,34 +128,33 @@ class Command {
   std::shared_ptr<event_impl> getEvent() const { return MEvent; }
 
   // Methods needed to support SYCL instrumentation
-  //
-  // Proxy method which calls emitInstrumentationData.
+
+  /// Proxy method which calls emitInstrumentationData.
   void emitInstrumentationDataProxy();
-  // Instrumentation method which emits telemetry data.
+  /// Instrumentation method which emits telemetry data.
   virtual void emitInstrumentationData() = 0;
-  // This function looks at all the dependencies for
-  // the release command and enables instrumentation
-  // to report these dependencies as edges
+  /// Looks at all the dependencies for the release command and enables
+  /// instrumentation to report these dependencies as edges.
   void resolveReleaseDependencies(std::set<Command *> &list);
-  // Creates an edge event when the dependency is a command
+  /// Creates an edge event when the dependency is a command
   void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr,
                                          const string_class &Prefix,
                                          bool IsCommand);
-  // Creates an edge event when the dependency is an event
+  /// Creates an edge event when the dependency is an event
   void emitEdgeEventForEventDependence(Command *Cmd, RT::PiEvent &EventAddr);
-  // Creates a signal event with the enqueued kernel event handle
+  /// Creates a signal event with the enqueued kernel event handle
   void emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr);
   /// Create a trace event of node_create type; this must be guarded by a
   /// check for xptiTraceEnabled()
   /// Post Condition: MTraceEvent will be set to the event created
-  /// @param MAddress  The address to use to create the payload
+  /// \param MAddress  The address to use to create the payload
   uint64_t makeTraceEventProlog(void *MAddress);
-  // If prolog has been run, run epilog; this must be guarded by a check for
-  // xptiTraceEnabled()
+  /// If prolog has been run, run epilog; this must be guarded by a check for
+  /// xptiTraceEnabled()
   void makeTraceEventEpilog();
-  // Emits an event of Type
+  /// Emits an event of Type
   void emitInstrumentation(uint16_t Type, const char *Txt = nullptr);
-  //
+
   // End Methods needed to support SYCL instrumentation
 
   virtual void printDot(std::ostream &Stream) const = 0;
@@ -172,58 +175,58 @@ class Command {
                      RT::PiEvent &Event);
   std::vector<EventImplPtr> prepareEvents(ContextImplPtr Context);
 
-  // Private interface. Derived classes should implement this method.
+  /// Private interface. Derived classes should implement this method.
   virtual cl_int enqueueImp() = 0;
 
-  // The type of the command
+  /// The type of the command
   CommandType MType;
-  // Mutex used to protect enqueueing from race conditions
+  /// Mutex used to protect enqueueing from race conditions
   std::mutex MEnqueueMtx;
 
 public:
-  // Contains list of dependencies(edges)
+  /// Contains list of dependencies(edges)
   std::vector<DepDesc> MDeps;
-  // Contains list of commands that depend on the command
+  /// Contains list of commands that depend on the command
   std::unordered_set<Command *> MUsers;
-  // Indicates whether the command can be blocked from enqueueing
+  /// Indicates whether the command can be blocked from enqueueing
   bool MIsBlockable = false;
-  // Counts the number of memory objects this command is a leaf for
+  /// Counts the number of memory objects this command is a leaf for
   unsigned MLeafCounter = 0;
 
   const char *MBlockReason = "Unknown";
 
-  // Describes the status of a command
+  /// Describes the status of a command
   std::atomic<EnqueueResultT::ResultT> MEnqueueStatus;
 
   // All member variable defined here  are needed for the SYCL instrumentation
   // layer. Do not guard these variables below with XPTI_ENABLE_INSTRUMENTATION
   // to ensure we have the same object layout when the macro in the library and
   // SYCL app are not the same.
-  //
-  // The event for node_create and task_begin
+
+  /// The event for node_create and task_begin
   void *MTraceEvent = nullptr;
-  // The stream under which the traces are emitted; stream ids are
-  // positive integers and we set it to an invalid value
+  /// The stream under which the traces are emitted; stream ids are
+  /// positive integers and we set it to an invalid value
   int32_t MStreamID = -1;
-  // Reserved for storing the object address such as SPIRV or memory object
-  // address
+  /// Reserved for storing the object address such as SPIRV or memory object
+  /// address
   void *MAddress = nullptr;
-  // Buffer to build the address string
+  /// Buffer to build the address string
   string_class MAddressString;
-  // Buffer to build the command node type
+  /// Buffer to build the command node type
   string_class MCommandNodeType;
-  // Buffer to build the command end-user understandable name
+  /// Buffer to build the command end-user understandable name
   string_class MCommandName;
-  // Flag to indicate if makeTraceEventProlog() has been run
+  /// Flag to indicate if makeTraceEventProlog() has been run
   bool MTraceEventPrologComplete = false;
-  // Flag to indicate if this is the first time we are seeing this payload
+  /// Flag to indicate if this is the first time we are seeing this payload
   bool MFirstInstance = false;
-  // Instance ID tracked for the command
+  /// Instance ID tracked for the command
   uint64_t MInstanceID = 0;
 };
 
-// The command does nothing during enqueue. The task can be used to implement
-// lock in the graph, or to merge several nodes into one.
+/// The empty command does nothing during enqueue. The task can be used to
+/// implement lock in the graph, or to merge several nodes into one.
 class EmptyCommand : public Command {
 public:
   EmptyCommand(QueueImplPtr Queue, Requirement Req);
@@ -239,8 +242,8 @@ class EmptyCommand : public Command {
   Requirement MRequirement;
 };
 
-// The command enqueues release instance of memory allocated on Host or
-// underlying framework.
+/// The release command enqueues release instance of memory allocated on Host or
+/// underlying framework.
 class ReleaseCommand : public Command {
 public:
   ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd);
@@ -251,10 +254,11 @@ class ReleaseCommand : public Command {
 private:
   cl_int enqueueImp() final;
 
-  // Command which allocates memory release command should dealocate
+  /// Command which allocates memory release command should dealocate.
   AllocaCommandBase *MAllocaCmd = nullptr;
 };
 
+/// Base class for memory allocation commands.
 class AllocaCommandBase : public Command {
 public:
   AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req,
@@ -272,17 +276,17 @@ class AllocaCommandBase : public Command {
 
   void *MMemAllocation = nullptr;
 
-  // Alloca command linked with current command.
-  // Device and host alloca commands can be linked, so they may share the same
-  // memory. Only one allocation from a pair can be accessed at a time. Alloca
-  // commands associated with such allocation is "active". In order to switch
-  // "active" status between alloca commands map/unmap operations are used.
+  /// Alloca command linked with current command.
+  /// Device and host alloca commands can be linked, so they may share the same
+  /// memory. Only one allocation from a pair can be accessed at a time. Alloca
+  /// commands associated with such allocation is "active". In order to switch
+  /// "active" status between alloca commands map/unmap operations are used.
   AllocaCommandBase *MLinkedAllocaCmd = nullptr;
-  // Indicates that current alloca is active one.
+  /// Indicates that current alloca is active one.
   bool MIsActive = true;
 
-  // Indicates that the command owns memory allocation in case of connected
-  // alloca command
+  /// Indicates that the command owns memory allocation in case of connected
+  /// alloca command
   bool MIsLeaderAlloca = true;
 
 protected:
@@ -290,8 +294,8 @@ class AllocaCommandBase : public Command {
   ReleaseCommand MReleaseCmd;
 };
 
-// The command enqueues allocation of instance of memory object on Host or
-// underlying framework.
+/// The alloca command enqueues allocation of instance of memory object on Host
+/// or underlying framework.
 class AllocaCommand : public AllocaCommandBase {
 public:
   AllocaCommand(QueueImplPtr Queue, Requirement Req,
@@ -304,11 +308,12 @@ class AllocaCommand : public AllocaCommandBase {
 private:
   cl_int enqueueImp() final;
 
-  // The flag indicates that alloca should try to reuse pointer provided by
-  // the user during memory object construction
+  /// The flag indicates that alloca should try to reuse pointer provided by
+  /// the user during memory object construction
   bool MInitFromUserData = false;
 };
 
+/// The AllocaSubBuf command enqueues creation of sub-buffer of memory object.
 class AllocaSubBufCommand : public AllocaCommandBase {
 public:
   AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req,
@@ -324,6 +329,7 @@ class AllocaSubBufCommand : public AllocaCommandBase {
   AllocaCommandBase *MParentAlloca = nullptr;
 };
 
+/// The map command enqueues mapping of host memory onto device memory.
 class MapMemObject : public Command {
 public:
   MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr,
@@ -342,6 +348,7 @@ class MapMemObject : public Command {
   access::mode MMapMode;
 };
 
+/// The unmap command removes mapping of host memory onto device memory.
 class UnMapMemObject : public Command {
 public:
   UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req,
@@ -359,7 +366,8 @@ class UnMapMemObject : public Command {
   void **MSrcPtr = nullptr;
 };
 
-// The command enqueues memory copy between two instances of memory object.
+/// The mem copy command enqueues memory copy between two instances of memory
+/// object.
 class MemCpyCommand : public Command {
 public:
   MemCpyCommand(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd,
@@ -380,7 +388,8 @@ class MemCpyCommand : public Command {
   AllocaCommandBase *MDstAllocaCmd = nullptr;
 };
 
-// The command enqueues memory copy between two instances of memory object.
+/// The mem copy host command enqueues memory copy between two instances of
+/// memory object.
 class MemCpyCommandHost : public Command {
 public:
   MemCpyCommandHost(Requirement SrcReq, AllocaCommandBase *SrcAllocaCmd,
@@ -401,7 +410,8 @@ class MemCpyCommandHost : public Command {
   void **MDstPtr = nullptr;
 };
 
-// The command enqueues execution of kernel or explicit memory operation.
+/// The exec CG command enqueues execution of kernel or explicit memory
+/// operation.
 class ExecCGCommand : public Command {
 public:
   ExecCGCommand(std::unique_ptr<detail::CG> CommandGroup, QueueImplPtr Queue);
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 2105c22cf6af7..02cb5bd4c4969 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -27,9 +27,10 @@ __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
 
-// The function checks whether two requirements overlaps or not. This
-// information can be used to prove that executing two kernels that
-// work on different parts of the memory object in parallel is legal.
+/// Checks whether two requirements overlaps or not.
+///
+/// This information can be used to prove that executing two kernels that
+/// work on different parts of the memory object in parallel is legal.
 static bool doOverlap(const Requirement *LHS, const Requirement *RHS) {
   return (LHS->MOffsetInBytes + LHS->MAccessRange.size() * LHS->MElemSize >=
           RHS->MOffsetInBytes) ||
@@ -43,12 +44,12 @@ static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) {
   return LHS == RHS || (LHS->is_host() && RHS->is_host());
 }
 
-// The function checks if current requirement is requirement for sub buffer
+/// Checks if current requirement is requirement for sub buffer
 static bool IsSuitableSubReq(const Requirement *Req) {
   return Req->MIsSubBuffer;
 }
 
-// Checks if the required access mode is allowed under the current one
+/// Checks if the required access mode is allowed under the current one.
 static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
   switch (Current) {
   case access::mode::read:
@@ -104,7 +105,6 @@ static void printDotRecursive(std::fstream &Stream,
 
 void Scheduler::GraphBuilder::printGraphAsDot(const char *ModeName) {
   static size_t Counter = 0;
-
   std::string ModeNameStr(ModeName);
   std::string FileName =
       "graph_" + std::to_string(Counter) + ModeNameStr + ".dot";
@@ -123,13 +123,10 @@ void Scheduler::GraphBuilder::printGraphAsDot(const char *ModeName) {
   Stream << "}" << std::endl;
 }
 
-// Returns record for the memory objects passed, nullptr if doesn't exist.
 MemObjRecord *Scheduler::GraphBuilder::getMemObjRecord(SYCLMemObjI *MemObject) {
   return MemObject->MRecord.get();
 }
 
-// Returns record for the memory object requirement refers to, if doesn't
-// exist, creates new one.
 MemObjRecord *
 Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue,
                                                  Requirement *Req) {
@@ -147,7 +144,6 @@ Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue,
   return MemObject->MRecord.get();
 }
 
-// Helper function which removes all values in Cmds from Leaves
 void Scheduler::GraphBuilder::updateLeaves(const std::set<Command *> &Cmds,
                                            MemObjRecord *Record,
                                            access::mode AccessMode) {
@@ -398,7 +394,7 @@ Command *Scheduler::GraphBuilder::addCopyBack(Requirement *Req) {
 // The function implements SYCL host accessor logic: host accessor
 // should provide access to the buffer in user space.
 Command *Scheduler::GraphBuilder::addHostAccessor(Requirement *Req,
-                                                const bool destructor) {
+                                                  const bool destructor) {
 
   const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue();
 
@@ -450,15 +446,14 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost(
   return insertMemoryMove(Record, Req, HostQueue);
 }
 
-// The functions finds dependencies for the requirement. It starts searching
-// from list of "leaf" commands for the record and check if the examining
-// command can be executed in parallel with new one with regard to the memory
-// object. If can, then continue searching through dependencies of that
-// command. There are several rules used:
-//
-// 1. New and examined commands only read -> can bypass
-// 2. New and examined commands has non-overlapping requirements -> can bypass
-// 3. New and examined commands has different contexts -> cannot bypass
+/// Start searching from list of "leaf" commands for the record and check if the
+/// examining command can be executed in parallel with new one with regard to
+/// the memory object. If can, then continue searching through dependencies of
+/// that command. There are several rules used:
+///
+/// 1. New and examined commands only read -> can bypass
+/// 2. New and examined commands has non-overlapping requirements -> can bypass
+/// 3. New and examined commands has different contexts -> cannot bypass
 std::set<Command *>
 Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, Requirement *Req,
                                         const ContextImplPtr &Context) {
@@ -606,8 +601,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
 
         // To ensure that the leader allocation is removed first
         AllocaCmd->getReleaseCmd()->addDep(
-            DepDesc(LinkedAllocaCmd->getReleaseCmd(), AllocaCmd->getRequirement(),
-                    LinkedAllocaCmd));
+            DepDesc(LinkedAllocaCmd->getReleaseCmd(),
+                    AllocaCmd->getRequirement(), LinkedAllocaCmd));
 
         // Device allocation takes ownership of the host ptr during
         // construction, host allocation doesn't. So, device allocation should
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 19c79ea5e4e18..1f8681a0b4df9 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -19,6 +19,152 @@
 #include <set>
 #include <vector>
 
+/// \defgroup sycl_graph DPC++ Execution Graph
+///
+/// DPC++, unlike OpenCL, provides a programming model in which user doesn't
+/// need to manage dependencies between kernels and memory explicitly. The DPC++
+/// Runtime must ensure correct execution with respect to order commands are
+/// submitted.
+///
+/// This document describes the part of the DPC++ Runtime that is responsible
+/// for building and processing dependency graph.
+///
+/// ## A couple of words about DPC++ and SYCL execution and memory model
+///
+/// The SYCL framework defines command group (\ref CG) as an entity that
+/// represents minimal execution block. The command group is submitted to SYCL
+/// queue and consists of a kernel and its requirements. The SYCL queue defines
+/// device and context using which the kernel should be executed.
+///
+/// There are also command groups that consist of memory requirements and
+/// explicit memory operation, such as copy, fill, update_host. In this case
+/// it's up to an implementation how to implement these operations.
+///
+/// The relative order of command groups submission defines order in which
+/// kernels must be executed if their memory requirements intersect. For
+/// example, if a command group A writes to a buffer X, command group B reads
+/// from X, then the scheduled execution order of A and B will be the same as
+/// their dynamic submission order (matches program order if submitted from the
+/// same host thread).
+///
+/// Memory requirements are requests to SYCL memory objects, such as buffer and
+/// image. SYCL memory objects are not bound to any specific context or device,
+/// it's SYCL responsibility to allocate and/or copy memory to the target
+/// context to achieve correct execution.
+///
+/// Refer to SYCL Specification 1.2.2 sections 3.4 and 3.5 to find more
+/// information about SYCL execution and memory model.
+///
+/// ### Example of DPC++ application
+///
+/// \code{.cpp}
+/// {
+///   // Creating DPC++ CPU and GPU queues
+///   cl::sycl::queue CPU_Queue = ...;
+///   cl::sycl::queue GPU_Queue = ...;
+///
+///   // Creating 3 DPC++ buffers
+///   auto BufferA = ...; // Buffer is initialized with host memory.
+///   auto BufferB = ...;
+///   auto BufferC = ...;
+///
+///   // "Copy command group" section
+///   // Request processing copy "explicit" operation on CPU
+///   // The copy operation reads from BufferA and writes to BufferB
+///
+///   CPU_Queue.submit([&](handler &CGH) {
+///     auto A = BufferA.get_access<read>(CGH);
+///     auto B = BufferB.get_access<write>(CGH);
+///     CGH.copy(A, B);
+///   });
+///
+///   // "Multi command group" section
+///   // Request processing multi kernel on GPU
+///   // The kernel reads from BufferB, multiplies by 4 and writes result to
+///   // BufferC
+///
+///   GPU_Queue.submit([&](handler &CGH) {
+///     auto B = BufferB.get_access<read>(CGH);
+///     auto C = BufferC.get_access<write>(CGH);
+///     CGH.parallel_for<class multi>(range<1>{N}, [=](id<1> Index) {
+///       C[Index] = B[Index] * 4;
+///     });
+///   });
+///
+///   // "Host accessor creation" section
+///   // Request the latest data of BufferC for the moment
+///   // This is synchronization point what means SYCL RT blocks on creation of
+///   // the accessor until requested data is available.
+///   auto C = BufferC.get_access<read>();
+/// }
+/// \endcode
+///
+/// In the example above DPC++ RT does:
+///
+/// 1. **Copy command group**.
+///    DPC++ RT allocates memory for BufferA and BufferB on CPU then execute
+///    "copy" explicit memory operation on CPU.
+///
+/// 2. **Multi command group**
+///    DPC++ RT allocates memory for BufferC and BufferB on GPU and copy
+///    content of BufferB from CPU to GPU, then execute "multi" kernel on GPU.
+///
+/// 3. **Host accessor creation**
+///    DPC++ RT allocates(it's possible to reuse already allocated memory)
+///    memory available for user for BufferC then copy content of BufferC from
+///    GPU to this memory.
+///
+/// So, the example above will be converted to the following OpenCL pseudo code
+/// (for both eager and lazy execution):
+/// \code{.cpp}
+/// // Initialization(not related to the Scheduler)
+/// Platform = clGetPlatforms(...);
+/// DeviceCPU = clGetDevices(CL_DEVICE_TYPE_CPU, ...);
+/// DeviceGPU = clGetDevices(CL_DEVICE_TYPE_GPU, ...);
+/// ContextCPU = clCreateContext(DeviceCPU, ...)
+/// ContextGPU = clCreateContext(DeviceGPU, ...)
+/// QueueCPU = clCreateCommandQueue(ContextCPU, DeviceCPU, ...);
+/// QueueGPU = clCreateCommandQueue(ContextGPU, DeviceGPU, ...);
+///
+/// // Copy command group:
+/// BufferACPU = clCreateBuffer(ContextCPU, CL_MEM_USE_HOST_PTR, ...);
+/// BufferBCPU = clCreateBuffer(ContextCPU, ...);
+/// CopyEvent = clEnqueueCopyBuffer(QueueCPU, BufferACPU, BufferBCPU, ...)
+///
+/// // Multi command group:
+/// ReadBufferEvent =
+///    clEnqueueReadBuffer(QueueCPU, BufferBCPU, HostPtr, CopyEvent, ...);
+/// BufferBGPU = clCreateBuffer(ContextGPU, ...);
+///
+/// UserEvent = clCreateUserEvent(ContextCPU);
+/// clSetEventCallback(ReadBufferEvent, event_completion_callback,
+/// /*data=*/UserEvent);
+///
+/// WriteBufferEvent = clEnqueueWriteBuffer(QueueGPU, BufferBGPU, HostPtr,
+/// UserEvent, ...); BufferCGPU = clCreateBuffer(ContextGPU, ...); ProgramGPU =
+/// clCreateProgramWithIL(ContextGPU, ...); clBuildProgram(ProgramGPU);
+/// MultiKernel = clCreateKernel("multi");
+/// clSetKernelArg(MultiKernel, BufferBGPU, ...);
+/// clSetKernelArg(MultiKernel, BufferCGPU, ...);
+/// MultiEvent =
+///    clEnqueueNDRangeKernel(QueueGPU, MultiKernel, WriteBufferEvent, ...);
+///
+/// // Host accessor creation:
+/// clEnqueueMapBuffer(QueueGPU, BufferCGPU, BLOCKING_MAP, MultiEvent, ...);
+///
+/// // Releasing mem objects during SYCL buffers destruction.
+/// clReleaseBuffer(BufferACPU);
+/// clReleaseBuffer(BufferBCPU);
+/// clReleaseBuffer(BufferBGPU);
+/// clReleaseBuffer(BufferCGPU);
+///
+/// // Release(not related to the Scheduler)
+/// clReleaseKernel(MultiKernel);
+/// clReleaseProgram(ProgramGPU);
+/// clReleaseContext(ContextGPU);
+/// clReleaseContext(ContextCPU);
+/// \endcode
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
@@ -31,8 +177,12 @@ using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
 using EventImplPtr = std::shared_ptr<detail::event_impl>;
 using ContextImplPtr = std::shared_ptr<detail::context_impl>;
 
-// The MemObjRecord is created for each memory object used in command
-// groups. There should be only one MemObjRecord for SYCL memory object.
+/// Memory Object Record
+///
+/// The MemObjRecord is used in command groups (todo better desc).
+/// There must be a single MemObjRecord for each SYCL memory object.
+///
+/// \ingroup sycl_graph
 struct MemObjRecord {
   MemObjRecord(ContextImplPtr CurContext, std::size_t LeafLimit)
       : MReadLeaves{LeafLimit}, MWriteLeaves{LeafLimit}, MCurContext{
@@ -59,46 +209,223 @@ struct MemObjRecord {
   bool MMemModified = false;
 };
 
+/// DPC++ graph scheduler class.
+///
+/// \section sched_overview Scheduler Overview
+///
+/// The Scheduler is a part of DPC++ RT which ensures correct execution of
+/// command groups. To achieve this Scheduler manages acyclic dependency graph
+/// (which can have independent sub-graphs) that consists of several types of
+/// nodes that represent specific commands:
+
+/// 1. Allocate memory.
+///    The command represents memory allocation operation. There can be
+///    multiple allocations for single SYCL memory object.
+/// 2. Release memory.
+///    The command represents memory release operation.
+/// 3. Execute command group.
+///    The command represents \ref CG "Command Group" (kernel) execution
+///    operation.
+/// 4. Copy memory.
+///    The command represents memory copy operation between two memory
+///    allocations of a single memory object.
+///
+/// As a main input Scheduler takes command group and returns an event
+/// representing the command group, so it can be waited on later. When a new
+/// command group comes Scheduler adds one or more nodes to the graph
+/// depending on the command groups' requirements. For example, if a new
+/// command group is submitted to the SYCL context which has the latest data
+/// for all the requirements, Scheduler adds a new "Execute command group"
+/// command making it dependent on all commands affecting new command group's
+/// requirements. But if one of the requirements has no up-to-date instance in
+/// the context which the command group is submitted to, Scheduler
+/// additionally inserts copy memory command (together with allocate memory
+/// command if needed).
+///
+/// A simple graph looks like:
+//
+//    +----------+     +----------+     +----------+
+//    |          |     |          |     |          |
+//    | Allocate |<----| Execute  |<----| Execute  |
+//    |          |     |          |     |          |
+//    +----------+     +----------+     +----------+
+//
+/// \dot
+/// digraph G {
+///   rankdir="LR";
+///   Execute1 [label = "Execute"];
+///   Execute2 [label = "Execute"];
+///   Allocate;
+///   Allocate -> Execute2 [dir = back];
+///   Execute2 -> Execute1 [dir = back];
+/// }
+/// \enddot
+///
+/// Where nodes represent commands and edges represent dependencies between
+/// them. There are three commands connected by arrows which mean that before
+/// executing second command group the first one must be executed. Also before
+/// executing the first command group memory allocation must be performed.
+///
+/// At some point Scheduler enqueues commands to the underlying devices. To do
+/// this Scheduler makes topological sort to get order in which commands are
+/// need to be enqueued. For example, the following graph (D depends on B and C,
+/// B and C depends on A) will be enqueued in the following order:
+/// \code{.cpp}
+///   EventA = Enqueue(A, /*Deps=*/{});
+///   EventB = Enqueue(B, /*Deps=*/{EventA});
+///   EventC = Enqueue(C, /*Deps=*/{EventA});
+///   EventD = Enqueue(D, /*Deps=*/{EventB, EventC});
+/// \endcode
+///
+//                             +----------+
+//                             |          |
+//                             |    D     |
+//                             |          |
+//                             +----------+
+//                            /            \
+//                           /              \
+//                          v                v
+//                      +----------+     +----------+
+//                      |          |     |          |
+//                      |     B    |     |    C     |
+//                      |          |     |          |
+//                      +----------+     +----------+
+//                             \            /
+//                              \          /
+//                               v        v
+//                              +----------+
+//                              |          |
+//                              |    A     |
+//                              |          |
+//                              +----------+
+/// \dot
+/// digraph G {
+///   D -> B;
+///   D -> C;
+///   C -> A;
+///   B -> A;
+/// }
+/// \enddot
+///
+/// \section sched_impl Implementation details
+///
+/// The Scheduler is split up into two parts: graph builder and graph
+/// processor.
+///
+/// To build dependencies Scheduler needs to memorize memory object and
+/// commands that modify it.
+///
+/// To detect that two command groups access the same memory object and create
+/// a dependency between them the scheduler needs to store information about
+/// the memory object.
+///
+/// \subsection sched_thread_safety Thread safety
+///
+/// To ensure thread safe execution of methods Scheduler provides access to the
+/// graph should be guarded by read-write mutex(analog of shared mutex from
+/// C++17).
+///
+/// An read-write mutex allows concurrent access to read-only operations, while
+/// write operations require exclusive access.
+///
+/// All the methods of GraphBuilder lock the mutex in write mode because these
+/// methods can modify the graph.
+/// Methods of GraphProcessor lock the mutex in read mode as they are not
+/// modifying the graph.
+///
+/// \subsection shced_err_handling Error handling
+///
+/// There are two sources of erros that needs to be handled in Scheduler:
+/// 1. the error that happens during command enqueue process
+/// 2. the error that happend during command execution.
+///
+/// If error occurs during commands enqueue process Command::enqueue method
+/// return faulty command. The Scheduler then reschedules the command and all
+/// dependent commands (if any).
+///
+/// An error with command processing can happen in underlying runtime, in this
+/// case Scheduler is notified asynchronously(using callback mechanism) what
+/// triggers rescheduling.
+///
+/// \ingroup sycl_graph
 class Scheduler {
 public:
-  // Registers command group, adds it to the dependency graph and returns an
-  // event object that can be used for waiting later. It's called by SYCL's
-  // queue.submit.
+  /// Registers a command group, and adds it to the dependency graph.
+  ///
+  /// It's called by SYCL's queue.submit.
+  ///
+  /// \param CommandGroup is a unique_ptr to a command group to be added.
+  /// \return an event object to wait on for command group completetion.
   EventImplPtr addCG(std::unique_ptr<detail::CG> CommandGroup,
                      QueueImplPtr Queue);
 
+  /// Registers a command group, that copies most recent memory to the memory
+  /// pointed by the requirement.
+  ///
+  /// \param Req is a requirement that points to the memory where data is
+  /// needed.
+  /// \return an event object to wait on for copy finish.
   EventImplPtr addCopyBack(Requirement *Req);
 
-  // Blocking call that waits for the event passed. For the eager execution
-  // mode this method invokes corresponding function of device API. In the
-  // lazy execution mode the method may enqueue the command associated with
-  // the event passed and its dependency before calling device API.
+  /// Waits for the event.
+  ///
+  /// This operation is blocking. For eager execution mode this method invokes
+  /// corresponding function of device API. In lazy execution mode the method
+  /// may enqueue the command, associated with the event, and its dependency
+  /// before calling device API.
+  ///
+  /// \param Event is a pointer to event to wait on.
   void waitForEvent(EventImplPtr Event);
 
-  // Removes buffer pointed by MemObj from the graph: ensures all commands
-  // accessing the memory objects are executed and triggers deallocation of
-  // all memory assigned to the memory object. It's called from the
-  // sycl::buffer and sycl::image destructors.
+  /// Removes buffer from the graph.
+  ///
+  /// The lifetime of memory object descriptor begins when first command group
+  /// that uses memory object comes and ends when "removeMemoryObject(...)"
+  /// method is called which means there will be no command group that uses the
+  /// memory object. When removeMemoryObject is called Scheduler will enqueue
+  /// and wait on all ReleseCommand's associated with the memory object, what
+  /// effectively guarantees that all commands accessing the memory object is
+  /// complete and then resources for the memory object is freed. Then all the
+  /// commands affecting the memory object are removed.
+  ///
+  /// On destruction Scheduler triggers destruction of all memory object
+  /// descriptors in order to wait on all commands not yet executed and all
+  /// memory it manages.
+  ///
+  /// This member function is used by \ref buffer and \ref image.
+  ///
+  /// \param MemObj is a memory object that points to buffer being removed.
   void removeMemoryObject(detail::SYCLMemObjI *MemObj);
 
-  // Removes finished non-leaf non-alloca commands from the subgraph (assuming
-  // that all its commands have been waited for).
+  /// Removes finished non-leaf non-alloca commands from the subgraph
+  /// (assuming that all its commands have been waited for).
+  /// \sa GraphBuilder::cleanupFinishedCommands
+  ///
+  /// \param FinishedEvent is a cleanup candidate event.
   void cleanupFinishedCommands(EventImplPtr FinishedEvent);
 
-  // Creates nodes in the graph, that update Req with the pointer to the host
-  // memory which contains the latest data of the memory object. New
-  // operations with the same memory object that have side effects are blocked
-  // until releaseHostAccessor is called. Returns an event which indicates
-  // when these nodes are completed and host accessor is ready for using.
+  /// Adds nodes to the graph, that update the requirement with the pointer
+  /// to the host memory.
+  ///
+  /// Assumes the host pointer contains the latest data. New operations with
+  /// the same memory object that have side effects are blocked until
+  /// releaseHostAccessor(Requirement *Req) is callled.
+  ///
+  /// \param Req is the requirement to be updated.
+  /// \return an event which indicates when these nodes are completed
+  /// and host accessor is ready for use.
   EventImplPtr addHostAccessor(Requirement *Req, const bool Destructor = false);
 
-  // Unblocks operations with the memory object.
+  /// Unblocks operations with the memory object.
+  ///
+  /// \param Req is a requirement that points to the memory object being
+  /// unblocked.
   void releaseHostAccessor(Requirement *Req);
 
-  // Returns an instance of the scheduler object.
+  /// \return an instance of the scheduler object.
   static Scheduler &getInstance();
 
-  // Returns list of "immediate" dependencies for the Event given.
+  /// \return a vector of "immediate" dependencies for the Event given.
   std::vector<EventImplPtr> getWaitList(EventImplPtr Event);
 
   QueueImplPtr getDefaultHostQueue() { return DefaultHostQueue; }
@@ -107,71 +434,97 @@ class Scheduler {
   Scheduler();
   static Scheduler instance;
 
-  // The graph builder provides interfaces that can change already existing
-  // graph (e.g. add/remove edges/nodes).
+  /// Graph builder class.
+  ///
+  /// The graph builder provides means to change an existing graph (e.g. add
+  /// or remove edges/nodes).
+  ///
+  /// \ingroup sycl_graph
   class GraphBuilder {
   public:
     GraphBuilder();
 
-    // Registers command group, adds it to the dependency graph and returns an
-    // command that represents command group execution. It's called by SYCL's
-    // queue::submit.
+    /// Registers \ref CG "command group" and adds it to the dependency graph.
+    ///
+    /// \sa queue::submit, Scheduler::addCG
+    ///
+    /// \return a command that represents command group execution.
     Command *addCG(std::unique_ptr<detail::CG> CommandGroup,
                    QueueImplPtr Queue);
 
+    /// Registers a \ref CG "command group" that updates host memory to the
+    /// latest state.
+    ///
+    /// \return a command that represents command group execution.
     Command *addCGUpdateHost(std::unique_ptr<detail::CG> CommandGroup,
                              QueueImplPtr HostQueue);
 
+    /// Registers a \ref CG "command group" to update memory to the latest
+    /// state.
+    ///
+    /// \param Req is a requirement, that describes memory object.
     Command *addCopyBack(Requirement *Req);
+
+    /// Registers a \ref CG "command group" to create a host accessor.
+    ///
+    /// \param Req points to memory being accessed.
     Command *addHostAccessor(Requirement *Req, const bool destructor = false);
 
-    // [Provisional] Optimizes the whole graph.
+    /// [Provisional] Optimizes the whole graph.
     void optimize();
 
-    // [Provisional] Optimizes subgraph that consists of command associated
-    // with Event passed and its dependencies.
+    /// [Provisional] Optimizes subgraph that consists of command associated
+    /// with Event passed and its dependencies.
     void optimize(EventImplPtr Event);
 
-    // Removes finished non-leaf non-alloca commands from the subgraph (assuming
-    // that all its commands have been waited for).
+    /// Removes finished non-leaf non-alloca commands from the subgraph
+    /// (assuming that all its commands have been waited for).
     void cleanupFinishedCommands(Command *FinishedCmd);
 
-    // Reschedules command passed using Queue provided. this can lead to
-    // rescheduling of all dependent commands. This can be used when user
-    // provides "secondary" queue to submit method which may be used when
-    // command fails to enqueue/execute in primary queue.
+    /// Reschedules command passed using Queue provided. t
+    ///
+    /// This can lead to rescheduling of all dependent commands. This can be
+    /// used when user provides "secondary" queue to submit method which may
+    /// be used when command fails to enqueue/execute in primary queue.
     void rescheduleCommand(Command *Cmd, QueueImplPtr Queue);
 
+    /// \return a pointer to the corresponding memory object record for the
+    /// SYCL memory object provided, or nullptr if it does not exist.
     MemObjRecord *getMemObjRecord(SYCLMemObjI *MemObject);
-    // Returns pointer to MemObjRecord for pointer to memory object.
-    // Return nullptr if there the record is not found.
+
+    /// \return a pointer to MemObjRecord for pointer to memory object. If the
+    /// record is not found, nullptr is returned.
     MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue,
                                           Requirement *Req);
 
-    // Decrements leaf counters for all leaves of the record.
+    /// Decrements leaf counters for all leaves of the record.
     void decrementLeafCountersForRecord(MemObjRecord *Record);
 
-    // Removes commands that use given MemObjRecord from the graph.
+    /// Removes commands that use given MemObjRecord from the graph.
     void cleanupCommandsForRecord(MemObjRecord *Record);
 
-    // Removes MemObjRecord for memory object passed.
+    /// Removes MemObjRecord for memory object passed.
     void removeRecordForMemObj(SYCLMemObjI *MemObject);
 
-    // Add new command to leaves if needed.
+    /// Add new command to leaves if needed.
     void addNodeToLeaves(MemObjRecord *Record, Command *Cmd,
                          access::mode AccessMode);
 
-    // Removes commands from leaves.
+    /// Removes commands from leaves.
     void updateLeaves(const std::set<Command *> &Cmds, MemObjRecord *Record,
                       access::mode AccessMode);
 
     std::vector<SYCLMemObjI *> MMemObjs;
 
   private:
-    // The method inserts required command to make so the latest state for the
-    // memory object Record refers to resides in the context which is bound to
-    // the Queue. Can insert copy/map/unmap operations depending on the source
-    // and destination.
+    /// Inserts required command to update memory object state in the context.
+    ///
+    /// Copy/map/unmap operations can be inserted depending on the source and
+    /// destination.
+    ///
+    /// \param Record is a memory object that needs to be updated.
+    /// \param Req is a Requirement describing destination.
+    /// \param Queue is a queue that is bound to target context.
     Command *insertMemoryMove(MemObjRecord *Record, Requirement *Req,
                               const QueueImplPtr &Queue);
 
@@ -184,24 +537,30 @@ class Scheduler {
     insertUpdateHostReqCmd(MemObjRecord *Record, Requirement *Req,
                            const QueueImplPtr &Queue);
 
+    /// Finds dependencies for the requirement.
     std::set<Command *> findDepsForReq(MemObjRecord *Record, Requirement *Req,
                                        const ContextImplPtr &Context);
 
-    // Finds a command dependency corresponding to the record
+    /// Finds a command dependency corresponding to the record.
     DepDesc findDepForRecord(Command *Cmd, MemObjRecord *Record);
 
-    // Searches for suitable alloca in memory record.
+    /// Searches for suitable alloca in memory record.
     AllocaCommandBase *findAllocaForReq(MemObjRecord *Record, Requirement *Req,
                                         const ContextImplPtr &Context);
-    // Searches for suitable alloca in memory record.
-    // If none found, creates new one.
+
+    /// Searches for suitable alloca in memory record.
+    ///
+    /// If none found, creates new one.
     AllocaCommandBase *getOrCreateAllocaForReq(MemObjRecord *Record,
                                                Requirement *Req,
                                                QueueImplPtr Queue);
 
     void markModifiedIfWrite(MemObjRecord *Record, Requirement *Req);
 
-    // Print contents of graph to text file in DOT format
+    /// Prints contents of graph to text file in DOT format
+    ///
+    /// \param ModeName is a stringified printing mode name to be used
+    /// in the result file name.
     void printGraphAsDot(const char *ModeName);
     enum PrintOptions {
       BeforeAddCG = 0,
@@ -215,21 +574,96 @@ class Scheduler {
     std::array<bool, PrintOptions::Size> MPrintOptionsArray;
   };
 
-  // The class that provides interfaces for enqueueing command and its
-  // dependencies to the underlying runtime. Methods of this class must not
-  // modify the graph.
+  /// Graph Processor provided interfaces for enqueueing commands and their
+  /// dependencies to the underlying runtime.
+  ///
+  /// Member functions of this class do not modify the graph.
+  ///
+  /// \section sched_enqueue Command enqueueing
+  /// \todo lazy mode is not implemented.
+  ///
+  /// The Scheduler can work in two modes of enqueueing commands: eager(default)
+  /// and lazy. In eager mode commands are enqueued whenever they come to the
+  /// Scheduler. In lazy mode they are not enqueued until content of the buffer
+  /// they are accessing is requested by user.
+  ///
+  /// Each command has enqueue method which takes vector of events that
+  /// represents dependencies and returns event which represents the command.
+  /// GraphProcessor makes topological sort to get order in which commands are
+  /// need to be enqueued. Then enqueue each command passing vector of events
+  /// that this command needs to wait on. If error happens during command
+  /// enqueue, the whole process is stopped, faulty command is propagated back
+  /// to the Scheduler.
+  ///
+  /// The command with dependencies that belong to different context from
+  /// command's one can't be enqueued directly(limitation of OpenCL runtime).
+  /// Instead for each dependency a proxy event in the target context is created
+  /// and linked using OpenCL callback mechanism with original one. For example,
+  /// the following SYCL code:
+  ///
+  /// \code{.cpp}
+  ///   // The ContextA and ContextB are different OpenCL contexts
+  ///   sycl::queue Q1(ContextA);
+  ///   sycl::queue Q2(ContextB);
+  ///
+  ///   Q1.submit(Task1);
+  ///
+  ///   Q2.submit(Task2);
+  /// \endcode
+  ///
+  /// is translated to the following OCL API calls:
+  ///
+  /// \code{.cpp}
+  ///   void event_completion_callback(void *data) {
+  ///     // Change status of event to complete.
+  ///     clSetEventStatus((cl_event *)data, CL_COMPLETE); // Scope of Context2
+  ///   }
+  ///
+  ///  // Enqueue TASK1
+  ///  EventTask1 = clEnqueueNDRangeKernel(Q1, TASK1, ..); // Scope of Context1
+  ///  // Read memory to host
+  ///  ReadMem = clEnqueueReadBuffer(A, .., /*Deps=*/EventTask1); // Scope of
+  ///  // Context1
+  ///
+  ///  // Create user event with initial status "not completed".
+  ///  UserEvent = clCreateUserEvent(Context2); // Scope of Context2
+  ///  // Ask OpenCL to call callback with UserEvent as data when "read memory
+  ///  // to host" operation is completed
+  ///  clSetEventCallback(ReadMem, event_completion_callback,
+  ///                     /*data=*/UserEvent); // Scope of Context1
+  ///
+  ///  // Enqueue write memory from host, block it on user event
+  ///  // It will be unblocked when we change UserEvent status to completed in
+  ///  // callback.
+  ///  WriteMem =
+  ///     clEnqueueWriteBuffer(A, .., /*Dep=*/UserEvent); // Scope of Context2
+  ///  // Enqueue TASK2
+  ///  EventTask2 =
+  ///     clEnqueueNDRangeKernel(TASK, .., /*Dep=*/WriteMem); // Scope of
+  ///     // Context2
+  /// \endcode
+  ///
+  /// The alternative approach that has been considered is to have separate
+  /// dispatcher thread that would wait for all events from the Context other
+  /// then target Context to complete and then enqueue command with dependencies
+  /// from target Context only. Alternative approach makes code significantly
+  /// more complex and can hurt performance on CPU device vs chosen approach
+  /// with callbacks.
+  ///
+  /// \ingroup sycl_graph
   class GraphProcessor {
   public:
-    // Returns a list of events that represent immediate dependencies of the
-    // command associated with Event passed.
+    /// \return a list of events that represent immediate dependencies of the
+    /// command associated with Event passed.
     static std::vector<EventImplPtr> getWaitList(EventImplPtr Event);
 
-    // Wait for the command, associated with Event passed, is completed.
+    /// Waits for the command, associated with Event passed, is completed.
     static void waitForEvent(EventImplPtr Event);
 
-    // Enqueue the command passed and all it's dependencies to the underlying
-    // device. Returns true is the command is successfully enqueued. Sets
-    // EnqueueResult to the specific status otherwise.
+    /// Enqueues the command and all its dependencies.
+    ///
+    /// \param EnqueueResult is set to specific status if enqueue failed.
+    /// \return true if the command is successfully enqueued.
     static bool enqueueCommand(Command *Cmd, EnqueueResultT &EnqueueResult,
                                BlockingT Blocking = NON_BLOCKING);
   };
@@ -237,7 +671,7 @@ class Scheduler {
   void waitForRecordToFinish(MemObjRecord *Record);
 
   GraphBuilder MGraphBuilder;
-  // Use read-write mutex in future.
+  // TODO Use read-write mutex in future.
   std::mutex MGraphLock;
 
   QueueImplPtr DefaultHostQueue;

From 5011e3dafdeae8ddf2cca21443d91ac557bd7862 Mon Sep 17 00:00:00 2001
From: Alexander Batashev <alexbatashev@outlook.com>
Date: Thu, 2 Apr 2020 21:36:07 +0300
Subject: [PATCH 2/7] Apply suggestions from code review

Signed-off-by: Alexander Batashev <alexander.batashev@intel.com>

Co-Authored-By: Artem Gindinson <artem.gindinson@intel.com>
---
 sycl/source/detail/scheduler/commands.hpp      | 6 +++---
 sycl/source/detail/scheduler/graph_builder.cpp | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index cf69d24bbbf4b..88f1d0cf21e9a 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -51,7 +51,7 @@ struct EnqueueResultT {
       : MResult(Result), MCmd(Cmd), MErrCode(ErrCode) {}
   /// Indicates result of enqueueing.
   ResultT MResult;
-  /// Pointer to the command failed to enqueue.
+  /// Pointer to the command which failed to enqueue.
   Command *MCmd;
   /// Error code which is set when enqueueing fails.
   cl_int MErrCode;
@@ -78,9 +78,9 @@ struct DepDesc {
 };
 
 /// The Command represents some action that needs to be performed on one or
-/// more memory objects. The command has vector of DepDesc objects that
+/// more memory objects. The Command has a vector of DepDesc objects that
 /// represent dependencies of the command. It has vector of pointer to commands
-/// that depend on the command. It has a pointer to \ref queue object. And has
+/// that depend on the command. It has a pointer to \ref queue object and an
 /// event that is associated with the command.
 ///
 /// \ingroup sycl_graph
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 02cb5bd4c4969..4785e16e32177 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -446,14 +446,14 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost(
   return insertMemoryMove(Record, Req, HostQueue);
 }
 
-/// Start searching from list of "leaf" commands for the record and check if the
-/// examining command can be executed in parallel with new one with regard to
-/// the memory object. If can, then continue searching through dependencies of
+/// Start the search for the record from list of "leaf" commands and check if the
+/// examining command can be executed in parallel with the new one with regard to
+/// the memory object. If it can, then continue searching through dependencies of
 /// that command. There are several rules used:
 ///
 /// 1. New and examined commands only read -> can bypass
 /// 2. New and examined commands has non-overlapping requirements -> can bypass
-/// 3. New and examined commands has different contexts -> cannot bypass
+/// 3. New and examined commands have different contexts -> cannot bypass
 std::set<Command *>
 Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, Requirement *Req,
                                         const ContextImplPtr &Context) {

From d0b265ed4b1b3621299d67a1ee83f0b3c6598669 Mon Sep 17 00:00:00 2001
From: Alexander Batashev <alexbatashev@outlook.com>
Date: Thu, 9 Apr 2020 11:11:39 +0300
Subject: [PATCH 3/7] Apply suggestions from code review

Signed-off-by: Alexander Batashev <alexander.batashev@intel.com>

Co-Authored-By: Romanov Vlad <17316488+romanovvlad@users.noreply.github.com>
Co-Authored-By: Sergey Semenov <43845535+sergey-semenov@users.noreply.github.com>
---
 sycl/source/detail/scheduler/commands.hpp     |  4 +-
 .../source/detail/scheduler/graph_builder.cpp |  6 +-
 sycl/source/detail/scheduler/scheduler.hpp    | 77 +++++++++----------
 3 files changed, 43 insertions(+), 44 deletions(-)

diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 88f1d0cf21e9a..a9affcc1796f7 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -79,7 +79,7 @@ struct DepDesc {
 
 /// The Command represents some action that needs to be performed on one or
 /// more memory objects. The Command has a vector of DepDesc objects that
-/// represent dependencies of the command. It has vector of pointer to commands
+/// represent dependencies of the command. It has a vector of pointers to commands
 /// that depend on the command. It has a pointer to \ref queue object and an
 /// event that is associated with the command.
 ///
@@ -242,7 +242,7 @@ class EmptyCommand : public Command {
   Requirement MRequirement;
 };
 
-/// The release command enqueues release instance of memory allocated on Host or
+/// The release command enqueues release of a memory object instance allocated on Host or
 /// underlying framework.
 class ReleaseCommand : public Command {
 public:
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 4785e16e32177..a16d382fbb9af 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -27,7 +27,7 @@ __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
 
-/// Checks whether two requirements overlaps or not.
+/// Checks whether two requirements overlap or not.
 ///
 /// This information can be used to prove that executing two kernels that
 /// work on different parts of the memory object in parallel is legal.
@@ -44,7 +44,7 @@ static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) {
   return LHS == RHS || (LHS->is_host() && RHS->is_host());
 }
 
-/// Checks if current requirement is requirement for sub buffer
+/// Checks if current requirement is requirement for sub buffer.
 static bool IsSuitableSubReq(const Requirement *Req) {
   return Req->MIsSubBuffer;
 }
@@ -447,7 +447,7 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost(
 }
 
 /// Start the search for the record from list of "leaf" commands and check if the
-/// examining command can be executed in parallel with the new one with regard to
+/// examined command can be executed in parallel with the new one with regard to
 /// the memory object. If it can, then continue searching through dependencies of
 /// that command. There are several rules used:
 ///
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 1f8681a0b4df9..a94860e5bfaa9 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -21,9 +21,9 @@
 
 /// \defgroup sycl_graph DPC++ Execution Graph
 ///
-/// DPC++, unlike OpenCL, provides a programming model in which user doesn't
+/// DPC++, unlike OpenCL, provides a programming model in which the user doesn't
 /// need to manage dependencies between kernels and memory explicitly. The DPC++
-/// Runtime must ensure correct execution with respect to order commands are
+/// Runtime must ensure correct execution with respect to the order commands are
 /// submitted.
 ///
 /// This document describes the part of the DPC++ Runtime that is responsible
@@ -34,13 +34,13 @@
 /// The SYCL framework defines command group (\ref CG) as an entity that
 /// represents minimal execution block. The command group is submitted to SYCL
 /// queue and consists of a kernel and its requirements. The SYCL queue defines
-/// device and context using which the kernel should be executed.
+/// the device and context using which the kernel should be executed.
 ///
 /// There are also command groups that consist of memory requirements and
-/// explicit memory operation, such as copy, fill, update_host. In this case
+/// an explicit memory operation, such as copy, fill, update_host. In this case
 /// it's up to an implementation how to implement these operations.
 ///
-/// The relative order of command groups submission defines order in which
+/// The relative order of command groups submission defines the order in which
 /// kernels must be executed if their memory requirements intersect. For
 /// example, if a command group A writes to a buffer X, command group B reads
 /// from X, then the scheduled execution order of A and B will be the same as
@@ -52,7 +52,7 @@
 /// it's SYCL responsibility to allocate and/or copy memory to the target
 /// context to achieve correct execution.
 ///
-/// Refer to SYCL Specification 1.2.2 sections 3.4 and 3.5 to find more
+/// Refer to SYCL Specification 1.2.1 sections 3.4 and 3.5 to find more
 /// information about SYCL execution and memory model.
 ///
 /// ### Example of DPC++ application
@@ -69,7 +69,7 @@
 ///   auto BufferC = ...;
 ///
 ///   // "Copy command group" section
-///   // Request processing copy "explicit" operation on CPU
+///   // Request processing explicit copy operation on CPU
 ///   // The copy operation reads from BufferA and writes to BufferB
 ///
 ///   CPU_Queue.submit([&](handler &CGH) {
@@ -93,17 +93,17 @@
 ///
 ///   // "Host accessor creation" section
 ///   // Request the latest data of BufferC for the moment
-///   // This is synchronization point what means SYCL RT blocks on creation of
+///   // This is a synchronization point, which means that the DPC++ RT blocks on creation of
 ///   // the accessor until requested data is available.
 ///   auto C = BufferC.get_access<read>();
 /// }
 /// \endcode
 ///
-/// In the example above DPC++ RT does:
+/// In the example above the DPC++ RT does the following:
 ///
 /// 1. **Copy command group**.
-///    DPC++ RT allocates memory for BufferA and BufferB on CPU then execute
-///    "copy" explicit memory operation on CPU.
+///    The DPC++ RT allocates memory for BufferA and BufferB on CPU then executes
+///   an explicit copy operation on CPU.
 ///
 /// 2. **Multi command group**
 ///    DPC++ RT allocates memory for BufferC and BufferB on GPU and copy
@@ -115,7 +115,6 @@
 ///    GPU to this memory.
 ///
 /// So, the example above will be converted to the following OpenCL pseudo code
-/// (for both eager and lazy execution):
 /// \code{.cpp}
 /// // Initialization(not related to the Scheduler)
 /// Platform = clGetPlatforms(...);
@@ -220,7 +219,7 @@ struct MemObjRecord {
 
 /// 1. Allocate memory.
 ///    The command represents memory allocation operation. There can be
-///    multiple allocations for single SYCL memory object.
+///    multiple allocations for a single SYCL memory object.
 /// 2. Release memory.
 ///    The command represents memory release operation.
 /// 3. Execute command group.
@@ -232,7 +231,7 @@ struct MemObjRecord {
 ///
 /// As a main input Scheduler takes command group and returns an event
 /// representing the command group, so it can be waited on later. When a new
-/// command group comes Scheduler adds one or more nodes to the graph
+/// command group comes, Scheduler adds one or more nodes to the graph
 /// depending on the command groups' requirements. For example, if a new
 /// command group is submitted to the SYCL context which has the latest data
 /// for all the requirements, Scheduler adds a new "Execute command group"
@@ -316,16 +315,16 @@ struct MemObjRecord {
 /// commands that modify it.
 ///
 /// To detect that two command groups access the same memory object and create
-/// a dependency between them the scheduler needs to store information about
+/// a dependency between them, Scheduler needs to store information about
 /// the memory object.
 ///
 /// \subsection sched_thread_safety Thread safety
 ///
-/// To ensure thread safe execution of methods Scheduler provides access to the
-/// graph should be guarded by read-write mutex(analog of shared mutex from
+/// To ensure thread safe execution of methods, Scheduler provides access to the
+/// graph that's guarded by a read-write mutex (analog of shared mutex from
 /// C++17).
 ///
-/// An read-write mutex allows concurrent access to read-only operations, while
+/// A read-write mutex allows concurrent access to read-only operations, while
 /// write operations require exclusive access.
 ///
 /// All the methods of GraphBuilder lock the mutex in write mode because these
@@ -335,8 +334,8 @@ struct MemObjRecord {
 ///
 /// \subsection shced_err_handling Error handling
 ///
-/// There are two sources of erros that needs to be handled in Scheduler:
-/// 1. the error that happens during command enqueue process
+/// There are two sources of errors that needs to be handled in Scheduler:
+/// 1. errors that happen during command enqueue process
 /// 2. the error that happend during command execution.
 ///
 /// If error occurs during commands enqueue process Command::enqueue method
@@ -344,7 +343,7 @@ struct MemObjRecord {
 /// dependent commands (if any).
 ///
 /// An error with command processing can happen in underlying runtime, in this
-/// case Scheduler is notified asynchronously(using callback mechanism) what
+/// case Scheduler is notified asynchronously (using callback mechanism) what
 /// triggers rescheduling.
 ///
 /// \ingroup sycl_graph
@@ -355,7 +354,7 @@ class Scheduler {
   /// It's called by SYCL's queue.submit.
   ///
   /// \param CommandGroup is a unique_ptr to a command group to be added.
-  /// \return an event object to wait on for command group completetion.
+  /// \return an event object to wait on for command group completion.
   EventImplPtr addCG(std::unique_ptr<detail::CG> CommandGroup,
                      QueueImplPtr Queue);
 
@@ -379,13 +378,13 @@ class Scheduler {
 
   /// Removes buffer from the graph.
   ///
-  /// The lifetime of memory object descriptor begins when first command group
-  /// that uses memory object comes and ends when "removeMemoryObject(...)"
+  /// The lifetime of memory object descriptor begins when the first command group
+  /// that uses the memory object is submitted and ends when "removeMemoryObject(...)"
   /// method is called which means there will be no command group that uses the
   /// memory object. When removeMemoryObject is called Scheduler will enqueue
-  /// and wait on all ReleseCommand's associated with the memory object, what
-  /// effectively guarantees that all commands accessing the memory object is
-  /// complete and then resources for the memory object is freed. Then all the
+  /// and wait on all release commands associated with the memory object, which
+  /// effectively guarantees that all commands accessing the memory object are
+  /// complete and then the resources allocated for the memory object are freed. Then all the
   /// commands affecting the memory object are removed.
   ///
   /// On destruction Scheduler triggers destruction of all memory object
@@ -394,7 +393,7 @@ class Scheduler {
   ///
   /// This member function is used by \ref buffer and \ref image.
   ///
-  /// \param MemObj is a memory object that points to buffer being removed.
+  /// \param MemObj is a memory object that points to the buffer being removed.
   void removeMemoryObject(detail::SYCLMemObjI *MemObj);
 
   /// Removes finished non-leaf non-alloca commands from the subgraph
@@ -481,10 +480,10 @@ class Scheduler {
     /// (assuming that all its commands have been waited for).
     void cleanupFinishedCommands(Command *FinishedCmd);
 
-    /// Reschedules command passed using Queue provided. t
+    /// Reschedules the command passed using Queue provided.
     ///
     /// This can lead to rescheduling of all dependent commands. This can be
-    /// used when user provides "secondary" queue to submit method which may
+    /// used when the user provides a "secondary" queue to the submit method which may
     /// be used when command fails to enqueue/execute in primary queue.
     void rescheduleCommand(Command *Cmd, QueueImplPtr Queue);
 
@@ -500,13 +499,13 @@ class Scheduler {
     /// Decrements leaf counters for all leaves of the record.
     void decrementLeafCountersForRecord(MemObjRecord *Record);
 
-    /// Removes commands that use given MemObjRecord from the graph.
+    /// Removes commands that use the given MemObjRecord from the graph.
     void cleanupCommandsForRecord(MemObjRecord *Record);
 
-    /// Removes MemObjRecord for memory object passed.
+    /// Removes the MemObjRecord for the memory object passed.
     void removeRecordForMemObj(SYCLMemObjI *MemObject);
 
-    /// Add new command to leaves if needed.
+    /// Adds new command to leaves if needed.
     void addNodeToLeaves(MemObjRecord *Record, Command *Cmd,
                          access::mode AccessMode);
 
@@ -517,7 +516,7 @@ class Scheduler {
     std::vector<SYCLMemObjI *> MMemObjs;
 
   private:
-    /// Inserts required command to update memory object state in the context.
+    /// Inserts the command required to update the memory object state in the context.
     ///
     /// Copy/map/unmap operations can be inserted depending on the source and
     /// destination.
@@ -574,7 +573,7 @@ class Scheduler {
     std::array<bool, PrintOptions::Size> MPrintOptionsArray;
   };
 
-  /// Graph Processor provided interfaces for enqueueing commands and their
+  /// Graph Processor provides interfaces for enqueueing commands and their
   /// dependencies to the underlying runtime.
   ///
   /// Member functions of this class do not modify the graph.
@@ -582,17 +581,17 @@ class Scheduler {
   /// \section sched_enqueue Command enqueueing
   /// \todo lazy mode is not implemented.
   ///
-  /// The Scheduler can work in two modes of enqueueing commands: eager(default)
+  /// The Scheduler can work in two modes of enqueueing commands: eager (default)
   /// and lazy. In eager mode commands are enqueued whenever they come to the
-  /// Scheduler. In lazy mode they are not enqueued until content of the buffer
+  /// Scheduler. In lazy mode they are not enqueued until the content of the buffer
   /// they are accessing is requested by user.
   ///
   /// Each command has enqueue method which takes vector of events that
   /// represents dependencies and returns event which represents the command.
   /// GraphProcessor makes topological sort to get order in which commands are
   /// need to be enqueued. Then enqueue each command passing vector of events
-  /// that this command needs to wait on. If error happens during command
-  /// enqueue, the whole process is stopped, faulty command is propagated back
+  /// that this command needs to wait on. If an error happens during command
+  /// enqueue, the whole process is stopped, the faulty command is propagated back
   /// to the Scheduler.
   ///
   /// The command with dependencies that belong to different context from

From f6eb8ff95e2c18109e4b5c953ef245e8b111273f Mon Sep 17 00:00:00 2001
From: Alexander Batashev <alexbatashev@outlook.com>
Date: Thu, 9 Apr 2020 11:15:00 +0300
Subject: [PATCH 4/7] Apply suggestions from code review

Signed-off-by: Alexander Batashev <alexander.batashev@intel.com>

Co-Authored-By: Sergey Semenov <43845535+sergey-semenov@users.noreply.github.com>
---
 sycl/source/detail/scheduler/scheduler.hpp | 28 +++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index a94860e5bfaa9..57feb83036c3a 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -229,8 +229,8 @@ struct MemObjRecord {
 ///    The command represents memory copy operation between two memory
 ///    allocations of a single memory object.
 ///
-/// As a main input Scheduler takes command group and returns an event
-/// representing the command group, so it can be waited on later. When a new
+/// As the main input Scheduler takes a command group and returns an event
+/// representing it, so it can be waited on later. When a new
 /// command group comes, Scheduler adds one or more nodes to the graph
 /// depending on the command groups' requirements. For example, if a new
 /// command group is submitted to the SYCL context which has the latest data
@@ -266,8 +266,8 @@ struct MemObjRecord {
 /// executing the first command group memory allocation must be performed.
 ///
 /// At some point Scheduler enqueues commands to the underlying devices. To do
-/// this Scheduler makes topological sort to get order in which commands are
-/// need to be enqueued. For example, the following graph (D depends on B and C,
+/// this, Scheduler performs topological sort to get the order in which commands should
+/// be enqueued. For example, the following graph (D depends on B and C,
 /// B and C depends on A) will be enqueued in the following order:
 /// \code{.cpp}
 ///   EventA = Enqueue(A, /*Deps=*/{});
@@ -311,8 +311,8 @@ struct MemObjRecord {
 /// The Scheduler is split up into two parts: graph builder and graph
 /// processor.
 ///
-/// To build dependencies Scheduler needs to memorize memory object and
-/// commands that modify it.
+/// To build dependencies, Scheduler needs to memorize memory objects and
+/// commands that modify them.
 ///
 /// To detect that two command groups access the same memory object and create
 /// a dependency between them, Scheduler needs to store information about
@@ -338,8 +338,8 @@ struct MemObjRecord {
 /// 1. errors that happen during command enqueue process
 /// 2. the error that happend during command execution.
 ///
-/// If error occurs during commands enqueue process Command::enqueue method
-/// return faulty command. The Scheduler then reschedules the command and all
+/// If an error occurs during command enqueue process, the Command::enqueue method
+/// returns the faulty command. Scheduler then reschedules the command and all
 /// dependent commands (if any).
 ///
 /// An error with command processing can happen in underlying runtime, in this
@@ -484,7 +484,7 @@ class Scheduler {
     ///
     /// This can lead to rescheduling of all dependent commands. This can be
     /// used when the user provides a "secondary" queue to the submit method which may
-    /// be used when command fails to enqueue/execute in primary queue.
+    /// be used when the command fails to enqueue/execute in the primary queue.
     void rescheduleCommand(Command *Cmd, QueueImplPtr Queue);
 
     /// \return a pointer to the corresponding memory object record for the
@@ -588,15 +588,15 @@ class Scheduler {
   ///
   /// Each command has enqueue method which takes vector of events that
   /// represents dependencies and returns event which represents the command.
-  /// GraphProcessor makes topological sort to get order in which commands are
-  /// need to be enqueued. Then enqueue each command passing vector of events
+  /// GraphProcessor performs topological sort to get the order in which commands have to
+  /// be enqueued. Then it enqueues each command, passing a vector of events
   /// that this command needs to wait on. If an error happens during command
   /// enqueue, the whole process is stopped, the faulty command is propagated back
   /// to the Scheduler.
   ///
-  /// The command with dependencies that belong to different context from
-  /// command's one can't be enqueued directly(limitation of OpenCL runtime).
-  /// Instead for each dependency a proxy event in the target context is created
+  /// The command with dependencies that belong to a context different from its own
+  /// can't be enqueued directly (limitation of OpenCL runtime).
+  /// Instead, for each dependency, a proxy event is created in the target context
   /// and linked using OpenCL callback mechanism with original one. For example,
   /// the following SYCL code:
   ///

From f9e6934278d976b8f51416b1a71142c24a39a97f Mon Sep 17 00:00:00 2001
From: Alexander Batashev <alexander.batashev@intel.com>
Date: Thu, 9 Apr 2020 13:34:07 +0300
Subject: [PATCH 5/7] Address more comments

Signed-off-by: Alexander Batashev <alexander.batashev@intel.com>
---
 sycl/source/detail/scheduler/commands.hpp  |  73 +++++++--------
 sycl/source/detail/scheduler/scheduler.hpp | 101 ++++++++++-----------
 2 files changed, 84 insertions(+), 90 deletions(-)

diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index a9affcc1796f7..c3a3edcdbaf33 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -49,7 +49,7 @@ struct EnqueueResultT {
   EnqueueResultT(ResultT Result = SyclEnqueueSuccess, Command *Cmd = nullptr,
                  cl_int ErrCode = CL_SUCCESS)
       : MResult(Result), MCmd(Cmd), MErrCode(ErrCode) {}
-  /// Indicates result of enqueueing.
+  /// Indicates the result of enqueueing.
   ResultT MResult;
   /// Pointer to the command which failed to enqueue.
   Command *MCmd;
@@ -77,11 +77,11 @@ struct DepDesc {
   AllocaCommandBase *MAllocaCmd = nullptr;
 };
 
-/// The Command represents some action that needs to be performed on one or
-/// more memory objects. The Command has a vector of DepDesc objects that
-/// represent dependencies of the command. It has a vector of pointers to commands
-/// that depend on the command. It has a pointer to \ref queue object and an
-/// event that is associated with the command.
+/// The Command class represents some action that needs to be performed on one
+/// or more memory objects. The Command has a vector of DepDesc objects that
+/// represent dependencies of the command. It has a vector of pointers to
+/// commands that depend on the command. It has a pointer to a \ref queue object
+/// and an event that is associated with the command.
 ///
 /// \ingroup sycl_graph
 class Command {
@@ -112,8 +112,8 @@ class Command {
   /// Checks if the command is enqueued, and calls enqueueImp.
   ///
   /// \param EnqueueResult is set to the specific status if enqueue failed.
-  /// \param Blocking if this argument is true, function will wait for command
-  /// to be unblocked before calling enqueueImp.
+  /// \param Blocking if this argument is true, function will wait for the
+  ///        command to be unblocked before calling enqueueImp.
   /// \return true if the command is enqueued.
   bool enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking);
 
@@ -136,23 +136,23 @@ class Command {
   /// Looks at all the dependencies for the release command and enables
   /// instrumentation to report these dependencies as edges.
   void resolveReleaseDependencies(std::set<Command *> &list);
-  /// Creates an edge event when the dependency is a command
+  /// Creates an edge event when the dependency is a command.
   void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr,
                                          const string_class &Prefix,
                                          bool IsCommand);
-  /// Creates an edge event when the dependency is an event
+  /// Creates an edge event when the dependency is an event.
   void emitEdgeEventForEventDependence(Command *Cmd, RT::PiEvent &EventAddr);
-  /// Creates a signal event with the enqueued kernel event handle
+  /// Creates a signal event with the enqueued kernel event handle.
   void emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr);
   /// Create a trace event of node_create type; this must be guarded by a
-  /// check for xptiTraceEnabled()
-  /// Post Condition: MTraceEvent will be set to the event created
-  /// \param MAddress  The address to use to create the payload
+  /// check for xptiTraceEnabled().
+  /// Post Condition: MTraceEvent will be set to the event created.
+  /// \param MAddress  The address to use to create the payload.
   uint64_t makeTraceEventProlog(void *MAddress);
   /// If prolog has been run, run epilog; this must be guarded by a check for
-  /// xptiTraceEnabled()
+  /// xptiTraceEnabled().
   void makeTraceEventEpilog();
-  /// Emits an event of Type
+  /// Emits an event of Type.
   void emitInstrumentation(uint16_t Type, const char *Txt = nullptr);
 
   // End Methods needed to support SYCL instrumentation
@@ -178,7 +178,7 @@ class Command {
   /// Private interface. Derived classes should implement this method.
   virtual cl_int enqueueImp() = 0;
 
-  /// The type of the command
+  /// The type of the command.
   CommandType MType;
   /// Mutex used to protect enqueueing from race conditions
   std::mutex MEnqueueMtx;
@@ -186,16 +186,16 @@ class Command {
 public:
   /// Contains list of dependencies(edges)
   std::vector<DepDesc> MDeps;
-  /// Contains list of commands that depend on the command
+  /// Contains list of commands that depend on the command.
   std::unordered_set<Command *> MUsers;
-  /// Indicates whether the command can be blocked from enqueueing
+  /// Indicates whether the command can be blocked from enqueueing.
   bool MIsBlockable = false;
-  /// Counts the number of memory objects this command is a leaf for
+  /// Counts the number of memory objects this command is a leaf for.
   unsigned MLeafCounter = 0;
 
   const char *MBlockReason = "Unknown";
 
-  /// Describes the status of a command
+  /// Describes the status of the command.
   std::atomic<EnqueueResultT::ResultT> MEnqueueStatus;
 
   // All member variable defined here  are needed for the SYCL instrumentation
@@ -203,25 +203,26 @@ class Command {
   // to ensure we have the same object layout when the macro in the library and
   // SYCL app are not the same.
 
-  /// The event for node_create and task_begin
+  /// The event for node_create and task_begin.
   void *MTraceEvent = nullptr;
-  /// The stream under which the traces are emitted; stream ids are
-  /// positive integers and we set it to an invalid value
+  /// The stream under which the traces are emitted.
+  ///
+  /// Stream ids are positive integers and we set it to an invalid value.
   int32_t MStreamID = -1;
   /// Reserved for storing the object address such as SPIRV or memory object
-  /// address
+  /// address.
   void *MAddress = nullptr;
-  /// Buffer to build the address string
+  /// Buffer to build the address string.
   string_class MAddressString;
-  /// Buffer to build the command node type
+  /// Buffer to build the command node type.
   string_class MCommandNodeType;
-  /// Buffer to build the command end-user understandable name
+  /// Buffer to build the command end-user understandable name.
   string_class MCommandName;
-  /// Flag to indicate if makeTraceEventProlog() has been run
+  /// Flag to indicate if makeTraceEventProlog() has been run.
   bool MTraceEventPrologComplete = false;
-  /// Flag to indicate if this is the first time we are seeing this payload
+  /// Flag to indicate if this is the first time we are seeing this payload.
   bool MFirstInstance = false;
-  /// Instance ID tracked for the command
+  /// Instance ID tracked for the command.
   uint64_t MInstanceID = 0;
 };
 
@@ -242,8 +243,8 @@ class EmptyCommand : public Command {
   Requirement MRequirement;
 };
 
-/// The release command enqueues release of a memory object instance allocated on Host or
-/// underlying framework.
+/// The release command enqueues release of a memory object instance allocated
+/// on Host or underlying framework.
 class ReleaseCommand : public Command {
 public:
   ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd);
@@ -286,7 +287,7 @@ class AllocaCommandBase : public Command {
   bool MIsActive = true;
 
   /// Indicates that the command owns memory allocation in case of connected
-  /// alloca command
+  /// alloca command.
   bool MIsLeaderAlloca = true;
 
 protected:
@@ -309,7 +310,7 @@ class AllocaCommand : public AllocaCommandBase {
   cl_int enqueueImp() final;
 
   /// The flag indicates that alloca should try to reuse pointer provided by
-  /// the user during memory object construction
+  /// the user during memory object construction.
   bool MInitFromUserData = false;
 };
 
@@ -329,7 +330,7 @@ class AllocaSubBufCommand : public AllocaCommandBase {
   AllocaCommandBase *MParentAlloca = nullptr;
 };
 
-/// The map command enqueues mapping of host memory onto device memory.
+/// The map command enqueues mapping of device memory onto host memory.
 class MapMemObject : public Command {
 public:
   MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr,
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 57feb83036c3a..410939f9426f1 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -33,12 +33,13 @@
 ///
 /// The SYCL framework defines command group (\ref CG) as an entity that
 /// represents minimal execution block. The command group is submitted to SYCL
-/// queue and consists of a kernel and its requirements. The SYCL queue defines
-/// the device and context using which the kernel should be executed.
+/// queue and consists of a kernel or an explicit memory operation, and their
+/// requirements. The SYCL queue defines the device and context using which the
+/// kernel should be executed.
 ///
-/// There are also command groups that consist of memory requirements and
-/// an explicit memory operation, such as copy, fill, update_host. In this case
-/// it's up to an implementation how to implement these operations.
+/// The commands that contain explicit memory operations include copy, fill,
+/// update_host and other operations. It's up to implementation how to define
+/// these operations.
 ///
 /// The relative order of command groups submission defines the order in which
 /// kernels must be executed if their memory requirements intersect. For
@@ -93,8 +94,8 @@
 ///
 ///   // "Host accessor creation" section
 ///   // Request the latest data of BufferC for the moment
-///   // This is a synchronization point, which means that the DPC++ RT blocks on creation of
-///   // the accessor until requested data is available.
+///   // This is a synchronization point, which means that the DPC++ RT blocks
+///   // on creation of the accessor until requested data is available.
 ///   auto C = BufferC.get_access<read>();
 /// }
 /// \endcode
@@ -102,8 +103,8 @@
 /// In the example above the DPC++ RT does the following:
 ///
 /// 1. **Copy command group**.
-///    The DPC++ RT allocates memory for BufferA and BufferB on CPU then executes
-///   an explicit copy operation on CPU.
+///    The DPC++ RT allocates memory for BufferA and BufferB on CPU then
+///    executes an explicit copy operation on CPU.
 ///
 /// 2. **Multi command group**
 ///    DPC++ RT allocates memory for BufferC and BufferB on GPU and copy
@@ -266,8 +267,8 @@ struct MemObjRecord {
 /// executing the first command group memory allocation must be performed.
 ///
 /// At some point Scheduler enqueues commands to the underlying devices. To do
-/// this, Scheduler performs topological sort to get the order in which commands should
-/// be enqueued. For example, the following graph (D depends on B and C,
+/// this, Scheduler performs topological sort to get the order in which commands
+/// should be enqueued. For example, the following graph (D depends on B and C,
 /// B and C depends on A) will be enqueued in the following order:
 /// \code{.cpp}
 ///   EventA = Enqueue(A, /*Deps=*/{});
@@ -308,8 +309,7 @@ struct MemObjRecord {
 ///
 /// \section sched_impl Implementation details
 ///
-/// The Scheduler is split up into two parts: graph builder and graph
-/// processor.
+/// The Scheduler is split up into two parts: graph builder and graph processor.
 ///
 /// To build dependencies, Scheduler needs to memorize memory objects and
 /// commands that modify them.
@@ -338,9 +338,9 @@ struct MemObjRecord {
 /// 1. errors that happen during command enqueue process
 /// 2. the error that happend during command execution.
 ///
-/// If an error occurs during command enqueue process, the Command::enqueue method
-/// returns the faulty command. Scheduler then reschedules the command and all
-/// dependent commands (if any).
+/// If an error occurs during command enqueue process, the Command::enqueue
+/// method returns the faulty command. Scheduler then reschedules the command
+/// and all dependent commands (if any).
 ///
 /// An error with command processing can happen in underlying runtime, in this
 /// case Scheduler is notified asynchronously (using callback mechanism) what
@@ -378,26 +378,23 @@ class Scheduler {
 
   /// Removes buffer from the graph.
   ///
-  /// The lifetime of memory object descriptor begins when the first command group
-  /// that uses the memory object is submitted and ends when "removeMemoryObject(...)"
-  /// method is called which means there will be no command group that uses the
-  /// memory object. When removeMemoryObject is called Scheduler will enqueue
-  /// and wait on all release commands associated with the memory object, which
-  /// effectively guarantees that all commands accessing the memory object are
-  /// complete and then the resources allocated for the memory object are freed. Then all the
-  /// commands affecting the memory object are removed.
-  ///
-  /// On destruction Scheduler triggers destruction of all memory object
-  /// descriptors in order to wait on all commands not yet executed and all
-  /// memory it manages.
+  /// The lifetime of memory object descriptor begins when the first command
+  /// group that uses the memory object is submitted and ends when
+  /// "removeMemoryObject(...)" method is called which means there will be no
+  /// command group that uses the memory object. When removeMemoryObject is
+  /// called Scheduler will enqueue and wait on all release commands associated
+  /// with the memory object, which effectively guarantees that all commands
+  /// accessing the memory object are complete and then the resources allocated
+  /// for the memory object are freed. Then all the commands affecting the
+  /// memory object are removed.
   ///
   /// This member function is used by \ref buffer and \ref image.
   ///
   /// \param MemObj is a memory object that points to the buffer being removed.
   void removeMemoryObject(detail::SYCLMemObjI *MemObj);
 
-  /// Removes finished non-leaf non-alloca commands from the subgraph
-  /// (assuming that all its commands have been waited for).
+  /// Removes finished non-leaf non-alloca commands from the subgraph (assuming
+  /// that all its commands have been waited for).
   /// \sa GraphBuilder::cleanupFinishedCommands
   ///
   /// \param FinishedEvent is a cleanup candidate event.
@@ -458,13 +455,12 @@ class Scheduler {
     Command *addCGUpdateHost(std::unique_ptr<detail::CG> CommandGroup,
                              QueueImplPtr HostQueue);
 
-    /// Registers a \ref CG "command group" to update memory to the latest
-    /// state.
+    /// Enqueues a command to update memory to the latest state.
     ///
     /// \param Req is a requirement, that describes memory object.
     Command *addCopyBack(Requirement *Req);
 
-    /// Registers a \ref CG "command group" to create a host accessor.
+    /// Enqueues a command to create a host accessor.
     ///
     /// \param Req points to memory being accessed.
     Command *addHostAccessor(Requirement *Req, const bool destructor = false);
@@ -483,8 +479,9 @@ class Scheduler {
     /// Reschedules the command passed using Queue provided.
     ///
     /// This can lead to rescheduling of all dependent commands. This can be
-    /// used when the user provides a "secondary" queue to the submit method which may
-    /// be used when the command fails to enqueue/execute in the primary queue.
+    /// used when the user provides a "secondary" queue to the submit method
+    /// which may be used when the command fails to enqueue/execute in the
+    /// primary queue.
     void rescheduleCommand(Command *Cmd, QueueImplPtr Queue);
 
     /// \return a pointer to the corresponding memory object record for the
@@ -516,7 +513,8 @@ class Scheduler {
     std::vector<SYCLMemObjI *> MMemObjs;
 
   private:
-    /// Inserts the command required to update the memory object state in the context.
+    /// Inserts the command required to update the memory object state in the
+    /// context.
     ///
     /// Copy/map/unmap operations can be inserted depending on the source and
     /// destination.
@@ -579,26 +577,21 @@ class Scheduler {
   /// Member functions of this class do not modify the graph.
   ///
   /// \section sched_enqueue Command enqueueing
-  /// \todo lazy mode is not implemented.
-  ///
-  /// The Scheduler can work in two modes of enqueueing commands: eager (default)
-  /// and lazy. In eager mode commands are enqueued whenever they come to the
-  /// Scheduler. In lazy mode they are not enqueued until the content of the buffer
-  /// they are accessing is requested by user.
   ///
-  /// Each command has enqueue method which takes vector of events that
-  /// represents dependencies and returns event which represents the command.
-  /// GraphProcessor performs topological sort to get the order in which commands have to
-  /// be enqueued. Then it enqueues each command, passing a vector of events
-  /// that this command needs to wait on. If an error happens during command
-  /// enqueue, the whole process is stopped, the faulty command is propagated back
-  /// to the Scheduler.
+  /// Commands are enqueued whenever they come to the Scheduler. Each command
+  /// has enqueue method which takes vector of events that represents
+  /// dependencies and returns event which represents the command.
+  /// GraphProcessor performs topological sort to get the order in which
+  /// commands have to be enqueued. Then it enqueues each command, passing a
+  /// vector of events that this command needs to wait on. If an error happens
+  /// during command enqueue, the whole process is stopped, the faulty command
+  /// is propagated back to the Scheduler.
   ///
-  /// The command with dependencies that belong to a context different from its own
-  /// can't be enqueued directly (limitation of OpenCL runtime).
-  /// Instead, for each dependency, a proxy event is created in the target context
-  /// and linked using OpenCL callback mechanism with original one. For example,
-  /// the following SYCL code:
+  /// The command with dependencies that belong to a context different from its
+  /// own can't be enqueued directly (limitation of OpenCL runtime).
+  /// Instead, for each dependency, a proxy event is created in the target
+  /// context and linked using OpenCL callback mechanism with original one.
+  /// For example, the following SYCL code:
   ///
   /// \code{.cpp}
   ///   // The ContextA and ContextB are different OpenCL contexts

From 93c08b00b3d3adc8bc7b9ece0926ddcb8414daef Mon Sep 17 00:00:00 2001
From: Alexander Batashev <alexander.batashev@intel.com>
Date: Thu, 9 Apr 2020 14:16:08 +0300
Subject: [PATCH 6/7] format

Signed-off-by: Alexander Batashev <alexander.batashev@intel.com>
---
 sycl/source/detail/scheduler/graph_builder.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index a16d382fbb9af..b4a7a7273be91 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -446,10 +446,10 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost(
   return insertMemoryMove(Record, Req, HostQueue);
 }
 
-/// Start the search for the record from list of "leaf" commands and check if the
-/// examined command can be executed in parallel with the new one with regard to
-/// the memory object. If it can, then continue searching through dependencies of
-/// that command. There are several rules used:
+/// Start the search for the record from list of "leaf" commands and check if
+/// the examined command can be executed in parallel with the new one with
+/// regard to the memory object. If it can, then continue searching through
+/// dependencies of that command. There are several rules used:
 ///
 /// 1. New and examined commands only read -> can bypass
 /// 2. New and examined commands has non-overlapping requirements -> can bypass

From 8bf29c706813763306ed336d25ba583759a02405 Mon Sep 17 00:00:00 2001
From: Alexander Batashev <alexander.batashev@intel.com>
Date: Tue, 14 Apr 2020 09:17:16 +0300
Subject: [PATCH 7/7] Fix more comments

Signed-off-by: Alexander Batashev <alexander.batashev@intel.com>
---
 sycl/source/detail/scheduler/scheduler.hpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 410939f9426f1..5affe5bde4977 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -21,7 +21,7 @@
 
 /// \defgroup sycl_graph DPC++ Execution Graph
 ///
-/// DPC++, unlike OpenCL, provides a programming model in which the user doesn't
+/// SYCL, unlike OpenCL, provides a programming model in which the user doesn't
 /// need to manage dependencies between kernels and memory explicitly. The DPC++
 /// Runtime must ensure correct execution with respect to the order commands are
 /// submitted.
@@ -60,11 +60,11 @@
 ///
 /// \code{.cpp}
 /// {
-///   // Creating DPC++ CPU and GPU queues
+///   // Creating SYCL CPU and GPU queues
 ///   cl::sycl::queue CPU_Queue = ...;
 ///   cl::sycl::queue GPU_Queue = ...;
 ///
-///   // Creating 3 DPC++ buffers
+///   // Creating 3 SYCL buffers
 ///   auto BufferA = ...; // Buffer is initialized with host memory.
 ///   auto BufferB = ...;
 ///   auto BufferC = ...;
@@ -369,9 +369,7 @@ class Scheduler {
   /// Waits for the event.
   ///
   /// This operation is blocking. For eager execution mode this method invokes
-  /// corresponding function of device API. In lazy execution mode the method
-  /// may enqueue the command, associated with the event, and its dependency
-  /// before calling device API.
+  /// corresponding function of device API.
   ///
   /// \param Event is a pointer to event to wait on.
   void waitForEvent(EventImplPtr Event);