From 8d7eed9d77f766d377c0e89bf8f8062d2aafe9a4 Mon Sep 17 00:00:00 2001
From: Vasanth Tovinkere <vasanth.tovinkere@intel.com>
Date: Tue, 10 Mar 2020 07:23:25 -0700
Subject: [PATCH] [SYCL][XPTI] Instrumentation of SYCL runtime with XPTI

+ XPTI proxy library to provide entry points into the
  xpti instrumentation framework
+ Header specification of the XPTI framework API calls
+ Documentation describing the changes to SYCL runtime
  available under xpti/doc
+ SYCL runtime instrumentation using XPTI proxy library
  to monitor the creation of the asynchronous task graph
  (nodes and dependencies)
+ Instrumentation of entry points in queue to capture
  the end-user source code locations of calls to submit,
  parallel_for etc.
+ Updates to the CMakeLists.txt to soft enable the XPTI
  instrumentation and linking of the SYCL library with
  the XPTI proxy/stub library
+ Updates to the CI scripts to include XPTI proxy library
  in building along with enabling of the instrumentation
  in the SYCL library

Signed-off-by: Vasanth Tovinkere <vasanth.tovinkere@intel.com>
---
 buildbot/configure.py                         |   7 +-
 sycl/CMakeLists.txt                           |  11 +
 sycl/include/CL/sycl/detail/cg.hpp            |  59 +-
 sycl/include/CL/sycl/detail/common.hpp        |  68 +-
 sycl/include/CL/sycl/detail/pi.hpp            |   7 +
 sycl/include/CL/sycl/handler.hpp              |   5 +-
 sycl/include/CL/sycl/ordered_queue.hpp        | 146 +++-
 sycl/include/CL/sycl/queue.hpp                | 356 ++++++--
 sycl/source/CMakeLists.txt                    |  45 +-
 sycl/source/detail/event_impl.cpp             |  70 ++
 sycl/source/detail/event_impl.hpp             |  20 +-
 sycl/source/detail/pi.cpp                     |  67 ++
 sycl/source/detail/queue_impl.cpp             |  99 +++
 sycl/source/detail/queue_impl.hpp             |  42 +-
 sycl/source/detail/scheduler/commands.cpp     | 761 +++++++++++++++++-
 sycl/source/detail/scheduler/commands.hpp     | 120 ++-
 sycl/source/detail/scheduler/scheduler.cpp    |  16 +
 sycl/source/handler.cpp                       |  25 +-
 sycl/source/ordered_queue.cpp                 |  22 +-
 sycl/source/queue.cpp                         |  22 +-
 sycl/unittests/scheduler/BlockedCommands.cpp  |   2 +
 .../scheduler/SchedulerTestUtils.hpp          |   1 +
 xpti/CMakeLists.txt                           |  55 ++
 xpti/README.md                                |  22 +
 xpti/doc/SYCL_Tracing_Implementation.md       | 236 ++++++
 xpti/include/xpti_data_types.h                | 543 +++++++++++++
 xpti/include/xpti_trace_framework.h           | 414 ++++++++++
 xpti/include/xpti_trace_framework.hpp         | 321 ++++++++
 xpti/src/CMakeLists.txt                       |  18 +
 xpti/src/xpti_proxy.cpp                       | 329 ++++++++
 30 files changed, 3650 insertions(+), 259 deletions(-)
 create mode 100644 xpti/CMakeLists.txt
 create mode 100644 xpti/README.md
 create mode 100644 xpti/doc/SYCL_Tracing_Implementation.md
 create mode 100644 xpti/include/xpti_data_types.h
 create mode 100644 xpti/include/xpti_trace_framework.h
 create mode 100644 xpti/include/xpti_trace_framework.hpp
 create mode 100644 xpti/src/CMakeLists.txt
 create mode 100644 xpti/src/xpti_proxy.cpp

diff --git a/buildbot/configure.py b/buildbot/configure.py
index c0e4dc3eb6d31..3d6c95393cae2 100644
--- a/buildbot/configure.py
+++ b/buildbot/configure.py
@@ -10,10 +10,11 @@ def do_configure(args):
     llvm_dir = os.path.join(args.src_dir, "llvm")
     sycl_dir = os.path.join(args.src_dir, "sycl")
     spirv_dir = os.path.join(args.src_dir, "llvm-spirv")
+    xpti_dir = os.path.join(args.src_dir, "xpti")
     ocl_header_dir = os.path.join(args.obj_dir, "OpenCL-Headers")
     icd_loader_lib = os.path.join(args.obj_dir, "OpenCL-ICD-Loader", "build")
     llvm_targets_to_build = 'X86'
-    llvm_enable_projects = 'clang;llvm-spirv;sycl;opencl-aot'
+    llvm_enable_projects = 'clang;llvm-spirv;sycl;opencl-aot;xpti'
     libclc_targets_to_build = ''
     sycl_build_pi_cuda = 'OFF'
     llvm_enable_assertions = 'ON'
@@ -44,9 +45,10 @@ def do_configure(args):
         "-DCMAKE_BUILD_TYPE={}".format(args.build_type),
         "-DLLVM_ENABLE_ASSERTIONS={}".format(llvm_enable_assertions),
         "-DLLVM_TARGETS_TO_BUILD={}".format(llvm_targets_to_build),
-        "-DLLVM_EXTERNAL_PROJECTS=sycl;llvm-spirv;opencl-aot",
+        "-DLLVM_EXTERNAL_PROJECTS=sycl;llvm-spirv;opencl-aot;xpti",
         "-DLLVM_EXTERNAL_SYCL_SOURCE_DIR={}".format(sycl_dir),
         "-DLLVM_EXTERNAL_LLVM_SPIRV_SOURCE_DIR={}".format(spirv_dir),
+        "-DLLVM_EXTERNAL_XPTI_SOURCE_DIR={}".format(xpti_dir),
         "-DLLVM_ENABLE_PROJECTS={}".format(llvm_enable_projects),
         "-DLIBCLC_TARGETS_TO_BUILD={}".format(libclc_targets_to_build),
         "-DOpenCL_INCLUDE_DIR={}".format(ocl_header_dir),
@@ -57,6 +59,7 @@ def do_configure(args):
         "-DCMAKE_INSTALL_PREFIX={}".format(install_dir),
         "-DSYCL_INCLUDE_TESTS=ON", # Explicitly include all kinds of SYCL tests.
         "-DLLVM_ENABLE_DOXYGEN={}".format(llvm_enable_doxygen),
+        "-DSYCL_ENABLE_XPTI_TRACING=ON", # Explicitly turn on XPTI tracing
         llvm_dir
     ]
 
diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index 8b2883ceca745..85c67129fe4e2 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -26,6 +26,10 @@ if(SYCL_ENABLE_WERROR)
   endif()
 endif()
 
+# Create a soft option for enabling or disabling the instrumentation
+# of the SYCL runtime and expect enabling
+option(SYCL_ENABLE_XPTI_TRACING "Enable tracing of SYCL constructs" OFF)
+
 if(MSVC)
   set_property(GLOBAL PROPERTY USE_FOLDERS ON)
   # Skip asynchronous C++ exceptions catching and assume "extern C" functions
@@ -218,6 +222,13 @@ add_custom_target( sycl-toolchain
   COMMENT "Building SYCL compiler toolchain..."
 )
 
+if (SYCL_ENABLE_XPTI_TRACING)
+  add_dependencies( sycl-toolchain xpti)
+  if (MSVC)
+    add_dependencies( sycl-toolchain xptid)
+  endif()
+endif()
+
 if (NOT DEFINED LLVM_INCLUDE_TESTS)
   set(LLVM_INCLUDE_TESTS ON)
 endif()
diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp
index e4d0a0d30967c..a87daa3e8e154 100644
--- a/sycl/include/CL/sycl/detail/cg.hpp
+++ b/sycl/include/CL/sycl/detail/cg.hpp
@@ -368,11 +368,21 @@ class CG {
      vector_class<detail::AccessorImplPtr> AccStorage,
      vector_class<shared_ptr_class<const void>> SharedPtrStorage,
      vector_class<Requirement *> Requirements,
-     vector_class<detail::EventImplPtr> Events)
+     vector_class<detail::EventImplPtr> Events, detail::code_location loc = {})
       : MType(Type), MArgsStorage(std::move(ArgsStorage)),
         MAccStorage(std::move(AccStorage)),
         MSharedPtrStorage(std::move(SharedPtrStorage)),
-        MRequirements(std::move(Requirements)), MEvents(std::move(Events)) {}
+        MRequirements(std::move(Requirements)), MEvents(std::move(Events)) {
+    // Capture the user code-location from Q.submit(), Q.parallel_for()
+    // etc for later use; if code location information is not available,
+    // the file name and function name members will be empty strings
+    if (loc.functionName())
+      MFunctionName = loc.functionName();
+    if (loc.fileName())
+      MFileName = loc.fileName();
+    MLine = loc.lineNumber();
+    MColumn = loc.columnNumber();
+  }
 
   CG(CG &&CommandGroup) = default;
 
@@ -397,6 +407,12 @@ class CG {
   vector_class<Requirement *> MRequirements;
   // List of events that order the execution of this CG
   vector_class<detail::EventImplPtr> MEvents;
+  // Member variables to capture the user code-location
+  // information from Q.submit(), Q.parallel_for() etc
+  // Storage for function name and source file name
+  string_class MFunctionName, MFileName;
+  // Storage for line and column of code location
+  int32_t MLine, MColumn;
 };
 
 // The class which represents "execute kernel" command group.
@@ -420,10 +436,10 @@ class CGExecKernel : public CG {
                vector_class<ArgDesc> Args, string_class KernelName,
                detail::OSModuleHandle OSModuleHandle,
                vector_class<shared_ptr_class<detail::stream_impl>> Streams,
-               CGTYPE Type)
+               CGTYPE Type, detail::code_location loc = {})
       : CG(Type, std::move(ArgsStorage), std::move(AccStorage),
            std::move(SharedPtrStorage), std::move(Requirements),
-           std::move(Events)),
+           std::move(Events), std::move(loc)),
         MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)),
         MSyclKernel(std::move(SyclKernel)), MArgs(std::move(Args)),
         MKernelName(std::move(KernelName)), MOSModuleHandle(OSModuleHandle),
@@ -450,10 +466,11 @@ class CGCopy : public CG {
          vector_class<detail::AccessorImplPtr> AccStorage,
          vector_class<shared_ptr_class<const void>> SharedPtrStorage,
          vector_class<Requirement *> Requirements,
-         vector_class<detail::EventImplPtr> Events)
+         vector_class<detail::EventImplPtr> Events,
+         detail::code_location loc = {})
       : CG(CopyType, std::move(ArgsStorage), std::move(AccStorage),
            std::move(SharedPtrStorage), std::move(Requirements),
-           std::move(Events)),
+           std::move(Events), std::move(loc)),
         MSrc(Src), MDst(Dst) {}
   void *getSrc() { return MSrc; }
   void *getDst() { return MDst; }
@@ -470,10 +487,11 @@ class CGFill : public CG {
          vector_class<detail::AccessorImplPtr> AccStorage,
          vector_class<shared_ptr_class<const void>> SharedPtrStorage,
          vector_class<Requirement *> Requirements,
-         vector_class<detail::EventImplPtr> Events)
+         vector_class<detail::EventImplPtr> Events,
+         detail::code_location loc = {})
       : CG(FILL, std::move(ArgsStorage), std::move(AccStorage),
            std::move(SharedPtrStorage), std::move(Requirements),
-           std::move(Events)),
+           std::move(Events), std::move(loc)),
         MPattern(std::move(Pattern)), MPtr((Requirement *)Ptr) {}
   Requirement *getReqToFill() { return MPtr; }
 };
@@ -487,10 +505,11 @@ class CGUpdateHost : public CG {
                vector_class<detail::AccessorImplPtr> AccStorage,
                vector_class<shared_ptr_class<const void>> SharedPtrStorage,
                vector_class<Requirement *> Requirements,
-               vector_class<detail::EventImplPtr> Events)
+               vector_class<detail::EventImplPtr> Events,
+               detail::code_location loc = {})
       : CG(UPDATE_HOST, std::move(ArgsStorage), std::move(AccStorage),
            std::move(SharedPtrStorage), std::move(Requirements),
-           std::move(Events)),
+           std::move(Events), std::move(loc)),
         MPtr((Requirement *)Ptr) {}
 
   Requirement *getReqToUpdate() { return MPtr; }
@@ -508,10 +527,11 @@ class CGCopyUSM : public CG {
             vector_class<detail::AccessorImplPtr> AccStorage,
             vector_class<shared_ptr_class<const void>> SharedPtrStorage,
             vector_class<Requirement *> Requirements,
-            vector_class<detail::EventImplPtr> Events)
+            vector_class<detail::EventImplPtr> Events,
+            detail::code_location loc = {})
       : CG(COPY_USM, std::move(ArgsStorage), std::move(AccStorage),
            std::move(SharedPtrStorage), std::move(Requirements),
-           std::move(Events)),
+           std::move(Events), std::move(loc)),
         MSrc(Src), MDst(Dst), MLength(Length) {}
 
   void *getSrc() { return MSrc; }
@@ -531,10 +551,11 @@ class CGFillUSM : public CG {
             vector_class<detail::AccessorImplPtr> AccStorage,
             vector_class<shared_ptr_class<const void>> SharedPtrStorage,
             vector_class<Requirement *> Requirements,
-            vector_class<detail::EventImplPtr> Events)
+            vector_class<detail::EventImplPtr> Events,
+            detail::code_location loc = {})
       : CG(FILL_USM, std::move(ArgsStorage), std::move(AccStorage),
            std::move(SharedPtrStorage), std::move(Requirements),
-           std::move(Events)),
+           std::move(Events), std::move(loc)),
         MPattern(std::move(Pattern)), MDst(DstPtr), MLength(Length) {}
   void *getDst() { return MDst; }
   size_t getLength() { return MLength; }
@@ -552,10 +573,11 @@ class CGPrefetchUSM : public CG {
                 vector_class<detail::AccessorImplPtr> AccStorage,
                 vector_class<shared_ptr_class<const void>> SharedPtrStorage,
                 vector_class<Requirement *> Requirements,
-                vector_class<detail::EventImplPtr> Events)
+                vector_class<detail::EventImplPtr> Events,
+                detail::code_location loc = {})
       : CG(PREFETCH_USM, std::move(ArgsStorage), std::move(AccStorage),
            std::move(SharedPtrStorage), std::move(Requirements),
-           std::move(Events)),
+           std::move(Events), std::move(loc)),
         MDst(DstPtr), MLength(Length) {}
   void *getDst() { return MDst; }
   size_t getLength() { return MLength; }
@@ -570,10 +592,11 @@ class CGInteropTask : public CG {
                 std::vector<detail::AccessorImplPtr> AccStorage,
                 std::vector<std::shared_ptr<const void>> SharedPtrStorage,
                 std::vector<Requirement *> Requirements,
-                std::vector<detail::EventImplPtr> Events, CGTYPE Type)
+                std::vector<detail::EventImplPtr> Events, CGTYPE Type,
+                detail::code_location loc = {})
       : CG(Type, std::move(ArgsStorage), std::move(AccStorage),
            std::move(SharedPtrStorage), std::move(Requirements),
-           std::move(Events)),
+           std::move(Events), std::move(loc)),
         MInteropTask(std::move(InteropTask)) {}
 };
 
diff --git a/sycl/include/CL/sycl/detail/common.hpp b/sycl/include/CL/sycl/detail/common.hpp
index 2dfe8ef71dc74..141d1fffdd7b8 100644
--- a/sycl/include/CL/sycl/detail/common.hpp
+++ b/sycl/include/CL/sycl/detail/common.hpp
@@ -16,24 +16,84 @@
 #include <CL/cl.h>
 #include <CL/cl_ext.h>
 #include <CL/cl_ext_intel.h>
+
+#include <cstdint>
 #include <string>
 #include <type_traits>
 
 #define STRINGIFY_LINE_HELP(s) #s
 #define STRINGIFY_LINE(s) STRINGIFY_LINE_HELP(s)
 
+// Default signature enables the passing of user code location information to
+// public methods as a default argument. If the end-user wants to disable the
+// code location information, they must compile the code with
+// -DDISABLE_SYCL_INSTRUMENTATION_METADATA flag
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace detail {
+// We define a sycl stream name and this will
+// be used by the instrumentation framework
+constexpr const char *SYCL_STREAM_NAME = "sycl";
+// Data structure that captures the user code
+// location information using the builtin capabilities
+// of the compiler
+struct code_location {
+#ifdef _MSC_VER
+  // Since MSVC does not support the required builtins, we
+  // implement the version with "unknown"s which is handled
+  // correctly by the instrumentation
+  static constexpr code_location current(const char *fileName = nullptr,
+                                         const char *funcName = nullptr,
+                                         unsigned long lineNo = 0,
+                                         unsigned long columnNo = 0) noexcept {
+    return code_location(fileName, funcName, lineNo, columnNo);
+  }
+#else
+  static constexpr code_location
+  current(const char *fileName = __builtin_FILE(),
+          const char *funcName = __builtin_FUNCTION(),
+          unsigned long lineNo = __builtin_LINE(),
+          unsigned long columnNo = 0) noexcept {
+    return code_location(fileName, funcName, lineNo, columnNo);
+  }
+#endif
+
+  constexpr code_location(const char *file, const char *func, int line,
+                          int col) noexcept
+      : MFileName(file), MFunctionName(func), MLineNo(line), MColumnNo(col) {}
+
+  constexpr code_location() noexcept
+      : MFileName(nullptr), MFunctionName(nullptr), MLineNo(0), MColumnNo(0) {}
+
+  constexpr unsigned long lineNumber() const noexcept { return MLineNo; }
+  constexpr unsigned long columnNumber() const noexcept { return MColumnNo; }
+  constexpr const char *fileName() const noexcept { return MFileName; }
+  constexpr const char *functionName() const noexcept { return MFunctionName; }
+
+private:
+  const char *MFileName;
+  const char *MFunctionName;
+  unsigned long MLineNo;
+  unsigned long MColumnNo;
+};
+} // namespace detail
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
 
 const char *stringifyErrorCode(cl_int error);
 
-static inline std::string codeToString(cl_int code){
-  return std::string(std::to_string(code) + " (" +
-         stringifyErrorCode(code) + ")");
+static inline std::string codeToString(cl_int code) {
+  return std::string(std::to_string(code) + " (" + stringifyErrorCode(code) +
+                     ")");
 }
 
-}}} // __SYCL_INLINE_NAMESPACE(cl)::sycl::detail
+} // namespace detail
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
 
 #ifdef __SYCL_DEVICE_ONLY__
 // TODO remove this when 'assert' is supported in device code
diff --git a/sycl/include/CL/sycl/detail/pi.hpp b/sycl/include/CL/sycl/detail/pi.hpp
index bea05328c81b3..a8105d2cf4acd 100644
--- a/sycl/include/CL/sycl/detail/pi.hpp
+++ b/sycl/include/CL/sycl/detail/pi.hpp
@@ -18,6 +18,13 @@
 #include <cassert>
 #include <string>
 
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+// Forward declarations
+namespace xpti {
+struct trace_event_data_t;
+}
+#endif
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 1678db2dbd80c..c64dbd4b25fa4 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -195,8 +195,9 @@ class handler {
   /// It's expected that the method is the latest method executed before
   /// object destruction.
   ///
-  /// \return a SYCL event object representing the command group.
-  event finalize();
+  /// \param Payload contains the code location of user code
+  /// \return a SYCL event object representing the command group
+  event finalize(const cl::sycl::detail::code_location &Payload = {});
 
   /// Saves streams associated with this handler.
   ///
diff --git a/sycl/include/CL/sycl/ordered_queue.hpp b/sycl/include/CL/sycl/ordered_queue.hpp
index 0914254631d76..240d780645e8b 100644
--- a/sycl/include/CL/sycl/ordered_queue.hpp
+++ b/sycl/include/CL/sycl/ordered_queue.hpp
@@ -88,15 +88,62 @@ class __SYCL_DEPRECATED__ ordered_queue {
   template <info::queue param>
   typename info::param_traits<info::queue, param>::return_type get_info() const;
 
-  template <typename T> event submit(T cgf) { return submit_impl(cgf); }
+  /// @param Loc is the code location of the submit call (default argument)
+  template <typename T>
+  event
+  submit(T cgf
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+         ,
+         const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit_impl(cgf, CodeLoc);
+  }
+
+  template <typename T>
+  event
+  submit(T cgf, ordered_queue &secondaryQueue
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+         ,
+         const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit_impl(cgf, secondaryQueue, CodeLoc);
+  }
 
-  template <typename T> event submit(T cgf, ordered_queue &secondaryQueue) {
-    return submit_impl(cgf, secondaryQueue);
+  /// @param CodeLoc is the code location of the submit call (default argument)
+  void wait(
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    wait_proxy(CodeLoc);
   }
 
-  void wait();
+  /// @param CodeLoc is the code location of the submit call (default argument)
+  void wait_and_throw(
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    wait_and_throw_proxy(CodeLoc);
+  }
+
+  void wait_proxy(const detail::code_location &CodeLoc);
 
-  void wait_and_throw();
+  void wait_and_throw_proxy(const detail::code_location &CodeLoc);
 
   void throw_asynchronous();
 
@@ -114,45 +161,88 @@ class __SYCL_DEPRECATED__ ordered_queue {
 
   // single_task version with a kernel represented as a lambda.
   template <typename KernelName = detail::auto_name, typename KernelType>
-  void single_task(KernelType KernelFunc) {
-    submit([&](handler &cgh) {
-      cgh.template single_task<KernelName, KernelType>(KernelFunc);
-    });
+  void single_task(
+      KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    submit(
+        [&](handler &cgh) {
+          cgh.template single_task<KernelName, KernelType>(KernelFunc);
+        },
+        CodeLoc);
   }
 
   // parallel_for version with a kernel represented as a lambda + range that
   // specifies global size only.
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
-  void parallel_for(range<Dims> NumWorkItems, KernelType KernelFunc) {
+  void parallel_for(
+      range<Dims> NumWorkItems, KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
     // By-value or By-reference for this?
-    submit([&](handler &cgh) {
-      cgh.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
-                                                              KernelFunc);
-    });
+    submit(
+        [&](handler &cgh) {
+          cgh.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
+                                                                  KernelFunc);
+        },
+        CodeLoc);
   }
 
   // parallel_for version with a kernel represented as a lambda + range and
   // offset that specify global size and global offset correspondingly.
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
-  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
-                    KernelType KernelFunc) {
-    submit([&](handler &cgh) {
-      cgh.template parallel_for<KernelName, KernelType, Dims>(
-          NumWorkItems, WorkItemOffset, KernelFunc);
-    });
+  void parallel_for(
+      range<Dims> NumWorkItems, id<Dims> WorkItemOffset, KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    submit(
+        [&](handler &cgh) {
+          cgh.template parallel_for<KernelName, KernelType, Dims>(
+              NumWorkItems, WorkItemOffset, KernelFunc);
+        },
+        CodeLoc);
   }
 
   // parallel_for version with a kernel represented as a lambda + nd_range that
   // specifies global, local sizes and offset.
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
-  void parallel_for(nd_range<Dims> ExecutionRange, KernelType KernelFunc) {
-    submit([&](handler &cgh) {
-      cgh.template parallel_for<KernelName, KernelType, Dims>(ExecutionRange,
-                                                              KernelFunc);
-    });
+  void parallel_for(
+      nd_range<Dims> ExecutionRange, KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    submit(
+        [&](handler &cgh) {
+          cgh.template parallel_for<KernelName, KernelType, Dims>(
+              ExecutionRange, KernelFunc);
+        },
+        CodeLoc);
   }
 
 private:
@@ -160,9 +250,11 @@ class __SYCL_DEPRECATED__ ordered_queue {
   template <class Obj>
   friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject);
 
-  event submit_impl(function_class<void(handler &)> CGH);
   event submit_impl(function_class<void(handler &)> CGH,
-                    ordered_queue &secondQueue);
+                    const detail::code_location &CodeLoc);
+  event submit_impl(function_class<void(handler &)> CGH,
+                    ordered_queue &secondQueue,
+                    const detail::code_location &CodeLoc);
 };
 
 #undef __SYCL_DEPRECATED__
diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
index 8a5cd41e25b2a..c315ccff3e960 100644
--- a/sycl/include/CL/sycl/queue.hpp
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -167,8 +167,21 @@ class queue {
   /// scheduled for execution on the device.
   ///
   /// \param CGF is a function object containing command group.
+  /// \param CodeLoc is the code location of the submit call (default argument)
   /// \return a SYCL event object for the submitted command group.
-  template <typename T> event submit(T CGF) { return submit_impl(CGF); }
+  template <typename T>
+  event
+  submit(T CGF
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+         ,
+         const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit_impl(CGF, CodeLoc);
+  }
 
   /// Submits a command group function object to the queue, in order to be
   /// scheduled for execution on the device.
@@ -178,17 +191,38 @@ class queue {
   ///
   /// \param CGF is a function object containing command group.
   /// \param SecondaryQueue is a fallback SYCL queue.
+  /// \param CodeLoc is the code location of the submit call (default argument)
   /// \return a SYCL event object, which corresponds to the queue the command
   /// group is being enqueued on.
-  template <typename T> event submit(T CGF, queue &SecondaryQueue) {
-    return submit_impl(CGF, SecondaryQueue);
+  template <typename T>
+  event
+  submit(T CGF, queue &SecondaryQueue
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+         ,
+         const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit_impl(CGF, SecondaryQueue, CodeLoc);
   }
 
   /// Performs a blocking wait for the completion of all enqueued tasks in the
   /// queue.
   ///
   /// Synchronous errors will be reported through SYCL exceptions.
-  void wait();
+  /// @param CodeLoc is the code location of the submit call (default argument)
+  void wait(
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    wait_proxy(CodeLoc);
+  }
 
   /// Performs a blocking wait for the completion of all enqueued tasks in the
   /// queue.
@@ -197,7 +231,24 @@ class queue {
   /// errors will be passed to the async_handler passed to the queue on
   /// construction. If no async_handler was provided then asynchronous
   /// exceptions will be lost.
-  void wait_and_throw();
+  /// @param CodeLoc is the code location of the submit call (default argument)
+  void wait_and_throw(
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    wait_and_throw_proxy(CodeLoc);
+  }
+
+  /// Proxy method for wait to forward the code location information to the
+  /// implementation
+  void wait_proxy(const detail::code_location &CodeLoc);
+  /// Proxy method for wait_and_throw to forward the code location information
+  /// to the implementation
+  void wait_and_throw_proxy(const detail::code_location &CodeLoc);
 
   /// Checks if any asynchronous errors have been produced by the queue and if
   /// so reports them to the async_handler passed on the queue construction.
@@ -254,23 +305,47 @@ class queue {
   /// single_task version with a kernel represented as a lambda.
   ///
   /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType>
-  event single_task(KernelType KernelFunc) {
-    return submit([&](handler &CGH) {
-      CGH.template single_task<KernelName, KernelType>(KernelFunc);
-    });
+  event single_task(
+      KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit(
+        [&](handler &CGH) {
+          CGH.template single_task<KernelName, KernelType>(KernelFunc);
+        },
+        CodeLoc);
   }
 
   /// single_task version with a kernel represented as a lambda.
   ///
   /// \param DepEvent is an event that specifies the kernel dependencies
   /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType>
-  event single_task(event DepEvent, KernelType KernelFunc) {
-    return submit([&](handler &CGH) {
-      CGH.depends_on(DepEvent);
-      CGH.template single_task<KernelName, KernelType>(KernelFunc);
-    });
+  event single_task(
+      event DepEvent, KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit(
+        [&](handler &CGH) {
+          CGH.depends_on(DepEvent);
+          CGH.template single_task<KernelName, KernelType>(KernelFunc);
+        },
+        CodeLoc);
   }
 
   /// single_task version with a kernel represented as a lambda.
@@ -278,13 +353,24 @@ class queue {
   /// \param DepEvents is a vector of events that specifies the kernel
   /// dependencies
   /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType>
-  event single_task(const vector_class<event> &DepEvents,
-                    KernelType KernelFunc) {
-    return submit([&](handler &CGH) {
-      CGH.depends_on(DepEvents);
-      CGH.template single_task<KernelName, KernelType>(KernelFunc);
-    });
+  event single_task(
+      const vector_class<event> &DepEvents, KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit(
+        [&](handler &CGH) {
+          CGH.depends_on(DepEvents);
+          CGH.template single_task<KernelName, KernelType>(KernelFunc);
+        },
+        CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + range that
@@ -292,13 +378,25 @@ class queue {
   ///
   /// \param NumWorkItems is a range that specifies the work space of the kernel
   /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
-  event parallel_for(range<Dims> NumWorkItems, KernelType KernelFunc) {
-    return submit([&](handler &CGH) {
-      CGH.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
-                                                              KernelFunc);
-    });
+  event parallel_for(
+      range<Dims> NumWorkItems, KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit(
+        [&](handler &CGH) {
+          CGH.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
+                                                                  KernelFunc);
+        },
+        CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + range that
@@ -307,15 +405,26 @@ class queue {
   /// \param NumWorkItems is a range that specifies the work space of the kernel
   /// \param DepEvent is an event that specifies the kernel dependencies
   /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
-  event parallel_for(range<Dims> NumWorkItems, event DepEvent,
-                     KernelType KernelFunc) {
-    return submit([&](handler &CGH) {
-      CGH.depends_on(DepEvent);
-      CGH.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
-                                                              KernelFunc);
-    });
+  event parallel_for(
+      range<Dims> NumWorkItems, event DepEvent, KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit(
+        [&](handler &CGH) {
+          CGH.depends_on(DepEvent);
+          CGH.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
+                                                                  KernelFunc);
+        },
+        CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + range that
@@ -325,16 +434,27 @@ class queue {
   /// \param DepEvents is a vector of events that specifies the kernel
   /// dependencies
   /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
-  event parallel_for(range<Dims> NumWorkItems,
-                     const vector_class<event> &DepEvents,
-                     KernelType KernelFunc) {
-    return submit([&](handler &CGH) {
-      CGH.depends_on(DepEvents);
-      CGH.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
-                                                              KernelFunc);
-    });
+  event parallel_for(
+      range<Dims> NumWorkItems, const vector_class<event> &DepEvents,
+      KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit(
+        [&](handler &CGH) {
+          CGH.depends_on(DepEvents);
+          CGH.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
+                                                                  KernelFunc);
+        },
+        CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + range and
@@ -343,14 +463,25 @@ class queue {
   /// \param NumWorkItems is a range that specifies the work space of the kernel
   /// \param WorkItemOffset specifies the offset for each work item id
   /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
-  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
-                     KernelType KernelFunc) {
-    return submit([&](handler &CGH) {
-      CGH.template parallel_for<KernelName, KernelType, Dims>(
-          NumWorkItems, WorkItemOffset, KernelFunc);
-    });
+  event parallel_for(
+      range<Dims> NumWorkItems, id<Dims> WorkItemOffset, KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit(
+        [&](handler &CGH) {
+          CGH.template parallel_for<KernelName, KernelType, Dims>(
+              NumWorkItems, WorkItemOffset, KernelFunc);
+        },
+        CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + range and
@@ -360,15 +491,27 @@ class queue {
   /// \param WorkItemOffset specifies the offset for each work item id
   /// \param DepEvent is an event that specifies the kernel dependencies
   /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
-  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
-                     event DepEvent, KernelType KernelFunc) {
-    return submit([&](handler &CGH) {
-      CGH.depends_on(DepEvent);
-      CGH.template parallel_for<KernelName, KernelType, Dims>(
-          NumWorkItems, WorkItemOffset, KernelFunc);
-    });
+  event parallel_for(
+      range<Dims> NumWorkItems, id<Dims> WorkItemOffset, event DepEvent,
+      KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit(
+        [&](handler &CGH) {
+          CGH.depends_on(DepEvent);
+          CGH.template parallel_for<KernelName, KernelType, Dims>(
+              NumWorkItems, WorkItemOffset, KernelFunc);
+        },
+        CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + range and
@@ -379,16 +522,27 @@ class queue {
   /// \param DepEvents is a vector of events that specifies the kernel
   /// dependencies
   /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
-  event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
-                     const vector_class<event> &DepEvents,
-                     KernelType KernelFunc) {
-    return submit([&](handler &CGH) {
-      CGH.depends_on(DepEvents);
-      CGH.template parallel_for<KernelName, KernelType, Dims>(
-          NumWorkItems, WorkItemOffset, KernelFunc);
-    });
+  event parallel_for(
+      range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+      const vector_class<event> &DepEvents, KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit(
+        [&](handler &CGH) {
+          CGH.depends_on(DepEvents);
+          CGH.template parallel_for<KernelName, KernelType, Dims>(
+              NumWorkItems, WorkItemOffset, KernelFunc);
+        },
+        CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + nd_range that
@@ -397,13 +551,25 @@ class queue {
   /// \param ExecutionRange is a range that specifies the work space of the
   /// kernel
   /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
-  event parallel_for(nd_range<Dims> ExecutionRange, KernelType KernelFunc) {
-    return submit([&](handler &CGH) {
-      CGH.template parallel_for<KernelName, KernelType, Dims>(ExecutionRange,
-                                                              KernelFunc);
-    });
+  event parallel_for(
+      nd_range<Dims> ExecutionRange, KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit(
+        [&](handler &CGH) {
+          CGH.template parallel_for<KernelName, KernelType, Dims>(
+              ExecutionRange, KernelFunc);
+        },
+        CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + nd_range that
@@ -413,15 +579,26 @@ class queue {
   /// kernel
   /// \param DepEvent is an event that specifies the kernel dependencies
   /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
-  event parallel_for(nd_range<Dims> ExecutionRange, event DepEvent,
-                     KernelType KernelFunc) {
-    return submit([&](handler &CGH) {
-      CGH.depends_on(DepEvent);
-      CGH.template parallel_for<KernelName, KernelType, Dims>(ExecutionRange,
-                                                              KernelFunc);
-    });
+  event parallel_for(
+      nd_range<Dims> ExecutionRange, event DepEvent, KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit(
+        [&](handler &CGH) {
+          CGH.depends_on(DepEvent);
+          CGH.template parallel_for<KernelName, KernelType, Dims>(
+              ExecutionRange, KernelFunc);
+        },
+        CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + nd_range that
@@ -432,16 +609,27 @@ class queue {
   /// \param DepEvents is a vector of events that specifies the kernel
   /// dependencies
   /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims>
-  event parallel_for(nd_range<Dims> ExecutionRange,
-                     const vector_class<event> &DepEvents,
-                     KernelType KernelFunc) {
-    return submit([&](handler &CGH) {
-      CGH.depends_on(DepEvents);
-      CGH.template parallel_for<KernelName, KernelType, Dims>(ExecutionRange,
-                                                              KernelFunc);
-    });
+  event parallel_for(
+      nd_range<Dims> ExecutionRange, const vector_class<event> &DepEvents,
+      KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return submit(
+        [&](handler &CGH) {
+          CGH.depends_on(DepEvents);
+          CGH.template parallel_for<KernelName, KernelType, Dims>(
+              ExecutionRange, KernelFunc);
+        },
+        CodeLoc);
   }
 
   /// Returns whether the queue is in order or OoO
@@ -455,9 +643,11 @@ class queue {
   friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject);
 
   /// A template-free version of submit.
-  event submit_impl(function_class<void(handler &)> CGH);
+  event submit_impl(function_class<void(handler &)> CGH,
+                    const detail::code_location &CodeLoc);
   /// A template-free version of submit.
-  event submit_impl(function_class<void(handler &)> CGH, queue secondQueue);
+  event submit_impl(function_class<void(handler &)> CGH, queue secondQueue,
+                    const detail::code_location &CodeLoc);
 };
 
 } // namespace sycl
diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt
index 6c2243fec3ed3..2cc51cd280fce 100644
--- a/sycl/source/CMakeLists.txt
+++ b/sycl/source/CMakeLists.txt
@@ -4,9 +4,19 @@
 #cmake_policy(SET CMP0057 NEW)
 #include(AddLLVM)
 
+if (SYCL_ENABLE_XPTI_TRACING)
+  if (NOT EXISTS ${LLVM_EXTERNAL_XPTI_SOURCE_DIR})
+    message (FATAL_ERROR "Undefined LLVM_EXTERNAL_XPTI_SOURCE_DIR variable: Must be set when XPTI tracing is set to ON")
+  endif()
+  include_directories(${LLVM_EXTERNAL_XPTI_SOURCE_DIR}/include)
+endif()
+
 function(add_sycl_rt_library LIB_NAME)
+  # Add an optional argument so we can get the library name to
+  # link with for Windows Debug version
+  cmake_parse_arguments(ARG "" "XPTI_LIB" "" ${ARGN})
 
-  add_library(${LIB_NAME} SHARED ${ARGN})
+  add_library(${LIB_NAME} SHARED ${ARG_UNPARSED_ARGUMENTS})
 
   #To-Do: Remove dependency on icd loader and opencl headers.
   add_dependencies(${LIB_NAME}
@@ -17,14 +27,22 @@ function(add_sycl_rt_library LIB_NAME)
 
   set_target_properties(${LIB_NAME} PROPERTIES LINKER_LANGUAGE CXX)
 
+  if (SYCL_ENABLE_XPTI_TRACING)
+    target_compile_definitions(${LIB_NAME} PRIVATE XPTI_ENABLE_INSTRUMENTATION XPTI_STATIC_LIBRARY)
+    target_link_libraries(${LIB_NAME} PRIVATE ${ARG_XPTI_LIB})
+  endif()
+
   if (MSVC)
-      target_compile_definitions(${LIB_NAME} PRIVATE __SYCL_BUILD_SYCL_DLL )
-      target_link_libraries(${LIB_NAME} PRIVATE shlwapi)
+    target_compile_definitions(${LIB_NAME} PRIVATE __SYCL_BUILD_SYCL_DLL )
+    target_link_libraries(${LIB_NAME} PRIVATE shlwapi)
   else()
-      set(linker_script "${CMAKE_CURRENT_SOURCE_DIR}/ld-version-script.txt")
-      target_link_libraries(
-        ${LIB_NAME} PRIVATE "-Wl,--version-script=${linker_script}")
-      set_target_properties(${LIB_NAME} PROPERTIES LINK_DEPENDS ${linker_script})
+    set(linker_script "${CMAKE_CURRENT_SOURCE_DIR}/ld-version-script.txt")
+    target_link_libraries(
+      ${LIB_NAME} PRIVATE "-Wl,--version-script=${linker_script}")
+    set_target_properties(${LIB_NAME} PROPERTIES LINK_DEPENDS ${linker_script})
+    if (SYCL_ENABLE_XPTI_TRACING)
+      target_link_libraries(${LIB_NAME} PRIVATE dl)
+    endif()
   endif()
 
   target_include_directories(
@@ -105,7 +123,11 @@ set(SYCL_SOURCES
     "$<$<OR:$<PLATFORM_ID:Linux>,$<PLATFORM_ID:Darwin>>:detail/posix_pi.cpp>"
 )
 
-add_sycl_rt_library(sycl ${SYCL_SOURCES})
+if (SYCL_ENABLE_XPTI_TRACING)
+  add_sycl_rt_library(sycl ${SYCL_SOURCES} XPTI_LIB xpti)
+else()
+  add_sycl_rt_library(sycl ${SYCL_SOURCES})
+endif()
 
 if (MSVC)
   # MSVC provides two incompatible build variants for its CRT: release and debug
@@ -143,8 +165,13 @@ if (MSVC)
 
   target_compile_options(sycl PRIVATE ${SYCL_CXX_FLAGS_RELEASE})
 
-  add_sycl_rt_library(sycld ${SYCL_SOURCES})
+  if (SYCL_ENABLE_XPTI_TRACING)
+    add_sycl_rt_library(sycld ${SYCL_SOURCES} XPTI_LIB xptid)
+  else()
+    add_sycl_rt_library(sycld ${SYCL_SOURCES})
+  endif()
   target_compile_options(sycld PRIVATE ${SYCL_CXX_FLAGS_DEBUG})
+
 endif()
 
 install(TARGETS ${SYCL_RT_LIBS}
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index 3340f6b488891..1adb3b85dd39c 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -17,9 +17,18 @@
 
 #include <chrono>
 
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+#include "xpti_trace_framework.hpp"
+#include <atomic>
+#include <sstream>
+#endif
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+extern xpti::trace_event_data_t *GSYCLGraphEvent;
+#endif
 
 // Threat all devices that don't support interoperability as host devices to
 // avoid attempts to call method get on such events.
@@ -96,8 +105,65 @@ event_impl::event_impl(QueueImplPtr Queue) : MQueue(Queue) {
   }
 }
 
+void *event_impl::instrumentationProlog(string_class &Name, int32_t StreamID,
+                                        uint64_t &IId) const {
+  void *TraceEvent = nullptr;
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return TraceEvent;
+  // Use a thread-safe counter to get a unique instance ID for the wait() on the
+  // event
+  static std::atomic<uint64_t> InstanceID = {1};
+  xpti::trace_event_data_t *WaitEvent = nullptr;
+
+  // Create a string with the event address so it
+  // can be associated with other debug data
+  xpti::utils::StringHelper SH;
+  Name = SH.nameWithAddress<RT::PiEvent>("event.wait", MEvent);
+
+  // We can emit the wait associated with the graph if the
+  // event does not have a command object or associated with
+  // the command object, if it exists
+  if (MCommand) {
+    Command *Cmd = (Command *)MCommand;
+    WaitEvent = Cmd->MTraceEvent ? static_cast<xpti_td *>(Cmd->MTraceEvent)
+                                 : GSYCLGraphEvent;
+  } else
+    WaitEvent = GSYCLGraphEvent;
+
+  // Record the current instance ID for use by Epilog
+  IId = InstanceID++;
+  xptiNotifySubscribers(StreamID, xpti::trace_wait_begin, nullptr, WaitEvent,
+                        IId, static_cast<const void *>(Name.c_str()));
+  TraceEvent = (void *)WaitEvent;
+#endif
+  return TraceEvent;
+}
+
+void event_impl::instrumentationEpilog(void *TelemetryEvent,
+                                       const string_class &Name,
+                                       int32_t StreamID, uint64_t IId) const {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!(xptiTraceEnabled() && TelemetryEvent))
+    return;
+  // Close the wait() scope
+  xpti::trace_event_data_t *TraceEvent =
+      (xpti::trace_event_data_t *)TelemetryEvent;
+  xptiNotifySubscribers(StreamID, xpti::trace_wait_end, nullptr, TraceEvent,
+                        IId, static_cast<const void *>(Name.c_str()));
+#endif
+}
+
 void event_impl::wait(
     std::shared_ptr<cl::sycl::detail::event_impl> Self) const {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  void *TelemetryEvent = nullptr;
+  uint64_t IId;
+  std::string Name;
+  int32_t StreamID = xptiRegisterStream(SYCL_STREAM_NAME);
+  TelemetryEvent = instrumentationProlog(Name, StreamID, IId);
+#endif
+
   if (MEvent)
     // presence of MEvent means the command has been enqueued, so no need to
     // go via the slow path event waiting in the scheduler
@@ -106,6 +172,10 @@ void event_impl::wait(
     detail::Scheduler::getInstance().waitForEvent(Self);
   if (MCommand && !SYCLConfig<SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP>::get())
     detail::Scheduler::getInstance().cleanupFinishedCommands(std::move(Self));
+
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  instrumentationEpilog(TelemetryEvent, Name, StreamID, IId);
+#endif
 }
 
 void event_impl::wait_and_throw(
diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index cbe0c1f55aa66..d21c84c6b1a96 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -118,8 +118,8 @@ class event_impl {
   /// \return a shared pointer to a valid context_impl.
   const ContextImplPtr &getContextImpl();
 
-  // \return the Plugin associated with the context of this event.
-  // Should be called when this is not a Host Event.
+  /// \return the Plugin associated with the context of this event.
+  /// Should be called when this is not a Host Event.
   const plugin &getPlugin() const;
 
   /// Associate event with the context.
@@ -127,25 +127,33 @@ class event_impl {
   /// Provided PiContext inside ContextImplPtr must be associated
   /// with the PiEvent object stored in this class
   ///
-  /// \param Context is a shared pointer to an instance of valid context_impl.
+  /// @param Context is a shared pointer to an instance of valid context_impl.
   void setContextImpl(const ContextImplPtr &Context);
 
   /// Returns command that is associated with the event.
   ///
-  /// \return a generic pointer to Command object instance.
+  /// @return a generic pointer to Command object instance.
   void *getCommand() { return MCommand; }
 
   /// Associates this event with the command.
   ///
-  /// \param Command is a generic pointer to Command object instance.
+  /// @param Command is a generic pointer to Command object instance.
   void setCommand(void *Command) { MCommand = Command; }
 
   /// Returns host profiling information.
   ///
-  /// \return a pointer to HostProfilingInfo instance.
+  /// @return a pointer to HostProfilingInfo instance.
   HostProfilingInfo *getHostProfilingInfo() { return MHostProfilingInfo.get(); }
 
 private:
+  // When instrumentation is enabled emits trace event for event wait begin and
+  // returns the telemetry event generated for the wait
+  void *instrumentationProlog(string_class &Name, int32_t StreamID,
+                              uint64_t &instance_id) const;
+  // Uses events generated by the Prolog and emits event wait done event
+  void instrumentationEpilog(void *TelementryEvent, const string_class &Name,
+                             int32_t StreamID, uint64_t IId) const;
+
   RT::PiEvent MEvent = nullptr;
   ContextImplPtr MContext;
   QueueImplWPtr MQueue;
diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp
index 6e92c950e116d..9b0959063fbe5 100644
--- a/sycl/source/detail/pi.cpp
+++ b/sycl/source/detail/pi.cpp
@@ -18,11 +18,35 @@
 #include <string>
 #include <sstream>
 
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+// Include the headers necessary for emitting
+// traces using the trace framework
+#include "xpti_trace_framework.h"
+#endif
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+// Stream name being used for traces generated from the SYCL runtime
+constexpr const char *PICALL_STREAM_NAME = "sycl.pi";
+// Global (to the SYCL runtime) graph handle that all command groups are a
+// child of
+///< Event to be used by graph related activities
+xpti_td *GSYCLGraphEvent = nullptr;
+///< Event to be used by PI layer related activities
+xpti_td *GPICallEvent = nullptr;
+///< Constansts being used as placeholder until one is able to reliably get the
+///< version of the SYCL runtime
+constexpr uint32_t GMajVer = 1;
+constexpr uint32_t GMinVer = 0;
+constexpr const char *GVerStr = "sycl 1.0";
+#endif
+
 namespace pi {
 
+bool XPTIInitDone = false;
+
 std::string platformInfoToString(pi_platform_info info) {
   switch (info) {
   case PI_PLATFORM_INFO_PROFILE:
@@ -208,6 +232,49 @@ vector_class<plugin> initialize() {
     }
     Plugins.push_back(plugin(PluginInformation));
   }
+
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!(xptiTraceEnabled() && !XPTIInitDone))
+    return Plugins;
+  // Not sure this is the best place to initialize the framework; SYCL runtime
+  // team needs to advise on the right place, until then we piggy-back on the
+  // initialization of the PI layer.
+
+  // Initialize the global events just once, in the case pi::initialize() is
+  // called multiple times
+  XPTIInitDone = true;
+  // Registers a new stream for 'sycl' and any plugin that wants to listen to
+  // this stream will register itself using this string or stream ID for this
+  // string.
+  uint8_t StreamID = xptiRegisterStream(SYCL_STREAM_NAME);
+  //  Let all tool plugins know that a stream by the name of 'sycl' has been
+  //  initialized and will be generating the trace stream.
+  //
+  //                                           +--- Minor version #
+  //            Major version # ------+        |   Version string
+  //                                  |        |       |
+  //                                  v        v       v
+  xptiInitialize(SYCL_STREAM_NAME, GMajVer, GMinVer, GVerStr);
+  // Create a tracepoint to indicate the graph creation
+  xpti::payload_t GraphPayload("application_graph");
+  uint64_t GraphInstanceNo;
+  GSYCLGraphEvent =
+      xptiMakeEvent("application_graph", &GraphPayload, xpti::trace_graph_event,
+                    xpti_at::active, &GraphInstanceNo);
+  if (GSYCLGraphEvent) {
+    // The graph event is a global event and will be used as the parent for
+    // all nodes (command groups)
+    xptiNotifySubscribers(StreamID, xpti::trace_graph_create, nullptr,
+                          GSYCLGraphEvent, GraphInstanceNo, nullptr);
+  }
+
+  xpti::payload_t PIPayload("Plugin Interface Layer");
+  uint64_t PiInstanceNo;
+  GPICallEvent =
+      xptiMakeEvent("PI Layer", &PIPayload, xpti::trace_algorithm_event,
+                    xpti_at::active, &PiInstanceNo);
+#endif
+
   return Plugins;
 }
 
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 5b7690cf5fa36..ddc9883cb610e 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -16,6 +16,11 @@
 
 #include <cstring>
 
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+#include "xpti_trace_framework.hpp"
+#include <sstream>
+#endif
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
@@ -86,6 +91,100 @@ void queue_impl::addEvent(event Event) {
   MEvents.push_back(std::move(Event));
 }
 
+void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc,
+                                        string_class &Name, int32_t StreamID,
+                                        uint64_t &IId) {
+  void *TraceEvent = nullptr;
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  xpti::trace_event_data_t *WaitEvent = nullptr;
+  if (!xptiTraceEnabled())
+    return TraceEvent;
+
+  xpti::payload_t Payload;
+  bool HasSourceInfo = false;
+  // We try to create a unique string for the wait() call by combining it with
+  // the queue address
+  xpti::utils::StringHelper NG;
+  Name = NG.nameWithAddress<queue_impl *>("queue.wait", this);
+
+  if (!CodeLoc.fileName()) {
+    // We have source code location information
+    Payload =
+        xpti::payload_t(Name.c_str(), CodeLoc.fileName(), CodeLoc.lineNumber(),
+                        CodeLoc.columnNumber(), (void *)this);
+    HasSourceInfo = true;
+  } else {
+    // We have no location information, so we'll use the address of the queue
+    Payload = xpti::payload_t(Name.c_str(), (void *)this);
+  }
+  // wait() calls could be at different user-code locations; We create a new
+  // event based on the code location info and if this has been seen before, a
+  // previously created event will be returned.
+  uint64_t QWaitInstanceNo = 0;
+  WaitEvent = xptiMakeEvent(Name.c_str(), &Payload, xpti::trace_graph_event,
+                            xpti_at::active, &QWaitInstanceNo);
+  IId = QWaitInstanceNo;
+  if (WaitEvent) {
+    device D = get_device();
+    std::string DevStr;
+    if (D.is_host())
+      DevStr = "HOST";
+    else if (D.is_cpu())
+      DevStr = "CPU";
+    else if (D.is_gpu())
+      DevStr = "GPU";
+    else if (D.is_accelerator())
+      DevStr = "ACCELERATOR";
+    else
+      DevStr = "UNKNOWN";
+    xptiAddMetadata(WaitEvent, "sycl_device", DevStr.c_str());
+    if (HasSourceInfo) {
+      xptiAddMetadata(WaitEvent, "sym_function_name", CodeLoc.functionName());
+      xptiAddMetadata(WaitEvent, "sym_source_file_name", CodeLoc.fileName());
+      xptiAddMetadata(WaitEvent, "sym_line_no",
+                      std::to_string(CodeLoc.lineNumber()).c_str());
+    }
+    xptiNotifySubscribers(StreamID, xpti::trace_wait_begin, nullptr, WaitEvent,
+                          QWaitInstanceNo,
+                          static_cast<const void *>(Name.c_str()));
+    TraceEvent = (void *)WaitEvent;
+  }
+#endif
+  return TraceEvent;
+}
+
+void queue_impl::instrumentationEpilog(void *TelemetryEvent, string_class &Name,
+                                       int32_t StreamID, uint64_t IId) {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!(xptiTraceEnabled() && TelemetryEvent))
+    return;
+  // Close the wait() scope
+  xpti::trace_event_data_t *TraceEvent =
+      (xpti::trace_event_data_t *)TelemetryEvent;
+  xptiNotifySubscribers(StreamID, xpti::trace_wait_end, nullptr, TraceEvent,
+                        IId, static_cast<const void *>(Name.c_str()));
+#endif
+}
+
+void queue_impl::wait(const detail::code_location &CodeLoc) {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  void *TelemetryEvent = nullptr;
+  uint64_t IId;
+  std::string Name;
+  int32_t StreamID = xptiRegisterStream(SYCL_STREAM_NAME);
+  TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId);
+#endif
+
+  std::lock_guard<mutex_class> Guard(MMutex);
+  for (auto &Event : MEvents)
+    Event.wait();
+  MEvents.clear();
+
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  instrumentationEpilog(TelemetryEvent, Name, StreamID, IId);
+#endif
+}
+
 } // namespace detail
 } // namespace sycl
 } // __SYCL_INLINE_NAMESPACE(cl)
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 0b3acd2f32a80..ef94d73a086c6 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -152,19 +152,21 @@ class queue_impl {
   /// \param CGF is a function object containing command group.
   /// \param Self is a shared_ptr to this queue.
   /// \param SecondQueue is a shared_ptr to the secondary queue.
+  /// \param Loc is the code location of the submit call (default argument)
   /// \return a SYCL event object, which corresponds to the queue the command
   /// group is being enqueued on.
   event submit(const function_class<void(handler &)> &CGF,
                shared_ptr_class<queue_impl> Self,
-               shared_ptr_class<queue_impl> SecondQueue) {
+               shared_ptr_class<queue_impl> SecondQueue,
+               const detail::code_location &Loc) {
     try {
-      return submit_impl(CGF, Self);
+      return submit_impl(CGF, Self, Loc);
     } catch (...) {
       {
         std::lock_guard<mutex_class> Guard(MMutex);
         MExceptions.PushBack(std::current_exception());
       }
-      return SecondQueue->submit(CGF, SecondQueue);
+      return SecondQueue->submit(CGF, SecondQueue, Loc);
     }
   }
 
@@ -173,28 +175,27 @@ class queue_impl {
   ///
   /// \param CGF is a function object containing command group.
   /// \param Self is a shared_ptr to this queue.
+  /// \param Loc is the code location of the submit call (default argument)
   /// \return a SYCL event object for the submitted command group.
   event submit(const function_class<void(handler &)> &CGF,
-               shared_ptr_class<queue_impl> Self) {
-    return submit_impl(CGF, std::move(Self));
+               shared_ptr_class<queue_impl> Self,
+               const detail::code_location &Loc) {
+    return submit_impl(CGF, std::move(Self), Loc);
   }
 
   /// Performs a blocking wait for the completion of all enqueued tasks in the
   /// queue.
   ///
   /// Synchronous errors will be reported through SYCL exceptions.
-  void wait() {
-    std::lock_guard<mutex_class> Guard(MMutex);
-    for (auto &Event : MEvents)
-      Event.wait();
-    MEvents.clear();
-  }
+  /// @param Loc is the code location of the submit call (default argument)
+  void wait(const detail::code_location &Loc = {});
 
   /// \return list of asynchronous exceptions occurred during execution.
   exception_list getExceptionList() const { return MExceptions; }
 
-  void wait_and_throw() {
-    wait();
+  /// @param Loc is the code location of the submit call (default argument)
+  void wait_and_throw(const detail::code_location &Loc = {}) {
+    wait(Loc);
     throw_asynchronous();
   }
 
@@ -350,16 +351,27 @@ class queue_impl {
   ///
   /// \param CGF is a function object containing command group.
   /// \param Self is a pointer to this queue.
+  /// \param Loc is the code location of the submit call (default argument)
   /// \return a SYCL event representing submitted command group.
   event submit_impl(const function_class<void(handler &)> &CGF,
-                    shared_ptr_class<queue_impl> Self) {
+                    shared_ptr_class<queue_impl> Self,
+                    const detail::code_location &Loc) {
     handler Handler(std::move(Self), MHostQueue);
     CGF(Handler);
-    event Event = Handler.finalize();
+    event Event = Handler.finalize(Loc);
     addEvent(Event);
     return Event;
   }
 
+  // When instrumentation is enabled emits trace event for wait begin and
+  // returns the telemetry event generated for the wait
+  void *instrumentationProlog(const detail::code_location &CodeLoc,
+                              string_class &Name, int32_t StreamID,
+                              uint64_t &iid);
+  // Uses events generated by the Prolog and emits wait done event
+  void instrumentationEpilog(void *TelementryEvent, string_class &Name,
+                             int32_t StreamID, uint64_t IId);
+
   /// Stores an event that should be associated with the queue
   ///
   /// \param Event is the event to be stored
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 6343ddecc670a..e9425a3e47783 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -24,6 +24,7 @@
 #include <detail/scheduler/commands.hpp>
 #include <detail/scheduler/scheduler.hpp>
 
+#include <cassert>
 #include <string>
 #include <vector>
 
@@ -33,9 +34,17 @@
 #include <memory>
 #endif
 
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+#include "xpti_trace_framework.hpp"
+#endif
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace detail {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+// Global graph for the application
+extern xpti::trace_event_data_t *GSYCLGraphEvent;
+#endif
 
 #ifdef __GNUG__
 struct DemangleHandle {
@@ -82,6 +91,63 @@ static std::string accessModeToString(access::mode Mode) {
   }
 }
 
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+// Using the command group type to create node types for the asynchronous task
+// graph modeling
+static std::string commandToNodeType(Command::CommandType Type) {
+  switch (Type) {
+  case Command::CommandType::RUN_CG:
+    return "command_group_node";
+  case Command::CommandType::COPY_MEMORY:
+    return "memory_transfer_node";
+  case Command::CommandType::ALLOCA:
+    return "memory_allocation_node";
+  case Command::CommandType::ALLOCA_SUB_BUF:
+    return "sub_buffer_creation_node";
+  case Command::CommandType::RELEASE:
+    return "memory_deallocation_node";
+  case Command::CommandType::MAP_MEM_OBJ:
+    return "memory_transfer_node";
+  case Command::CommandType::UNMAP_MEM_OBJ:
+    return "memory_transfer_node";
+  case Command::CommandType::UPDATE_REQUIREMENT:
+    return "host_acc_create_buffer_lock_node";
+  case Command::CommandType::EMPTY_TASK:
+    return "host_acc_destroy_buffer_release_node";
+  default:
+    return "unknown_node";
+  }
+}
+
+// Using the names being generated and the string are subject to change to
+// something more meaningful to end-users as this will be visible in analysis
+// tools that subscribe to this data
+static std::string commandToName(Command::CommandType Type) {
+  switch (Type) {
+  case Command::CommandType::RUN_CG:
+    return "Command Group Action";
+  case Command::CommandType::COPY_MEMORY:
+    return "Memory Transfer (Copy)";
+  case Command::CommandType::ALLOCA:
+    return "Memory Allocation";
+  case Command::CommandType::ALLOCA_SUB_BUF:
+    return "Sub Buffer Creation";
+  case Command::CommandType::RELEASE:
+    return "Memory Deallocation";
+  case Command::CommandType::MAP_MEM_OBJ:
+    return "Memory Transfer (Map)";
+  case Command::CommandType::UNMAP_MEM_OBJ:
+    return "Memory Transfer (Unmap)";
+  case Command::CommandType::UPDATE_REQUIREMENT:
+    return "Host Accessor Creation/Buffer Lock";
+  case Command::CommandType::EMPTY_TASK:
+    return "Host Accessor Destruction/Buffer Lock Release";
+  default:
+    return "Unknown Action";
+  }
+}
+#endif
+
 static std::vector<RT::PiEvent>
 getPiEvents(const std::vector<EventImplPtr> &EventImpls) {
   std::vector<RT::PiEvent> RetPiEvents;
@@ -161,6 +227,221 @@ Command::Command(CommandType Type, QueueImplPtr Queue)
   MEvent.reset(new detail::event_impl(MQueue));
   MEvent->setCommand(this);
   MEvent->setContextImpl(detail::getSyclObjImpl(MQueue->get_context()));
+
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return;
+  // Obtain the stream ID so all commands can emit traces to that stream
+  MStreamID = xptiRegisterStream(SYCL_STREAM_NAME);
+#endif
+}
+
+void Command::emitInstrumentationDataProxy() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  emitInstrumentationData();
+#endif
+}
+
+/// Method takes in void * for the address as adding a template function to
+/// the command group object maybe undesirable.
+/// @param Cmd The command object of the source of the edge
+/// @param ObjAddr The address that defines the edge dependency; it is the event
+/// address when the edge is for an event and a memory object address if it is
+/// due to an accessor
+/// @param Prefix Contains "event" if the dependency is an edge and contains the
+/// access mode to the buffer if it is due to an accessor
+/// @param IsCommand True if the dependency has a command object as the source,
+/// false otherwise
+void Command::emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr,
+                                                const string_class &Prefix,
+                                                bool IsCommand) {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  // Bail early if either the source or the target node for the given dependency
+  // is undefined or NULL
+  if (!(xptiTraceEnabled() && MTraceEvent && Cmd && Cmd->MTraceEvent))
+    return;
+  // If all the information we need for creating an edge event is available,
+  // then go ahead with creating it; if not, bail early!
+  xpti::utils::StringHelper SH;
+  std::string AddressStr = SH.addressAsString<void *>(ObjAddr);
+  std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr);
+  // Create an edge with the dependent buffer address for which a command
+  // object has been created as one of the properties of the edge
+  xpti::payload_t Payload(TypeString.c_str(), MAddress);
+  uint64_t EdgeInstanceNo;
+  xpti_td *EdgeEvent =
+      xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event,
+                    xpti_at::active, &EdgeInstanceNo);
+  if (EdgeEvent) {
+    xpti_td *SrcEvent = static_cast<xpti_td *>(Cmd->MTraceEvent);
+    xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
+    EdgeEvent->source_id = SrcEvent->unique_id;
+    EdgeEvent->target_id = TgtEvent->unique_id;
+    if (IsCommand) {
+      xptiAddMetadata(EdgeEvent, "access_mode", TypeString.c_str());
+      xptiAddMetadata(EdgeEvent, "memory_object", AddressStr.c_str());
+    } else {
+      xptiAddMetadata(EdgeEvent, "event", TypeString.c_str());
+    }
+    xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
+                          detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
+                          nullptr);
+  }
+  // General comment - None of these are serious errors as the instrumentation
+  // layer MUST be tolerant of errors. If we need to let the end user know, we
+  // throw exceptions in the future
+#endif
+}
+
+/// Creates an edge when the dependency is due to an event.
+/// @param Cmd The command object of the source of the edge
+/// @param PiEventAddr The address that defines the edge dependency, which in
+/// this case is an event
+void Command::emitEdgeEventForEventDependence(Command *Cmd,
+                                              RT::PiEvent &PiEventAddr) {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  // If we have failed to create an event to represent the Command, then we
+  // cannot emit an edge event. Bail early!
+  if (!(xptiTraceEnabled() && MTraceEvent))
+    return;
+
+  if (Cmd && Cmd->MTraceEvent) {
+    // If the event is associated with a command, we use this command's trace
+    // event as the source of edge, hence modeling the control flow
+    emitEdgeEventForCommandDependence(Cmd, (void *)PiEventAddr, "Event", false);
+    return;
+  }
+  if (PiEventAddr) {
+    xpti::utils::StringHelper SH;
+    std::string AddressStr = SH.addressAsString<RT::PiEvent>(PiEventAddr);
+    // This is the case when it is a OCL event enqueued by the user or another
+    // event is registered by the runtime as a dependency The dependency on
+    // this occasion is an OCL event; so we build a virtual node in the graph
+    // with the event as the metadata for the node
+    std::string NodeName = SH.nameWithAddressString("virtual_node", AddressStr);
+    // Node name is "virtual_node[<event_addr>]"
+    xpti::payload_t VNPayload(NodeName.c_str(), MAddress);
+    uint64_t VNodeInstanceNo;
+    xpti_td *NodeEvent =
+        xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event,
+                      xpti_at::active, &VNodeInstanceNo);
+    // Emit the virtual node first
+    xptiAddMetadata(NodeEvent, "kernel_name", NodeName.c_str());
+    xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
+                          detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo,
+                          nullptr);
+    // Create a new event for the edge
+    std::string EdgeName = SH.nameWithAddressString("Event", AddressStr);
+    xpti::payload_t EdgePayload(EdgeName.c_str(), MAddress);
+    uint64_t EdgeInstanceNo;
+    xpti_td *EdgeEvent =
+        xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event,
+                      xpti_at::active, &EdgeInstanceNo);
+    if (EdgeEvent && NodeEvent) {
+      // Source node represents the event and this event needs to be completed
+      // before target node can execute
+      xpti_td *TgtEvent = static_cast<xpti_td *>(MTraceEvent);
+      EdgeEvent->source_id = NodeEvent->unique_id;
+      EdgeEvent->target_id = TgtEvent->unique_id;
+      xptiAddMetadata(EdgeEvent, "event", EdgeName.c_str());
+      xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
+                            detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo,
+                            nullptr);
+    }
+    return;
+  }
+#endif
+}
+
+uint64_t Command::makeTraceEventProlog(void *MAddress) {
+  uint64_t CommandInstanceNo = 0;
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return CommandInstanceNo;
+
+  MTraceEventPrologComplete = true;
+  // Setup the member variables with information needed for event notification
+  MCommandNodeType = commandToNodeType(MType);
+  MCommandName = commandToName(MType);
+  xpti::utils::StringHelper SH;
+  MAddressString = SH.addressAsString<void *>(MAddress);
+  std::string CommandString =
+      SH.nameWithAddressString(MCommandName, MAddressString);
+
+  xpti::payload_t p(CommandString.c_str(), MAddress);
+  xpti_td *CmdTraceEvent =
+      xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event,
+                    xpti_at::active, &CommandInstanceNo);
+  MInstanceID = CommandInstanceNo;
+  if (CmdTraceEvent) {
+    MTraceEvent = (void *)CmdTraceEvent;
+    // If we are seeing this event again, then the instance ID will be greater
+    // than 1; in this case, we must skip sending a notification to create a
+    // node as this node has already been created. We return this value so the
+    // epilog method can be called selectively.
+    MFirstInstance = (CommandInstanceNo == 1);
+  }
+#endif
+  return CommandInstanceNo;
+}
+
+void Command::makeTraceEventEpilog() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!(xptiTraceEnabled() && MTraceEvent))
+    return;
+  assert(MTraceEventPrologComplete);
+  xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
+                        detail::GSYCLGraphEvent,
+                        static_cast<xpti_td *>(MTraceEvent), MInstanceID,
+                        static_cast<const void *>(MCommandNodeType.c_str()));
+#endif
+}
+
+void Command::addDep(DepDesc NewDep) {
+  if (NewDep.MDepCommand)
+    MDepsEvents.push_back(NewDep.MDepCommand->getEvent());
+  MDeps.push_back(NewDep);
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  emitEdgeEventForCommandDependence(
+      NewDep.MDepCommand, (void *)NewDep.MDepRequirement->MSYCLMemObj,
+      accessModeToString(NewDep.MDepRequirement->MAccessMode), true);
+#endif
+}
+
+void Command::addDep(EventImplPtr Event) {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  // We need this for just the instrumentation, so guarding it will prevent
+  // unused variable warnings when instrumentation is turned off
+  Command *Cmd = (Command *)Event->getCommand();
+  RT::PiEvent &PiEventAddr = Event->getHandleRef();
+  // Now make an edge for the dependent event
+  emitEdgeEventForEventDependence(Cmd, PiEventAddr);
+#endif
+
+  MDepsEvents.push_back(std::move(Event));
+}
+
+void Command::emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr) {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!(xptiTraceEnabled() && MTraceEvent && PiEventAddr))
+    return;
+  // Asynchronous call, so send a signal with the event information as
+  // user_data
+  xptiNotifySubscribers(MStreamID, xpti::trace_signal, detail::GSYCLGraphEvent,
+                        static_cast<xpti_td *>(MTraceEvent), MInstanceID,
+                        (void *)PiEventAddr);
+#endif
+}
+
+void Command::emitInstrumentation(uint16_t Type, const char *Txt) {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!(xptiTraceEnabled() && MTraceEvent))
+    return;
+  // Trace event notifier that emits a Type event
+  xptiNotifySubscribers(MStreamID, Type, detail::GSYCLGraphEvent,
+                        static_cast<xpti_td *>(MTraceEvent), MInstanceID,
+                        static_cast<const void *>(Txt));
+#endif
 }
 
 bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) {
@@ -182,9 +463,21 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) {
               std::string(MBlockReason),
           PI_INVALID_OPERATION);
 
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+    // Scoped trace event notifier that emits a barrier begin and barrier end
+    // event, which models the barrier while enqueuing along with the blocked
+    // reason, as determined by the scheduler
+    std::string Info = "enqueue.barrier[";
+    Info += std::string(MBlockReason) + "]";
+    emitInstrumentation(xpti::trace_barrier_begin, Info.c_str());
+#endif
+
     // Wait if blocking
     while (!MCanEnqueue)
       ;
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+    emitInstrumentation(xpti::trace_barrier_end, Info.c_str());
+#endif
   }
 
   std::lock_guard<std::mutex> Lock(MEnqueueMtx);
@@ -193,6 +486,10 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) {
   if (MEnqueued)
     return true;
 
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  emitInstrumentation(xpti::trace_task_begin, nullptr);
+#endif
+
   cl_int Res = enqueueImp();
 
   if (CL_SUCCESS != Res)
@@ -203,9 +500,107 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) {
     // CL_SUCCESS
     MEnqueued = true;
 
+  // Emit this correlation signal before the task end
+  emitEnqueuedEventSignal(MEvent->getHandleRef());
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  emitInstrumentation(xpti::trace_task_end, nullptr);
+#endif
   return static_cast<bool>(MEnqueued);
 }
 
+void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  assert(MType == CommandType::RELEASE && "Expected release command");
+  if (!MTraceEvent)
+    return;
+  // The current command is the target node for all dependencies as the source
+  // nodes have to be completed first before the current node can begin to
+  // execute; these edges model control flow
+  xpti_td *TgtTraceEvent = static_cast<xpti_td *>(MTraceEvent);
+  // We have all the Commands that must be completed before the release command
+  // can be enqueued; here we'll find the command that is an Alloca with the
+  // same SYCLMemObject address and create a dependency line (edge) between them
+  // in our sematic modeling
+  for (auto &Item : DepList) {
+    if (Item->MTraceEvent && Item->MAddress == MAddress) {
+      xpti::utils::StringHelper SH;
+      std::string AddressStr = SH.addressAsString<void *>(MAddress);
+      std::string TypeString =
+          "Edge:" + SH.nameWithAddressString(commandToName(MType), AddressStr);
+
+      // Create an edge with the dependent buffer address being one of the
+      // properties of the edge
+      xpti::payload_t p(TypeString.c_str(), MAddress);
+      uint64_t EdgeInstanceNo;
+      xpti_td *EdgeEvent =
+          xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event,
+                        xpti_at::active, &EdgeInstanceNo);
+      if (EdgeEvent) {
+        xpti_td *SrcTraceEvent = static_cast<xpti_td *>(Item->MTraceEvent);
+        EdgeEvent->target_id = TgtTraceEvent->unique_id;
+        EdgeEvent->source_id = SrcTraceEvent->unique_id;
+        xptiAddMetadata(EdgeEvent, "memory_object", AddressStr.c_str());
+        xptiNotifySubscribers(MStreamID, xpti::trace_edge_create,
+                              detail::GSYCLGraphEvent, EdgeEvent,
+                              EdgeInstanceNo, nullptr);
+      }
+    }
+  }
+#endif
+}
+
+AllocaCommandBase::AllocaCommandBase(CommandType Type, QueueImplPtr Queue,
+                                     Requirement Req,
+                                     AllocaCommandBase *LinkedAllocaCmd)
+    : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
+      MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MRequirement(std::move(Req)),
+      MReleaseCmd(Queue, this) {
+  MRequirement.MAccessMode = access::mode::read_write;
+  emitInstrumentationDataProxy();
+}
+
+void AllocaCommandBase::emitInstrumentationData() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return;
+  // Create a payload with the command name and an event using this payload to
+  // emit a node_create
+  MAddress = MRequirement.MSYCLMemObj;
+  makeTraceEventProlog(MAddress);
+  // Set the relevant meta data properties for this command
+  if (MTraceEvent && MFirstInstance) {
+    xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
+    xptiAddMetadata(TE, "sycl_device",
+                    deviceToString(MQueue->get_device()).c_str());
+    xptiAddMetadata(TE, "memory_object", MAddressString.c_str());
+  }
+#endif
+}
+
+AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req,
+                             bool InitFromUserData,
+                             AllocaCommandBase *LinkedAllocaCmd)
+    : AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
+                        LinkedAllocaCmd),
+      MInitFromUserData(InitFromUserData) {
+  // Node event must be created before the dependent edge is added to this node,
+  // so this call must be before the addDep() call.
+  emitInstrumentationDataProxy();
+  addDep(DepDesc(nullptr, getRequirement(), this));
+}
+
+void AllocaCommand::emitInstrumentationData() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return;
+
+  // Only if it is the first event, we emit a node create event
+  if (MFirstInstance) {
+    makeTraceEventEpilog();
+  }
+#endif
+}
+
 cl_int AllocaCommand::enqueueImp() {
   std::vector<EventImplPtr> EventImpls =
       Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
@@ -251,6 +646,39 @@ void AllocaCommand::printDot(std::ostream &Stream) const {
   }
 }
 
+AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req,
+                                         AllocaCommandBase *ParentAlloca)
+    : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue),
+                        std::move(Req),
+                        /*LinkedAllocaCmd*/ nullptr),
+      MParentAlloca(ParentAlloca) {
+  // Node event must be created before the dependent edge
+  // is added to this node, so this call must be before
+  // the addDep() call.
+  emitInstrumentationDataProxy();
+  addDep(DepDesc(MParentAlloca, getRequirement(), MParentAlloca));
+}
+
+void AllocaSubBufCommand::emitInstrumentationData() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return;
+
+  // Only if it is the first event, we emit a node create event and any meta
+  // data that is available for the command
+  if (MFirstInstance) {
+    xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
+    xptiAddMetadata(TE, "offset",
+                    std::to_string(this->MRequirement.MOffsetInBytes).c_str());
+    std::string range = std::to_string(this->MRequirement.MAccessRange[0]) +
+                        "-" +
+                        std::to_string(this->MRequirement.MAccessRange[1]);
+    xptiAddMetadata(TE, "access_range", range.c_str());
+    makeTraceEventEpilog();
+  }
+#endif
+}
+
 cl_int AllocaSubBufCommand::enqueueImp() {
   std::vector<EventImplPtr> EventImpls =
       Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
@@ -286,6 +714,31 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
   }
 }
 
+ReleaseCommand::ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
+    : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) {
+  emitInstrumentationDataProxy();
+}
+
+void ReleaseCommand::emitInstrumentationData() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return;
+  // Create a payload with the command name and an event using this payload to
+  // emit a node_create
+  MAddress = MAllocaCmd->getSYCLMemObj();
+  makeTraceEventProlog(MAddress);
+
+  if (MFirstInstance) {
+    xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
+    xptiAddMetadata(TE, "sycl_device",
+                    deviceToString(MQueue->get_device()).c_str());
+    xptiAddMetadata(TE, "allocation_type",
+                    commandToName(MAllocaCmd->getType()).c_str());
+    makeTraceEventEpilog();
+  }
+#endif
+}
+
 cl_int ReleaseCommand::enqueueImp() {
   std::vector<EventImplPtr> EventImpls =
       Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
@@ -372,7 +825,28 @@ void ReleaseCommand::printDot(std::ostream &Stream) const {
 MapMemObject::MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req,
                            void **DstPtr, QueueImplPtr Queue)
     : Command(CommandType::MAP_MEM_OBJ, std::move(Queue)),
-      MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr) {}
+      MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr) {
+  emitInstrumentationDataProxy();
+}
+
+void MapMemObject::emitInstrumentationData() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return;
+  // Create a payload with the command name and an event using this payload to
+  // emit a node_create
+  MAddress = MSrcAllocaCmd->getSYCLMemObj();
+  makeTraceEventProlog(MAddress);
+
+  if (MFirstInstance) {
+    xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
+    xptiAddMetadata(TE, "sycl_device",
+                    deviceToString(MQueue->get_device()).c_str());
+    xptiAddMetadata(TE, "memory_object", MAddressString.c_str());
+    makeTraceEventEpilog();
+  }
+#endif
+}
 
 cl_int MapMemObject::enqueueImp() {
   std::vector<EventImplPtr> EventImpls =
@@ -408,7 +882,28 @@ void MapMemObject::printDot(std::ostream &Stream) const {
 UnMapMemObject::UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req,
                                void **SrcPtr, QueueImplPtr Queue)
     : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)),
-      MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) {}
+      MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) {
+  emitInstrumentationDataProxy();
+}
+
+void UnMapMemObject::emitInstrumentationData() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return;
+  // Create a payload with the command name and an event using this payload to
+  // emit a node_create
+  MAddress = MDstAllocaCmd->getSYCLMemObj();
+  makeTraceEventProlog(MAddress);
+
+  if (MFirstInstance) {
+    xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
+    xptiAddMetadata(TE, "sycl_device",
+                    deviceToString(MQueue->get_device()).c_str());
+    xptiAddMetadata(TE, "memory_object", MAddressString.c_str());
+    makeTraceEventEpilog();
+  }
+#endif
+}
 
 cl_int UnMapMemObject::enqueueImp() {
   std::vector<EventImplPtr> EventImpls =
@@ -450,6 +945,31 @@ MemCpyCommand::MemCpyCommand(Requirement SrcReq,
       MDstAllocaCmd(DstAllocaCmd) {
   if (!MSrcQueue->is_host())
     MEvent->setContextImpl(detail::getSyclObjImpl(MSrcQueue->get_context()));
+
+  emitInstrumentationDataProxy();
+}
+
+void MemCpyCommand::emitInstrumentationData() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return;
+  // Create a payload with the command name and an event using this payload to
+  // emit a node_create
+  MAddress = MSrcAllocaCmd->getSYCLMemObj();
+  makeTraceEventProlog(MAddress);
+
+  if (MFirstInstance) {
+    xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
+    xptiAddMetadata(CmdTraceEvent, "sycl_device",
+                    deviceToString(MQueue->get_device()).c_str());
+    xptiAddMetadata(CmdTraceEvent, "memory_object", MAddressString.c_str());
+    std::string From = deviceToString(MSrcQueue->get_device());
+    std::string To = deviceToString(MQueue->get_device());
+    xptiAddMetadata(CmdTraceEvent, "copy_from", From.c_str());
+    xptiAddMetadata(CmdTraceEvent, "copy_to", To.c_str());
+    makeTraceEventEpilog();
+  }
+#endif
 }
 
 cl_int MemCpyCommand::enqueueImp() {
@@ -565,6 +1085,31 @@ MemCpyCommandHost::MemCpyCommandHost(Requirement SrcReq,
       MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) {
   if (!MSrcQueue->is_host())
     MEvent->setContextImpl(detail::getSyclObjImpl(MSrcQueue->get_context()));
+
+  emitInstrumentationDataProxy();
+}
+
+void MemCpyCommandHost::emitInstrumentationData() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return;
+  // Create a payload with the command name and an event using this payload to
+  // emit a node_create
+  MAddress = MSrcAllocaCmd->getSYCLMemObj();
+  makeTraceEventProlog(MAddress);
+
+  if (MFirstInstance) {
+    xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
+    xptiAddMetadata(CmdTraceEvent, "sycl_device",
+                    deviceToString(MQueue->get_device()).c_str());
+    xptiAddMetadata(CmdTraceEvent, "memory_object", MAddressString.c_str());
+    std::string From = deviceToString(MSrcQueue->get_device());
+    std::string To = deviceToString(MQueue->get_device());
+    xptiAddMetadata(CmdTraceEvent, "copy_from", From.c_str());
+    xptiAddMetadata(CmdTraceEvent, "copy_to", To.c_str());
+    makeTraceEventEpilog();
+  }
+#endif
 }
 
 cl_int MemCpyCommandHost::enqueueImp() {
@@ -592,6 +1137,32 @@ cl_int MemCpyCommandHost::enqueueImp() {
   return CL_SUCCESS;
 }
 
+EmptyCommand::EmptyCommand(QueueImplPtr Queue, Requirement Req)
+    : Command(CommandType::EMPTY_TASK, std::move(Queue)),
+      MRequirement(std::move(Req)) {
+
+  emitInstrumentationDataProxy();
+}
+
+void EmptyCommand::emitInstrumentationData() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return;
+  // Create a payload with the command name and an event using this payload to
+  // emit a node_create
+  MAddress = MRequirement.MSYCLMemObj;
+  makeTraceEventProlog(MAddress);
+
+  if (MFirstInstance) {
+    xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
+    xptiAddMetadata(CmdTraceEvent, "sycl_device",
+                    deviceToString(MQueue->get_device()).c_str());
+    xptiAddMetadata(CmdTraceEvent, "memory_object", MAddressString.c_str());
+    makeTraceEventEpilog();
+  }
+#endif
+}
+
 void EmptyCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\"";
 
@@ -627,50 +1198,186 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const {
   }
 }
 
-void ExecCGCommand::printDot(std::ostream &Stream) const {
-  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
+UpdateHostRequirementCommand::UpdateHostRequirementCommand(
+    QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd,
+    void **DstPtr)
+    : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)),
+      MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) {
 
-  Stream << "ID = " << this << "\\n";
-  Stream << "EXEC CG ON " << deviceToString(MQueue->get_device()) << "\\n";
+  emitInstrumentationDataProxy();
+}
 
-  switch (MCommandGroup->getType()) {
-  case detail::CG::KERNEL: {
-    auto *KernelCG =
-        reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
-    Stream << "Kernel name: ";
-    if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
-      Stream << "created from source";
-    else
-      Stream << demangleKernelName(KernelCG->getKernelName());
-    Stream << "\\n";
-    break;
+void UpdateHostRequirementCommand::emitInstrumentationData() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return;
+  // Create a payload with the command name and an event using this payload to
+  // emit a node_create
+  MAddress = MSrcAllocaCmd->getSYCLMemObj();
+  makeTraceEventProlog(MAddress);
+
+  if (MFirstInstance) {
+    xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
+    xptiAddMetadata(CmdTraceEvent, "sycl_device",
+                    deviceToString(MQueue->get_device()).c_str());
+    xptiAddMetadata(CmdTraceEvent, "memory_object", MAddressString.c_str());
+    makeTraceEventEpilog();
   }
+#endif
+}
+
+static std::string cgTypeToString(detail::CG::CGTYPE Type) {
+  switch (Type) {
+  case detail::CG::KERNEL:
+    return "Kernel";
+    break;
   case detail::CG::UPDATE_HOST:
-    Stream << "CG type: update_host\\n";
+    return "update_host";
     break;
   case detail::CG::FILL:
-    Stream << "CG type: fill\\n";
+    return "fill";
     break;
   case detail::CG::COPY_ACC_TO_ACC:
-    Stream << "CG type: copy acc to acc\\n";
+    return "copy acc to acc";
     break;
   case detail::CG::COPY_ACC_TO_PTR:
-    Stream << "CG type: copy acc to ptr\\n";
+    return "copy acc to ptr";
     break;
   case detail::CG::COPY_PTR_TO_ACC:
-    Stream << "CG type: copy ptr to acc\\n";
+    return "copy ptr to acc";
     break;
   case detail::CG::COPY_USM:
-    Stream << "CG type: copy usm\\n";
+    return "copy usm";
     break;
   case detail::CG::FILL_USM:
-    Stream << "CG type: fill usm\\n";
+    return "fill usm";
     break;
   case detail::CG::PREFETCH_USM:
-    Stream << "CG type: prefetch usm\\n";
+    return "prefetch usm";
     break;
   default:
-    Stream << "CG type: unknown\\n";
+    return "unknown";
+    break;
+  }
+}
+
+ExecCGCommand::ExecCGCommand(std::unique_ptr<detail::CG> CommandGroup,
+                             QueueImplPtr Queue)
+    : Command(CommandType::RUN_CG, std::move(Queue)),
+      MCommandGroup(std::move(CommandGroup)) {
+
+  emitInstrumentationDataProxy();
+}
+
+void ExecCGCommand::emitInstrumentationData() {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  if (!xptiTraceEnabled())
+    return;
+  // Create a payload with the command name and an event using this payload to
+  // emit a node_create
+  bool HasSourceInfo = false;
+  std::string KernelName, FromSource;
+  switch (MCommandGroup->getType()) {
+  case detail::CG::KERNEL: {
+    auto KernelCG =
+        reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
+
+    if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
+      FromSource = "true";
+      pi_kernel KernelHandle = KernelCG->MSyclKernel->getHandleRef();
+      MAddress = KernelHandle;
+      KernelName = MCommandGroup->MFunctionName;
+    } else {
+      FromSource = "false";
+      KernelName = demangleKernelName(KernelCG->getKernelName());
+    }
+  } break;
+  default:
+    KernelName = cgTypeToString(MCommandGroup->getType());
+    break;
+  }
+  std::string CommandType = commandToNodeType(MType);
+  //  Get source file, line number information from the CommandGroup object
+  //  and create payload using name, address, and source info
+  //
+  //  On Windows, since the support for builtin functions is not available in
+  //  MSVC, the MFileName, MLine will be set to nullptr and "0" respectively.
+  //  Handle this condition explicitly here.
+  xpti::payload_t Payload;
+  if (!MCommandGroup->MFileName.empty()) {
+    // File name has a valid string
+    Payload =
+        xpti::payload_t(KernelName.c_str(), MCommandGroup->MFileName.c_str(),
+                        MCommandGroup->MLine, MCommandGroup->MColumn, MAddress);
+    HasSourceInfo = true;
+  } else if (MAddress) {
+    // We have a valid function name and an address
+    Payload = xpti::payload_t(KernelName.c_str(), MAddress);
+  } else {
+    // In any case, we will have a valid function name and we'll use that to
+    // create the hash
+    Payload = xpti::payload_t(KernelName.c_str());
+  }
+
+  uint64_t CGKernelInstanceNo;
+  // Create event using the payload
+  xpti_td *CmdTraceEvent =
+      xptiMakeEvent("ExecCG", &Payload, xpti::trace_graph_event,
+                    xpti::trace_activity_type_t::active, &CGKernelInstanceNo);
+
+  if (CmdTraceEvent) {
+    MInstanceID = CGKernelInstanceNo;
+    MTraceEvent = (void *)CmdTraceEvent;
+    // If we are seeing this event again, then the instance ID will be greater
+    // than 1; in this case, we will skip sending a notification to create a
+    // node as this node has already been created.
+    if (CGKernelInstanceNo > 1)
+      return;
+
+    xptiAddMetadata(CmdTraceEvent, "sycl_device",
+                    deviceToString(MQueue->get_device()).c_str());
+    if (!KernelName.empty()) {
+      xptiAddMetadata(CmdTraceEvent, "kernel_name", KernelName.c_str());
+    }
+    if (!FromSource.empty()) {
+      xptiAddMetadata(CmdTraceEvent, "from_source", FromSource.c_str());
+    }
+    if (HasSourceInfo) {
+      xptiAddMetadata(CmdTraceEvent, "sym_function_name", KernelName.c_str());
+      xptiAddMetadata(CmdTraceEvent, "sym_source_file_name",
+                      MCommandGroup->MFileName.c_str());
+      xptiAddMetadata(CmdTraceEvent, "sym_line_no",
+                      std::to_string(MCommandGroup->MLine).c_str());
+    }
+
+    xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
+                          detail::GSYCLGraphEvent, CmdTraceEvent,
+                          CGKernelInstanceNo,
+                          static_cast<const void *>(CommandType.c_str()));
+  }
+#endif
+}
+
+void ExecCGCommand::printDot(std::ostream &Stream) const {
+  Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
+
+  Stream << "ID = " << this << "\n";
+  Stream << "EXEC CG ON " << deviceToString(MQueue->get_device()) << "\\n";
+
+  switch (MCommandGroup->getType()) {
+  case detail::CG::KERNEL: {
+    auto KernelCG =
+        reinterpret_cast<detail::CGExecKernel *>(MCommandGroup.get());
+    Stream << "Kernel name: ";
+    if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource())
+      Stream << "created from source";
+    else
+      Stream << demangleKernelName(KernelCG->getKernelName());
+    Stream << "\\n";
+    break;
+  }
+  default:
+    Stream << "CG type: " << cgTypeToString(MCommandGroup->getType()) << "\\n";
     break;
   }
 
@@ -1011,7 +1718,7 @@ cl_int ExecCGCommand::enqueueImp() {
     CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get();
     MemoryManager::prefetch_usm(Prefetch->getDst(), MQueue,
                                 Prefetch->getLength(), std::move(RawEvents),
-                                Event); 
+                                Event);
     return CL_SUCCESS;
   }
   case CG::CGTYPE::INTEROP_TASK_CODEPLAY: {
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 315e0e366fd98..13763cc77a8d3 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -9,7 +9,9 @@
 #pragma once
 
 #include <atomic>
+#include <cstdint>
 #include <memory>
+#include <set>
 #include <unordered_set>
 #include <vector>
 
@@ -91,13 +93,9 @@ class Command {
 
   Command(CommandType Type, QueueImplPtr Queue);
 
-  void addDep(DepDesc NewDep) {
-    if (NewDep.MDepCommand)
-      MDepsEvents.push_back(NewDep.MDepCommand->getEvent());
-    MDeps.push_back(NewDep);
-  }
+  void addDep(DepDesc NewDep);
 
-  void addDep(EventImplPtr Event) { MDepsEvents.push_back(std::move(Event)); }
+  void addDep(EventImplPtr Event);
 
   void addUser(Command *NewUser) { MUsers.insert(NewUser); }
 
@@ -118,6 +116,37 @@ class Command {
 
   std::shared_ptr<event_impl> getEvent() const { return MEvent; }
 
+  // Methods needed to support SYCL instrumentation
+  //
+  // Proxy method which calls emitInstrumentationData.
+  void emitInstrumentationDataProxy();
+  // Instrumentation method which emits telemetry data.
+  virtual void emitInstrumentationData() = 0;
+  // This function looks at all the dependencies for
+  // the release command and enables instrumentation
+  // to report these dependencies as edges
+  void resolveReleaseDependencies(std::set<Command *> &list);
+  // Creates an edge event when the dependency is a command
+  void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr,
+                                         const string_class &Prefix,
+                                         bool IsCommand);
+  // Creates an edge event when the dependency is an event
+  void emitEdgeEventForEventDependence(Command *Cmd, RT::PiEvent &EventAddr);
+  // Creates a signal event with the enqueued kernel event handle
+  void emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr);
+  /// Create a trace event of node_create type; this must be guarded by a
+  /// check for xptiTraceEnabled()
+  /// Post Condition: MTraceEvent will be set to the event created
+  /// @param MAddress  The address to use to create the payload
+  uint64_t makeTraceEventProlog(void *MAddress);
+  // If prolog has been run, run epilog; this must be guarded by a check for
+  // xptiTraceEnabled()
+  void makeTraceEventEpilog();
+  // Emits an event of Type
+  void emitInstrumentation(uint16_t Type, const char *Txt = nullptr);
+  //
+  // End Methods needed to support SYCL instrumentation
+
   virtual void printDot(std::ostream &Stream) const = 0;
 
   virtual const Requirement *getRequirement() const {
@@ -159,19 +188,45 @@ class Command {
   unsigned MLeafCounter = 0;
 
   const char *MBlockReason = "Unknown";
+
+  // All member variable defined here  are needed for the SYCL instrumentation
+  // layer. Do not guard these variables below with XPTI_ENABLE_INSTRUMENTATION
+  // to ensure we have the same object layout when the macro in the library and
+  // SYCL app are not the same.
+  //
+  // The event for node_create and task_begin
+  void *MTraceEvent = nullptr;
+  // The stream under which the traces are emitted; stream ids are
+  // positive integers and we set it to an invalid value
+  int32_t MStreamID = -1;
+  // Reserved for storing the object address such as SPIRV or memory object
+  // address
+  void *MAddress = nullptr;
+  // Buffer to build the address string
+  string_class MAddressString;
+  // Buffer to build the command node type
+  string_class MCommandNodeType;
+  // Buffer to build the command end-user understandable name
+  string_class MCommandName;
+  // Flag to indicate if makeTraceEventProlog() has been run
+  bool MTraceEventPrologComplete = false;
+  // Flag to indicate if this is the first time we are seeing this payload
+  bool MFirstInstance = false;
+  // Instance ID tracked for the command
+  uint64_t MInstanceID = 0;
 };
 
 // The command does nothing during enqueue. The task can be used to implement
 // lock in the graph, or to merge several nodes into one.
 class EmptyCommand : public Command {
 public:
-  EmptyCommand(QueueImplPtr Queue, Requirement Req)
-      : Command(CommandType::EMPTY_TASK, std::move(Queue)),
-        MRequirement(std::move(Req)) {}
+  EmptyCommand(QueueImplPtr Queue, Requirement Req);
 
   void printDot(std::ostream &Stream) const final;
   const Requirement *getRequirement() const final { return &MRequirement; }
 
+  void emitInstrumentationData();
+
 private:
   cl_int enqueueImp() final { return CL_SUCCESS; }
 
@@ -182,11 +237,10 @@ class EmptyCommand : public Command {
 // underlying framework.
 class ReleaseCommand : public Command {
 public:
-  ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd)
-      : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) {
-  }
+  ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd);
 
   void printDot(std::ostream &Stream) const final;
+  void emitInstrumentationData();
 
 private:
   cl_int enqueueImp() final;
@@ -198,12 +252,7 @@ class ReleaseCommand : public Command {
 class AllocaCommandBase : public Command {
 public:
   AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req,
-                    AllocaCommandBase *LinkedAllocaCmd)
-      : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd),
-        MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MReleaseCmd(Queue, this),
-        MRequirement(std::move(Req)) {
-    MRequirement.MAccessMode = access::mode::read_write;
-  }
+                    AllocaCommandBase *LinkedAllocaCmd);
 
   ReleaseCommand *getReleaseCmd() { return &MReleaseCmd; }
 
@@ -213,6 +262,8 @@ class AllocaCommandBase : public Command {
 
   const Requirement *getRequirement() const final { return &MRequirement; }
 
+  void emitInstrumentationData();
+
   void *MMemAllocation = nullptr;
 
   // Alloca command linked with current command.
@@ -229,8 +280,8 @@ class AllocaCommandBase : public Command {
   bool MIsLeaderAlloca = true;
 
 protected:
-  ReleaseCommand MReleaseCmd;
   Requirement MRequirement;
+  ReleaseCommand MReleaseCmd;
 };
 
 // The command enqueues allocation of instance of memory object on Host or
@@ -239,14 +290,10 @@ class AllocaCommand : public AllocaCommandBase {
 public:
   AllocaCommand(QueueImplPtr Queue, Requirement Req,
                 bool InitFromUserData = true,
-                AllocaCommandBase *LinkedAllocaCmd = nullptr)
-      : AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
-                          LinkedAllocaCmd),
-        MInitFromUserData(InitFromUserData) {
-    addDep(DepDesc(nullptr, getRequirement(), this));
-  }
+                AllocaCommandBase *LinkedAllocaCmd = nullptr);
 
   void printDot(std::ostream &Stream) const final;
+  void emitInstrumentationData();
 
 private:
   cl_int enqueueImp() final;
@@ -259,16 +306,11 @@ class AllocaCommand : public AllocaCommandBase {
 class AllocaSubBufCommand : public AllocaCommandBase {
 public:
   AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req,
-                      AllocaCommandBase *ParentAlloca)
-      : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue),
-                          std::move(Req),
-                          /*LinkedAllocaCmd*/ nullptr),
-        MParentAlloca(ParentAlloca) {
-    addDep(DepDesc(MParentAlloca, getRequirement(), MParentAlloca));
-  }
+                      AllocaCommandBase *ParentAlloca);
 
   void printDot(std::ostream &Stream) const final;
   AllocaCommandBase *getParentAlloca() { return MParentAlloca; }
+  void emitInstrumentationData();
 
 private:
   cl_int enqueueImp() final;
@@ -283,6 +325,7 @@ class MapMemObject : public Command {
 
   void printDot(std::ostream &Stream) const final;
   const Requirement *getRequirement() const final { return &MSrcReq; }
+  void emitInstrumentationData();
 
 private:
   cl_int enqueueImp() final;
@@ -299,6 +342,7 @@ class UnMapMemObject : public Command {
 
   void printDot(std::ostream &Stream) const final;
   const Requirement *getRequirement() const final { return &MDstReq; }
+  void emitInstrumentationData();
 
 private:
   cl_int enqueueImp() final;
@@ -317,6 +361,7 @@ class MemCpyCommand : public Command {
 
   void printDot(std::ostream &Stream) const final;
   const Requirement *getRequirement() const final { return &MDstReq; }
+  void emitInstrumentationData();
 
 private:
   cl_int enqueueImp() final;
@@ -337,6 +382,7 @@ class MemCpyCommandHost : public Command {
 
   void printDot(std::ostream &Stream) const final;
   const Requirement *getRequirement() const final { return &MDstReq; }
+  void emitInstrumentationData();
 
 private:
   cl_int enqueueImp() final;
@@ -351,13 +397,12 @@ class MemCpyCommandHost : public Command {
 // The command enqueues execution of kernel or explicit memory operation.
 class ExecCGCommand : public Command {
 public:
-  ExecCGCommand(std::unique_ptr<detail::CG> CommandGroup, QueueImplPtr Queue)
-      : Command(CommandType::RUN_CG, std::move(Queue)),
-        MCommandGroup(std::move(CommandGroup)) {}
+  ExecCGCommand(std::unique_ptr<detail::CG> CommandGroup, QueueImplPtr Queue);
 
   void flushStreams();
 
   void printDot(std::ostream &Stream) const final;
+  void emitInstrumentationData();
 
 private:
   cl_int enqueueImp() final;
@@ -370,12 +415,11 @@ class ExecCGCommand : public Command {
 class UpdateHostRequirementCommand : public Command {
 public:
   UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req,
-                               AllocaCommandBase *SrcAllocaCmd, void **DstPtr)
-      : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)),
-        MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) {}
+                               AllocaCommandBase *SrcAllocaCmd, void **DstPtr);
 
   void printDot(std::ostream &Stream) const final;
   const Requirement *getRequirement() const final { return &MDstReq; }
+  void emitInstrumentationData();
 
 private:
   cl_int enqueueImp() final;
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 862507c1839cc..9a5f02fab02c8 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -27,11 +27,19 @@ EventImplPtr addHostAccessorToSchedulerInstance(Requirement *Req,
 }
 
 void Scheduler::waitForRecordToFinish(MemObjRecord *Record) {
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+  // Will contain the list of dependencies for the Release Command
+  std::set<Command *> DepCommands;
+#endif
   for (Command *Cmd : Record->MReadLeaves) {
     EnqueueResultT Res;
     bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res);
     if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
       throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+    // Capture the dependencies
+    DepCommands.insert(Cmd);
+#endif
     GraphProcessor::waitForEvent(Cmd->getEvent());
   }
   for (Command *Cmd : Record->MWriteLeaves) {
@@ -39,6 +47,9 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record) {
     bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res);
     if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
       throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+    DepCommands.insert(Cmd);
+#endif
     GraphProcessor::waitForEvent(Cmd->getEvent());
   }
   for (AllocaCommandBase *AllocaCmd : Record->MAllocaCommands) {
@@ -47,6 +58,11 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record) {
     bool Enqueued = GraphProcessor::enqueueCommand(ReleaseCmd, Res);
     if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult)
       throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION);
+#ifdef XPTI_ENABLE_INSTRUMENTATION
+    // Report these dependencies to the Command so these dependencies can be
+    // reported as edges
+    ReleaseCmd->resolveReleaseDependencies(DepCommands);
+#endif
     GraphProcessor::waitForEvent(ReleaseCmd->getEvent());
   }
 }
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 828c344b97d7b..c0e8ec9e9edaa 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -19,7 +19,7 @@
 
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
-event handler::finalize() {
+event handler::finalize(const cl::sycl::detail::code_location &Payload) {
   sycl::event EventRet;
   unique_ptr_class<detail::CG> CommandGroup;
   switch (MCGType) {
@@ -30,52 +30,53 @@ event handler::finalize() {
         std::move(MArgsStorage), std::move(MAccStorage),
         std::move(MSharedPtrStorage), std::move(MRequirements),
         std::move(MEvents), std::move(MArgs), std::move(MKernelName),
-        std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType));
+        std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType,
+        Payload));
     break;
   }
   case detail::CG::INTEROP_TASK_CODEPLAY:
     CommandGroup.reset(new detail::CGInteropTask(
         std::move(MInteropTask), std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents), MCGType));
+        std::move(MRequirements), std::move(MEvents), MCGType, Payload));
     break;
   case detail::CG::COPY_ACC_TO_PTR:
   case detail::CG::COPY_PTR_TO_ACC:
   case detail::CG::COPY_ACC_TO_ACC:
-    CommandGroup.reset(
-        new detail::CGCopy(MCGType, MSrcPtr, MDstPtr, std::move(MArgsStorage),
-                           std::move(MAccStorage), std::move(MSharedPtrStorage),
-                           std::move(MRequirements), std::move(MEvents)));
+    CommandGroup.reset(new detail::CGCopy(
+        MCGType, MSrcPtr, MDstPtr, std::move(MArgsStorage),
+        std::move(MAccStorage), std::move(MSharedPtrStorage),
+        std::move(MRequirements), std::move(MEvents), Payload));
     break;
   case detail::CG::FILL:
     CommandGroup.reset(new detail::CGFill(
         std::move(MPattern), MDstPtr, std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents)));
+        std::move(MRequirements), std::move(MEvents), Payload));
     break;
   case detail::CG::UPDATE_HOST:
     CommandGroup.reset(new detail::CGUpdateHost(
         MDstPtr, std::move(MArgsStorage), std::move(MAccStorage),
         std::move(MSharedPtrStorage), std::move(MRequirements),
-        std::move(MEvents)));
+        std::move(MEvents), Payload));
     break;
   case detail::CG::COPY_USM:
     CommandGroup.reset(new detail::CGCopyUSM(
         MSrcPtr, MDstPtr, MLength, std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents)));
+        std::move(MRequirements), std::move(MEvents), Payload));
     break;
   case detail::CG::FILL_USM:
     CommandGroup.reset(new detail::CGFillUSM(
         std::move(MPattern), MDstPtr, MLength, std::move(MArgsStorage),
         std::move(MAccStorage), std::move(MSharedPtrStorage),
-        std::move(MRequirements), std::move(MEvents)));
+        std::move(MRequirements), std::move(MEvents), Payload));
     break;
   case detail::CG::PREFETCH_USM:
     CommandGroup.reset(new detail::CGPrefetchUSM(
         MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage),
         std::move(MSharedPtrStorage), std::move(MRequirements),
-        std::move(MEvents)));
+        std::move(MEvents), Payload));
     break;
   case detail::CG::NONE:
     throw runtime_error("Command group submitted without a kernel or a "
diff --git a/sycl/source/ordered_queue.cpp b/sycl/source/ordered_queue.cpp
index 4abf0923fef49..9829c446f7bde 100644
--- a/sycl/source/ordered_queue.cpp
+++ b/sycl/source/ordered_queue.cpp
@@ -74,10 +74,6 @@ device ordered_queue::get_device() const { return impl->get_device(); }
 
 bool ordered_queue::is_host() const { return impl->is_host(); }
 
-void ordered_queue::wait() { impl->wait(); }
-
-void ordered_queue::wait_and_throw() { impl->wait_and_throw(); }
-
 void ordered_queue::throw_asynchronous() { impl->throw_asynchronous(); }
 
 event ordered_queue::memset(void *ptr, int value, size_t count) {
@@ -88,13 +84,23 @@ event ordered_queue::memcpy(void *dest, const void *src, size_t count) {
   return impl->memcpy(impl, dest, src, count);
 }
 
-event ordered_queue::submit_impl(function_class<void(handler &)> CGH) {
-  return impl->submit(CGH, impl);
+event ordered_queue::submit_impl(function_class<void(handler &)> CGH,
+                                 const detail::code_location &CodeLoc) {
+  return impl->submit(CGH, impl, CodeLoc);
 }
 
 event ordered_queue::submit_impl(function_class<void(handler &)> CGH,
-                                 ordered_queue &secondQueue) {
-  return impl->submit(CGH, impl, secondQueue.impl);
+                                 ordered_queue &secondQueue,
+                                 const detail::code_location &CodeLoc) {
+  return impl->submit(CGH, impl, secondQueue.impl, CodeLoc);
+}
+
+void ordered_queue::wait_proxy(const detail::code_location &CodeLoc) {
+  impl->wait(CodeLoc);
+}
+
+void ordered_queue::wait_and_throw_proxy(const detail::code_location &CodeLoc) {
+  impl->wait_and_throw(CodeLoc);
 }
 
 template <info::queue param>
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index c47eba506958d..e20b8e6016725 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -88,9 +88,6 @@ device queue::get_device() const { return impl->get_device(); }
 
 bool queue::is_host() const { return impl->is_host(); }
 
-void queue::wait() { impl->wait(); }
-
-void queue::wait_and_throw() { impl->wait_and_throw(); }
 
 void queue::throw_asynchronous() { impl->throw_asynchronous(); }
 
@@ -106,13 +103,22 @@ event queue::mem_advise(const void *ptr, size_t length, int advice) {
   return impl->mem_advise(ptr, length, advice);
 }
 
-event queue::submit_impl(function_class<void(handler &)> CGH) {
-  return impl->submit(CGH, impl);
+event queue::submit_impl(function_class<void(handler &)> CGH,
+                         const detail::code_location &CodeLoc) {
+  return impl->submit(CGH, impl, CodeLoc);
 }
 
-event queue::submit_impl(function_class<void(handler &)> CGH,
-                         queue secondQueue) {
-  return impl->submit(CGH, impl, secondQueue.impl);
+event queue::submit_impl(function_class<void(handler &)> CGH, queue secondQueue,
+                         const detail::code_location &CodeLoc) {
+  return impl->submit(CGH, impl, secondQueue.impl, CodeLoc);
+}
+
+void queue::wait_proxy(const detail::code_location &CodeLoc) {
+  impl->wait(CodeLoc);
+}
+
+void queue::wait_and_throw_proxy(const detail::code_location &CodeLoc) {
+  impl->wait_and_throw(CodeLoc);
 }
 
 template <info::queue param>
diff --git a/sycl/unittests/scheduler/BlockedCommands.cpp b/sycl/unittests/scheduler/BlockedCommands.cpp
index 1b9d192da8fac..df93a2b5f9dcd 100644
--- a/sycl/unittests/scheduler/BlockedCommands.cpp
+++ b/sycl/unittests/scheduler/BlockedCommands.cpp
@@ -22,6 +22,8 @@ class MockCommand : public detail::Command {
       : Command(detail::Command::ALLOCA, Queue) {}
   void printDot(std::ostream &Stream) const override {}
 
+  void emitInstrumentationData() override {}
+
   cl_int enqueueImp() override { return MRetVal; }
 
   cl_int MRetVal = CL_SUCCESS;
diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
index a7af5a7db06b1..59b77aba199be 100644
--- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp
+++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
@@ -23,6 +23,7 @@ class FakeCommand : public cl::sycl::detail::Command {
         MRequirement{std::move(Req)} {}
 
   void printDot(std::ostream &Stream) const override {}
+  void emitInstrumentationData() override {}
 
   const cl::sycl::detail::Requirement *getRequirement() const final {
     return &MRequirement;
diff --git a/xpti/CMakeLists.txt b/xpti/CMakeLists.txt
new file mode 100644
index 0000000000000..a23187e713809
--- /dev/null
+++ b/xpti/CMakeLists.txt
@@ -0,0 +1,55 @@
+cmake_minimum_required(VERSION 2.8.9)
+
+set(XPTI_VERSION 0.4.1)
+set(XPTI_DIR ${CMAKE_CURRENT_LIST_DIR})
+# Setting the same version as SYCL
+set(CMAKE_CXX_STANDARD 11)
+
+if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  message(STATUS "No build type selected, default to Release")
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type (default Release)" FORCE)
+endif()
+
+project (xpti)
+
+if (MSVC)
+  # MSVC provides two incompatible build variants for its CRT: release and debug
+  # To avoid potential issues in user code we also need to provide two kinds
+  # of SYCL Runtime Library for release and debug configurations.
+  set(XPTI_CXX_FLAGS "")
+  if (CMAKE_BUILD_TYPE MATCHES "Debug")
+    set(XPTI_CXX_FLAGS "${CMAKE_CXX_FLAGS_DEBUG}")
+    string(REPLACE "/MDd" "" XPTI_CXX_FLAGS "${XPTI_CXX_FLAGS}")
+    string(REPLACE "/MTd" "" XPTI_CXX_FLAGS "${XPTI_CXX_FLAGS}")
+  else()
+    if (CMAKE_BUILD_TYPE MATCHES "Release")
+      set(XPTI_CXX_FLAGS "${CMAKE_CXX_FLAGS_RELEASE}")
+    elseif (CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo")
+      set(XPTI_CXX_FLAGS "${CMAKE_CXX_FLAGS_MINSIZEREL}")
+    elseif (CMAKE_BUILD_TYPE MATCHES "MinSizeRel")
+      set(XPTI_CXX_FLAGS "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+    endif()
+    string(REPLACE "/MD" "" XPTI_CXX_FLAGS "${XPTI_CXX_FLAGS}")
+    string(REPLACE "/MT" "" XPTI_CXX_FLAGS "${XPTI_CXX_FLAGS}")
+  endif()
+
+  # target_compile_options requires list of options, not a string
+  string(REPLACE " " ";" XPTI_CXX_FLAGS "${XPTI_CXX_FLAGS}")
+
+  set(XPTI_CXX_FLAGS_RELEASE "${XPTI_CXX_FLAGS};/MD")
+  set(XPTI_CXX_FLAGS_DEBUG "${XPTI_CXX_FLAGS};/MDd")
+
+  # CMake automatically applies these flags to all targets. To override this
+  # behavior, options lists are reset.
+  set(CMAKE_CXX_FLAGS_RELEASE "")
+  set(CMAKE_CXX_FLAGS_MINSIZEREL "")
+  set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "")
+  set(CMAKE_CXX_FLAGS_DEBUG "")
+endif()
+
+set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/lib/${CMAKE_BUILD_TYPE})
+set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+
+include_directories(${CMAKE_SOURCE_DIR}/include)
+add_subdirectory(src)
diff --git a/xpti/README.md b/xpti/README.md
new file mode 100644
index 0000000000000..0c0e8868e305d
--- /dev/null
+++ b/xpti/README.md
@@ -0,0 +1,22 @@
+# XPTI proxy library
+
+Implementation of the instrumentation stub library to support SYCL
+Instrumentation. The stub library checks for two things before it can
+successfully dispatch event streams:
+
+1. Environment variable that indicates that tracing has been enabled.
+
+   This is defined by the variable `XPTI_TRACE_ENABLE`. The possible
+   values taken by this environment variable are:
+
+   To enable: `XPTI_TRACE_ENABLE=1` or `XPTI_TRACE_ENABLE=true`
+
+   To disable: `XPTI_TRACE_ENABLE=0` or `XPTI_TRACE_ENABLE=false`
+
+2. Environment variable that points to the XPTI dispatcher so the stub
+   library can dynamically load it and dispatch the calls to the dispatcher.
+   `XPTI_FRAMEWORK_DISPATCHER=/path/to/dispatcher.[so,dll,dylib]`
+
+The stub library requires both of these to be set for it to successfully
+dispatch the calls for the event streams. The dispatcher is required for
+tool developers to implement subscribers and register them with the dispatcher.
diff --git a/xpti/doc/SYCL_Tracing_Implementation.md b/xpti/doc/SYCL_Tracing_Implementation.md
new file mode 100644
index 0000000000000..f78823061c8b5
--- /dev/null
+++ b/xpti/doc/SYCL_Tracing_Implementation.md
@@ -0,0 +1,236 @@
+# SYCL Tracing - Part I
+
+In order to understand the various language constructs that are available
+in the SYCL layer, instrumenting them to provide a correlation between the
+end-user source code and the kernels that execute on a device must be made.
+A lightweight tracing framework (XPTI) was developed to facilitate this
+through explicit instrumentation of all the language constructs. The goal
+of this framework is to provide a low-overhead solution that tools can use
+to build performance analytical models. Based on tests, simulations and
+projections, the framework API can capture telemetry information for
+~60-70,000 events/sec with overheads less than 1% of the application runtime.
+
+This document outlines the use of this framework API at various points in
+the SYCL runtime to trace the language constructs. Each language construct
+used/expressed by the developer is associated with a source location payload
+information that includes the function name, source file name and line
+number where the construct is expressed. Using the source location
+information, a hash is created for the language construct along with a
+corresponding unique ID. The framework provides the ability to propagate
+this ID all the way to the driver layers for the device.
+
+This document discusses where in the SYCL runtime instrumentation has been
+added and the reasons behind adding this instrumentation.
+
+> **NOTE:** This document is better viewed with [Markdown Reader](https://chrome.google.com/webstore/detail/markdown-reader/gpoigdifkoadgajcincpilkjmejcaanc?hl=en) plugin for chrome or the [Markdown Preview Extension](https://github.com/shd101wyy/vscode-markdown-preview-enhanced/releases) for Visual Studio Code.
+
+## Instrumentation Trace Points
+
+This section will document all the places in the SYCL runtime that have been
+instrumented to capture the asynchronous task graphs created by the runtime.
+The task graphs are captured as graph, nodes and edges:
+
+>  - The graph encapsulates all of the disjoint task graphs generated by the application.
+>  - The nodes capture operations that are performed, such as kernel
+executions or memory transfers
+>  - The edges represent dependence relationships, the representation of
+which mimics control flow as opposed to a dependence graph. The source node
+in an edge must complete before the target node can begin execution.
+
+ All code changes to enable this have been guarded by
+ `XPTI_ENABLE_INSTRUMENTATION` macro and the CMake files have been updated to
+ have this as an option which is enabled by default and this change is under
+ `llvm/sycl/CMakeLists.txt`.
+
+```cmake
+...
+# Create a soft option for enabling or disabling the instrumentation
+# of the SYCL runtime
+option(SYCL_ENABLE_XPTI_TRACING "Enable tracing of SYCL constructs" ON)
+```
+
+### The Graph
+
+Any SYCL/DPC++ application can submit command groups to any active queue
+during the lifetime of the application. Each submission is handled by the
+runtime and the asynchronous task graphs are updated to reflect the new
+submission. This may be as simple as adding a new node to the task-graph or
+adding multiple nodes to the graph, where one of the nodes represents the
+computation and the others dependent memory transfers.
+
+To model this, we create a global graph for every application instantiation
+and all kernel executions in the applications are added as nodes in this
+global graph. In the SYCL runtime, there is no obvious location where the
+creation of the global graph can be inserted as many objects are
+instantiated statically. Currently, we embed the graph creation in the
+plugin interface (PI) layer `initialize()` call. In this call, we will
+perform two operations:
+
+1. Initialize all listeners and create a trace event to represent the graph.
+This is done in `sycl/include/CL/sycl/detail/pi.cpp`.
+2. Send a `graph_create` event to all subscribers. This notification
+will only be sent once.
+
+### The Nodes
+
+The command group lambdas are captured and encapsulated in a `Command`
+object. This object is evaluated for dependencies on data/memory or external
+OpenCL events and an asynchronous task graph is built by mapping all these
+dependencies, before it is enqueued on the device. In order to capture the
+command groups (nodes) and the dependencies (edges), the base class
+`Command` and any derived classes that are of interest are instrumented.
+
+In this section, we discuss the instrumentation of the Command object in two
+parts: (1) The changes made to capture end-user source code details for
+language constructs (2) The instrumentation that handles capturing the
+relevant metadata.
+
+1. In order to capture end-user source code information, we have implemented
+`cl::sycl::detail::code_location` class that uses the builtin functions
+in the compiler. However, equivalent implementations are unavailable on
+Windows and separate cross-platform implementation might be used in the
+future. To mitigate this, the Windows implementation will always report
+`unknown_file`, `unknown_func` and a line number of 0 for source
+file, function name and line number. We handle this case while processing
+this information.
+
+   The source information of a language construct, such as source file,
+   function name, line number and column number allow us to determine if a
+   Command that was previously created for a construct is being created
+   again. In such cases, we will not emit a `node_create` event, but we
+   will bump up the instance count recording the number of instances
+   created. Secondly, the source information allows us to associate a unique
+   ID with the source location and propagate it all the way to the driver,
+   if possible. This will allow us to associate a Kernel event with a source
+   location at all times. All instrumentation that identifies a command
+   object of a given type and emits the `node_create` event is located
+   in the `emitInstrumentationData()` and must be implemented by all
+   derived classes.
+
+   To enable this source location information, we start with enabling the
+   public methods in the queue class, such as `queue.submit()`,
+   `queue.parallel_for()`, `queue.wait()`, etc to include a default
+   argument that captures the source location information. The location of
+   the line in the caller that makes the call to `queue.submit()`,
+   `queue.parallel_for()`, etc is represented in this default argument.
+   These changes are present in `queue.hpp` and `ordered_queue.hpp`.
+   The default arguments for all public functions are guarded by
+   `#ifdef SYCL_INSTRUMENTATION_METADATA` that is currently enabled by
+   default.
+
+   The location information, when captured, is propagated all the way to the
+   `CommandGroup` object. So, for every `CommandGroup` object, we
+   will have the corresponding source location in end-user code where the
+   command group is submitted to the queue. This metadata is propagated by
+   the instrumentation to the subscribers of the stream.
+
+2. The base `Command class` and all derived classes are instrumented to capture
+   the relevant information for each command object and a `node_create` event is
+   generated.
+
+### The Node instance
+
+Once a command object is created, it is enqueued on the device for
+execution. To capture the execution of this node instance, we instrument the
+`enqueue()` method to determine the cost of this computation or memory
+related kernel. As the commands are enqueued, the enqueue method emits a
+pair of events indicating the `task_begin` and `task_end`events that
+capture the duration of the enqueued command.  For commands that are
+asynchronous, the pair of events capture just the kernel submission and the
+actual execution of the command on the device is tracked through the
+`cl_event` returned by the enqueue operation. In the case of host kernel
+execution or commands that are synchronous, the cost is measured directly.
+
+In the case of the command being submitted to an OpenCL device, we capture
+the event of the submitted kernel and propagate it to the subscriber tool.
+It is up to the tool to register a callback for this event completion and
+close the task opened for the command object.
+
+### The Edges
+
+As discussed in the previous section, the command groups submitted to the
+device queues form nodes in the asynchronous tasks graphs created by
+the SYCL runtime. In addition to these nodes, based on the memory references
+(through accessors or USM pointers), additional nodes to `allocate`,
+`copy` and `release` are created and they are necessary for the
+computation kernels to run. The computation kernel has dependencies on the
+memory objects and these dependencies are recorded as `event`s and in
+our model we represent them as edges between the dependent nodes.
+
+Tools monitoring the event stream then can start capturing the asynchronous
+task graph as it is being built. As dependencies are added to a command
+object, the instrumentation emits these dependencies as `edge_create`
+events. Each of these `edge_create`events encapsulate the two command
+objects that have a dependency through this edge. The source object of this
+edge event must complete execution first before the target object of the
+edge can begin execution.
+
+To instrument this part of the code, the `Command::addDep` methods of
+the Command object are instrumented to create the trace points and notify
+all subscribers.
+
+The `Release` command, as implemented in the SYCL runtime, has a
+reference to the memory object, but no explicit dependencies are created. To
+model the edges correctly, we instrument the `waitForRecordToFinish` method in
+the `Scheduler` where the release operation waits on all the
+dependent operations to complete to capture the edges.
+
+This concludes all the changes that were made to the SYCL runtime to support
+tracing. The next section talks about the XPTI framework that allows
+applications and runtimes to efficiently capture, record and emit trace
+notifications for important events during the run.
+
+# SYCL Tracing - Part II
+The architecture of the XPTI, when described at a rudimentary level, allows
+you to instrument any runtime or application and link it with the static
+stub library that implements all the functions in the library. If the
+tracing is enabled through the environment variable
+`XPTI_TRACE_ENABLE=1`, the stub library checks to see if the framework
+dispatcher is registered. This dispatcher is registered through an
+environment variable `XPTI_FRAMEWORK_DISPATCHER=/path/to/libxptifw.so`.
+If tracing is turned on and the dynamic loading of the framework dispatcher
+is successful, then the proxy library creates a dispatch table for all the
+trampoline functions used in the instrumentation of the application or
+runtime.
+
+The static library is the only dependency for building the SYCL runtime (or
+any application/runtime that uses this instrumentation mechanism) and is
+currently available under `llvm/xpti`. In the current organization at
+the file system level, the API specification for the instrumentation
+framework is available with the static library under `llvm/xpti/include`.
+The framework is divided into two parts: (1) the implementation of the proxy/
+stub library that will be compiled to create a static library to be linked
+with SYCL runtime and (2) a dynamic library with can be registered as the
+framework dispatcher and is not required by the SYCL runtime that is not a
+part of the `llvm` project. The dynamic library depends on the API
+specification that is a part of the static proxy library. Using the
+specification in the `llvm/xpti`, an implementation of the dynamic
+library can be built.
+
+## The Proxy library
+
+The proxy library implements all of the public functions that are a part of
+the XPTI tracing infrastructure. Each function however is a stub that checks
+to see if tracing has been enabled. If so, it forwards the call to the same
+function in the framework dispatcher or the dynamic component of this
+framework.
+
+   ```c++
+   XPTI_EXPORT_API xpti::result_t xptiInitialize(const char *stream,
+      uint32_t maj,
+      uint32_t min,
+      const char *version) {
+      // Static object g_loader will return true if
+      // XPTI_TRACE_ENABLE=1 and the dynamic dispatcher
+      // library in XPTI_FRAMEWORK_DISPATCHER is valid
+      // and successfully loaded.
+      //
+      if (xpti::g_loader.noErrors()) {
+          void *f = xpti::g_loader.functionByIndex(XPTI_INITIALIZE);
+          if (f) {
+              return (*(xpti_initialize_t)f)(stream, maj, min, version);
+          }
+      }
+      return xpti::result_t::XPTI_RESULT_FAIL;
+   }
+   ```
diff --git a/xpti/include/xpti_data_types.h b/xpti/include/xpti_data_types.h
new file mode 100644
index 0000000000000..91ef9afba6d35
--- /dev/null
+++ b/xpti/include/xpti_data_types.h
@@ -0,0 +1,543 @@
+//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+#pragma once
+#include <atomic>
+#include <cstdint>
+#include <string>
+#include <unordered_map>
+
+namespace xpti {
+constexpr int invalid_id = -1;
+constexpr uint8_t default_vendor = 0;
+
+/// @brief Flag values used by the payload_t structure to mark the information
+/// present
+/// @details When a payload is created, it is conceivable that only partial
+/// information may be present and these flags are used to indicate the
+/// available information. The hash generator will generate a hash based on the
+/// flags set.
+///
+enum class payload_flag_t {
+  NameAvailable = 1,        ///< The name for the tracepoint is available
+  SourceFileAvailable = 2,  ///< Source file information available
+  CodePointerAvailable = 4, ///< Code pointer VA is available
+  LineInfoAvailable = 8,    ///< Line information available in the payload
+  ColumnInfoAvailable = 16, ///< Column information available in payload
+  HashAvailable = 2 << 16   ///< A hash is already available for this payload
+};
+
+//
+//  Helper macros for creating new tracepoint and
+//  event types
+//
+using trace_point_t = uint16_t;
+using event_type_t = uint16_t;
+using string_id_t = int32_t;
+
+using safe_flag_t = std::atomic<bool>;
+using safe_uint64_t = std::atomic<uint64_t>;
+using safe_uint32_t = std::atomic<uint32_t>;
+using safe_uint16_t = std::atomic<uint16_t>;
+using safe_int64_t = std::atomic<int64_t>;
+using safe_int32_t = std::atomic<int32_t>;
+using safe_int16_t = std::atomic<int16_t>;
+using metadata_t = std::unordered_map<string_id_t, string_id_t>;
+
+#define XPTI_EVENT(val) xpti::event_type_t(val)
+#define XPTI_TRACE_POINT_BEGIN(val) xpti::trace_point_t(val << 1 | 0)
+#define XPTI_TRACE_POINT_END(val) xpti::trace_point_t(val << 1 | 1)
+
+#define XPTI_PACK08_RET16(value1, value2) ((value1 << 8) | value2)
+#define XPTI_PACK16_RET32(value1, value2) ((value1 << 16) | value2)
+#define XPTI_PACK32_RET64(value1, value2) (((uint64_t)value1 << 32) | value2)
+
+/// @brief Payload data structure that is optional for trace point callback API
+/// @details The payload structure, if determined at compile time, can deliver
+/// the source association of various parallel constructs defined by the
+/// language. In the case it is defined, a lookup table will provide the
+/// association from a kernel/lambda (address) to a payload and the same address
+/// to a unique ID created at runtime.
+///
+/// All instances of a kernel will be associated with the same unique ID through
+/// the lifetime of an object. The hash maps that will be maintained would be:
+/// # [unique_id]->[payload]
+/// # [kernel address]->[unique_id]
+///
+/// Unique_id MUST be propagated downstream to the OpenCL runtime to ensure the
+/// associations back to the sources. This requires elp from the compiler
+/// front-end.
+///
+struct payload_t {
+  /// String ID of the function/kernel name; only valid if 'name' is valid
+  string_id_t name_sid = invalid_id;
+  /// String ID of the source file name
+  string_id_t source_file_sid = invalid_id;
+  /// Name of the trace point; graph, algorithm, lock names, for example.
+  const char *name = nullptr;
+  /// Absolute path of the source file; may have to to be unicode string
+  const char *source_file = nullptr;
+  /// Line number information to correlate the trace point
+  uint32_t line_no = invalid_id;
+  /// For a complex statement, column number may be needed to resolve the trace
+  /// point; currently none of the compiler builtins return a valid column no
+  uint32_t column_no = invalid_id;
+  /// Kernel/lambda/function address
+  const void *code_ptr_va = nullptr;
+  /// Internal bookkeeping slot - do not change.
+  uint64_t internal;
+  /// Flags indicating whether string name, codepointer, source file and hash
+  /// values are available
+  uint64_t flags = 0;
+
+  payload_t() = default;
+
+  //  If the address of the kernel/function name is provided, we mark it as
+  //  valid since we can potentially reconstruct the name and the source file
+  //  information during post-processing step of symbol resolution; this
+  //  indicates a partial but valid payload.
+  payload_t(void *codeptr) {
+    code_ptr_va = codeptr;
+    name_sid = invalid_id;        ///< Invalid string ID
+    source_file_sid = invalid_id; ///< Invalid string ID
+    name = nullptr;               ///< Invalid name string pointer
+    source_file = nullptr;        ///< Invalid source file string pointer
+    line_no = invalid_id;         ///< Invalid line number
+    column_no = invalid_id;       ///< Invalid column number
+    flags = (uint64_t)payload_flag_t::CodePointerAvailable;
+  }
+
+  //  If neither an address or the fully identifyable source file name and
+  //  location are not available, we take in the name of the
+  //  function/task/user-defined name as input and create a hash from it. We
+  //  mark it as valid since we can display the name in a timeline view, but the
+  //  payload is considered to be a partial but valid payload.
+  payload_t(const char *func_name) {
+    code_ptr_va = nullptr;
+    name_sid = invalid_id;        ///< Invalid string ID
+    source_file_sid = invalid_id; ///< Invalid string ID
+    name = func_name;             ///< Invalid name string pointer
+    source_file = nullptr;        ///< Invalid source file string pointer
+    line_no = invalid_id;         ///< Invalid line number
+    column_no = invalid_id;       ///< Invalid column number
+    flags = (uint64_t)(payload_flag_t::NameAvailable);
+  }
+
+  payload_t(const char *func_name, void *codeptr) {
+    code_ptr_va = codeptr;
+    name_sid = invalid_id;        ///< Invalid string ID
+    source_file_sid = invalid_id; ///< Invalid string ID
+    name = func_name;             ///< Invalid name string pointer
+    source_file = nullptr;        ///< Invalid source file string pointer
+    line_no = invalid_id;         ///< Invalid line number
+    column_no = invalid_id;       ///< Invalid column number
+    flags = (uint64_t)payload_flag_t::NameAvailable |
+            (uint64_t)payload_flag_t::CodePointerAvailable;
+  }
+
+  //  We need the payload to contain at the very least, the code pointer
+  //  information of the kernel or function. In the full payload case, we will
+  //  also have the function name and source file name along with the line and
+  //  column number of the trace point that forms the payload.
+  payload_t(const char *kname, const char *sf, int line, int col,
+            void *codeptr) {
+    code_ptr_va = codeptr;
+    /// Invalid string ID as the string hasn't been registered yet
+    name_sid = invalid_id;
+    source_file_sid = invalid_id;
+    /// Capture the rest of the parameters
+    name = kname;
+    source_file = sf;
+    line_no = line;
+    column_no = col;
+    flags = (uint64_t)payload_flag_t::NameAvailable |
+            (uint64_t)payload_flag_t::SourceFileAvailable |
+            (uint64_t)payload_flag_t::LineInfoAvailable |
+            (uint64_t)payload_flag_t::ColumnInfoAvailable |
+            (uint64_t)payload_flag_t::CodePointerAvailable;
+  }
+};
+
+///  @brief Enumerator defining the global/basic trace point types
+///  @details The frame work defines the global/basic trace point types
+///  that are necessary for modeling parallel runtimes. A helper macro
+///  provided to create the enum values as the LSB is reserved for
+///  determining if the trace point is a 'begin' trace or an 'end'
+///  trace. This reserved bit is used by the scoped_notify() class
+///  to automatically send the closing enum trace type for a given
+///  trace point type.
+///
+///  The provided macros TRACE_POINT_BEGIN(val) and TRACE_POINT_END(val)
+///  must be used in all user defined enums that are defined to extend
+///  the trace point types.
+///
+///  The trace_type data is of type uint8_t and the 7-LSB bits are used
+///  to enumerate trace types. the MSB bit is reserved for user-defined
+///  trace types and is set to 0 for predefined trace point types defined
+///  by the framework.
+///
+///  When user-defined trace types are being declared, a new ID is added
+///  to this value to create a uint16_t data type. The LSB 8-bits have
+///  the 8th bit set indicating that it is user-defined and the remaining
+///  7-bits will indicated the user defined trace point type. However,
+///  since multiple tools or vendors could create their own trace point
+///  types, we require the vendor_id to create a vendor namespace to avoid
+///  collisions.
+///
+///                                  user-defined bit
+///                                    |
+///                                    |
+///                                    |+-----+---- 127 possible values for
+///                                    ||     |     defining trace types.
+///                                    ||     |     Due to the scope bit,
+///                                    ||     |     63 unique scope types
+///                                    ||     |     can be defined.
+///                                    vv     v
+///   Field width (uint16_t) |........|........|
+///                          15      8 7      0
+///                           ^      ^
+///                           |      |
+///                           |      |
+///                           +------+----- Reserved for vendor ID
+///
+enum class trace_point_type_t : uint16_t {
+  unknown_type = 0,
+  /// Indicates that a graph has been instantiated
+  graph_create = XPTI_TRACE_POINT_BEGIN(1),
+  /// Indicates that a new node object has been instantiated
+  node_create = XPTI_TRACE_POINT_BEGIN(2),
+  /// Indicates that a new edge object has been instantiated
+  edge_create = XPTI_TRACE_POINT_BEGIN(3),
+  /// Indicates the beginning of a parallel region
+  region_begin = XPTI_TRACE_POINT_BEGIN(4),
+  /// Indicates the end of a parallel region
+  region_end = XPTI_TRACE_POINT_END(4),
+  /// Indicates the begin of a task execution, the parent of which could be a
+  /// graph or a parallel region
+  task_begin = XPTI_TRACE_POINT_BEGIN(5),
+  /// Indicates the end of an executing task
+  task_end = XPTI_TRACE_POINT_END(5),
+  /// Indicates the begin of a barrier call
+  barrier_begin = XPTI_TRACE_POINT_BEGIN(6),
+  /// Indicates the end of a barrier
+  barrier_end = XPTI_TRACE_POINT_END(6),
+  /// Similar to barrier begin, but captures the information for a lock
+  lock_begin = XPTI_TRACE_POINT_BEGIN(7),
+  /// Similar to barrier end, but captures the information for a lock
+  lock_end = XPTI_TRACE_POINT_END(7),
+  /// Use to model triggers (impulse) at various points in time - will not have
+  /// an end equivalent
+  signal = XPTI_TRACE_POINT_BEGIN(8),
+  /// Used to model the data transfer initiation from device A to device B
+  transfer_begin = XPTI_TRACE_POINT_BEGIN(9),
+  /// Used to model the completion of a previously initiated data transfer
+  /// event
+  transfer_end = XPTI_TRACE_POINT_END(9),
+  /// Is present for completeness to capture the spawning of new threads in a
+  /// runtime
+  thread_begin = XPTI_TRACE_POINT_BEGIN(10),
+  /// Models the end of the lifetime of a thread
+  thread_end = XPTI_TRACE_POINT_END(10),
+  /// Models the explicit barrier begin in SYCL
+  wait_begin = XPTI_TRACE_POINT_BEGIN(11),
+  /// Models the explicit barrier end in SYCL
+  wait_end = XPTI_TRACE_POINT_END(11),
+  /// Indicates that the trace point is user defined and only the tool defined
+  /// for a stream will be able to handle it
+  user_defined = 1 << 7
+};
+
+///  @brief Enumerator defining the global/basic trace event types
+///  @details The frame work defines the global/basic trace event types that are
+///  necessary for modeling parallel runtimes.
+///
+///  The event_type data is of type uint8_t and the 7-LSB bits are used to
+///  enumerate event types. the MSB bit is reserved for user-defined event types
+///  and is set to 0 for predefined event types defined by the framework.
+///
+///  When user-defined event types are being declared, a new ID is added to this
+///  value to create a uint16_t data type. The LSB 8-bits have the 8th bit set
+///  indicating that it is user-defined and the remaining 7-bits will indicated
+///  the user defined trace event type. However, since multiple tools or vendors
+///  could create their own trace event types, we require the vendor_id to
+///  create a vendor namespace to avoid collisions.
+///
+///                                  user-defined bit
+///                                    |
+///                                    |
+///                                    |+-----+---- 127 possible values for
+///                                    ||     |     defining event types.
+///                                    ||     |
+///                                    ||     |
+///                                    ||     |
+///                                    vv     v
+///   Field width (uint16_t) |........|........|
+///                          15      8 7      0
+///                           ^      ^
+///                           |      |
+///                           |      |
+///                           +------+----- Reserved for vendor ID
+enum class trace_event_type_t : uint16_t {
+  /// In this case, the callback can choose to map it to something called
+  /// unknown or ignore it entirely
+  unknown_event = 0,
+  /// Event type is graph - usually reported for traces from  graph or for
+  /// graph, node or edge object creation
+  graph = XPTI_EVENT(1),
+  /// Algorithm type describes a parallel algorithm such as a parallel_for
+  algorithm = XPTI_EVENT(2),
+  /// Barrier event is usually a synchronization type that causes threads to
+  /// wait until something happens and found in parallel algorithms and explicit
+  /// synchronization use cases in asynchronous programming
+  barrier = XPTI_EVENT(3),
+  /// Activity in the scheduler that is not useful work is reported as this
+  /// event type
+  scheduler = XPTI_EVENT(4),
+  /// Asynchronous activity event
+  async = XPTI_EVENT(5),
+  /// Synchronization event - only the contention time is captured by this
+  /// event and marked as overhead
+  lock = XPTI_EVENT(6),
+  /// Indicates that the current event is an offload read request
+  offload_read = XPTI_EVENT(7),
+  /// Indicates that the current event is an offload write request
+  offload_write = XPTI_EVENT(8),
+  /// User defined event for extensibility and will have to be registered by
+  /// the tool/runtime
+  user_defined = 1 << 7
+};
+
+enum class trace_activity_type_t {
+  /// Activity type is unknown; it is upto the collector handling the callback
+  /// to mark it as needed
+  unknown_activity = 0,
+  /// Any activity reported by the tracing that results in useful work, hence
+  /// active time
+  active = 1,
+  /// Activity that was primarily due to overheads such as time spent in
+  /// barriers and schedulers, acquiring locks, etc
+  overhead = 1 << 1,
+  /// Activities that may be considered as background tasks; for example,
+  /// asynchronous activities or region callbacks that are placeholders for
+  /// nested activities
+  background = 1 << 2,
+  /// Explicit sleeps could be a result of calling APIs that result in zero
+  /// active time
+  sleep_activity = 1 << 3
+};
+
+struct reserved_data_t {
+  /// Has a reference to the associated payload field for an event
+  payload_t *payload;
+  /// Has additional metadata that may be defined by the user as key-value
+  /// pairs
+  metadata_t metadata;
+};
+
+struct trace_event_data_t {
+  /// Unique id that corresponds to an event type or event group type
+  int64_t unique_id = invalid_id;
+  /// Data ID: ID that tracks the data elements streaming through the algorithm
+  /// (mostly graphs; will be the same as instance_id for algorithms)
+  uint64_t data_id = 0;
+  /// Instance id of an algorithm with id=unique_id
+  uint64_t instance_id = 0;
+  /// The type of event
+  uint16_t event_type;
+  /// How this event is classified: active, overhead, barrier etc
+  uint16_t activity_type;
+  /// Unused 32-bit slot that could be used for any ids that need to be
+  /// propagated in the future
+  uint32_t unused;
+  /// If event_type is "graph" and trace_type is "edge_create", then the source
+  /// ID is set
+  int64_t source_id = invalid_id;
+  /// If event_type is "graph" and trace_type is "edge_create", then the target
+  /// ID is set
+  int64_t target_id = invalid_id;
+  /// A reserved slot for memory growth, if required by the framework
+  reserved_data_t reserved;
+  /// User defined data, if required; owned by the user shared object and will
+  /// not be deleted when event data is destroyed
+  void *user_data = nullptr;
+};
+
+///
+///  The error code list is incomplete and still
+///  being defined.
+///
+enum class result_t : int32_t {
+  // Success codes here (values >=0)
+  XPTI_RESULT_SUCCESS = int32_t(0),
+  XPTI_RESULT_FALSE = int32_t(1),
+  // Error codes here (values < 0)
+  XPTI_RESULT_FAIL = int32_t(0x80004001),
+  XPTI_RESULT_NOTIMPL = int32_t(0x80004002),
+  XPTI_RESULT_DUPLICATE = int32_t(0x80004003),
+  XPTI_RESULT_NOTFOUND = int32_t(0x80004004),
+  XPTI_RESULT_UNDELETE = int32_t(0x80004005),
+  XPTI_RESULT_INVALIDARG = int32_t(0x80004006)
+};
+
+// These defines are present to enable plugin developers
+// who want to subscribe to the streams from the framework
+//
+#if defined(_WIN64) || defined(_WIN32) /* Windows */
+#ifdef XPTI_CALLBACK_API_EXPORTS
+#define XPTI_CALLBACK_API __declspec(dllexport)
+#else
+#define XPTI_CALLBACK_API __declspec(dllimport)
+#endif
+#else /* Generic Unix/Linux */
+#ifdef XPTI_CALLBACK_API_EXPORTS
+#define XPTI_CALLBACK_API __attribute__((visibility("default")))
+#else
+#define XPTI_CALLBACK_API
+#endif
+#endif
+/// @brief Callback function prototype
+/// @details All callback functions that are registered with
+/// the tracing framework have this signature.
+///
+/// @param [in] trace_type The trace type for which this callback has been
+/// invoked.
+/// @param [in] parent  Parent object for which the current object/trace is a
+/// child of. If the current trace is not nested, the parent object will be
+/// NULL.
+/// @param [in] child  Child object for this callback has been invoked.
+/// @param [in] user_data Data sent by the caller which can be anything and the
+/// tool trying to interpret it needs to know the type for the handshake to be
+/// successful. Most of the time, this field is used to send in const char *
+/// data.
+typedef void (*tracepoint_callback_api_t)(uint16_t trace_type,
+                                          xpti::trace_event_data_t *parent,
+                                          xpti::trace_event_data_t *child,
+                                          uint64_t instance,
+                                          const void *user_data);
+typedef void (*plugin_init_t)(unsigned int, unsigned int, const char *,
+                              const char *);
+typedef void (*plugin_fini_t)(const char *);
+
+constexpr uint16_t trace_task_begin =
+    static_cast<uint16_t>(xpti::trace_point_type_t::task_begin);
+constexpr uint16_t trace_task_end =
+    static_cast<uint16_t>(xpti::trace_point_type_t::task_end);
+constexpr uint16_t trace_wait_begin =
+    static_cast<uint16_t>(xpti::trace_point_type_t::wait_begin);
+constexpr uint16_t trace_wait_end =
+    static_cast<uint16_t>(xpti::trace_point_type_t::wait_end);
+constexpr uint16_t trace_barrier_begin =
+    static_cast<uint16_t>(xpti::trace_point_type_t::barrier_begin);
+constexpr uint16_t trace_barrier_end =
+    static_cast<uint16_t>(xpti::trace_point_type_t::barrier_end);
+constexpr uint16_t trace_graph_create =
+    static_cast<uint16_t>(xpti::trace_point_type_t::graph_create);
+constexpr uint16_t trace_node_create =
+    static_cast<uint16_t>(xpti::trace_point_type_t::node_create);
+constexpr uint16_t trace_edge_create =
+    static_cast<uint16_t>(xpti::trace_point_type_t::edge_create);
+constexpr uint16_t trace_signal =
+    static_cast<uint16_t>(xpti::trace_point_type_t::signal);
+
+constexpr uint16_t trace_graph_event =
+    static_cast<uint16_t>(xpti::trace_event_type_t::graph);
+constexpr uint16_t trace_algorithm_event =
+    static_cast<uint16_t>(xpti::trace_event_type_t::algorithm);
+} // namespace xpti
+
+using xpti_tp = xpti::trace_point_type_t;
+using xpti_te = xpti::trace_event_type_t;
+using xpti_at = xpti::trace_activity_type_t;
+using xpti_td = xpti::trace_event_data_t;
+
+extern "C" {
+/// @brief The framework loads the tool which implements xptiTraceInit() and
+/// calls it when the runtime is being initialized
+/// @details When tools implement callbacks and want to register them with
+/// the runtime, they must implement the xptiTraceInit() and xptiTraceFinish()
+/// functions and the runtime will try to resolve these symbols on load.
+/// xptiTraceInit() is then called by the runtime so that the tool knows when
+/// the runtime is instantiated so it can register its callbacks in the
+/// xptiTraceInit() function.
+///
+/// When the runtime calls the tool's implementation of the xptiTraceInit()
+/// function, it also provides the version of the runtime that is invoking the
+/// init call. This allows tools implementers to handle certain calls based on
+/// the runtime version the tools supports.
+///
+/// @code
+/// void XPTI_CALLBACK_API xptiTraceInit
+///     (
+///        unsigned int maj,
+///        unsigned int min,
+///        const char *version,
+///        const char *stream_name
+///     )
+/// {
+///     std::string v = version; // make a copy of the version string
+///     if(maj < 3) {
+///       // do something here like registering callbacks
+///       g_stream_id = xptiRegisterStream(stream_name);
+///       xptiRegisterCallback(g_stream_id, graph_create, trace_point_begin);
+///       xptiRegisterCallback(g_stream_id, node_create, trace_point_begin);
+///       xptiRegisterCallback(g_stream_id, edge_create, trace_point_begin);
+///       xptiRegisterCallback(g_stream_id, region_begin, trace_point_begin);
+///       xptiRegisterCallback(g_stream_id, region_end, trace_point_end);
+///       xptiRegisterCallback(g_stream_id, task_begin, trace_point_begin);
+///       xptiRegisterCallback(g_stream_id, task_end, trace_point_end);
+///       xptiRegisterCallback(g_stream_id, barrier_begin, trace_point_begin);
+///       xptiRegisterCallback(g_stream_id, barrier_end, trace_point_end);
+///       xptiRegisterCallback(g_stream_id, lock_begin, trace_point_begin);
+///       xptiRegisterCallback(g_stream_id, lock_end, trace_point_end);
+///       xptiRegisterCallback(g_stream_id, transfer_begin, trace_point_begin);
+///       xptiRegisterCallback(g_stream_id, transfer_end, trace_point_end);
+///       xptiRegisterCallback(g_stream_id, thread_begin, trace_point_begin);
+///       xptiRegisterCallback(g_stream_id, thread_end, trace_point_end);
+///       xptiRegisterCallback(g_stream_id, wait_begin, trace_point_begin);
+///       xptiRegisterCallback(g_stream_id, wait_end, trace_point_end);
+///       xptiRegisterCallback(g_stream_id, signal, trace_point_begin);
+///     } else {
+///       // report incompatible tool error message
+///     }
+/// }
+/// @endcode
+///
+/// @param [in]  major_version The major version of the runtime
+/// @param [in]  minor_version The minor version of the runtime. if the version
+/// consists a tertiary number, it will not be reported. For example, if we have
+/// a version number 5.1.23776, the only 5 and 1 we be reported for major and
+/// minor versions. The API assumes that semantic versioning is being used for
+/// the runtime/application.
+///
+/// @see https://semver.org/ Major revision number change will break API
+/// compatibility. Minor revision number change will always be backward
+/// compatible, but may contain additional functionality.
+///
+/// @param [in]  version_str Null terminated version string. This value is
+/// guaranteed to be valid for the duration of the xptiTraceInit() call.
+/// @param [in]  stream_name Null terminated string indicating the stream name
+/// that is invoking this xptiTraceInit() call. This value is guaranteed to be
+/// valid for the duration of the xptiTraceInit() call.
+/// @return none
+XPTI_CALLBACK_API void xptiTraceInit(unsigned int major_version,
+                                     unsigned int minor_version,
+                                     const char *version_str,
+                                     const char *stream_name);
+
+/// @brief Function to handle unloading of the module or termination of
+/// application
+/// @details This function will get called when the application
+/// or the runtime implementing the trace point is about to be
+/// unloaded or terminated.
+///
+/// @param [in]  stream_name Null terminated string indicating the stream name
+/// that is invoking this xptiTraceFinish() call. This value is guaranteed to be
+/// valid for the duration of the xptiTraceFinish() call. The subscriber who has
+/// subscribed to this stream can now free up all internal data structures and
+/// memory that has been allocated to manage the stream data.
+XPTI_CALLBACK_API void xptiTraceFinish(const char *stream_name);
+}
diff --git a/xpti/include/xpti_trace_framework.h b/xpti/include/xpti_trace_framework.h
new file mode 100644
index 0000000000000..c91294acd12e8
--- /dev/null
+++ b/xpti/include/xpti_trace_framework.h
@@ -0,0 +1,414 @@
+//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+#pragma once
+#include "xpti_data_types.h"
+
+#if defined(XPTI_STATIC_LIBRARY)
+// If we are building or using the proxy
+// static library we don't export any symbols
+//
+#define XPTI_EXPORT_API
+#else
+#if defined(_WIN64) || defined(_WIN32) /* Windows */
+#ifdef XPTI_API_EXPORTS
+#define XPTI_EXPORT_API __declspec(dllexport)
+#else
+#define XPTI_EXPORT_API __declspec(dllimport)
+#endif
+#else /* Generic Unix/Linux */
+#ifdef XPTI_API_EXPORTS
+#define XPTI_EXPORT_API __attribute__((visibility("default")))
+#else
+#define XPTI_EXPORT_API
+#endif
+#endif
+#endif
+
+#define XPTI_EXTRACT_USER_DEFINED_ID(val) ((uint16_t)val & 0x007f)
+#define XPTI_TOOL_ID(val) (((uint16_t)val >> 8) & 0x00ff)
+
+extern "C" {
+
+/// @brief Initialization function that is called when a new stream is generated
+/// @details When a runtime or application that uses XPTI instrumentation API
+/// starts to generate a new stream, a call to xptiInitialize() must be made to
+/// let all subscribers know that a new stream is being generated. If the
+/// subscribers are interested in this stream, they can the choose to subscribe
+/// to the stream.
+/// @param stream Name of the stream, for example "sycl", "opencl" etc
+/// @param maj Major version number
+/// @param min Minor version number
+/// @param version Full version as a string
+/// @return None
+XPTI_EXPORT_API xpti::result_t xptiInitialize(const char *stream, uint32_t maj,
+                                              uint32_t min,
+                                              const char *version);
+
+/// @brief Finalization function that is called when a stream halted
+/// @details When a runtime or application that uses XPTI instrumentation API
+/// stops generating the stream, a call to xptiFinalize() must be made to let
+/// all subscribers know that the stream identified by 'stream' has stopped
+/// generating events. If the subscribers are registered to receive events from
+/// this stream, they can choose to unsubscribe from the stream or handle the
+/// situation when the stream stop sending events.
+/// @param stream Name of the stream, for example "sycl", "opencl" etc
+/// @return None
+XPTI_EXPORT_API void xptiFinalize(const char *stream);
+
+/// @brief Generates a unique ID
+/// @details When a tool is subscribing to the event stream and wants to
+/// generate task IDs that do not collide with unique IDs currently being
+/// generated for nodes, edges and graphs, this API can be used. Any time, a
+/// task that represents the instance of a node executing on the device is being
+/// traced, the event ID corresponds to the unique ID of the node it represents
+/// and in order to disambiguate the instances, a task ID can be generated and
+/// sent as the instance ID for that task.
+XPTI_EXPORT_API uint64_t xptiGetUniqueId();
+
+/// @brief Register a string to the string table
+/// @details All strings in the XPTI framework are referred to by their string
+/// IDs and this method allow you to register a string and get the string ID for
+/// it. In addition to the string ID, a reference to the string in the string
+/// table is also returned. This lifetime of this string reference is equal to
+/// the lifetime of the XPTI framework.
+/// @param string The string to be registered with the string table. If the
+/// string already exists in the string table, the previous ID is returned along
+/// with the reference to the string in the string table.
+/// @param table_string A reference to the string in the string table. This
+/// string reference is guaranteed to be valid for the lifetime of the XPTI
+/// framework.
+/// @return The string ID of the string being registered. If an error occurs
+/// during registration, xpti::invalid_id is returned.
+XPTI_EXPORT_API xpti::string_id_t xptiRegisterString(const char *string,
+                                                     char **table_string);
+
+/// @brief Lookup a string in the string tablewith its string ID
+/// @details All strings in the XPTI framework are referred to by their string
+/// IDs and this method allows you to lookup a string by its string ID. The
+/// lifetime of the returned string reference is equal to the lifetime of the
+/// XPTI framework.
+/// @param id The string ID of the string to lookup.
+/// @return A reference to the string identified by the string ID.
+XPTI_EXPORT_API const char *xptiLookupString(xpti::string_id_t id);
+
+/// @brief Register a stream by its name and get a stream ID
+/// @details When events in a given stream have to be notified to the
+/// subscribers, the stream ID to which the events belong to is required. This
+/// method will register a stream by its name and return an ID that can be used
+/// for notifications.
+/// @param stream_name The stream name that needs to be registered.
+/// @return The stream ID. If the stream has already been registered, the
+/// previously generated stream ID is returned.
+XPTI_EXPORT_API uint8_t xptiRegisterStream(const char *stream_name);
+
+/// @brief Unregister a stream by its name
+/// @details Unregistering a stream will invalidate the stream ID associated
+/// with it by calling xptiFinalize() on all subscribers registered to this
+/// stream and disabling all registered callbacks for this stream.
+/// @param stream_name The stream name that needs to be unregistered.
+/// @return The result code is XPTI_RESULT_SUCCESS when successful and
+/// XPTI_RESULT_NOTFOUND if the stream is not found.
+XPTI_EXPORT_API xpti::result_t xptiUnregisterStream(const char *stream_name);
+
+/// @brief Registers a user defined trace point
+/// @details The framework allows applications or runtimes using the framework
+/// to extend the pre-defined tracepoint types. In order to facilitate this, a
+/// tool name must be provided. This allows multiple vendors to instrument and
+/// extend different software modules and have them behave well when put
+/// together. However, the tool_name must be unique for this to behave well.
+///
+/// @code
+///   typedef enum {
+///     my_tp_extn1_begin = XPTI_TRACE_POINT_BEGIN(0),
+///     my_tp_extn1_end = XPTI_TRACE_POINT_END(0),
+///     my_tp_extn2_begin = XPTI_TRACE_POINT_BEGIN(1),
+///     my_tp_extn2_end = XPTI_TRACE_POINT_END(1)
+///   }tp_extension_t;
+///   ...
+///   uint16_t tp1_start = xptiRegisterUserDefinedTracePoint("myTest",
+///                       my_tp_extn1_begin);
+///   uint16_t tp1_end = xptiRegisterUserDefinedTracePoint("myTest",
+///                         my_tp_extn1_end);
+///   uint16_t tp2_start = xptiRegisterUserDefinedTracePoint("myTest",
+///                       my_tp_extn2_begin);
+///   uint16_t tp2_end = xptiRegisterUserDefinedTracePoint("myTest",
+///                         my_tp_extn2_end);
+///   ...
+///   xptiNotifySubscribers(stream_id, tp1_start, parent, event, instance,
+///                         nullptr);
+/// @endcode
+///
+/// @param tool_name The tool name that is extending tracepoint types for its
+/// use.
+/// @param user_defined_tp The user defined tracepoint is a value ranging from
+/// 0-127, which would allow vendors to create 64 pairs of tracepoints.
+/// @return The result code is XPTI_RESULT_SUCCESS when successful and
+/// XPTI_RESULT_NOTFOUND if the stream is not found.
+XPTI_EXPORT_API uint16_t xptiRegisterUserDefinedTracePoint(
+    const char *tool_name, uint8_t user_defined_tp);
+
+/// @brief Registers a user defined event type
+/// @details The framework allows applications or runtimes using the framework
+/// to extend the pre-defined event types. In order to facilitate this, a
+/// tool name must be provided. This allows multiple vendors to instrument and
+/// extend different software modules and have them behave well when put
+/// together. However, the tool_name must be unique for this to behave well.
+///
+/// @code
+///   typedef enum {
+///     my_ev_extn1 = XPTI_EVENT(0),
+///     my_ev_extn2 = XPTI_EVENT(1)
+///   } event_extension_t;
+///   ...
+///   uint16_t my_ev1 = xptiRegisterUserDefinedEventType("myTest", my_ev_extn1);
+///   uint16_t my_ev2 = xptiRegisterUserDefinedEventType("myTest", my_ev_extn2);
+///   ...
+///   uint64_t InstanceNo;
+///   MyEvent = xptiMakeEvent("application_foo", &Payload,
+///                           my_ev1, xpti::trace_activity_type_t::active,
+///                           &InstanceNo);
+/// @endcode
+///
+/// In order for an notification to be received for such an event, a callback
+/// must be registered.
+///
+/// @param tool_name The tool name that is extending tracepoint types for its
+/// use.
+/// @param user_defined_event The user defined event is a value ranging
+/// from 0-127, which would allow vendors to create 127 new events under
+/// tool_name.
+/// @return The result code is XPTI_RESULT_SUCCESS when successful and
+/// XPTI_RESULT_NOTFOUND if the stream is not found.
+XPTI_EXPORT_API uint16_t xptiRegisterUserDefinedEventType(
+    const char *tool_name, uint8_t user_defined_event);
+
+/// @brief Creates a trace point event
+/// @details When the application or runtime wants to instrument interesting
+/// sections of the code, they can create trace point events that represent
+/// these sections and use the created event to notify subscribers that such an
+/// event ocurred. Each created event will have a unique ID. If the same payload
+/// is provided to the xptiMakeEvent() function, the same trace event is
+/// returned after looking up the invariant information in the payload
+/// parameter. If the unique ID or the event itself has been cached, there will
+/// be no lookup costs. However, if they are not cached, the same payload is
+/// provded each time the section is encountered and the event that has been
+/// created previously will be returned. This will however incur a lookup cost
+/// and it is recommended that this be avoided to keep the instrumentation
+/// overheads minimal.
+///
+/// @code
+///   uint64_t InstanceNo;
+///   trace_event_data_t *MyEvent;
+///   xpti::payload_t Payload("foo", "foo.cpp", 100, 0, (void *)this);
+///   MyEvent = xptiMakeEvent("foo", &Payload,
+///                           xpti::trace_event_type_t::algorithm,
+///                           xpti::trace_activity_type_t::active,
+///                           &InstanceNo);
+///
+///   // Cache MyEvent locally so it can be used the next time around by
+///   // avoiding a lookup
+/// @endcode
+///
+/// @param name The name of the event, typically the function name or kernel
+/// name, etc
+/// @param payload The payload that uniquely describes the trace point which can
+/// be done by the function name, source file name and line number within the
+/// source file and the address of the function, for example.
+/// @param event The event type of the current trace event being created, as in
+/// is it a graph event or an algorithm event, etc.
+/// @param activity The activity type for the event - as in active, background,
+/// overhead etc.
+/// @param instance_no This value is returned by the framework and represents
+/// the instance number of this event. If the same event is attempted to be
+/// created again, the instance ID give you an indication of how many times this
+/// section has been visited.
+/// @return The trace event representing the section's payload is returned.
+XPTI_EXPORT_API xpti::trace_event_data_t *
+xptiMakeEvent(const char *name, xpti::payload_t *payload, uint16_t event,
+              xpti::trace_activity_type_t activity, uint64_t *instance_no);
+
+/// @brief Retrieves a trace event given the unique id of the event
+/// @details If the unique ID of a trace event is cached, this function allows
+/// you to query the framework for the trace event data structure.
+///
+/// @param uid The unique ID of the event for which the lookup needs to be
+/// performed
+/// @return The trace event with unique ID equal to uid. If the unique ID is not
+/// present, then nullptr will be returned.
+XPTI_EXPORT_API const xpti::trace_event_data_t *xptiFindEvent(int64_t uid);
+
+/// @brief Retrieves the payload information associated with an event
+/// @details An event encapsulates the unique payload it represents and this
+/// function allows you to query the payload with the trace event data pointer.
+///
+/// @param lookup_object The trace event object for which the payload
+/// information must be retrieved.
+/// @return The payload data structure pointer for the event.
+XPTI_EXPORT_API const xpti::payload_t *
+xptiQueryPayload(xpti::trace_event_data_t *lookup_object);
+
+/// @brief Registers a callback for a trace point type
+/// @details Subscribers receive notifications to the trace point types they
+/// register a callback with. This function allows subscribers to register the
+/// same or different callback with all trace point types.
+///
+/// @param stream_id The stream for which the registration is requested
+/// @param trace_type The trace point type for which the registration must be
+/// made. For example, you can register a different call back for
+/// xpti::trace_point_type_t::task_begin and xpti::trace_point_type_t::task_end.
+/// @param cb The callback function who's signature is of the type
+/// xpti::tracepoint_callback_api_t
+/// @return The result code which can be one of:
+///            1. XPTI_RESULT_SUCCESS when the registration is successful
+///            2. XPTI_RESULT_DUPLICATE when the callback function has already
+///               been registered for the stream and trace point type
+///            3. XPTI_RESULT_UNDELETE when the registration is for a callback
+///               that had been previously unregistered.
+XPTI_EXPORT_API xpti::result_t
+xptiRegisterCallback(uint8_t stream_id, uint16_t trace_type,
+                     xpti::tracepoint_callback_api_t cb);
+
+/// @brief Unregisters a previously registered callback for a trace point type
+/// @details Subscribers receive notifications to the trace point types they
+/// register a callback with. This function allows subscribers to unregister
+/// any previously registered callback  functions with this function so they can
+/// stop receiving notifications.
+///
+/// @param stream_id The stream for which the registration must be disabled
+/// @param trace_type The trace point type for which the registration must be
+/// disabled.
+/// @param cb The callback function who's signature is of the type
+/// xpti::tracepoint_callback_api_t and must be disabled.
+/// @return The result code which can be one of:
+///            1. XPTI_RESULT_SUCCESS when the unregistration is successful
+///            2. XPTI_RESULT_DUPLICATE when the callback function has already
+///               been disabled for the stream and trace point type
+///            3. XPTI_RESULT_NOTFOUND if the callbackhas not been previously
+///               registered.
+XPTI_EXPORT_API xpti::result_t
+xptiUnregisterCallback(uint8_t stream_id, uint16_t trace_type,
+                       xpti::tracepoint_callback_api_t cb);
+
+/// @brief Notifies all registered subscribers that an event has occurred
+/// @details Subscribers receive notifications to the trace point types they
+/// register a callback with. This function allows subscribers to unregister
+/// any previously registered callback  functions with this function so they can
+/// stop receiving notifications.
+///
+/// @param stream_id The stream for which the registration must be disabled
+/// @param trace_type The trace point type for which the notification is being
+/// sent out
+/// @param parent The parent trace event type for the current event. If none
+/// exist, this can be nullptr.
+/// @param object The event object for which the notification must be sent out.
+/// @param instance The instance number of the current event and this value is
+/// guaranteed to be static for the duration of the callback handler.
+/// @param temporal_user_data This is the field where each tool can send in some
+/// state information and the handshake of the type of this data type must be
+/// handled by extending tracepoint types that handle diffent types od user
+/// data.
+/// @return The result code which can be one of:
+///            1. XPTI_RESULT_SUCCESS when the notification is successful
+XPTI_EXPORT_API xpti::result_t
+xptiNotifySubscribers(uint8_t stream_id, uint16_t trace_type,
+                      xpti::trace_event_data_t *parent,
+                      xpti::trace_event_data_t *object, uint64_t instance,
+                      const void *temporal_user_data);
+
+/// @brief Associates <key-value> pairs with an event
+/// @details If the instrumentation embedded in applications need to send
+/// additional metadata to the framwork and eventually the subscribers, this
+/// function can be used. The metadata is of the form of <key-value> pairs and
+/// are only of string types. Internall, the data is represented as <key-value>
+/// pairs of string IDs, so when one queries the metadata, they must look up the
+/// value's string ID.
+///
+/// @param e The event for which the metadata is being added
+/// @param key The key that identifies the metadata as a string
+/// @param value The value for the key as a string
+/// @return The result code which can be one of:
+///            1. XPTI_RESULT_SUCCESS when the add is successful
+///            2. XPTI_RESULT_INVALIDARG when the inputs are invalid
+///            3. XPTI_RESULT_DUPLICATE when the key-value pair already exists
+XPTI_EXPORT_API xpti::result_t xptiAddMetadata(xpti::trace_event_data_t *e,
+                                               const char *key,
+                                               const char *value);
+
+/// @brief Query the metadata table for a given event
+/// @details In order to retrieve metadata information for a given event, you
+/// must get the metadata tables and perform your queries on this table.
+/// @code
+///    auto m = xptiQueryMetadata(event);
+///    // Example of printing all the metadata contents
+///    for( auto &md : m ) {
+///        printf("++ %20s:%s\n", xptiLookupString(md.first),
+///                               xptiLookupString(md.second));
+///    }
+///    // Here's an example of a query on the table
+///    char *table_string;
+///    xpti::string_id_t key_id = xptiRegisterString("myKey", &table_string);
+///    auto index = m.find(key_id);
+///    if(index != m.end()) {
+///       // Retrieve the value
+///       const char *value = xptiLookupString((*index).second);
+///    }
+/// @endcode
+///
+/// @param e The event for which the metadata is being requested
+/// @return The metadata table of type xpti::metadata_t *
+XPTI_EXPORT_API xpti::metadata_t *
+xptiQueryMetadata(xpti::trace_event_data_t *e);
+
+/// @brief Returns a bool that indicates whether tracing is enabled or not
+/// @details If the tracing is enabled by the XPTI_TRACE_ENABLE=1 environment
+/// variable, a valid dispatcher for dispatching calls to the framework and if
+/// there exists one or more valid subscribers, then this function will return
+/// true, else false
+/// @return bool that indicates whether it is enabled or not
+XPTI_EXPORT_API bool xptiTraceEnabled();
+
+/// @brief Resets internal state
+/// @details This method is currently ONLY used by the tests and is NOT
+/// recommended for use in the instrumentation of applications or runtimes.
+/// The proxy/stub library does not implement this function.
+XPTI_EXPORT_API void xptiReset();
+
+/// @brief Force sets internal state to trace enabled
+/// @details This method is currently ONLY used by the tests and is NOT
+/// recommended for use in the instrumentation of applications or runtimes.
+/// The proxy/stub library does not implement this function.
+XPTI_EXPORT_API void xptiForceSetTraceEnabled(bool yesOrNo);
+
+typedef xpti::result_t (*xpti_initialize_t)(const char *, uint32_t, uint32_t,
+                                            const char *);
+typedef void (*xpti_finalize_t)(const char *);
+typedef uint64_t (*xpti_get_unique_id_t)();
+typedef xpti::string_id_t (*xpti_register_string_t)(const char *, char **);
+typedef const char *(*xpti_lookup_string_t)(xpti::string_id_t);
+typedef uint8_t (*xpti_register_stream_t)(const char *);
+typedef xpti::result_t (*xpti_unregister_stream_t)(const char *);
+typedef uint16_t (*xpti_register_user_defined_tp_t)(const char *, uint8_t);
+typedef uint16_t (*xpti_register_user_defined_et_t)(const char *, uint8_t);
+typedef xpti::trace_event_data_t *(*xpti_make_event_t)(
+    const char *, xpti::payload_t *, uint16_t, xpti::trace_activity_type_t,
+    uint64_t *);
+typedef const xpti::trace_event_data_t *(*xpti_find_event_t)(int64_t);
+typedef const xpti::payload_t *(*xpti_query_payload_t)(
+    xpti::trace_event_data_t *);
+typedef xpti::result_t (*xpti_register_cb_t)(uint8_t, uint16_t,
+                                             xpti::tracepoint_callback_api_t);
+typedef xpti::result_t (*xpti_unregister_cb_t)(uint8_t, uint16_t,
+                                               xpti::tracepoint_callback_api_t);
+typedef xpti::result_t (*xpti_notify_subscribers_t)(
+    uint8_t, uint16_t, xpti::trace_event_data_t *, xpti::trace_event_data_t *,
+    uint64_t instance, const void *temporal_user_data);
+typedef xpti::result_t (*xpti_add_metadata_t)(xpti::trace_event_data_t *,
+                                              const char *, const char *);
+typedef xpti::metadata_t *(*xpti_query_metadata_t)(xpti::trace_event_data_t *);
+typedef bool (*xpti_trace_enabled_t)();
+}
diff --git a/xpti/include/xpti_trace_framework.hpp b/xpti/include/xpti_trace_framework.hpp
new file mode 100644
index 0000000000000..65a030791f711
--- /dev/null
+++ b/xpti/include/xpti_trace_framework.hpp
@@ -0,0 +1,321 @@
+//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+#pragma once
+#include <cstdint>
+#include <sstream>
+
+#include "xpti_data_types.h"
+#include "xpti_trace_framework.h"
+
+#if defined(_WIN32) || defined(_WIN64)
+#include <string>
+#include <strsafe.h>
+#include <windows.h>
+typedef HINSTANCE xpti_plugin_handle_t;
+typedef FARPROC xpti_plugin_function_t;
+#define XPTI_PLUGIN_STRING "*.dll"
+#define XPTI_PATH_SEPARATOR "\\"
+// Windows does not have PATH_MAX defined or is inconsistent; Documentation
+// suggests that 32767 is the max string length of environment variables on
+// Windows
+constexpr auto WIN_PATH_MAX = 32767;
+#else // Linux and MacOSX
+#include <cstdlib>
+#include <dlfcn.h>
+#include <limits.h>
+#include <string>
+
+typedef void *xpti_plugin_handle_t;
+typedef void *xpti_plugin_function_t;
+#define XPTI_PATH_SEPARATOR "/"
+
+#if defined(__unix__) // Linux
+#define XPTI_PLUGIN_STRING "*.so"
+#elif defined(__APPLE__) // Mac
+#define XPTI_PLUGIN_STRING "*.dylib"
+#endif
+#endif
+
+/// Insert something when compiled with msvc
+/// https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
+#ifdef _MSC_VER
+#define __XPTI_INSERT_IF_MSVC(x) x
+#else
+#define __XPTI_INSERT_IF_MSVC(x)
+#endif
+
+namespace xpti {
+namespace utils {
+
+class StringHelper {
+public:
+  template <class T> std::string addressAsString(T address) {
+    std::stringstream ss;
+    ss << std::hex << address;
+    return ss.str();
+  }
+
+  template <class T>
+  std::string nameWithAddress(const char *prefix, T address) {
+    std::string coded_string;
+
+    if (prefix)
+      coded_string = prefix;
+    else
+      coded_string = "unknown";
+
+    coded_string += "[" + addressAsString<T>(address) + "]";
+    return coded_string;
+  }
+
+  template <class T>
+  std::string nameWithAddress(std::string &prefix, T address) {
+    std::string coded_string;
+    if (!prefix.empty())
+      coded_string = prefix + "[" + addressAsString<T>(address) + "]";
+    else
+      coded_string = "unknown[" + addressAsString<T>(address) + "]";
+
+    return coded_string;
+  }
+
+  std::string nameWithAddressString(const char *prefix, std::string &address) {
+    std::string coded_string;
+
+    if (prefix)
+      coded_string = prefix;
+    else
+      coded_string = "unknown";
+
+    coded_string += "[" + address + "]";
+    return coded_string;
+  }
+
+  std::string nameWithAddressString(const std::string &prefix,
+                                    std::string &address) {
+    std::string coded_string;
+    ;
+    if (!prefix.empty())
+      coded_string = prefix + "[" + address + "]";
+    else
+      coded_string = "unknown[" + address + "]";
+
+    return coded_string;
+  }
+};
+
+class PlatformHelper {
+public:
+  /// @brief Retrieves the last error and represents it as a std::string
+  /// @details This function is a platform independent abstraction for
+  /// retrieving the last error that was captured.
+  ///
+  /// @return <addr>        The last error logged in the system
+  ///
+  std::string getLastError() {
+    std::string error;
+#if defined(_WIN32) || defined(_WIN64)
+    DWORD err = GetLastError();
+    LPVOID msgBuff;
+    size_t size = FormatMessageA(
+        FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
+            FORMAT_MESSAGE_IGNORE_INSERTS,
+        NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&msgBuff,
+        0, NULL);
+
+    if (size) {
+      LPCSTR msgStr = (LPCSTR)msgBuff;
+      error = std::string(msgStr, msgStr + size);
+    }
+    LocalFree(msgBuff);
+#else
+    const char *err_string = dlerror();
+    if (err_string)
+      error = err_string;
+#endif
+    return error;
+  }
+
+  std::string getEnvironmentVariable(const std::string &var) {
+    // Previous code that used a secure API for getting the environment variable
+    // and was replaced with a C++11 std::getenv for simplicity. However, if
+    // environments require the use of the suggested API from Microsoft, the
+    // code segment below may be un-commented and current std::getenv() code
+    // block encapsulated in the #else clause.
+    //
+    // #if defined(_WIN32) || defined(_WIN64)
+    //     // Implementation that uses the secure versions of the API to get an
+    //     // environment variable.
+    //     char *valuePtr = nullptr;
+    //     size_t length;
+    //     errno_t error = _dupenv_s(&valuePtr, &length, var.c_str());
+    //     // Variable doesn't exist
+    //     if (error)
+    //       return "";
+    //     // If the variable exists, then get the value into a temporary copy
+    //     std::string retValue = valuePtr;
+    //     free(valuePtr);
+    //     return retValue;
+    // #else
+
+    // Remove irrelevant deprecation warning in this context in case /sdl option
+    // is used to provide more information on deprecated API
+    // https://docs.microsoft.com/en-us/cpp/c-runtime-library/security-features-in-the-crt
+    // https://docs.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks
+    __XPTI_INSERT_IF_MSVC(__pragma(warning(suppress : 4996)))
+    const char *val = std::getenv(var.c_str());
+    return val ? val : "";
+    // #endif
+  }
+
+  /// @brief Finds a function defined by symbol in a shared object or DLL
+  /// @details This function is a platform independent abstraction for finding
+  /// a symbol in a shared object of DLL. If successful, it will return a
+  /// valid function pointer.
+  ///
+  /// @param [in]  h        The plugin handle which is required to make this
+  ///                       query
+  /// @param [in] symbol    The symbol that needs to be searched within the
+  ///                       shared object or DLL represented by the handle h
+  ///
+  /// @return <addr>        If the query is successful, the function returns
+  ///                       a valid address
+  /// @return nullptr       If the query fails, nullptr is returned
+  ///
+  xpti_plugin_function_t findFunction(xpti_plugin_handle_t h,
+                                      const char *symbol) {
+    xpti_plugin_function_t func = nullptr;
+    if (h && symbol) {
+#if defined(_WIN32) || defined(_WIN64)
+      func = GetProcAddress(h, symbol);
+#else
+      func = dlsym(h, symbol);
+#endif
+    }
+    return func;
+  }
+
+  /// @brief Loads a shared object or DLL and returns a plugin handle
+  /// @details This function is a platform independent abstraction for loading
+  /// a DLL or shared object and returns a valid plugin handle if successful.
+  ///
+  /// @param [in]  h        The plugin handle which is required to make this
+  ///                       query
+  /// @param [in] symbol    The symbol that needs to be searched within the
+  ///                       shared object or DLL represented by the handle h
+  ///
+  /// @return <addr>        If the query is successful, the function returns
+  ///                       a valid address
+  /// @return nullptr       If the query fails, nullptr is returned
+  ///
+  xpti_plugin_handle_t loadLibrary(const char *path, std::string &error) {
+    xpti_plugin_handle_t handle = 0;
+#if defined(_WIN32) || defined(_WIN64)
+    handle = LoadLibraryA(path);
+    if (!handle) {
+      error = getLastError();
+    }
+#else
+    handle = dlopen(path, RTLD_LAZY);
+    if (!handle) {
+      error = getLastError();
+    }
+#endif
+    return handle;
+  }
+
+  xpti::result_t unloadLibrary(xpti_plugin_handle_t h) {
+    xpti::result_t fr = xpti::result_t::XPTI_RESULT_SUCCESS;
+#if defined(_WIN32) || defined(_WIN64)
+    if (!FreeLibrary(h)) {
+      //  Error occurred while unloading the share object
+      return xpti::result_t::XPTI_RESULT_FAIL;
+    }
+#else
+    if (dlclose(h)) {
+      //  Error occurred while unloading the share object
+      return xpti::result_t::XPTI_RESULT_FAIL;
+    }
+#endif
+    return fr;
+  }
+
+  /// @brief Checks is tracing has been enabled through XPTI_TRACE_ENABLE
+  /// variable
+  /// @details The environment variable XPTI_TRACE_ENABLE is checked to see if
+  /// it is set. If it is not set, tracing is assumed to be enabled. If set,
+  /// then "true" or "1" indicates enabled and "false" or "0" indicates
+  /// disabled.
+  /// @return bool    true if set to "true" or "1" and false if set to "false"
+  /// or "0"
+  bool checkTraceEnv() {
+    std::string env = getEnvironmentVariable("XPTI_TRACE_ENABLE");
+    if (env.empty()) {
+      return true;
+    }
+    if (env == "true" || env == "1")
+      return true;
+    if (env == "false" || env == "0")
+      return false;
+    // If someone sets the variable to garbage, then we consider it as disabled
+    return false;
+  }
+};
+} // namespace utils
+
+namespace framework {
+constexpr uint16_t signal = (uint16_t)xpti::trace_point_type_t::signal;
+constexpr uint16_t graph_create =
+    (uint16_t)xpti::trace_point_type_t::graph_create;
+constexpr uint16_t node_create =
+    (uint16_t)xpti::trace_point_type_t::node_create;
+constexpr uint16_t edge_create =
+    (uint16_t)xpti::trace_point_type_t::edge_create;
+
+class scoped_notify {
+public:
+  scoped_notify(const char *stream, uint16_t trace_type,
+                xpti::trace_event_data_t *parent,
+                xpti::trace_event_data_t *object, uint64_t instance,
+                const void *user_data = nullptr)
+      : m_object(object), m_parent(parent), m_stream_id(0),
+        m_trace_type(trace_type), m_user_data(user_data), m_instance(instance) {
+    if (xptiTraceEnabled() && object) {
+      uint16_t open = m_trace_type & 0xfffe;
+      m_stream_id = xptiRegisterStream(stream);
+      xptiNotifySubscribers(m_stream_id, open, parent, object, instance,
+                            m_user_data);
+    }
+  }
+
+  ~scoped_notify() {
+    if (xptiTraceEnabled() && m_object) {
+      switch (m_trace_type) {
+      case signal:
+      case graph_create:
+      case node_create:
+      case edge_create:
+        break;
+      default: {
+        uint16_t close = m_trace_type | 1;
+        xptiNotifySubscribers(m_stream_id, close, m_parent, m_object,
+                              m_instance, m_user_data);
+      } break;
+      }
+    }
+  }
+
+private:
+  xpti::trace_event_data_t *m_object, *m_parent;
+  uint8_t m_stream_id;
+  uint16_t m_trace_type;
+  const void *m_user_data;
+  uint64_t m_instance;
+};
+} // namespace framework
+} // namespace xpti
diff --git a/xpti/src/CMakeLists.txt b/xpti/src/CMakeLists.txt
new file mode 100644
index 0000000000000..d2d35c126dc76
--- /dev/null
+++ b/xpti/src/CMakeLists.txt
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 2.8.9)
+project (xpti)
+file(GLOB SOURCES *.cpp)
+include_directories(${XPTI_DIR}/include)
+add_definitions(-DXPTI_STATIC_LIBRARY)
+add_library(xpti STATIC ${SOURCES})
+
+# Handle the debug version for the Microsoft compiler as a special case by
+# creating a debug version of the static library that uses the flags used by
+# the SYCL runtime
+if (MSVC)
+  add_library(xptid STATIC ${SOURCES})
+  target_compile_options(xptid PRIVATE ${XPTI_CXX_FLAGS_DEBUG})
+  target_compile_options(xpti PRIVATE ${XPTI_CXX_FLAGS_RELEASE})
+endif()
+
+# Set the location of the library installation
+install(TARGETS xpti DESTINATION ${CMAKE_BINARY_DIR})
diff --git a/xpti/src/xpti_proxy.cpp b/xpti/src/xpti_proxy.cpp
new file mode 100644
index 0000000000000..df41f770e235a
--- /dev/null
+++ b/xpti/src/xpti_proxy.cpp
@@ -0,0 +1,329 @@
+//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+#include "xpti_trace_framework.hpp"
+
+#include <iostream>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+enum functions_t {
+  XPTI_INITIALIZE,
+  XPTI_FINALIZE,
+  XPTI_GET_UNIQUE_ID,
+  XPTI_REGISTER_STRING,
+  XPTI_LOOKUP_STRING,
+  XPTI_REGISTER_STREAM,
+  XPTI_UNREGISTER_STREAM,
+  XPTI_REGISTER_USER_DEFINED_TP,
+  XPTI_REGISTER_USER_DEFINED_ET,
+  XPTI_MAKE_EVENT,
+  XPTI_FIND_EVENT,
+  XPTI_QUERY_PAYLOAD,
+  XPTI_REGISTER_CALLBACK,
+  XPTI_UNREGISTER_CALLBACK,
+  XPTI_NOTIFY_SUBSCRIBERS,
+  XPTI_ADD_METADATA,
+  XPTI_QUERY_METADATA,
+  XPTI_TRACE_ENABLED,
+
+  // All additional functions need to appear before
+  // the XPTI_FW_API_COUNT enum
+  XPTI_FW_API_COUNT ///< This enum must always be the last one in the list
+};
+
+namespace xpti {
+class ProxyLoader {
+  std::unordered_map<int, const char *> m_function_names = {
+      {XPTI_INITIALIZE, "xptiInitialize"},
+      {XPTI_FINALIZE, "xptiFinalize"},
+      {XPTI_GET_UNIQUE_ID, "xptiGetUniqueId"},
+      {XPTI_REGISTER_STRING, "xptiRegisterString"},
+      {XPTI_LOOKUP_STRING, "xptiLookupString"},
+      {XPTI_REGISTER_STREAM, "xptiRegisterStream"},
+      {XPTI_UNREGISTER_STREAM, "xptiUnregisterStream"},
+      {XPTI_REGISTER_USER_DEFINED_TP, "xptiRegisterUserDefinedTracePoint"},
+      {XPTI_REGISTER_USER_DEFINED_ET, "xptiRegisterUserDefinedEventType"},
+      {XPTI_MAKE_EVENT, "xptiMakeEvent"},
+      {XPTI_FIND_EVENT, "xptiFindEvent"},
+      {XPTI_QUERY_PAYLOAD, "xptiQueryPayload"},
+      {XPTI_REGISTER_CALLBACK, "xptiRegisterCallback"},
+      {XPTI_UNREGISTER_CALLBACK, "xptiUnregisterCallback"},
+      {XPTI_NOTIFY_SUBSCRIBERS, "xptiNotifySubscribers"},
+      {XPTI_ADD_METADATA, "xptiAddMetadata"},
+      {XPTI_QUERY_METADATA, "xptiQueryMetadata"},
+      {XPTI_TRACE_ENABLED, "xptiTraceEnabled"}};
+
+public:
+  typedef std::vector<xpti_plugin_function_t> dispatch_table_t;
+
+  ProxyLoader() : m_loaded(false), m_fw_plugin_handle(nullptr) {
+    // When this object is created, we attempt to load
+    // the share object implementation. We look for the
+    // environment variable XPTI_FRAMEWORK_DISPATCHER to
+    // see if it has been set. If not, all methods in
+    // the proxy should end up being close to no-ops
+    //
+    std::string env =
+        m_loader.getEnvironmentVariable("XPTI_FRAMEWORK_DISPATCHER");
+    if (env.empty())
+      return;
+    std::string error;
+    m_fw_plugin_handle = m_loader.loadLibrary(env.c_str(), error);
+    if (m_fw_plugin_handle) {
+      // We will defer changing m_loaded = true until the
+      // end of this block after we are able to resolve
+      // all of the entry points
+      //
+      m_dispatch_table.resize(XPTI_FW_API_COUNT);
+      for (auto &func_name : m_function_names) {
+        xpti_plugin_function_t func =
+            m_loader.findFunction(m_fw_plugin_handle, func_name.second);
+        if (!func) { // Return if we fail on even one function
+          m_loader.unloadLibrary(m_fw_plugin_handle);
+          m_fw_plugin_handle = nullptr;
+          return;
+        }
+        m_dispatch_table[func_name.first] = func;
+      }
+      // Only if all the functions are found and loaded,
+      // do we set the m_loaded = true
+      //
+      m_loaded = true;
+    }
+  }
+
+  ~ProxyLoader() {
+    // If the loading of the framework library was
+    // successful, we should close the handle in the
+    // destructor to decrement the reference count
+    // maintained by the loader.
+    //
+    if (m_fw_plugin_handle) {
+      m_loader.unloadLibrary(m_fw_plugin_handle);
+    }
+  }
+
+  inline bool noErrors() { return m_loaded; }
+
+  void *functionByIndex(int index) {
+    if (index >= XPTI_INITIALIZE && index < XPTI_FW_API_COUNT) {
+      return m_dispatch_table[index];
+    }
+    return nullptr;
+  }
+
+private:
+  bool m_loaded;
+  xpti_plugin_handle_t m_fw_plugin_handle;
+  dispatch_table_t m_dispatch_table;
+  xpti::utils::PlatformHelper m_loader;
+};
+
+static ProxyLoader g_loader;
+} // namespace xpti
+
+XPTI_EXPORT_API uint16_t xptiRegisterUserDefinedTracePoint(
+    const char *tool_name, uint8_t user_defined_tp) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_REGISTER_USER_DEFINED_TP);
+    if (f) {
+      return (*(xpti_register_user_defined_tp_t)f)(tool_name, user_defined_tp);
+    }
+  }
+  return xpti::invalid_id;
+}
+
+XPTI_EXPORT_API uint16_t xptiRegisterUserDefinedEventType(
+    const char *tool_name, uint8_t user_defined_event) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_REGISTER_USER_DEFINED_ET);
+    if (f) {
+      return (*(xpti_register_user_defined_et_t)f)(tool_name,
+                                                   user_defined_event);
+    }
+  }
+  return xpti::invalid_id;
+}
+
+XPTI_EXPORT_API xpti::result_t xptiInitialize(const char *stream, uint32_t maj,
+                                              uint32_t min,
+                                              const char *version) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_INITIALIZE);
+    if (f) {
+      return (*(xpti_initialize_t)f)(stream, maj, min, version);
+    }
+  }
+  return xpti::result_t::XPTI_RESULT_FAIL;
+}
+
+XPTI_EXPORT_API void xptiFinalize(const char *stream) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_FINALIZE);
+    if (f) {
+      (*(xpti_finalize_t)f)(stream);
+    }
+  }
+}
+
+XPTI_EXPORT_API uint64_t xptiGetUniqueId() {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_GET_UNIQUE_ID);
+    if (f) {
+      return (*(xpti_get_unique_id_t)f)();
+    }
+  }
+  return xpti::invalid_id;
+}
+
+XPTI_EXPORT_API xpti::string_id_t xptiRegisterString(const char *string,
+                                                     char **table_string) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_REGISTER_STRING);
+    if (f) {
+      return (*(xpti_register_string_t)f)(string, table_string);
+    }
+  }
+  return xpti::invalid_id;
+}
+
+XPTI_EXPORT_API const char *xptiLookupString(xpti::string_id_t id) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_LOOKUP_STRING);
+    if (f) {
+      return (*(xpti_lookup_string_t)f)(id);
+    }
+  }
+  return nullptr;
+}
+
+XPTI_EXPORT_API uint8_t xptiRegisterStream(const char *stream_name) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_REGISTER_STREAM);
+    if (f) {
+      return (*(xpti_register_stream_t)f)(stream_name);
+    }
+  }
+  return xpti::invalid_id;
+}
+
+XPTI_EXPORT_API xpti::result_t xptiUnregisterStream(const char *stream_name) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_UNREGISTER_STREAM);
+    if (f) {
+      return (*(xpti_unregister_stream_t)f)(stream_name);
+    }
+  }
+  return xpti::result_t::XPTI_RESULT_FAIL;
+}
+XPTI_EXPORT_API xpti::trace_event_data_t *
+xptiMakeEvent(const char *name, xpti::payload_t *payload, uint16_t event,
+              xpti::trace_activity_type_t activity, uint64_t *instance_no) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_MAKE_EVENT);
+    if (f) {
+      return (*(xpti_make_event_t)f)(name, payload, event, activity,
+                                     instance_no);
+    }
+  }
+  return nullptr;
+}
+
+XPTI_EXPORT_API const xpti::trace_event_data_t *xptiFindEvent(int64_t uid) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_FIND_EVENT);
+    if (f) {
+      return (*(xpti_find_event_t)f)(uid);
+    }
+  }
+  return nullptr;
+}
+
+XPTI_EXPORT_API const xpti::payload_t *
+xptiQueryPayload(xpti::trace_event_data_t *lookup_object) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_QUERY_PAYLOAD);
+    if (f) {
+      return (*(xpti_query_payload_t)f)(lookup_object);
+    }
+  }
+  return nullptr;
+}
+
+XPTI_EXPORT_API xpti::result_t
+xptiRegisterCallback(uint8_t stream_id, uint16_t trace_type,
+                     xpti::tracepoint_callback_api_t cb) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_REGISTER_CALLBACK);
+    if (f) {
+      return (*(xpti_register_cb_t)f)(stream_id, trace_type, cb);
+    }
+  }
+  return xpti::result_t::XPTI_RESULT_FAIL;
+}
+
+XPTI_EXPORT_API xpti::result_t
+xptiUnregisterCallback(uint8_t stream_id, uint16_t trace_type,
+                       xpti::tracepoint_callback_api_t cb) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_UNREGISTER_CALLBACK);
+    if (f) {
+      return (*(xpti_unregister_cb_t)f)(stream_id, trace_type, cb);
+    }
+  }
+  return xpti::result_t::XPTI_RESULT_FAIL;
+}
+
+XPTI_EXPORT_API xpti::result_t
+xptiNotifySubscribers(uint8_t stream_id, uint16_t trace_type,
+                      xpti::trace_event_data_t *parent,
+                      xpti::trace_event_data_t *object, uint64_t instance,
+                      const void *user_data) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_NOTIFY_SUBSCRIBERS);
+    if (f) {
+      return (*(xpti_notify_subscribers_t)f)(stream_id, trace_type, parent,
+                                             object, instance, user_data);
+    }
+  }
+  return xpti::result_t::XPTI_RESULT_FAIL;
+}
+
+XPTI_EXPORT_API bool xptiTraceEnabled() {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_TRACE_ENABLED);
+    if (f) {
+      return (*(xpti_trace_enabled_t)f)();
+    }
+  }
+  return false;
+}
+
+XPTI_EXPORT_API xpti::result_t xptiAddMetadata(xpti::trace_event_data_t *e,
+                                               const char *key,
+                                               const char *value) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_ADD_METADATA);
+    if (f) {
+      return (*(xpti_add_metadata_t)f)(e, key, value);
+    }
+  }
+  return xpti::result_t::XPTI_RESULT_FAIL;
+}
+
+XPTI_EXPORT_API xpti::metadata_t *
+xptiQueryMetadata(xpti::trace_event_data_t *lookup_object) {
+  if (xpti::g_loader.noErrors()) {
+    auto f = xpti::g_loader.functionByIndex(XPTI_QUERY_METADATA);
+    if (f) {
+      return (*(xpti_query_metadata_t)f)(lookup_object);
+    }
+  }
+  return nullptr;
+}