From 8d7eed9d77f766d377c0e89bf8f8062d2aafe9a4 Mon Sep 17 00:00:00 2001 From: Vasanth Tovinkere Date: Tue, 10 Mar 2020 07:23:25 -0700 Subject: [PATCH] [SYCL][XPTI] Instrumentation of SYCL runtime with XPTI + XPTI proxy library to provide entry points into the xpti instrumentation framework + Header specification of the XPTI framework API calls + Documentation describing the changes to SYCL runtime available under xpti/doc + SYCL runtime instrumentation using XPTI proxy library to monitor the creation of the asynchronous task graph (nodes and dependencies) + Instrumentation of entry points in queue to capture the end-user source code locations of calls to submit, parallel_for etc. + Updates to the CMakeLists.txt to soft enable the XPTI instrumentation and linking of the SYCL library with the XPTI proxy/stub library + Updates to the CI scripts to include XPTI proxy library in building along with enabling of the instrumentation in the SYCL library Signed-off-by: Vasanth Tovinkere --- buildbot/configure.py | 7 +- sycl/CMakeLists.txt | 11 + sycl/include/CL/sycl/detail/cg.hpp | 59 +- sycl/include/CL/sycl/detail/common.hpp | 68 +- sycl/include/CL/sycl/detail/pi.hpp | 7 + sycl/include/CL/sycl/handler.hpp | 5 +- sycl/include/CL/sycl/ordered_queue.hpp | 146 +++- sycl/include/CL/sycl/queue.hpp | 356 ++++++-- sycl/source/CMakeLists.txt | 45 +- sycl/source/detail/event_impl.cpp | 70 ++ sycl/source/detail/event_impl.hpp | 20 +- sycl/source/detail/pi.cpp | 67 ++ sycl/source/detail/queue_impl.cpp | 99 +++ sycl/source/detail/queue_impl.hpp | 42 +- sycl/source/detail/scheduler/commands.cpp | 761 +++++++++++++++++- sycl/source/detail/scheduler/commands.hpp | 120 ++- sycl/source/detail/scheduler/scheduler.cpp | 16 + sycl/source/handler.cpp | 25 +- sycl/source/ordered_queue.cpp | 22 +- sycl/source/queue.cpp | 22 +- sycl/unittests/scheduler/BlockedCommands.cpp | 2 + .../scheduler/SchedulerTestUtils.hpp | 1 + xpti/CMakeLists.txt | 55 ++ xpti/README.md | 22 + xpti/doc/SYCL_Tracing_Implementation.md | 236 ++++++ xpti/include/xpti_data_types.h | 543 +++++++++++++ xpti/include/xpti_trace_framework.h | 414 ++++++++++ xpti/include/xpti_trace_framework.hpp | 321 ++++++++ xpti/src/CMakeLists.txt | 18 + xpti/src/xpti_proxy.cpp | 329 ++++++++ 30 files changed, 3650 insertions(+), 259 deletions(-) create mode 100644 xpti/CMakeLists.txt create mode 100644 xpti/README.md create mode 100644 xpti/doc/SYCL_Tracing_Implementation.md create mode 100644 xpti/include/xpti_data_types.h create mode 100644 xpti/include/xpti_trace_framework.h create mode 100644 xpti/include/xpti_trace_framework.hpp create mode 100644 xpti/src/CMakeLists.txt create mode 100644 xpti/src/xpti_proxy.cpp diff --git a/buildbot/configure.py b/buildbot/configure.py index c0e4dc3eb6d31..3d6c95393cae2 100644 --- a/buildbot/configure.py +++ b/buildbot/configure.py @@ -10,10 +10,11 @@ def do_configure(args): llvm_dir = os.path.join(args.src_dir, "llvm") sycl_dir = os.path.join(args.src_dir, "sycl") spirv_dir = os.path.join(args.src_dir, "llvm-spirv") + xpti_dir = os.path.join(args.src_dir, "xpti") ocl_header_dir = os.path.join(args.obj_dir, "OpenCL-Headers") icd_loader_lib = os.path.join(args.obj_dir, "OpenCL-ICD-Loader", "build") llvm_targets_to_build = 'X86' - llvm_enable_projects = 'clang;llvm-spirv;sycl;opencl-aot' + llvm_enable_projects = 'clang;llvm-spirv;sycl;opencl-aot;xpti' libclc_targets_to_build = '' sycl_build_pi_cuda = 'OFF' llvm_enable_assertions = 'ON' @@ -44,9 +45,10 @@ def do_configure(args): "-DCMAKE_BUILD_TYPE={}".format(args.build_type), "-DLLVM_ENABLE_ASSERTIONS={}".format(llvm_enable_assertions), "-DLLVM_TARGETS_TO_BUILD={}".format(llvm_targets_to_build), - "-DLLVM_EXTERNAL_PROJECTS=sycl;llvm-spirv;opencl-aot", + "-DLLVM_EXTERNAL_PROJECTS=sycl;llvm-spirv;opencl-aot;xpti", "-DLLVM_EXTERNAL_SYCL_SOURCE_DIR={}".format(sycl_dir), "-DLLVM_EXTERNAL_LLVM_SPIRV_SOURCE_DIR={}".format(spirv_dir), + "-DLLVM_EXTERNAL_XPTI_SOURCE_DIR={}".format(xpti_dir), "-DLLVM_ENABLE_PROJECTS={}".format(llvm_enable_projects), "-DLIBCLC_TARGETS_TO_BUILD={}".format(libclc_targets_to_build), "-DOpenCL_INCLUDE_DIR={}".format(ocl_header_dir), @@ -57,6 +59,7 @@ def do_configure(args): "-DCMAKE_INSTALL_PREFIX={}".format(install_dir), "-DSYCL_INCLUDE_TESTS=ON", # Explicitly include all kinds of SYCL tests. "-DLLVM_ENABLE_DOXYGEN={}".format(llvm_enable_doxygen), + "-DSYCL_ENABLE_XPTI_TRACING=ON", # Explicitly turn on XPTI tracing llvm_dir ] diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index 8b2883ceca745..85c67129fe4e2 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -26,6 +26,10 @@ if(SYCL_ENABLE_WERROR) endif() endif() +# Create a soft option for enabling or disabling the instrumentation +# of the SYCL runtime and expect enabling +option(SYCL_ENABLE_XPTI_TRACING "Enable tracing of SYCL constructs" OFF) + if(MSVC) set_property(GLOBAL PROPERTY USE_FOLDERS ON) # Skip asynchronous C++ exceptions catching and assume "extern C" functions @@ -218,6 +222,13 @@ add_custom_target( sycl-toolchain COMMENT "Building SYCL compiler toolchain..." ) +if (SYCL_ENABLE_XPTI_TRACING) + add_dependencies( sycl-toolchain xpti) + if (MSVC) + add_dependencies( sycl-toolchain xptid) + endif() +endif() + if (NOT DEFINED LLVM_INCLUDE_TESTS) set(LLVM_INCLUDE_TESTS ON) endif() diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp index e4d0a0d30967c..a87daa3e8e154 100644 --- a/sycl/include/CL/sycl/detail/cg.hpp +++ b/sycl/include/CL/sycl/detail/cg.hpp @@ -368,11 +368,21 @@ class CG { vector_class AccStorage, vector_class> SharedPtrStorage, vector_class Requirements, - vector_class Events) + vector_class Events, detail::code_location loc = {}) : MType(Type), MArgsStorage(std::move(ArgsStorage)), MAccStorage(std::move(AccStorage)), MSharedPtrStorage(std::move(SharedPtrStorage)), - MRequirements(std::move(Requirements)), MEvents(std::move(Events)) {} + MRequirements(std::move(Requirements)), MEvents(std::move(Events)) { + // Capture the user code-location from Q.submit(), Q.parallel_for() + // etc for later use; if code location information is not available, + // the file name and function name members will be empty strings + if (loc.functionName()) + MFunctionName = loc.functionName(); + if (loc.fileName()) + MFileName = loc.fileName(); + MLine = loc.lineNumber(); + MColumn = loc.columnNumber(); + } CG(CG &&CommandGroup) = default; @@ -397,6 +407,12 @@ class CG { vector_class MRequirements; // List of events that order the execution of this CG vector_class MEvents; + // Member variables to capture the user code-location + // information from Q.submit(), Q.parallel_for() etc + // Storage for function name and source file name + string_class MFunctionName, MFileName; + // Storage for line and column of code location + int32_t MLine, MColumn; }; // The class which represents "execute kernel" command group. @@ -420,10 +436,10 @@ class CGExecKernel : public CG { vector_class Args, string_class KernelName, detail::OSModuleHandle OSModuleHandle, vector_class> Streams, - CGTYPE Type) + CGTYPE Type, detail::code_location loc = {}) : CG(Type, std::move(ArgsStorage), std::move(AccStorage), std::move(SharedPtrStorage), std::move(Requirements), - std::move(Events)), + std::move(Events), std::move(loc)), MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)), MSyclKernel(std::move(SyclKernel)), MArgs(std::move(Args)), MKernelName(std::move(KernelName)), MOSModuleHandle(OSModuleHandle), @@ -450,10 +466,11 @@ class CGCopy : public CG { vector_class AccStorage, vector_class> SharedPtrStorage, vector_class Requirements, - vector_class Events) + vector_class Events, + detail::code_location loc = {}) : CG(CopyType, std::move(ArgsStorage), std::move(AccStorage), std::move(SharedPtrStorage), std::move(Requirements), - std::move(Events)), + std::move(Events), std::move(loc)), MSrc(Src), MDst(Dst) {} void *getSrc() { return MSrc; } void *getDst() { return MDst; } @@ -470,10 +487,11 @@ class CGFill : public CG { vector_class AccStorage, vector_class> SharedPtrStorage, vector_class Requirements, - vector_class Events) + vector_class Events, + detail::code_location loc = {}) : CG(FILL, std::move(ArgsStorage), std::move(AccStorage), std::move(SharedPtrStorage), std::move(Requirements), - std::move(Events)), + std::move(Events), std::move(loc)), MPattern(std::move(Pattern)), MPtr((Requirement *)Ptr) {} Requirement *getReqToFill() { return MPtr; } }; @@ -487,10 +505,11 @@ class CGUpdateHost : public CG { vector_class AccStorage, vector_class> SharedPtrStorage, vector_class Requirements, - vector_class Events) + vector_class Events, + detail::code_location loc = {}) : CG(UPDATE_HOST, std::move(ArgsStorage), std::move(AccStorage), std::move(SharedPtrStorage), std::move(Requirements), - std::move(Events)), + std::move(Events), std::move(loc)), MPtr((Requirement *)Ptr) {} Requirement *getReqToUpdate() { return MPtr; } @@ -508,10 +527,11 @@ class CGCopyUSM : public CG { vector_class AccStorage, vector_class> SharedPtrStorage, vector_class Requirements, - vector_class Events) + vector_class Events, + detail::code_location loc = {}) : CG(COPY_USM, std::move(ArgsStorage), std::move(AccStorage), std::move(SharedPtrStorage), std::move(Requirements), - std::move(Events)), + std::move(Events), std::move(loc)), MSrc(Src), MDst(Dst), MLength(Length) {} void *getSrc() { return MSrc; } @@ -531,10 +551,11 @@ class CGFillUSM : public CG { vector_class AccStorage, vector_class> SharedPtrStorage, vector_class Requirements, - vector_class Events) + vector_class Events, + detail::code_location loc = {}) : CG(FILL_USM, std::move(ArgsStorage), std::move(AccStorage), std::move(SharedPtrStorage), std::move(Requirements), - std::move(Events)), + std::move(Events), std::move(loc)), MPattern(std::move(Pattern)), MDst(DstPtr), MLength(Length) {} void *getDst() { return MDst; } size_t getLength() { return MLength; } @@ -552,10 +573,11 @@ class CGPrefetchUSM : public CG { vector_class AccStorage, vector_class> SharedPtrStorage, vector_class Requirements, - vector_class Events) + vector_class Events, + detail::code_location loc = {}) : CG(PREFETCH_USM, std::move(ArgsStorage), std::move(AccStorage), std::move(SharedPtrStorage), std::move(Requirements), - std::move(Events)), + std::move(Events), std::move(loc)), MDst(DstPtr), MLength(Length) {} void *getDst() { return MDst; } size_t getLength() { return MLength; } @@ -570,10 +592,11 @@ class CGInteropTask : public CG { std::vector AccStorage, std::vector> SharedPtrStorage, std::vector Requirements, - std::vector Events, CGTYPE Type) + std::vector Events, CGTYPE Type, + detail::code_location loc = {}) : CG(Type, std::move(ArgsStorage), std::move(AccStorage), std::move(SharedPtrStorage), std::move(Requirements), - std::move(Events)), + std::move(Events), std::move(loc)), MInteropTask(std::move(InteropTask)) {} }; diff --git a/sycl/include/CL/sycl/detail/common.hpp b/sycl/include/CL/sycl/detail/common.hpp index 2dfe8ef71dc74..141d1fffdd7b8 100644 --- a/sycl/include/CL/sycl/detail/common.hpp +++ b/sycl/include/CL/sycl/detail/common.hpp @@ -16,24 +16,84 @@ #include #include #include + +#include #include #include #define STRINGIFY_LINE_HELP(s) #s #define STRINGIFY_LINE(s) STRINGIFY_LINE_HELP(s) +// Default signature enables the passing of user code location information to +// public methods as a default argument. If the end-user wants to disable the +// code location information, they must compile the code with +// -DDISABLE_SYCL_INSTRUMENTATION_METADATA flag +__SYCL_INLINE_NAMESPACE(cl) { +namespace sycl { +namespace detail { +// We define a sycl stream name and this will +// be used by the instrumentation framework +constexpr const char *SYCL_STREAM_NAME = "sycl"; +// Data structure that captures the user code +// location information using the builtin capabilities +// of the compiler +struct code_location { +#ifdef _MSC_VER + // Since MSVC does not support the required builtins, we + // implement the version with "unknown"s which is handled + // correctly by the instrumentation + static constexpr code_location current(const char *fileName = nullptr, + const char *funcName = nullptr, + unsigned long lineNo = 0, + unsigned long columnNo = 0) noexcept { + return code_location(fileName, funcName, lineNo, columnNo); + } +#else + static constexpr code_location + current(const char *fileName = __builtin_FILE(), + const char *funcName = __builtin_FUNCTION(), + unsigned long lineNo = __builtin_LINE(), + unsigned long columnNo = 0) noexcept { + return code_location(fileName, funcName, lineNo, columnNo); + } +#endif + + constexpr code_location(const char *file, const char *func, int line, + int col) noexcept + : MFileName(file), MFunctionName(func), MLineNo(line), MColumnNo(col) {} + + constexpr code_location() noexcept + : MFileName(nullptr), MFunctionName(nullptr), MLineNo(0), MColumnNo(0) {} + + constexpr unsigned long lineNumber() const noexcept { return MLineNo; } + constexpr unsigned long columnNumber() const noexcept { return MColumnNo; } + constexpr const char *fileName() const noexcept { return MFileName; } + constexpr const char *functionName() const noexcept { return MFunctionName; } + +private: + const char *MFileName; + const char *MFunctionName; + unsigned long MLineNo; + unsigned long MColumnNo; +}; +} // namespace detail +} // namespace sycl +} // __SYCL_INLINE_NAMESPACE(cl) + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { const char *stringifyErrorCode(cl_int error); -static inline std::string codeToString(cl_int code){ - return std::string(std::to_string(code) + " (" + - stringifyErrorCode(code) + ")"); +static inline std::string codeToString(cl_int code) { + return std::string(std::to_string(code) + " (" + stringifyErrorCode(code) + + ")"); } -}}} // __SYCL_INLINE_NAMESPACE(cl)::sycl::detail +} // namespace detail +} // namespace sycl +} // __SYCL_INLINE_NAMESPACE(cl) #ifdef __SYCL_DEVICE_ONLY__ // TODO remove this when 'assert' is supported in device code diff --git a/sycl/include/CL/sycl/detail/pi.hpp b/sycl/include/CL/sycl/detail/pi.hpp index bea05328c81b3..a8105d2cf4acd 100644 --- a/sycl/include/CL/sycl/detail/pi.hpp +++ b/sycl/include/CL/sycl/detail/pi.hpp @@ -18,6 +18,13 @@ #include #include +#ifdef XPTI_ENABLE_INSTRUMENTATION +// Forward declarations +namespace xpti { +struct trace_event_data_t; +} +#endif + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 1678db2dbd80c..c64dbd4b25fa4 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -195,8 +195,9 @@ class handler { /// It's expected that the method is the latest method executed before /// object destruction. /// - /// \return a SYCL event object representing the command group. - event finalize(); + /// \param Payload contains the code location of user code + /// \return a SYCL event object representing the command group + event finalize(const cl::sycl::detail::code_location &Payload = {}); /// Saves streams associated with this handler. /// diff --git a/sycl/include/CL/sycl/ordered_queue.hpp b/sycl/include/CL/sycl/ordered_queue.hpp index 0914254631d76..240d780645e8b 100644 --- a/sycl/include/CL/sycl/ordered_queue.hpp +++ b/sycl/include/CL/sycl/ordered_queue.hpp @@ -88,15 +88,62 @@ class __SYCL_DEPRECATED__ ordered_queue { template typename info::param_traits::return_type get_info() const; - template event submit(T cgf) { return submit_impl(cgf); } + /// @param Loc is the code location of the submit call (default argument) + template + event + submit(T cgf +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit_impl(cgf, CodeLoc); + } + + template + event + submit(T cgf, ordered_queue &secondaryQueue +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit_impl(cgf, secondaryQueue, CodeLoc); + } - template event submit(T cgf, ordered_queue &secondaryQueue) { - return submit_impl(cgf, secondaryQueue); + /// @param CodeLoc is the code location of the submit call (default argument) + void wait( +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + wait_proxy(CodeLoc); } - void wait(); + /// @param CodeLoc is the code location of the submit call (default argument) + void wait_and_throw( +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + wait_and_throw_proxy(CodeLoc); + } + + void wait_proxy(const detail::code_location &CodeLoc); - void wait_and_throw(); + void wait_and_throw_proxy(const detail::code_location &CodeLoc); void throw_asynchronous(); @@ -114,45 +161,88 @@ class __SYCL_DEPRECATED__ ordered_queue { // single_task version with a kernel represented as a lambda. template - void single_task(KernelType KernelFunc) { - submit([&](handler &cgh) { - cgh.template single_task(KernelFunc); - }); + void single_task( + KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + submit( + [&](handler &cgh) { + cgh.template single_task(KernelFunc); + }, + CodeLoc); } // parallel_for version with a kernel represented as a lambda + range that // specifies global size only. template - void parallel_for(range NumWorkItems, KernelType KernelFunc) { + void parallel_for( + range NumWorkItems, KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif // By-value or By-reference for this? - submit([&](handler &cgh) { - cgh.template parallel_for(NumWorkItems, - KernelFunc); - }); + submit( + [&](handler &cgh) { + cgh.template parallel_for(NumWorkItems, + KernelFunc); + }, + CodeLoc); } // parallel_for version with a kernel represented as a lambda + range and // offset that specify global size and global offset correspondingly. template - void parallel_for(range NumWorkItems, id WorkItemOffset, - KernelType KernelFunc) { - submit([&](handler &cgh) { - cgh.template parallel_for( - NumWorkItems, WorkItemOffset, KernelFunc); - }); + void parallel_for( + range NumWorkItems, id WorkItemOffset, KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + submit( + [&](handler &cgh) { + cgh.template parallel_for( + NumWorkItems, WorkItemOffset, KernelFunc); + }, + CodeLoc); } // parallel_for version with a kernel represented as a lambda + nd_range that // specifies global, local sizes and offset. template - void parallel_for(nd_range ExecutionRange, KernelType KernelFunc) { - submit([&](handler &cgh) { - cgh.template parallel_for(ExecutionRange, - KernelFunc); - }); + void parallel_for( + nd_range ExecutionRange, KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + submit( + [&](handler &cgh) { + cgh.template parallel_for( + ExecutionRange, KernelFunc); + }, + CodeLoc); } private: @@ -160,9 +250,11 @@ class __SYCL_DEPRECATED__ ordered_queue { template friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject); - event submit_impl(function_class CGH); event submit_impl(function_class CGH, - ordered_queue &secondQueue); + const detail::code_location &CodeLoc); + event submit_impl(function_class CGH, + ordered_queue &secondQueue, + const detail::code_location &CodeLoc); }; #undef __SYCL_DEPRECATED__ diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp index 8a5cd41e25b2a..c315ccff3e960 100644 --- a/sycl/include/CL/sycl/queue.hpp +++ b/sycl/include/CL/sycl/queue.hpp @@ -167,8 +167,21 @@ class queue { /// scheduled for execution on the device. /// /// \param CGF is a function object containing command group. + /// \param CodeLoc is the code location of the submit call (default argument) /// \return a SYCL event object for the submitted command group. - template event submit(T CGF) { return submit_impl(CGF); } + template + event + submit(T CGF +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit_impl(CGF, CodeLoc); + } /// Submits a command group function object to the queue, in order to be /// scheduled for execution on the device. @@ -178,17 +191,38 @@ class queue { /// /// \param CGF is a function object containing command group. /// \param SecondaryQueue is a fallback SYCL queue. + /// \param CodeLoc is the code location of the submit call (default argument) /// \return a SYCL event object, which corresponds to the queue the command /// group is being enqueued on. - template event submit(T CGF, queue &SecondaryQueue) { - return submit_impl(CGF, SecondaryQueue); + template + event + submit(T CGF, queue &SecondaryQueue +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit_impl(CGF, SecondaryQueue, CodeLoc); } /// Performs a blocking wait for the completion of all enqueued tasks in the /// queue. /// /// Synchronous errors will be reported through SYCL exceptions. - void wait(); + /// @param CodeLoc is the code location of the submit call (default argument) + void wait( +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + wait_proxy(CodeLoc); + } /// Performs a blocking wait for the completion of all enqueued tasks in the /// queue. @@ -197,7 +231,24 @@ class queue { /// errors will be passed to the async_handler passed to the queue on /// construction. If no async_handler was provided then asynchronous /// exceptions will be lost. - void wait_and_throw(); + /// @param CodeLoc is the code location of the submit call (default argument) + void wait_and_throw( +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + wait_and_throw_proxy(CodeLoc); + } + + /// Proxy method for wait to forward the code location information to the + /// implementation + void wait_proxy(const detail::code_location &CodeLoc); + /// Proxy method for wait_and_throw to forward the code location information + /// to the implementation + void wait_and_throw_proxy(const detail::code_location &CodeLoc); /// Checks if any asynchronous errors have been produced by the queue and if /// so reports them to the async_handler passed on the queue construction. @@ -254,23 +305,47 @@ class queue { /// single_task version with a kernel represented as a lambda. /// /// \param KernelFunc is the Kernel functor or lambda + /// \param CodeLoc contains the code location of user code template - event single_task(KernelType KernelFunc) { - return submit([&](handler &CGH) { - CGH.template single_task(KernelFunc); - }); + event single_task( + KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit( + [&](handler &CGH) { + CGH.template single_task(KernelFunc); + }, + CodeLoc); } /// single_task version with a kernel represented as a lambda. /// /// \param DepEvent is an event that specifies the kernel dependencies /// \param KernelFunc is the Kernel functor or lambda + /// \param CodeLoc contains the code location of user code template - event single_task(event DepEvent, KernelType KernelFunc) { - return submit([&](handler &CGH) { - CGH.depends_on(DepEvent); - CGH.template single_task(KernelFunc); - }); + event single_task( + event DepEvent, KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.template single_task(KernelFunc); + }, + CodeLoc); } /// single_task version with a kernel represented as a lambda. @@ -278,13 +353,24 @@ class queue { /// \param DepEvents is a vector of events that specifies the kernel /// dependencies /// \param KernelFunc is the Kernel functor or lambda + /// \param CodeLoc contains the code location of user code template - event single_task(const vector_class &DepEvents, - KernelType KernelFunc) { - return submit([&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.template single_task(KernelFunc); - }); + event single_task( + const vector_class &DepEvents, KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.template single_task(KernelFunc); + }, + CodeLoc); } /// parallel_for version with a kernel represented as a lambda + range that @@ -292,13 +378,25 @@ class queue { /// /// \param NumWorkItems is a range that specifies the work space of the kernel /// \param KernelFunc is the Kernel functor or lambda + /// \param CodeLoc contains the code location of user code template - event parallel_for(range NumWorkItems, KernelType KernelFunc) { - return submit([&](handler &CGH) { - CGH.template parallel_for(NumWorkItems, - KernelFunc); - }); + event parallel_for( + range NumWorkItems, KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit( + [&](handler &CGH) { + CGH.template parallel_for(NumWorkItems, + KernelFunc); + }, + CodeLoc); } /// parallel_for version with a kernel represented as a lambda + range that @@ -307,15 +405,26 @@ class queue { /// \param NumWorkItems is a range that specifies the work space of the kernel /// \param DepEvent is an event that specifies the kernel dependencies /// \param KernelFunc is the Kernel functor or lambda + /// \param CodeLoc contains the code location of user code template - event parallel_for(range NumWorkItems, event DepEvent, - KernelType KernelFunc) { - return submit([&](handler &CGH) { - CGH.depends_on(DepEvent); - CGH.template parallel_for(NumWorkItems, - KernelFunc); - }); + event parallel_for( + range NumWorkItems, event DepEvent, KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.template parallel_for(NumWorkItems, + KernelFunc); + }, + CodeLoc); } /// parallel_for version with a kernel represented as a lambda + range that @@ -325,16 +434,27 @@ class queue { /// \param DepEvents is a vector of events that specifies the kernel /// dependencies /// \param KernelFunc is the Kernel functor or lambda + /// \param CodeLoc contains the code location of user code template - event parallel_for(range NumWorkItems, - const vector_class &DepEvents, - KernelType KernelFunc) { - return submit([&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.template parallel_for(NumWorkItems, - KernelFunc); - }); + event parallel_for( + range NumWorkItems, const vector_class &DepEvents, + KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.template parallel_for(NumWorkItems, + KernelFunc); + }, + CodeLoc); } /// parallel_for version with a kernel represented as a lambda + range and @@ -343,14 +463,25 @@ class queue { /// \param NumWorkItems is a range that specifies the work space of the kernel /// \param WorkItemOffset specifies the offset for each work item id /// \param KernelFunc is the Kernel functor or lambda + /// \param CodeLoc contains the code location of user code template - event parallel_for(range NumWorkItems, id WorkItemOffset, - KernelType KernelFunc) { - return submit([&](handler &CGH) { - CGH.template parallel_for( - NumWorkItems, WorkItemOffset, KernelFunc); - }); + event parallel_for( + range NumWorkItems, id WorkItemOffset, KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit( + [&](handler &CGH) { + CGH.template parallel_for( + NumWorkItems, WorkItemOffset, KernelFunc); + }, + CodeLoc); } /// parallel_for version with a kernel represented as a lambda + range and @@ -360,15 +491,27 @@ class queue { /// \param WorkItemOffset specifies the offset for each work item id /// \param DepEvent is an event that specifies the kernel dependencies /// \param KernelFunc is the Kernel functor or lambda + /// \param CodeLoc contains the code location of user code template - event parallel_for(range NumWorkItems, id WorkItemOffset, - event DepEvent, KernelType KernelFunc) { - return submit([&](handler &CGH) { - CGH.depends_on(DepEvent); - CGH.template parallel_for( - NumWorkItems, WorkItemOffset, KernelFunc); - }); + event parallel_for( + range NumWorkItems, id WorkItemOffset, event DepEvent, + KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.template parallel_for( + NumWorkItems, WorkItemOffset, KernelFunc); + }, + CodeLoc); } /// parallel_for version with a kernel represented as a lambda + range and @@ -379,16 +522,27 @@ class queue { /// \param DepEvents is a vector of events that specifies the kernel /// dependencies /// \param KernelFunc is the Kernel functor or lambda + /// \param CodeLoc contains the code location of user code template - event parallel_for(range NumWorkItems, id WorkItemOffset, - const vector_class &DepEvents, - KernelType KernelFunc) { - return submit([&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.template parallel_for( - NumWorkItems, WorkItemOffset, KernelFunc); - }); + event parallel_for( + range NumWorkItems, id WorkItemOffset, + const vector_class &DepEvents, KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.template parallel_for( + NumWorkItems, WorkItemOffset, KernelFunc); + }, + CodeLoc); } /// parallel_for version with a kernel represented as a lambda + nd_range that @@ -397,13 +551,25 @@ class queue { /// \param ExecutionRange is a range that specifies the work space of the /// kernel /// \param KernelFunc is the Kernel functor or lambda + /// \param CodeLoc contains the code location of user code template - event parallel_for(nd_range ExecutionRange, KernelType KernelFunc) { - return submit([&](handler &CGH) { - CGH.template parallel_for(ExecutionRange, - KernelFunc); - }); + event parallel_for( + nd_range ExecutionRange, KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit( + [&](handler &CGH) { + CGH.template parallel_for( + ExecutionRange, KernelFunc); + }, + CodeLoc); } /// parallel_for version with a kernel represented as a lambda + nd_range that @@ -413,15 +579,26 @@ class queue { /// kernel /// \param DepEvent is an event that specifies the kernel dependencies /// \param KernelFunc is the Kernel functor or lambda + /// \param CodeLoc contains the code location of user code template - event parallel_for(nd_range ExecutionRange, event DepEvent, - KernelType KernelFunc) { - return submit([&](handler &CGH) { - CGH.depends_on(DepEvent); - CGH.template parallel_for(ExecutionRange, - KernelFunc); - }); + event parallel_for( + nd_range ExecutionRange, event DepEvent, KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvent); + CGH.template parallel_for( + ExecutionRange, KernelFunc); + }, + CodeLoc); } /// parallel_for version with a kernel represented as a lambda + nd_range that @@ -432,16 +609,27 @@ class queue { /// \param DepEvents is a vector of events that specifies the kernel /// dependencies /// \param KernelFunc is the Kernel functor or lambda + /// \param CodeLoc contains the code location of user code template - event parallel_for(nd_range ExecutionRange, - const vector_class &DepEvents, - KernelType KernelFunc) { - return submit([&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.template parallel_for(ExecutionRange, - KernelFunc); - }); + event parallel_for( + nd_range ExecutionRange, const vector_class &DepEvents, + KernelType KernelFunc +#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA + , + const detail::code_location &CodeLoc = detail::code_location::current() +#endif + ) { +#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA + const detail::code_location &CodeLoc = {}; +#endif + return submit( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + CGH.template parallel_for( + ExecutionRange, KernelFunc); + }, + CodeLoc); } /// Returns whether the queue is in order or OoO @@ -455,9 +643,11 @@ class queue { friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject); /// A template-free version of submit. - event submit_impl(function_class CGH); + event submit_impl(function_class CGH, + const detail::code_location &CodeLoc); /// A template-free version of submit. - event submit_impl(function_class CGH, queue secondQueue); + event submit_impl(function_class CGH, queue secondQueue, + const detail::code_location &CodeLoc); }; } // namespace sycl diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index 6c2243fec3ed3..2cc51cd280fce 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -4,9 +4,19 @@ #cmake_policy(SET CMP0057 NEW) #include(AddLLVM) +if (SYCL_ENABLE_XPTI_TRACING) + if (NOT EXISTS ${LLVM_EXTERNAL_XPTI_SOURCE_DIR}) + message (FATAL_ERROR "Undefined LLVM_EXTERNAL_XPTI_SOURCE_DIR variable: Must be set when XPTI tracing is set to ON") + endif() + include_directories(${LLVM_EXTERNAL_XPTI_SOURCE_DIR}/include) +endif() + function(add_sycl_rt_library LIB_NAME) + # Add an optional argument so we can get the library name to + # link with for Windows Debug version + cmake_parse_arguments(ARG "" "XPTI_LIB" "" ${ARGN}) - add_library(${LIB_NAME} SHARED ${ARGN}) + add_library(${LIB_NAME} SHARED ${ARG_UNPARSED_ARGUMENTS}) #To-Do: Remove dependency on icd loader and opencl headers. add_dependencies(${LIB_NAME} @@ -17,14 +27,22 @@ function(add_sycl_rt_library LIB_NAME) set_target_properties(${LIB_NAME} PROPERTIES LINKER_LANGUAGE CXX) + if (SYCL_ENABLE_XPTI_TRACING) + target_compile_definitions(${LIB_NAME} PRIVATE XPTI_ENABLE_INSTRUMENTATION XPTI_STATIC_LIBRARY) + target_link_libraries(${LIB_NAME} PRIVATE ${ARG_XPTI_LIB}) + endif() + if (MSVC) - target_compile_definitions(${LIB_NAME} PRIVATE __SYCL_BUILD_SYCL_DLL ) - target_link_libraries(${LIB_NAME} PRIVATE shlwapi) + target_compile_definitions(${LIB_NAME} PRIVATE __SYCL_BUILD_SYCL_DLL ) + target_link_libraries(${LIB_NAME} PRIVATE shlwapi) else() - set(linker_script "${CMAKE_CURRENT_SOURCE_DIR}/ld-version-script.txt") - target_link_libraries( - ${LIB_NAME} PRIVATE "-Wl,--version-script=${linker_script}") - set_target_properties(${LIB_NAME} PROPERTIES LINK_DEPENDS ${linker_script}) + set(linker_script "${CMAKE_CURRENT_SOURCE_DIR}/ld-version-script.txt") + target_link_libraries( + ${LIB_NAME} PRIVATE "-Wl,--version-script=${linker_script}") + set_target_properties(${LIB_NAME} PROPERTIES LINK_DEPENDS ${linker_script}) + if (SYCL_ENABLE_XPTI_TRACING) + target_link_libraries(${LIB_NAME} PRIVATE dl) + endif() endif() target_include_directories( @@ -105,7 +123,11 @@ set(SYCL_SOURCES "$<$,$>:detail/posix_pi.cpp>" ) -add_sycl_rt_library(sycl ${SYCL_SOURCES}) +if (SYCL_ENABLE_XPTI_TRACING) + add_sycl_rt_library(sycl ${SYCL_SOURCES} XPTI_LIB xpti) +else() + add_sycl_rt_library(sycl ${SYCL_SOURCES}) +endif() if (MSVC) # MSVC provides two incompatible build variants for its CRT: release and debug @@ -143,8 +165,13 @@ if (MSVC) target_compile_options(sycl PRIVATE ${SYCL_CXX_FLAGS_RELEASE}) - add_sycl_rt_library(sycld ${SYCL_SOURCES}) + if (SYCL_ENABLE_XPTI_TRACING) + add_sycl_rt_library(sycld ${SYCL_SOURCES} XPTI_LIB xptid) + else() + add_sycl_rt_library(sycld ${SYCL_SOURCES}) + endif() target_compile_options(sycld PRIVATE ${SYCL_CXX_FLAGS_DEBUG}) + endif() install(TARGETS ${SYCL_RT_LIBS} diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 3340f6b488891..1adb3b85dd39c 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -17,9 +17,18 @@ #include +#ifdef XPTI_ENABLE_INSTRUMENTATION +#include "xpti_trace_framework.hpp" +#include +#include +#endif + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { +#ifdef XPTI_ENABLE_INSTRUMENTATION +extern xpti::trace_event_data_t *GSYCLGraphEvent; +#endif // Threat all devices that don't support interoperability as host devices to // avoid attempts to call method get on such events. @@ -96,8 +105,65 @@ event_impl::event_impl(QueueImplPtr Queue) : MQueue(Queue) { } } +void *event_impl::instrumentationProlog(string_class &Name, int32_t StreamID, + uint64_t &IId) const { + void *TraceEvent = nullptr; +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return TraceEvent; + // Use a thread-safe counter to get a unique instance ID for the wait() on the + // event + static std::atomic InstanceID = {1}; + xpti::trace_event_data_t *WaitEvent = nullptr; + + // Create a string with the event address so it + // can be associated with other debug data + xpti::utils::StringHelper SH; + Name = SH.nameWithAddress("event.wait", MEvent); + + // We can emit the wait associated with the graph if the + // event does not have a command object or associated with + // the command object, if it exists + if (MCommand) { + Command *Cmd = (Command *)MCommand; + WaitEvent = Cmd->MTraceEvent ? static_cast(Cmd->MTraceEvent) + : GSYCLGraphEvent; + } else + WaitEvent = GSYCLGraphEvent; + + // Record the current instance ID for use by Epilog + IId = InstanceID++; + xptiNotifySubscribers(StreamID, xpti::trace_wait_begin, nullptr, WaitEvent, + IId, static_cast(Name.c_str())); + TraceEvent = (void *)WaitEvent; +#endif + return TraceEvent; +} + +void event_impl::instrumentationEpilog(void *TelemetryEvent, + const string_class &Name, + int32_t StreamID, uint64_t IId) const { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!(xptiTraceEnabled() && TelemetryEvent)) + return; + // Close the wait() scope + xpti::trace_event_data_t *TraceEvent = + (xpti::trace_event_data_t *)TelemetryEvent; + xptiNotifySubscribers(StreamID, xpti::trace_wait_end, nullptr, TraceEvent, + IId, static_cast(Name.c_str())); +#endif +} + void event_impl::wait( std::shared_ptr Self) const { +#ifdef XPTI_ENABLE_INSTRUMENTATION + void *TelemetryEvent = nullptr; + uint64_t IId; + std::string Name; + int32_t StreamID = xptiRegisterStream(SYCL_STREAM_NAME); + TelemetryEvent = instrumentationProlog(Name, StreamID, IId); +#endif + if (MEvent) // presence of MEvent means the command has been enqueued, so no need to // go via the slow path event waiting in the scheduler @@ -106,6 +172,10 @@ void event_impl::wait( detail::Scheduler::getInstance().waitForEvent(Self); if (MCommand && !SYCLConfig::get()) detail::Scheduler::getInstance().cleanupFinishedCommands(std::move(Self)); + +#ifdef XPTI_ENABLE_INSTRUMENTATION + instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); +#endif } void event_impl::wait_and_throw( diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index cbe0c1f55aa66..d21c84c6b1a96 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -118,8 +118,8 @@ class event_impl { /// \return a shared pointer to a valid context_impl. const ContextImplPtr &getContextImpl(); - // \return the Plugin associated with the context of this event. - // Should be called when this is not a Host Event. + /// \return the Plugin associated with the context of this event. + /// Should be called when this is not a Host Event. const plugin &getPlugin() const; /// Associate event with the context. @@ -127,25 +127,33 @@ class event_impl { /// Provided PiContext inside ContextImplPtr must be associated /// with the PiEvent object stored in this class /// - /// \param Context is a shared pointer to an instance of valid context_impl. + /// @param Context is a shared pointer to an instance of valid context_impl. void setContextImpl(const ContextImplPtr &Context); /// Returns command that is associated with the event. /// - /// \return a generic pointer to Command object instance. + /// @return a generic pointer to Command object instance. void *getCommand() { return MCommand; } /// Associates this event with the command. /// - /// \param Command is a generic pointer to Command object instance. + /// @param Command is a generic pointer to Command object instance. void setCommand(void *Command) { MCommand = Command; } /// Returns host profiling information. /// - /// \return a pointer to HostProfilingInfo instance. + /// @return a pointer to HostProfilingInfo instance. HostProfilingInfo *getHostProfilingInfo() { return MHostProfilingInfo.get(); } private: + // When instrumentation is enabled emits trace event for event wait begin and + // returns the telemetry event generated for the wait + void *instrumentationProlog(string_class &Name, int32_t StreamID, + uint64_t &instance_id) const; + // Uses events generated by the Prolog and emits event wait done event + void instrumentationEpilog(void *TelementryEvent, const string_class &Name, + int32_t StreamID, uint64_t IId) const; + RT::PiEvent MEvent = nullptr; ContextImplPtr MContext; QueueImplWPtr MQueue; diff --git a/sycl/source/detail/pi.cpp b/sycl/source/detail/pi.cpp index 6e92c950e116d..9b0959063fbe5 100644 --- a/sycl/source/detail/pi.cpp +++ b/sycl/source/detail/pi.cpp @@ -18,11 +18,35 @@ #include #include +#ifdef XPTI_ENABLE_INSTRUMENTATION +// Include the headers necessary for emitting +// traces using the trace framework +#include "xpti_trace_framework.h" +#endif + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { +#ifdef XPTI_ENABLE_INSTRUMENTATION +// Stream name being used for traces generated from the SYCL runtime +constexpr const char *PICALL_STREAM_NAME = "sycl.pi"; +// Global (to the SYCL runtime) graph handle that all command groups are a +// child of +///< Event to be used by graph related activities +xpti_td *GSYCLGraphEvent = nullptr; +///< Event to be used by PI layer related activities +xpti_td *GPICallEvent = nullptr; +///< Constansts being used as placeholder until one is able to reliably get the +///< version of the SYCL runtime +constexpr uint32_t GMajVer = 1; +constexpr uint32_t GMinVer = 0; +constexpr const char *GVerStr = "sycl 1.0"; +#endif + namespace pi { +bool XPTIInitDone = false; + std::string platformInfoToString(pi_platform_info info) { switch (info) { case PI_PLATFORM_INFO_PROFILE: @@ -208,6 +232,49 @@ vector_class initialize() { } Plugins.push_back(plugin(PluginInformation)); } + +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!(xptiTraceEnabled() && !XPTIInitDone)) + return Plugins; + // Not sure this is the best place to initialize the framework; SYCL runtime + // team needs to advise on the right place, until then we piggy-back on the + // initialization of the PI layer. + + // Initialize the global events just once, in the case pi::initialize() is + // called multiple times + XPTIInitDone = true; + // Registers a new stream for 'sycl' and any plugin that wants to listen to + // this stream will register itself using this string or stream ID for this + // string. + uint8_t StreamID = xptiRegisterStream(SYCL_STREAM_NAME); + // Let all tool plugins know that a stream by the name of 'sycl' has been + // initialized and will be generating the trace stream. + // + // +--- Minor version # + // Major version # ------+ | Version string + // | | | + // v v v + xptiInitialize(SYCL_STREAM_NAME, GMajVer, GMinVer, GVerStr); + // Create a tracepoint to indicate the graph creation + xpti::payload_t GraphPayload("application_graph"); + uint64_t GraphInstanceNo; + GSYCLGraphEvent = + xptiMakeEvent("application_graph", &GraphPayload, xpti::trace_graph_event, + xpti_at::active, &GraphInstanceNo); + if (GSYCLGraphEvent) { + // The graph event is a global event and will be used as the parent for + // all nodes (command groups) + xptiNotifySubscribers(StreamID, xpti::trace_graph_create, nullptr, + GSYCLGraphEvent, GraphInstanceNo, nullptr); + } + + xpti::payload_t PIPayload("Plugin Interface Layer"); + uint64_t PiInstanceNo; + GPICallEvent = + xptiMakeEvent("PI Layer", &PIPayload, xpti::trace_algorithm_event, + xpti_at::active, &PiInstanceNo); +#endif + return Plugins; } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 5b7690cf5fa36..ddc9883cb610e 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -16,6 +16,11 @@ #include +#ifdef XPTI_ENABLE_INSTRUMENTATION +#include "xpti_trace_framework.hpp" +#include +#endif + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { @@ -86,6 +91,100 @@ void queue_impl::addEvent(event Event) { MEvents.push_back(std::move(Event)); } +void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc, + string_class &Name, int32_t StreamID, + uint64_t &IId) { + void *TraceEvent = nullptr; +#ifdef XPTI_ENABLE_INSTRUMENTATION + xpti::trace_event_data_t *WaitEvent = nullptr; + if (!xptiTraceEnabled()) + return TraceEvent; + + xpti::payload_t Payload; + bool HasSourceInfo = false; + // We try to create a unique string for the wait() call by combining it with + // the queue address + xpti::utils::StringHelper NG; + Name = NG.nameWithAddress("queue.wait", this); + + if (!CodeLoc.fileName()) { + // We have source code location information + Payload = + xpti::payload_t(Name.c_str(), CodeLoc.fileName(), CodeLoc.lineNumber(), + CodeLoc.columnNumber(), (void *)this); + HasSourceInfo = true; + } else { + // We have no location information, so we'll use the address of the queue + Payload = xpti::payload_t(Name.c_str(), (void *)this); + } + // wait() calls could be at different user-code locations; We create a new + // event based on the code location info and if this has been seen before, a + // previously created event will be returned. + uint64_t QWaitInstanceNo = 0; + WaitEvent = xptiMakeEvent(Name.c_str(), &Payload, xpti::trace_graph_event, + xpti_at::active, &QWaitInstanceNo); + IId = QWaitInstanceNo; + if (WaitEvent) { + device D = get_device(); + std::string DevStr; + if (D.is_host()) + DevStr = "HOST"; + else if (D.is_cpu()) + DevStr = "CPU"; + else if (D.is_gpu()) + DevStr = "GPU"; + else if (D.is_accelerator()) + DevStr = "ACCELERATOR"; + else + DevStr = "UNKNOWN"; + xptiAddMetadata(WaitEvent, "sycl_device", DevStr.c_str()); + if (HasSourceInfo) { + xptiAddMetadata(WaitEvent, "sym_function_name", CodeLoc.functionName()); + xptiAddMetadata(WaitEvent, "sym_source_file_name", CodeLoc.fileName()); + xptiAddMetadata(WaitEvent, "sym_line_no", + std::to_string(CodeLoc.lineNumber()).c_str()); + } + xptiNotifySubscribers(StreamID, xpti::trace_wait_begin, nullptr, WaitEvent, + QWaitInstanceNo, + static_cast(Name.c_str())); + TraceEvent = (void *)WaitEvent; + } +#endif + return TraceEvent; +} + +void queue_impl::instrumentationEpilog(void *TelemetryEvent, string_class &Name, + int32_t StreamID, uint64_t IId) { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!(xptiTraceEnabled() && TelemetryEvent)) + return; + // Close the wait() scope + xpti::trace_event_data_t *TraceEvent = + (xpti::trace_event_data_t *)TelemetryEvent; + xptiNotifySubscribers(StreamID, xpti::trace_wait_end, nullptr, TraceEvent, + IId, static_cast(Name.c_str())); +#endif +} + +void queue_impl::wait(const detail::code_location &CodeLoc) { +#ifdef XPTI_ENABLE_INSTRUMENTATION + void *TelemetryEvent = nullptr; + uint64_t IId; + std::string Name; + int32_t StreamID = xptiRegisterStream(SYCL_STREAM_NAME); + TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId); +#endif + + std::lock_guard Guard(MMutex); + for (auto &Event : MEvents) + Event.wait(); + MEvents.clear(); + +#ifdef XPTI_ENABLE_INSTRUMENTATION + instrumentationEpilog(TelemetryEvent, Name, StreamID, IId); +#endif +} + } // namespace detail } // namespace sycl } // __SYCL_INLINE_NAMESPACE(cl) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 0b3acd2f32a80..ef94d73a086c6 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -152,19 +152,21 @@ class queue_impl { /// \param CGF is a function object containing command group. /// \param Self is a shared_ptr to this queue. /// \param SecondQueue is a shared_ptr to the secondary queue. + /// \param Loc is the code location of the submit call (default argument) /// \return a SYCL event object, which corresponds to the queue the command /// group is being enqueued on. event submit(const function_class &CGF, shared_ptr_class Self, - shared_ptr_class SecondQueue) { + shared_ptr_class SecondQueue, + const detail::code_location &Loc) { try { - return submit_impl(CGF, Self); + return submit_impl(CGF, Self, Loc); } catch (...) { { std::lock_guard Guard(MMutex); MExceptions.PushBack(std::current_exception()); } - return SecondQueue->submit(CGF, SecondQueue); + return SecondQueue->submit(CGF, SecondQueue, Loc); } } @@ -173,28 +175,27 @@ class queue_impl { /// /// \param CGF is a function object containing command group. /// \param Self is a shared_ptr to this queue. + /// \param Loc is the code location of the submit call (default argument) /// \return a SYCL event object for the submitted command group. event submit(const function_class &CGF, - shared_ptr_class Self) { - return submit_impl(CGF, std::move(Self)); + shared_ptr_class Self, + const detail::code_location &Loc) { + return submit_impl(CGF, std::move(Self), Loc); } /// Performs a blocking wait for the completion of all enqueued tasks in the /// queue. /// /// Synchronous errors will be reported through SYCL exceptions. - void wait() { - std::lock_guard Guard(MMutex); - for (auto &Event : MEvents) - Event.wait(); - MEvents.clear(); - } + /// @param Loc is the code location of the submit call (default argument) + void wait(const detail::code_location &Loc = {}); /// \return list of asynchronous exceptions occurred during execution. exception_list getExceptionList() const { return MExceptions; } - void wait_and_throw() { - wait(); + /// @param Loc is the code location of the submit call (default argument) + void wait_and_throw(const detail::code_location &Loc = {}) { + wait(Loc); throw_asynchronous(); } @@ -350,16 +351,27 @@ class queue_impl { /// /// \param CGF is a function object containing command group. /// \param Self is a pointer to this queue. + /// \param Loc is the code location of the submit call (default argument) /// \return a SYCL event representing submitted command group. event submit_impl(const function_class &CGF, - shared_ptr_class Self) { + shared_ptr_class Self, + const detail::code_location &Loc) { handler Handler(std::move(Self), MHostQueue); CGF(Handler); - event Event = Handler.finalize(); + event Event = Handler.finalize(Loc); addEvent(Event); return Event; } + // When instrumentation is enabled emits trace event for wait begin and + // returns the telemetry event generated for the wait + void *instrumentationProlog(const detail::code_location &CodeLoc, + string_class &Name, int32_t StreamID, + uint64_t &iid); + // Uses events generated by the Prolog and emits wait done event + void instrumentationEpilog(void *TelementryEvent, string_class &Name, + int32_t StreamID, uint64_t IId); + /// Stores an event that should be associated with the queue /// /// \param Event is the event to be stored diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 6343ddecc670a..e9425a3e47783 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -33,9 +34,17 @@ #include #endif +#ifdef XPTI_ENABLE_INSTRUMENTATION +#include "xpti_trace_framework.hpp" +#endif + __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { namespace detail { +#ifdef XPTI_ENABLE_INSTRUMENTATION +// Global graph for the application +extern xpti::trace_event_data_t *GSYCLGraphEvent; +#endif #ifdef __GNUG__ struct DemangleHandle { @@ -82,6 +91,63 @@ static std::string accessModeToString(access::mode Mode) { } } +#ifdef XPTI_ENABLE_INSTRUMENTATION +// Using the command group type to create node types for the asynchronous task +// graph modeling +static std::string commandToNodeType(Command::CommandType Type) { + switch (Type) { + case Command::CommandType::RUN_CG: + return "command_group_node"; + case Command::CommandType::COPY_MEMORY: + return "memory_transfer_node"; + case Command::CommandType::ALLOCA: + return "memory_allocation_node"; + case Command::CommandType::ALLOCA_SUB_BUF: + return "sub_buffer_creation_node"; + case Command::CommandType::RELEASE: + return "memory_deallocation_node"; + case Command::CommandType::MAP_MEM_OBJ: + return "memory_transfer_node"; + case Command::CommandType::UNMAP_MEM_OBJ: + return "memory_transfer_node"; + case Command::CommandType::UPDATE_REQUIREMENT: + return "host_acc_create_buffer_lock_node"; + case Command::CommandType::EMPTY_TASK: + return "host_acc_destroy_buffer_release_node"; + default: + return "unknown_node"; + } +} + +// Using the names being generated and the string are subject to change to +// something more meaningful to end-users as this will be visible in analysis +// tools that subscribe to this data +static std::string commandToName(Command::CommandType Type) { + switch (Type) { + case Command::CommandType::RUN_CG: + return "Command Group Action"; + case Command::CommandType::COPY_MEMORY: + return "Memory Transfer (Copy)"; + case Command::CommandType::ALLOCA: + return "Memory Allocation"; + case Command::CommandType::ALLOCA_SUB_BUF: + return "Sub Buffer Creation"; + case Command::CommandType::RELEASE: + return "Memory Deallocation"; + case Command::CommandType::MAP_MEM_OBJ: + return "Memory Transfer (Map)"; + case Command::CommandType::UNMAP_MEM_OBJ: + return "Memory Transfer (Unmap)"; + case Command::CommandType::UPDATE_REQUIREMENT: + return "Host Accessor Creation/Buffer Lock"; + case Command::CommandType::EMPTY_TASK: + return "Host Accessor Destruction/Buffer Lock Release"; + default: + return "Unknown Action"; + } +} +#endif + static std::vector getPiEvents(const std::vector &EventImpls) { std::vector RetPiEvents; @@ -161,6 +227,221 @@ Command::Command(CommandType Type, QueueImplPtr Queue) MEvent.reset(new detail::event_impl(MQueue)); MEvent->setCommand(this); MEvent->setContextImpl(detail::getSyclObjImpl(MQueue->get_context())); + +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + // Obtain the stream ID so all commands can emit traces to that stream + MStreamID = xptiRegisterStream(SYCL_STREAM_NAME); +#endif +} + +void Command::emitInstrumentationDataProxy() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + emitInstrumentationData(); +#endif +} + +/// Method takes in void * for the address as adding a template function to +/// the command group object maybe undesirable. +/// @param Cmd The command object of the source of the edge +/// @param ObjAddr The address that defines the edge dependency; it is the event +/// address when the edge is for an event and a memory object address if it is +/// due to an accessor +/// @param Prefix Contains "event" if the dependency is an edge and contains the +/// access mode to the buffer if it is due to an accessor +/// @param IsCommand True if the dependency has a command object as the source, +/// false otherwise +void Command::emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, + const string_class &Prefix, + bool IsCommand) { +#ifdef XPTI_ENABLE_INSTRUMENTATION + // Bail early if either the source or the target node for the given dependency + // is undefined or NULL + if (!(xptiTraceEnabled() && MTraceEvent && Cmd && Cmd->MTraceEvent)) + return; + // If all the information we need for creating an edge event is available, + // then go ahead with creating it; if not, bail early! + xpti::utils::StringHelper SH; + std::string AddressStr = SH.addressAsString(ObjAddr); + std::string TypeString = SH.nameWithAddressString(Prefix, AddressStr); + // Create an edge with the dependent buffer address for which a command + // object has been created as one of the properties of the edge + xpti::payload_t Payload(TypeString.c_str(), MAddress); + uint64_t EdgeInstanceNo; + xpti_td *EdgeEvent = + xptiMakeEvent(TypeString.c_str(), &Payload, xpti::trace_graph_event, + xpti_at::active, &EdgeInstanceNo); + if (EdgeEvent) { + xpti_td *SrcEvent = static_cast(Cmd->MTraceEvent); + xpti_td *TgtEvent = static_cast(MTraceEvent); + EdgeEvent->source_id = SrcEvent->unique_id; + EdgeEvent->target_id = TgtEvent->unique_id; + if (IsCommand) { + xptiAddMetadata(EdgeEvent, "access_mode", TypeString.c_str()); + xptiAddMetadata(EdgeEvent, "memory_object", AddressStr.c_str()); + } else { + xptiAddMetadata(EdgeEvent, "event", TypeString.c_str()); + } + xptiNotifySubscribers(MStreamID, xpti::trace_edge_create, + detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo, + nullptr); + } + // General comment - None of these are serious errors as the instrumentation + // layer MUST be tolerant of errors. If we need to let the end user know, we + // throw exceptions in the future +#endif +} + +/// Creates an edge when the dependency is due to an event. +/// @param Cmd The command object of the source of the edge +/// @param PiEventAddr The address that defines the edge dependency, which in +/// this case is an event +void Command::emitEdgeEventForEventDependence(Command *Cmd, + RT::PiEvent &PiEventAddr) { +#ifdef XPTI_ENABLE_INSTRUMENTATION + // If we have failed to create an event to represent the Command, then we + // cannot emit an edge event. Bail early! + if (!(xptiTraceEnabled() && MTraceEvent)) + return; + + if (Cmd && Cmd->MTraceEvent) { + // If the event is associated with a command, we use this command's trace + // event as the source of edge, hence modeling the control flow + emitEdgeEventForCommandDependence(Cmd, (void *)PiEventAddr, "Event", false); + return; + } + if (PiEventAddr) { + xpti::utils::StringHelper SH; + std::string AddressStr = SH.addressAsString(PiEventAddr); + // This is the case when it is a OCL event enqueued by the user or another + // event is registered by the runtime as a dependency The dependency on + // this occasion is an OCL event; so we build a virtual node in the graph + // with the event as the metadata for the node + std::string NodeName = SH.nameWithAddressString("virtual_node", AddressStr); + // Node name is "virtual_node[]" + xpti::payload_t VNPayload(NodeName.c_str(), MAddress); + uint64_t VNodeInstanceNo; + xpti_td *NodeEvent = + xptiMakeEvent(NodeName.c_str(), &VNPayload, xpti::trace_graph_event, + xpti_at::active, &VNodeInstanceNo); + // Emit the virtual node first + xptiAddMetadata(NodeEvent, "kernel_name", NodeName.c_str()); + xptiNotifySubscribers(MStreamID, xpti::trace_node_create, + detail::GSYCLGraphEvent, NodeEvent, VNodeInstanceNo, + nullptr); + // Create a new event for the edge + std::string EdgeName = SH.nameWithAddressString("Event", AddressStr); + xpti::payload_t EdgePayload(EdgeName.c_str(), MAddress); + uint64_t EdgeInstanceNo; + xpti_td *EdgeEvent = + xptiMakeEvent(EdgeName.c_str(), &EdgePayload, xpti::trace_graph_event, + xpti_at::active, &EdgeInstanceNo); + if (EdgeEvent && NodeEvent) { + // Source node represents the event and this event needs to be completed + // before target node can execute + xpti_td *TgtEvent = static_cast(MTraceEvent); + EdgeEvent->source_id = NodeEvent->unique_id; + EdgeEvent->target_id = TgtEvent->unique_id; + xptiAddMetadata(EdgeEvent, "event", EdgeName.c_str()); + xptiNotifySubscribers(MStreamID, xpti::trace_edge_create, + detail::GSYCLGraphEvent, EdgeEvent, EdgeInstanceNo, + nullptr); + } + return; + } +#endif +} + +uint64_t Command::makeTraceEventProlog(void *MAddress) { + uint64_t CommandInstanceNo = 0; +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return CommandInstanceNo; + + MTraceEventPrologComplete = true; + // Setup the member variables with information needed for event notification + MCommandNodeType = commandToNodeType(MType); + MCommandName = commandToName(MType); + xpti::utils::StringHelper SH; + MAddressString = SH.addressAsString(MAddress); + std::string CommandString = + SH.nameWithAddressString(MCommandName, MAddressString); + + xpti::payload_t p(CommandString.c_str(), MAddress); + xpti_td *CmdTraceEvent = + xptiMakeEvent(CommandString.c_str(), &p, xpti::trace_graph_event, + xpti_at::active, &CommandInstanceNo); + MInstanceID = CommandInstanceNo; + if (CmdTraceEvent) { + MTraceEvent = (void *)CmdTraceEvent; + // If we are seeing this event again, then the instance ID will be greater + // than 1; in this case, we must skip sending a notification to create a + // node as this node has already been created. We return this value so the + // epilog method can be called selectively. + MFirstInstance = (CommandInstanceNo == 1); + } +#endif + return CommandInstanceNo; +} + +void Command::makeTraceEventEpilog() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!(xptiTraceEnabled() && MTraceEvent)) + return; + assert(MTraceEventPrologComplete); + xptiNotifySubscribers(MStreamID, xpti::trace_node_create, + detail::GSYCLGraphEvent, + static_cast(MTraceEvent), MInstanceID, + static_cast(MCommandNodeType.c_str())); +#endif +} + +void Command::addDep(DepDesc NewDep) { + if (NewDep.MDepCommand) + MDepsEvents.push_back(NewDep.MDepCommand->getEvent()); + MDeps.push_back(NewDep); +#ifdef XPTI_ENABLE_INSTRUMENTATION + emitEdgeEventForCommandDependence( + NewDep.MDepCommand, (void *)NewDep.MDepRequirement->MSYCLMemObj, + accessModeToString(NewDep.MDepRequirement->MAccessMode), true); +#endif +} + +void Command::addDep(EventImplPtr Event) { +#ifdef XPTI_ENABLE_INSTRUMENTATION + // We need this for just the instrumentation, so guarding it will prevent + // unused variable warnings when instrumentation is turned off + Command *Cmd = (Command *)Event->getCommand(); + RT::PiEvent &PiEventAddr = Event->getHandleRef(); + // Now make an edge for the dependent event + emitEdgeEventForEventDependence(Cmd, PiEventAddr); +#endif + + MDepsEvents.push_back(std::move(Event)); +} + +void Command::emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr) { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!(xptiTraceEnabled() && MTraceEvent && PiEventAddr)) + return; + // Asynchronous call, so send a signal with the event information as + // user_data + xptiNotifySubscribers(MStreamID, xpti::trace_signal, detail::GSYCLGraphEvent, + static_cast(MTraceEvent), MInstanceID, + (void *)PiEventAddr); +#endif +} + +void Command::emitInstrumentation(uint16_t Type, const char *Txt) { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!(xptiTraceEnabled() && MTraceEvent)) + return; + // Trace event notifier that emits a Type event + xptiNotifySubscribers(MStreamID, Type, detail::GSYCLGraphEvent, + static_cast(MTraceEvent), MInstanceID, + static_cast(Txt)); +#endif } bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) { @@ -182,9 +463,21 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) { std::string(MBlockReason), PI_INVALID_OPERATION); +#ifdef XPTI_ENABLE_INSTRUMENTATION + // Scoped trace event notifier that emits a barrier begin and barrier end + // event, which models the barrier while enqueuing along with the blocked + // reason, as determined by the scheduler + std::string Info = "enqueue.barrier["; + Info += std::string(MBlockReason) + "]"; + emitInstrumentation(xpti::trace_barrier_begin, Info.c_str()); +#endif + // Wait if blocking while (!MCanEnqueue) ; +#ifdef XPTI_ENABLE_INSTRUMENTATION + emitInstrumentation(xpti::trace_barrier_end, Info.c_str()); +#endif } std::lock_guard Lock(MEnqueueMtx); @@ -193,6 +486,10 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) { if (MEnqueued) return true; +#ifdef XPTI_ENABLE_INSTRUMENTATION + emitInstrumentation(xpti::trace_task_begin, nullptr); +#endif + cl_int Res = enqueueImp(); if (CL_SUCCESS != Res) @@ -203,9 +500,107 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking) { // CL_SUCCESS MEnqueued = true; + // Emit this correlation signal before the task end + emitEnqueuedEventSignal(MEvent->getHandleRef()); +#ifdef XPTI_ENABLE_INSTRUMENTATION + emitInstrumentation(xpti::trace_task_end, nullptr); +#endif return static_cast(MEnqueued); } +void Command::resolveReleaseDependencies(std::set &DepList) { +#ifdef XPTI_ENABLE_INSTRUMENTATION + assert(MType == CommandType::RELEASE && "Expected release command"); + if (!MTraceEvent) + return; + // The current command is the target node for all dependencies as the source + // nodes have to be completed first before the current node can begin to + // execute; these edges model control flow + xpti_td *TgtTraceEvent = static_cast(MTraceEvent); + // We have all the Commands that must be completed before the release command + // can be enqueued; here we'll find the command that is an Alloca with the + // same SYCLMemObject address and create a dependency line (edge) between them + // in our sematic modeling + for (auto &Item : DepList) { + if (Item->MTraceEvent && Item->MAddress == MAddress) { + xpti::utils::StringHelper SH; + std::string AddressStr = SH.addressAsString(MAddress); + std::string TypeString = + "Edge:" + SH.nameWithAddressString(commandToName(MType), AddressStr); + + // Create an edge with the dependent buffer address being one of the + // properties of the edge + xpti::payload_t p(TypeString.c_str(), MAddress); + uint64_t EdgeInstanceNo; + xpti_td *EdgeEvent = + xptiMakeEvent(TypeString.c_str(), &p, xpti::trace_graph_event, + xpti_at::active, &EdgeInstanceNo); + if (EdgeEvent) { + xpti_td *SrcTraceEvent = static_cast(Item->MTraceEvent); + EdgeEvent->target_id = TgtTraceEvent->unique_id; + EdgeEvent->source_id = SrcTraceEvent->unique_id; + xptiAddMetadata(EdgeEvent, "memory_object", AddressStr.c_str()); + xptiNotifySubscribers(MStreamID, xpti::trace_edge_create, + detail::GSYCLGraphEvent, EdgeEvent, + EdgeInstanceNo, nullptr); + } + } + } +#endif +} + +AllocaCommandBase::AllocaCommandBase(CommandType Type, QueueImplPtr Queue, + Requirement Req, + AllocaCommandBase *LinkedAllocaCmd) + : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd), + MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MRequirement(std::move(Req)), + MReleaseCmd(Queue, this) { + MRequirement.MAccessMode = access::mode::read_write; + emitInstrumentationDataProxy(); +} + +void AllocaCommandBase::emitInstrumentationData() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + // Create a payload with the command name and an event using this payload to + // emit a node_create + MAddress = MRequirement.MSYCLMemObj; + makeTraceEventProlog(MAddress); + // Set the relevant meta data properties for this command + if (MTraceEvent && MFirstInstance) { + xpti_td *TE = static_cast(MTraceEvent); + xptiAddMetadata(TE, "sycl_device", + deviceToString(MQueue->get_device()).c_str()); + xptiAddMetadata(TE, "memory_object", MAddressString.c_str()); + } +#endif +} + +AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req, + bool InitFromUserData, + AllocaCommandBase *LinkedAllocaCmd) + : AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req), + LinkedAllocaCmd), + MInitFromUserData(InitFromUserData) { + // Node event must be created before the dependent edge is added to this node, + // so this call must be before the addDep() call. + emitInstrumentationDataProxy(); + addDep(DepDesc(nullptr, getRequirement(), this)); +} + +void AllocaCommand::emitInstrumentationData() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + + // Only if it is the first event, we emit a node create event + if (MFirstInstance) { + makeTraceEventEpilog(); + } +#endif +} + cl_int AllocaCommand::enqueueImp() { std::vector EventImpls = Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context())); @@ -251,6 +646,39 @@ void AllocaCommand::printDot(std::ostream &Stream) const { } } +AllocaSubBufCommand::AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, + AllocaCommandBase *ParentAlloca) + : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue), + std::move(Req), + /*LinkedAllocaCmd*/ nullptr), + MParentAlloca(ParentAlloca) { + // Node event must be created before the dependent edge + // is added to this node, so this call must be before + // the addDep() call. + emitInstrumentationDataProxy(); + addDep(DepDesc(MParentAlloca, getRequirement(), MParentAlloca)); +} + +void AllocaSubBufCommand::emitInstrumentationData() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + + // Only if it is the first event, we emit a node create event and any meta + // data that is available for the command + if (MFirstInstance) { + xpti_td *TE = static_cast(MTraceEvent); + xptiAddMetadata(TE, "offset", + std::to_string(this->MRequirement.MOffsetInBytes).c_str()); + std::string range = std::to_string(this->MRequirement.MAccessRange[0]) + + "-" + + std::to_string(this->MRequirement.MAccessRange[1]); + xptiAddMetadata(TE, "access_range", range.c_str()); + makeTraceEventEpilog(); + } +#endif +} + cl_int AllocaSubBufCommand::enqueueImp() { std::vector EventImpls = Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context())); @@ -286,6 +714,31 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const { } } +ReleaseCommand::ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd) + : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) { + emitInstrumentationDataProxy(); +} + +void ReleaseCommand::emitInstrumentationData() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + // Create a payload with the command name and an event using this payload to + // emit a node_create + MAddress = MAllocaCmd->getSYCLMemObj(); + makeTraceEventProlog(MAddress); + + if (MFirstInstance) { + xpti_td *TE = static_cast(MTraceEvent); + xptiAddMetadata(TE, "sycl_device", + deviceToString(MQueue->get_device()).c_str()); + xptiAddMetadata(TE, "allocation_type", + commandToName(MAllocaCmd->getType()).c_str()); + makeTraceEventEpilog(); + } +#endif +} + cl_int ReleaseCommand::enqueueImp() { std::vector EventImpls = Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context())); @@ -372,7 +825,28 @@ void ReleaseCommand::printDot(std::ostream &Stream) const { MapMemObject::MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req, void **DstPtr, QueueImplPtr Queue) : Command(CommandType::MAP_MEM_OBJ, std::move(Queue)), - MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr) {} + MSrcAllocaCmd(SrcAllocaCmd), MSrcReq(std::move(Req)), MDstPtr(DstPtr) { + emitInstrumentationDataProxy(); +} + +void MapMemObject::emitInstrumentationData() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + // Create a payload with the command name and an event using this payload to + // emit a node_create + MAddress = MSrcAllocaCmd->getSYCLMemObj(); + makeTraceEventProlog(MAddress); + + if (MFirstInstance) { + xpti_td *TE = static_cast(MTraceEvent); + xptiAddMetadata(TE, "sycl_device", + deviceToString(MQueue->get_device()).c_str()); + xptiAddMetadata(TE, "memory_object", MAddressString.c_str()); + makeTraceEventEpilog(); + } +#endif +} cl_int MapMemObject::enqueueImp() { std::vector EventImpls = @@ -408,7 +882,28 @@ void MapMemObject::printDot(std::ostream &Stream) const { UnMapMemObject::UnMapMemObject(AllocaCommandBase *DstAllocaCmd, Requirement Req, void **SrcPtr, QueueImplPtr Queue) : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)), - MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) {} + MDstAllocaCmd(DstAllocaCmd), MDstReq(std::move(Req)), MSrcPtr(SrcPtr) { + emitInstrumentationDataProxy(); +} + +void UnMapMemObject::emitInstrumentationData() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + // Create a payload with the command name and an event using this payload to + // emit a node_create + MAddress = MDstAllocaCmd->getSYCLMemObj(); + makeTraceEventProlog(MAddress); + + if (MFirstInstance) { + xpti_td *TE = static_cast(MTraceEvent); + xptiAddMetadata(TE, "sycl_device", + deviceToString(MQueue->get_device()).c_str()); + xptiAddMetadata(TE, "memory_object", MAddressString.c_str()); + makeTraceEventEpilog(); + } +#endif +} cl_int UnMapMemObject::enqueueImp() { std::vector EventImpls = @@ -450,6 +945,31 @@ MemCpyCommand::MemCpyCommand(Requirement SrcReq, MDstAllocaCmd(DstAllocaCmd) { if (!MSrcQueue->is_host()) MEvent->setContextImpl(detail::getSyclObjImpl(MSrcQueue->get_context())); + + emitInstrumentationDataProxy(); +} + +void MemCpyCommand::emitInstrumentationData() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + // Create a payload with the command name and an event using this payload to + // emit a node_create + MAddress = MSrcAllocaCmd->getSYCLMemObj(); + makeTraceEventProlog(MAddress); + + if (MFirstInstance) { + xpti_td *CmdTraceEvent = static_cast(MTraceEvent); + xptiAddMetadata(CmdTraceEvent, "sycl_device", + deviceToString(MQueue->get_device()).c_str()); + xptiAddMetadata(CmdTraceEvent, "memory_object", MAddressString.c_str()); + std::string From = deviceToString(MSrcQueue->get_device()); + std::string To = deviceToString(MQueue->get_device()); + xptiAddMetadata(CmdTraceEvent, "copy_from", From.c_str()); + xptiAddMetadata(CmdTraceEvent, "copy_to", To.c_str()); + makeTraceEventEpilog(); + } +#endif } cl_int MemCpyCommand::enqueueImp() { @@ -565,6 +1085,31 @@ MemCpyCommandHost::MemCpyCommandHost(Requirement SrcReq, MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) { if (!MSrcQueue->is_host()) MEvent->setContextImpl(detail::getSyclObjImpl(MSrcQueue->get_context())); + + emitInstrumentationDataProxy(); +} + +void MemCpyCommandHost::emitInstrumentationData() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + // Create a payload with the command name and an event using this payload to + // emit a node_create + MAddress = MSrcAllocaCmd->getSYCLMemObj(); + makeTraceEventProlog(MAddress); + + if (MFirstInstance) { + xpti_td *CmdTraceEvent = static_cast(MTraceEvent); + xptiAddMetadata(CmdTraceEvent, "sycl_device", + deviceToString(MQueue->get_device()).c_str()); + xptiAddMetadata(CmdTraceEvent, "memory_object", MAddressString.c_str()); + std::string From = deviceToString(MSrcQueue->get_device()); + std::string To = deviceToString(MQueue->get_device()); + xptiAddMetadata(CmdTraceEvent, "copy_from", From.c_str()); + xptiAddMetadata(CmdTraceEvent, "copy_to", To.c_str()); + makeTraceEventEpilog(); + } +#endif } cl_int MemCpyCommandHost::enqueueImp() { @@ -592,6 +1137,32 @@ cl_int MemCpyCommandHost::enqueueImp() { return CL_SUCCESS; } +EmptyCommand::EmptyCommand(QueueImplPtr Queue, Requirement Req) + : Command(CommandType::EMPTY_TASK, std::move(Queue)), + MRequirement(std::move(Req)) { + + emitInstrumentationDataProxy(); +} + +void EmptyCommand::emitInstrumentationData() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + // Create a payload with the command name and an event using this payload to + // emit a node_create + MAddress = MRequirement.MSYCLMemObj; + makeTraceEventProlog(MAddress); + + if (MFirstInstance) { + xpti_td *CmdTraceEvent = static_cast(MTraceEvent); + xptiAddMetadata(CmdTraceEvent, "sycl_device", + deviceToString(MQueue->get_device()).c_str()); + xptiAddMetadata(CmdTraceEvent, "memory_object", MAddressString.c_str()); + makeTraceEventEpilog(); + } +#endif +} + void EmptyCommand::printDot(std::ostream &Stream) const { Stream << "\"" << this << "\" [style=filled, fillcolor=\"#8d8f29\", label=\""; @@ -627,50 +1198,186 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const { } } -void ExecCGCommand::printDot(std::ostream &Stream) const { - Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; +UpdateHostRequirementCommand::UpdateHostRequirementCommand( + QueueImplPtr Queue, Requirement Req, AllocaCommandBase *SrcAllocaCmd, + void **DstPtr) + : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)), + MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) { - Stream << "ID = " << this << "\\n"; - Stream << "EXEC CG ON " << deviceToString(MQueue->get_device()) << "\\n"; + emitInstrumentationDataProxy(); +} - switch (MCommandGroup->getType()) { - case detail::CG::KERNEL: { - auto *KernelCG = - reinterpret_cast(MCommandGroup.get()); - Stream << "Kernel name: "; - if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) - Stream << "created from source"; - else - Stream << demangleKernelName(KernelCG->getKernelName()); - Stream << "\\n"; - break; +void UpdateHostRequirementCommand::emitInstrumentationData() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + // Create a payload with the command name and an event using this payload to + // emit a node_create + MAddress = MSrcAllocaCmd->getSYCLMemObj(); + makeTraceEventProlog(MAddress); + + if (MFirstInstance) { + xpti_td *CmdTraceEvent = static_cast(MTraceEvent); + xptiAddMetadata(CmdTraceEvent, "sycl_device", + deviceToString(MQueue->get_device()).c_str()); + xptiAddMetadata(CmdTraceEvent, "memory_object", MAddressString.c_str()); + makeTraceEventEpilog(); } +#endif +} + +static std::string cgTypeToString(detail::CG::CGTYPE Type) { + switch (Type) { + case detail::CG::KERNEL: + return "Kernel"; + break; case detail::CG::UPDATE_HOST: - Stream << "CG type: update_host\\n"; + return "update_host"; break; case detail::CG::FILL: - Stream << "CG type: fill\\n"; + return "fill"; break; case detail::CG::COPY_ACC_TO_ACC: - Stream << "CG type: copy acc to acc\\n"; + return "copy acc to acc"; break; case detail::CG::COPY_ACC_TO_PTR: - Stream << "CG type: copy acc to ptr\\n"; + return "copy acc to ptr"; break; case detail::CG::COPY_PTR_TO_ACC: - Stream << "CG type: copy ptr to acc\\n"; + return "copy ptr to acc"; break; case detail::CG::COPY_USM: - Stream << "CG type: copy usm\\n"; + return "copy usm"; break; case detail::CG::FILL_USM: - Stream << "CG type: fill usm\\n"; + return "fill usm"; break; case detail::CG::PREFETCH_USM: - Stream << "CG type: prefetch usm\\n"; + return "prefetch usm"; break; default: - Stream << "CG type: unknown\\n"; + return "unknown"; + break; + } +} + +ExecCGCommand::ExecCGCommand(std::unique_ptr CommandGroup, + QueueImplPtr Queue) + : Command(CommandType::RUN_CG, std::move(Queue)), + MCommandGroup(std::move(CommandGroup)) { + + emitInstrumentationDataProxy(); +} + +void ExecCGCommand::emitInstrumentationData() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (!xptiTraceEnabled()) + return; + // Create a payload with the command name and an event using this payload to + // emit a node_create + bool HasSourceInfo = false; + std::string KernelName, FromSource; + switch (MCommandGroup->getType()) { + case detail::CG::KERNEL: { + auto KernelCG = + reinterpret_cast(MCommandGroup.get()); + + if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) { + FromSource = "true"; + pi_kernel KernelHandle = KernelCG->MSyclKernel->getHandleRef(); + MAddress = KernelHandle; + KernelName = MCommandGroup->MFunctionName; + } else { + FromSource = "false"; + KernelName = demangleKernelName(KernelCG->getKernelName()); + } + } break; + default: + KernelName = cgTypeToString(MCommandGroup->getType()); + break; + } + std::string CommandType = commandToNodeType(MType); + // Get source file, line number information from the CommandGroup object + // and create payload using name, address, and source info + // + // On Windows, since the support for builtin functions is not available in + // MSVC, the MFileName, MLine will be set to nullptr and "0" respectively. + // Handle this condition explicitly here. + xpti::payload_t Payload; + if (!MCommandGroup->MFileName.empty()) { + // File name has a valid string + Payload = + xpti::payload_t(KernelName.c_str(), MCommandGroup->MFileName.c_str(), + MCommandGroup->MLine, MCommandGroup->MColumn, MAddress); + HasSourceInfo = true; + } else if (MAddress) { + // We have a valid function name and an address + Payload = xpti::payload_t(KernelName.c_str(), MAddress); + } else { + // In any case, we will have a valid function name and we'll use that to + // create the hash + Payload = xpti::payload_t(KernelName.c_str()); + } + + uint64_t CGKernelInstanceNo; + // Create event using the payload + xpti_td *CmdTraceEvent = + xptiMakeEvent("ExecCG", &Payload, xpti::trace_graph_event, + xpti::trace_activity_type_t::active, &CGKernelInstanceNo); + + if (CmdTraceEvent) { + MInstanceID = CGKernelInstanceNo; + MTraceEvent = (void *)CmdTraceEvent; + // If we are seeing this event again, then the instance ID will be greater + // than 1; in this case, we will skip sending a notification to create a + // node as this node has already been created. + if (CGKernelInstanceNo > 1) + return; + + xptiAddMetadata(CmdTraceEvent, "sycl_device", + deviceToString(MQueue->get_device()).c_str()); + if (!KernelName.empty()) { + xptiAddMetadata(CmdTraceEvent, "kernel_name", KernelName.c_str()); + } + if (!FromSource.empty()) { + xptiAddMetadata(CmdTraceEvent, "from_source", FromSource.c_str()); + } + if (HasSourceInfo) { + xptiAddMetadata(CmdTraceEvent, "sym_function_name", KernelName.c_str()); + xptiAddMetadata(CmdTraceEvent, "sym_source_file_name", + MCommandGroup->MFileName.c_str()); + xptiAddMetadata(CmdTraceEvent, "sym_line_no", + std::to_string(MCommandGroup->MLine).c_str()); + } + + xptiNotifySubscribers(MStreamID, xpti::trace_node_create, + detail::GSYCLGraphEvent, CmdTraceEvent, + CGKernelInstanceNo, + static_cast(CommandType.c_str())); + } +#endif +} + +void ExecCGCommand::printDot(std::ostream &Stream) const { + Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\""; + + Stream << "ID = " << this << "\n"; + Stream << "EXEC CG ON " << deviceToString(MQueue->get_device()) << "\\n"; + + switch (MCommandGroup->getType()) { + case detail::CG::KERNEL: { + auto KernelCG = + reinterpret_cast(MCommandGroup.get()); + Stream << "Kernel name: "; + if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) + Stream << "created from source"; + else + Stream << demangleKernelName(KernelCG->getKernelName()); + Stream << "\\n"; + break; + } + default: + Stream << "CG type: " << cgTypeToString(MCommandGroup->getType()) << "\\n"; break; } @@ -1011,7 +1718,7 @@ cl_int ExecCGCommand::enqueueImp() { CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get(); MemoryManager::prefetch_usm(Prefetch->getDst(), MQueue, Prefetch->getLength(), std::move(RawEvents), - Event); + Event); return CL_SUCCESS; } case CG::CGTYPE::INTEROP_TASK_CODEPLAY: { diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp index 315e0e366fd98..13763cc77a8d3 100644 --- a/sycl/source/detail/scheduler/commands.hpp +++ b/sycl/source/detail/scheduler/commands.hpp @@ -9,7 +9,9 @@ #pragma once #include +#include #include +#include #include #include @@ -91,13 +93,9 @@ class Command { Command(CommandType Type, QueueImplPtr Queue); - void addDep(DepDesc NewDep) { - if (NewDep.MDepCommand) - MDepsEvents.push_back(NewDep.MDepCommand->getEvent()); - MDeps.push_back(NewDep); - } + void addDep(DepDesc NewDep); - void addDep(EventImplPtr Event) { MDepsEvents.push_back(std::move(Event)); } + void addDep(EventImplPtr Event); void addUser(Command *NewUser) { MUsers.insert(NewUser); } @@ -118,6 +116,37 @@ class Command { std::shared_ptr getEvent() const { return MEvent; } + // Methods needed to support SYCL instrumentation + // + // Proxy method which calls emitInstrumentationData. + void emitInstrumentationDataProxy(); + // Instrumentation method which emits telemetry data. + virtual void emitInstrumentationData() = 0; + // This function looks at all the dependencies for + // the release command and enables instrumentation + // to report these dependencies as edges + void resolveReleaseDependencies(std::set &list); + // Creates an edge event when the dependency is a command + void emitEdgeEventForCommandDependence(Command *Cmd, void *ObjAddr, + const string_class &Prefix, + bool IsCommand); + // Creates an edge event when the dependency is an event + void emitEdgeEventForEventDependence(Command *Cmd, RT::PiEvent &EventAddr); + // Creates a signal event with the enqueued kernel event handle + void emitEnqueuedEventSignal(RT::PiEvent &PiEventAddr); + /// Create a trace event of node_create type; this must be guarded by a + /// check for xptiTraceEnabled() + /// Post Condition: MTraceEvent will be set to the event created + /// @param MAddress The address to use to create the payload + uint64_t makeTraceEventProlog(void *MAddress); + // If prolog has been run, run epilog; this must be guarded by a check for + // xptiTraceEnabled() + void makeTraceEventEpilog(); + // Emits an event of Type + void emitInstrumentation(uint16_t Type, const char *Txt = nullptr); + // + // End Methods needed to support SYCL instrumentation + virtual void printDot(std::ostream &Stream) const = 0; virtual const Requirement *getRequirement() const { @@ -159,19 +188,45 @@ class Command { unsigned MLeafCounter = 0; const char *MBlockReason = "Unknown"; + + // All member variable defined here are needed for the SYCL instrumentation + // layer. Do not guard these variables below with XPTI_ENABLE_INSTRUMENTATION + // to ensure we have the same object layout when the macro in the library and + // SYCL app are not the same. + // + // The event for node_create and task_begin + void *MTraceEvent = nullptr; + // The stream under which the traces are emitted; stream ids are + // positive integers and we set it to an invalid value + int32_t MStreamID = -1; + // Reserved for storing the object address such as SPIRV or memory object + // address + void *MAddress = nullptr; + // Buffer to build the address string + string_class MAddressString; + // Buffer to build the command node type + string_class MCommandNodeType; + // Buffer to build the command end-user understandable name + string_class MCommandName; + // Flag to indicate if makeTraceEventProlog() has been run + bool MTraceEventPrologComplete = false; + // Flag to indicate if this is the first time we are seeing this payload + bool MFirstInstance = false; + // Instance ID tracked for the command + uint64_t MInstanceID = 0; }; // The command does nothing during enqueue. The task can be used to implement // lock in the graph, or to merge several nodes into one. class EmptyCommand : public Command { public: - EmptyCommand(QueueImplPtr Queue, Requirement Req) - : Command(CommandType::EMPTY_TASK, std::move(Queue)), - MRequirement(std::move(Req)) {} + EmptyCommand(QueueImplPtr Queue, Requirement Req); void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MRequirement; } + void emitInstrumentationData(); + private: cl_int enqueueImp() final { return CL_SUCCESS; } @@ -182,11 +237,10 @@ class EmptyCommand : public Command { // underlying framework. class ReleaseCommand : public Command { public: - ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd) - : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) { - } + ReleaseCommand(QueueImplPtr Queue, AllocaCommandBase *AllocaCmd); void printDot(std::ostream &Stream) const final; + void emitInstrumentationData(); private: cl_int enqueueImp() final; @@ -198,12 +252,7 @@ class ReleaseCommand : public Command { class AllocaCommandBase : public Command { public: AllocaCommandBase(CommandType Type, QueueImplPtr Queue, Requirement Req, - AllocaCommandBase *LinkedAllocaCmd) - : Command(Type, Queue), MLinkedAllocaCmd(LinkedAllocaCmd), - MIsLeaderAlloca(nullptr == LinkedAllocaCmd), MReleaseCmd(Queue, this), - MRequirement(std::move(Req)) { - MRequirement.MAccessMode = access::mode::read_write; - } + AllocaCommandBase *LinkedAllocaCmd); ReleaseCommand *getReleaseCmd() { return &MReleaseCmd; } @@ -213,6 +262,8 @@ class AllocaCommandBase : public Command { const Requirement *getRequirement() const final { return &MRequirement; } + void emitInstrumentationData(); + void *MMemAllocation = nullptr; // Alloca command linked with current command. @@ -229,8 +280,8 @@ class AllocaCommandBase : public Command { bool MIsLeaderAlloca = true; protected: - ReleaseCommand MReleaseCmd; Requirement MRequirement; + ReleaseCommand MReleaseCmd; }; // The command enqueues allocation of instance of memory object on Host or @@ -239,14 +290,10 @@ class AllocaCommand : public AllocaCommandBase { public: AllocaCommand(QueueImplPtr Queue, Requirement Req, bool InitFromUserData = true, - AllocaCommandBase *LinkedAllocaCmd = nullptr) - : AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req), - LinkedAllocaCmd), - MInitFromUserData(InitFromUserData) { - addDep(DepDesc(nullptr, getRequirement(), this)); - } + AllocaCommandBase *LinkedAllocaCmd = nullptr); void printDot(std::ostream &Stream) const final; + void emitInstrumentationData(); private: cl_int enqueueImp() final; @@ -259,16 +306,11 @@ class AllocaCommand : public AllocaCommandBase { class AllocaSubBufCommand : public AllocaCommandBase { public: AllocaSubBufCommand(QueueImplPtr Queue, Requirement Req, - AllocaCommandBase *ParentAlloca) - : AllocaCommandBase(CommandType::ALLOCA_SUB_BUF, std::move(Queue), - std::move(Req), - /*LinkedAllocaCmd*/ nullptr), - MParentAlloca(ParentAlloca) { - addDep(DepDesc(MParentAlloca, getRequirement(), MParentAlloca)); - } + AllocaCommandBase *ParentAlloca); void printDot(std::ostream &Stream) const final; AllocaCommandBase *getParentAlloca() { return MParentAlloca; } + void emitInstrumentationData(); private: cl_int enqueueImp() final; @@ -283,6 +325,7 @@ class MapMemObject : public Command { void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MSrcReq; } + void emitInstrumentationData(); private: cl_int enqueueImp() final; @@ -299,6 +342,7 @@ class UnMapMemObject : public Command { void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } + void emitInstrumentationData(); private: cl_int enqueueImp() final; @@ -317,6 +361,7 @@ class MemCpyCommand : public Command { void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } + void emitInstrumentationData(); private: cl_int enqueueImp() final; @@ -337,6 +382,7 @@ class MemCpyCommandHost : public Command { void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } + void emitInstrumentationData(); private: cl_int enqueueImp() final; @@ -351,13 +397,12 @@ class MemCpyCommandHost : public Command { // The command enqueues execution of kernel or explicit memory operation. class ExecCGCommand : public Command { public: - ExecCGCommand(std::unique_ptr CommandGroup, QueueImplPtr Queue) - : Command(CommandType::RUN_CG, std::move(Queue)), - MCommandGroup(std::move(CommandGroup)) {} + ExecCGCommand(std::unique_ptr CommandGroup, QueueImplPtr Queue); void flushStreams(); void printDot(std::ostream &Stream) const final; + void emitInstrumentationData(); private: cl_int enqueueImp() final; @@ -370,12 +415,11 @@ class ExecCGCommand : public Command { class UpdateHostRequirementCommand : public Command { public: UpdateHostRequirementCommand(QueueImplPtr Queue, Requirement Req, - AllocaCommandBase *SrcAllocaCmd, void **DstPtr) - : Command(CommandType::UPDATE_REQUIREMENT, std::move(Queue)), - MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(Req)), MDstPtr(DstPtr) {} + AllocaCommandBase *SrcAllocaCmd, void **DstPtr); void printDot(std::ostream &Stream) const final; const Requirement *getRequirement() const final { return &MDstReq; } + void emitInstrumentationData(); private: cl_int enqueueImp() final; diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 862507c1839cc..9a5f02fab02c8 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -27,11 +27,19 @@ EventImplPtr addHostAccessorToSchedulerInstance(Requirement *Req, } void Scheduler::waitForRecordToFinish(MemObjRecord *Record) { +#ifdef XPTI_ENABLE_INSTRUMENTATION + // Will contain the list of dependencies for the Release Command + std::set DepCommands; +#endif for (Command *Cmd : Record->MReadLeaves) { EnqueueResultT Res; bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); +#ifdef XPTI_ENABLE_INSTRUMENTATION + // Capture the dependencies + DepCommands.insert(Cmd); +#endif GraphProcessor::waitForEvent(Cmd->getEvent()); } for (Command *Cmd : Record->MWriteLeaves) { @@ -39,6 +47,9 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record) { bool Enqueued = GraphProcessor::enqueueCommand(Cmd, Res); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); +#ifdef XPTI_ENABLE_INSTRUMENTATION + DepCommands.insert(Cmd); +#endif GraphProcessor::waitForEvent(Cmd->getEvent()); } for (AllocaCommandBase *AllocaCmd : Record->MAllocaCommands) { @@ -47,6 +58,11 @@ void Scheduler::waitForRecordToFinish(MemObjRecord *Record) { bool Enqueued = GraphProcessor::enqueueCommand(ReleaseCmd, Res); if (!Enqueued && EnqueueResultT::SyclEnqueueFailed == Res.MResult) throw runtime_error("Enqueue process failed.", PI_INVALID_OPERATION); +#ifdef XPTI_ENABLE_INSTRUMENTATION + // Report these dependencies to the Command so these dependencies can be + // reported as edges + ReleaseCmd->resolveReleaseDependencies(DepCommands); +#endif GraphProcessor::waitForEvent(ReleaseCmd->getEvent()); } } diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 828c344b97d7b..c0e8ec9e9edaa 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -19,7 +19,7 @@ __SYCL_INLINE_NAMESPACE(cl) { namespace sycl { -event handler::finalize() { +event handler::finalize(const cl::sycl::detail::code_location &Payload) { sycl::event EventRet; unique_ptr_class CommandGroup; switch (MCGType) { @@ -30,52 +30,53 @@ event handler::finalize() { std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), std::move(MRequirements), std::move(MEvents), std::move(MArgs), std::move(MKernelName), - std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType)); + std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType, + Payload)); break; } case detail::CG::INTEROP_TASK_CODEPLAY: CommandGroup.reset(new detail::CGInteropTask( std::move(MInteropTask), std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents), MCGType)); + std::move(MRequirements), std::move(MEvents), MCGType, Payload)); break; case detail::CG::COPY_ACC_TO_PTR: case detail::CG::COPY_PTR_TO_ACC: case detail::CG::COPY_ACC_TO_ACC: - CommandGroup.reset( - new detail::CGCopy(MCGType, MSrcPtr, MDstPtr, std::move(MArgsStorage), - std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents))); + CommandGroup.reset(new detail::CGCopy( + MCGType, MSrcPtr, MDstPtr, std::move(MArgsStorage), + std::move(MAccStorage), std::move(MSharedPtrStorage), + std::move(MRequirements), std::move(MEvents), Payload)); break; case detail::CG::FILL: CommandGroup.reset(new detail::CGFill( std::move(MPattern), MDstPtr, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents))); + std::move(MRequirements), std::move(MEvents), Payload)); break; case detail::CG::UPDATE_HOST: CommandGroup.reset(new detail::CGUpdateHost( MDstPtr, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), std::move(MRequirements), - std::move(MEvents))); + std::move(MEvents), Payload)); break; case detail::CG::COPY_USM: CommandGroup.reset(new detail::CGCopyUSM( MSrcPtr, MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents))); + std::move(MRequirements), std::move(MEvents), Payload)); break; case detail::CG::FILL_USM: CommandGroup.reset(new detail::CGFillUSM( std::move(MPattern), MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), - std::move(MRequirements), std::move(MEvents))); + std::move(MRequirements), std::move(MEvents), Payload)); break; case detail::CG::PREFETCH_USM: CommandGroup.reset(new detail::CGPrefetchUSM( MDstPtr, MLength, std::move(MArgsStorage), std::move(MAccStorage), std::move(MSharedPtrStorage), std::move(MRequirements), - std::move(MEvents))); + std::move(MEvents), Payload)); break; case detail::CG::NONE: throw runtime_error("Command group submitted without a kernel or a " diff --git a/sycl/source/ordered_queue.cpp b/sycl/source/ordered_queue.cpp index 4abf0923fef49..9829c446f7bde 100644 --- a/sycl/source/ordered_queue.cpp +++ b/sycl/source/ordered_queue.cpp @@ -74,10 +74,6 @@ device ordered_queue::get_device() const { return impl->get_device(); } bool ordered_queue::is_host() const { return impl->is_host(); } -void ordered_queue::wait() { impl->wait(); } - -void ordered_queue::wait_and_throw() { impl->wait_and_throw(); } - void ordered_queue::throw_asynchronous() { impl->throw_asynchronous(); } event ordered_queue::memset(void *ptr, int value, size_t count) { @@ -88,13 +84,23 @@ event ordered_queue::memcpy(void *dest, const void *src, size_t count) { return impl->memcpy(impl, dest, src, count); } -event ordered_queue::submit_impl(function_class CGH) { - return impl->submit(CGH, impl); +event ordered_queue::submit_impl(function_class CGH, + const detail::code_location &CodeLoc) { + return impl->submit(CGH, impl, CodeLoc); } event ordered_queue::submit_impl(function_class CGH, - ordered_queue &secondQueue) { - return impl->submit(CGH, impl, secondQueue.impl); + ordered_queue &secondQueue, + const detail::code_location &CodeLoc) { + return impl->submit(CGH, impl, secondQueue.impl, CodeLoc); +} + +void ordered_queue::wait_proxy(const detail::code_location &CodeLoc) { + impl->wait(CodeLoc); +} + +void ordered_queue::wait_and_throw_proxy(const detail::code_location &CodeLoc) { + impl->wait_and_throw(CodeLoc); } template diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index c47eba506958d..e20b8e6016725 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -88,9 +88,6 @@ device queue::get_device() const { return impl->get_device(); } bool queue::is_host() const { return impl->is_host(); } -void queue::wait() { impl->wait(); } - -void queue::wait_and_throw() { impl->wait_and_throw(); } void queue::throw_asynchronous() { impl->throw_asynchronous(); } @@ -106,13 +103,22 @@ event queue::mem_advise(const void *ptr, size_t length, int advice) { return impl->mem_advise(ptr, length, advice); } -event queue::submit_impl(function_class CGH) { - return impl->submit(CGH, impl); +event queue::submit_impl(function_class CGH, + const detail::code_location &CodeLoc) { + return impl->submit(CGH, impl, CodeLoc); } -event queue::submit_impl(function_class CGH, - queue secondQueue) { - return impl->submit(CGH, impl, secondQueue.impl); +event queue::submit_impl(function_class CGH, queue secondQueue, + const detail::code_location &CodeLoc) { + return impl->submit(CGH, impl, secondQueue.impl, CodeLoc); +} + +void queue::wait_proxy(const detail::code_location &CodeLoc) { + impl->wait(CodeLoc); +} + +void queue::wait_and_throw_proxy(const detail::code_location &CodeLoc) { + impl->wait_and_throw(CodeLoc); } template diff --git a/sycl/unittests/scheduler/BlockedCommands.cpp b/sycl/unittests/scheduler/BlockedCommands.cpp index 1b9d192da8fac..df93a2b5f9dcd 100644 --- a/sycl/unittests/scheduler/BlockedCommands.cpp +++ b/sycl/unittests/scheduler/BlockedCommands.cpp @@ -22,6 +22,8 @@ class MockCommand : public detail::Command { : Command(detail::Command::ALLOCA, Queue) {} void printDot(std::ostream &Stream) const override {} + void emitInstrumentationData() override {} + cl_int enqueueImp() override { return MRetVal; } cl_int MRetVal = CL_SUCCESS; diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index a7af5a7db06b1..59b77aba199be 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -23,6 +23,7 @@ class FakeCommand : public cl::sycl::detail::Command { MRequirement{std::move(Req)} {} void printDot(std::ostream &Stream) const override {} + void emitInstrumentationData() override {} const cl::sycl::detail::Requirement *getRequirement() const final { return &MRequirement; diff --git a/xpti/CMakeLists.txt b/xpti/CMakeLists.txt new file mode 100644 index 0000000000000..a23187e713809 --- /dev/null +++ b/xpti/CMakeLists.txt @@ -0,0 +1,55 @@ +cmake_minimum_required(VERSION 2.8.9) + +set(XPTI_VERSION 0.4.1) +set(XPTI_DIR ${CMAKE_CURRENT_LIST_DIR}) +# Setting the same version as SYCL +set(CMAKE_CXX_STANDARD 11) + +if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + message(STATUS "No build type selected, default to Release") + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type (default Release)" FORCE) +endif() + +project (xpti) + +if (MSVC) + # MSVC provides two incompatible build variants for its CRT: release and debug + # To avoid potential issues in user code we also need to provide two kinds + # of SYCL Runtime Library for release and debug configurations. + set(XPTI_CXX_FLAGS "") + if (CMAKE_BUILD_TYPE MATCHES "Debug") + set(XPTI_CXX_FLAGS "${CMAKE_CXX_FLAGS_DEBUG}") + string(REPLACE "/MDd" "" XPTI_CXX_FLAGS "${XPTI_CXX_FLAGS}") + string(REPLACE "/MTd" "" XPTI_CXX_FLAGS "${XPTI_CXX_FLAGS}") + else() + if (CMAKE_BUILD_TYPE MATCHES "Release") + set(XPTI_CXX_FLAGS "${CMAKE_CXX_FLAGS_RELEASE}") + elseif (CMAKE_BUILD_TYPE MATCHES "RelWithDebInfo") + set(XPTI_CXX_FLAGS "${CMAKE_CXX_FLAGS_MINSIZEREL}") + elseif (CMAKE_BUILD_TYPE MATCHES "MinSizeRel") + set(XPTI_CXX_FLAGS "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") + endif() + string(REPLACE "/MD" "" XPTI_CXX_FLAGS "${XPTI_CXX_FLAGS}") + string(REPLACE "/MT" "" XPTI_CXX_FLAGS "${XPTI_CXX_FLAGS}") + endif() + + # target_compile_options requires list of options, not a string + string(REPLACE " " ";" XPTI_CXX_FLAGS "${XPTI_CXX_FLAGS}") + + set(XPTI_CXX_FLAGS_RELEASE "${XPTI_CXX_FLAGS};/MD") + set(XPTI_CXX_FLAGS_DEBUG "${XPTI_CXX_FLAGS};/MDd") + + # CMake automatically applies these flags to all targets. To override this + # behavior, options lists are reset. + set(CMAKE_CXX_FLAGS_RELEASE "") + set(CMAKE_CXX_FLAGS_MINSIZEREL "") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "") + set(CMAKE_CXX_FLAGS_DEBUG "") +endif() + +set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/lib/${CMAKE_BUILD_TYPE}) +set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}) +set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}) + +include_directories(${CMAKE_SOURCE_DIR}/include) +add_subdirectory(src) diff --git a/xpti/README.md b/xpti/README.md new file mode 100644 index 0000000000000..0c0e8868e305d --- /dev/null +++ b/xpti/README.md @@ -0,0 +1,22 @@ +# XPTI proxy library + +Implementation of the instrumentation stub library to support SYCL +Instrumentation. The stub library checks for two things before it can +successfully dispatch event streams: + +1. Environment variable that indicates that tracing has been enabled. + + This is defined by the variable `XPTI_TRACE_ENABLE`. The possible + values taken by this environment variable are: + + To enable: `XPTI_TRACE_ENABLE=1` or `XPTI_TRACE_ENABLE=true` + + To disable: `XPTI_TRACE_ENABLE=0` or `XPTI_TRACE_ENABLE=false` + +2. Environment variable that points to the XPTI dispatcher so the stub + library can dynamically load it and dispatch the calls to the dispatcher. + `XPTI_FRAMEWORK_DISPATCHER=/path/to/dispatcher.[so,dll,dylib]` + +The stub library requires both of these to be set for it to successfully +dispatch the calls for the event streams. The dispatcher is required for +tool developers to implement subscribers and register them with the dispatcher. diff --git a/xpti/doc/SYCL_Tracing_Implementation.md b/xpti/doc/SYCL_Tracing_Implementation.md new file mode 100644 index 0000000000000..f78823061c8b5 --- /dev/null +++ b/xpti/doc/SYCL_Tracing_Implementation.md @@ -0,0 +1,236 @@ +# SYCL Tracing - Part I + +In order to understand the various language constructs that are available +in the SYCL layer, instrumenting them to provide a correlation between the +end-user source code and the kernels that execute on a device must be made. +A lightweight tracing framework (XPTI) was developed to facilitate this +through explicit instrumentation of all the language constructs. The goal +of this framework is to provide a low-overhead solution that tools can use +to build performance analytical models. Based on tests, simulations and +projections, the framework API can capture telemetry information for +~60-70,000 events/sec with overheads less than 1% of the application runtime. + +This document outlines the use of this framework API at various points in +the SYCL runtime to trace the language constructs. Each language construct +used/expressed by the developer is associated with a source location payload +information that includes the function name, source file name and line +number where the construct is expressed. Using the source location +information, a hash is created for the language construct along with a +corresponding unique ID. The framework provides the ability to propagate +this ID all the way to the driver layers for the device. + +This document discusses where in the SYCL runtime instrumentation has been +added and the reasons behind adding this instrumentation. + +> **NOTE:** This document is better viewed with [Markdown Reader](https://chrome.google.com/webstore/detail/markdown-reader/gpoigdifkoadgajcincpilkjmejcaanc?hl=en) plugin for chrome or the [Markdown Preview Extension](https://github.com/shd101wyy/vscode-markdown-preview-enhanced/releases) for Visual Studio Code. + +## Instrumentation Trace Points + +This section will document all the places in the SYCL runtime that have been +instrumented to capture the asynchronous task graphs created by the runtime. +The task graphs are captured as graph, nodes and edges: + +> - The graph encapsulates all of the disjoint task graphs generated by the application. +> - The nodes capture operations that are performed, such as kernel +executions or memory transfers +> - The edges represent dependence relationships, the representation of +which mimics control flow as opposed to a dependence graph. The source node +in an edge must complete before the target node can begin execution. + + All code changes to enable this have been guarded by + `XPTI_ENABLE_INSTRUMENTATION` macro and the CMake files have been updated to + have this as an option which is enabled by default and this change is under + `llvm/sycl/CMakeLists.txt`. + +```cmake +... +# Create a soft option for enabling or disabling the instrumentation +# of the SYCL runtime +option(SYCL_ENABLE_XPTI_TRACING "Enable tracing of SYCL constructs" ON) +``` + +### The Graph + +Any SYCL/DPC++ application can submit command groups to any active queue +during the lifetime of the application. Each submission is handled by the +runtime and the asynchronous task graphs are updated to reflect the new +submission. This may be as simple as adding a new node to the task-graph or +adding multiple nodes to the graph, where one of the nodes represents the +computation and the others dependent memory transfers. + +To model this, we create a global graph for every application instantiation +and all kernel executions in the applications are added as nodes in this +global graph. In the SYCL runtime, there is no obvious location where the +creation of the global graph can be inserted as many objects are +instantiated statically. Currently, we embed the graph creation in the +plugin interface (PI) layer `initialize()` call. In this call, we will +perform two operations: + +1. Initialize all listeners and create a trace event to represent the graph. +This is done in `sycl/include/CL/sycl/detail/pi.cpp`. +2. Send a `graph_create` event to all subscribers. This notification +will only be sent once. + +### The Nodes + +The command group lambdas are captured and encapsulated in a `Command` +object. This object is evaluated for dependencies on data/memory or external +OpenCL events and an asynchronous task graph is built by mapping all these +dependencies, before it is enqueued on the device. In order to capture the +command groups (nodes) and the dependencies (edges), the base class +`Command` and any derived classes that are of interest are instrumented. + +In this section, we discuss the instrumentation of the Command object in two +parts: (1) The changes made to capture end-user source code details for +language constructs (2) The instrumentation that handles capturing the +relevant metadata. + +1. In order to capture end-user source code information, we have implemented +`cl::sycl::detail::code_location` class that uses the builtin functions +in the compiler. However, equivalent implementations are unavailable on +Windows and separate cross-platform implementation might be used in the +future. To mitigate this, the Windows implementation will always report +`unknown_file`, `unknown_func` and a line number of 0 for source +file, function name and line number. We handle this case while processing +this information. + + The source information of a language construct, such as source file, + function name, line number and column number allow us to determine if a + Command that was previously created for a construct is being created + again. In such cases, we will not emit a `node_create` event, but we + will bump up the instance count recording the number of instances + created. Secondly, the source information allows us to associate a unique + ID with the source location and propagate it all the way to the driver, + if possible. This will allow us to associate a Kernel event with a source + location at all times. All instrumentation that identifies a command + object of a given type and emits the `node_create` event is located + in the `emitInstrumentationData()` and must be implemented by all + derived classes. + + To enable this source location information, we start with enabling the + public methods in the queue class, such as `queue.submit()`, + `queue.parallel_for()`, `queue.wait()`, etc to include a default + argument that captures the source location information. The location of + the line in the caller that makes the call to `queue.submit()`, + `queue.parallel_for()`, etc is represented in this default argument. + These changes are present in `queue.hpp` and `ordered_queue.hpp`. + The default arguments for all public functions are guarded by + `#ifdef SYCL_INSTRUMENTATION_METADATA` that is currently enabled by + default. + + The location information, when captured, is propagated all the way to the + `CommandGroup` object. So, for every `CommandGroup` object, we + will have the corresponding source location in end-user code where the + command group is submitted to the queue. This metadata is propagated by + the instrumentation to the subscribers of the stream. + +2. The base `Command class` and all derived classes are instrumented to capture + the relevant information for each command object and a `node_create` event is + generated. + +### The Node instance + +Once a command object is created, it is enqueued on the device for +execution. To capture the execution of this node instance, we instrument the +`enqueue()` method to determine the cost of this computation or memory +related kernel. As the commands are enqueued, the enqueue method emits a +pair of events indicating the `task_begin` and `task_end`events that +capture the duration of the enqueued command. For commands that are +asynchronous, the pair of events capture just the kernel submission and the +actual execution of the command on the device is tracked through the +`cl_event` returned by the enqueue operation. In the case of host kernel +execution or commands that are synchronous, the cost is measured directly. + +In the case of the command being submitted to an OpenCL device, we capture +the event of the submitted kernel and propagate it to the subscriber tool. +It is up to the tool to register a callback for this event completion and +close the task opened for the command object. + +### The Edges + +As discussed in the previous section, the command groups submitted to the +device queues form nodes in the asynchronous tasks graphs created by +the SYCL runtime. In addition to these nodes, based on the memory references +(through accessors or USM pointers), additional nodes to `allocate`, +`copy` and `release` are created and they are necessary for the +computation kernels to run. The computation kernel has dependencies on the +memory objects and these dependencies are recorded as `event`s and in +our model we represent them as edges between the dependent nodes. + +Tools monitoring the event stream then can start capturing the asynchronous +task graph as it is being built. As dependencies are added to a command +object, the instrumentation emits these dependencies as `edge_create` +events. Each of these `edge_create`events encapsulate the two command +objects that have a dependency through this edge. The source object of this +edge event must complete execution first before the target object of the +edge can begin execution. + +To instrument this part of the code, the `Command::addDep` methods of +the Command object are instrumented to create the trace points and notify +all subscribers. + +The `Release` command, as implemented in the SYCL runtime, has a +reference to the memory object, but no explicit dependencies are created. To +model the edges correctly, we instrument the `waitForRecordToFinish` method in +the `Scheduler` where the release operation waits on all the +dependent operations to complete to capture the edges. + +This concludes all the changes that were made to the SYCL runtime to support +tracing. The next section talks about the XPTI framework that allows +applications and runtimes to efficiently capture, record and emit trace +notifications for important events during the run. + +# SYCL Tracing - Part II +The architecture of the XPTI, when described at a rudimentary level, allows +you to instrument any runtime or application and link it with the static +stub library that implements all the functions in the library. If the +tracing is enabled through the environment variable +`XPTI_TRACE_ENABLE=1`, the stub library checks to see if the framework +dispatcher is registered. This dispatcher is registered through an +environment variable `XPTI_FRAMEWORK_DISPATCHER=/path/to/libxptifw.so`. +If tracing is turned on and the dynamic loading of the framework dispatcher +is successful, then the proxy library creates a dispatch table for all the +trampoline functions used in the instrumentation of the application or +runtime. + +The static library is the only dependency for building the SYCL runtime (or +any application/runtime that uses this instrumentation mechanism) and is +currently available under `llvm/xpti`. In the current organization at +the file system level, the API specification for the instrumentation +framework is available with the static library under `llvm/xpti/include`. +The framework is divided into two parts: (1) the implementation of the proxy/ +stub library that will be compiled to create a static library to be linked +with SYCL runtime and (2) a dynamic library with can be registered as the +framework dispatcher and is not required by the SYCL runtime that is not a +part of the `llvm` project. The dynamic library depends on the API +specification that is a part of the static proxy library. Using the +specification in the `llvm/xpti`, an implementation of the dynamic +library can be built. + +## The Proxy library + +The proxy library implements all of the public functions that are a part of +the XPTI tracing infrastructure. Each function however is a stub that checks +to see if tracing has been enabled. If so, it forwards the call to the same +function in the framework dispatcher or the dynamic component of this +framework. + + ```c++ + XPTI_EXPORT_API xpti::result_t xptiInitialize(const char *stream, + uint32_t maj, + uint32_t min, + const char *version) { + // Static object g_loader will return true if + // XPTI_TRACE_ENABLE=1 and the dynamic dispatcher + // library in XPTI_FRAMEWORK_DISPATCHER is valid + // and successfully loaded. + // + if (xpti::g_loader.noErrors()) { + void *f = xpti::g_loader.functionByIndex(XPTI_INITIALIZE); + if (f) { + return (*(xpti_initialize_t)f)(stream, maj, min, version); + } + } + return xpti::result_t::XPTI_RESULT_FAIL; + } + ``` diff --git a/xpti/include/xpti_data_types.h b/xpti/include/xpti_data_types.h new file mode 100644 index 0000000000000..91ef9afba6d35 --- /dev/null +++ b/xpti/include/xpti_data_types.h @@ -0,0 +1,543 @@ +// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +#pragma once +#include +#include +#include +#include + +namespace xpti { +constexpr int invalid_id = -1; +constexpr uint8_t default_vendor = 0; + +/// @brief Flag values used by the payload_t structure to mark the information +/// present +/// @details When a payload is created, it is conceivable that only partial +/// information may be present and these flags are used to indicate the +/// available information. The hash generator will generate a hash based on the +/// flags set. +/// +enum class payload_flag_t { + NameAvailable = 1, ///< The name for the tracepoint is available + SourceFileAvailable = 2, ///< Source file information available + CodePointerAvailable = 4, ///< Code pointer VA is available + LineInfoAvailable = 8, ///< Line information available in the payload + ColumnInfoAvailable = 16, ///< Column information available in payload + HashAvailable = 2 << 16 ///< A hash is already available for this payload +}; + +// +// Helper macros for creating new tracepoint and +// event types +// +using trace_point_t = uint16_t; +using event_type_t = uint16_t; +using string_id_t = int32_t; + +using safe_flag_t = std::atomic; +using safe_uint64_t = std::atomic; +using safe_uint32_t = std::atomic; +using safe_uint16_t = std::atomic; +using safe_int64_t = std::atomic; +using safe_int32_t = std::atomic; +using safe_int16_t = std::atomic; +using metadata_t = std::unordered_map; + +#define XPTI_EVENT(val) xpti::event_type_t(val) +#define XPTI_TRACE_POINT_BEGIN(val) xpti::trace_point_t(val << 1 | 0) +#define XPTI_TRACE_POINT_END(val) xpti::trace_point_t(val << 1 | 1) + +#define XPTI_PACK08_RET16(value1, value2) ((value1 << 8) | value2) +#define XPTI_PACK16_RET32(value1, value2) ((value1 << 16) | value2) +#define XPTI_PACK32_RET64(value1, value2) (((uint64_t)value1 << 32) | value2) + +/// @brief Payload data structure that is optional for trace point callback API +/// @details The payload structure, if determined at compile time, can deliver +/// the source association of various parallel constructs defined by the +/// language. In the case it is defined, a lookup table will provide the +/// association from a kernel/lambda (address) to a payload and the same address +/// to a unique ID created at runtime. +/// +/// All instances of a kernel will be associated with the same unique ID through +/// the lifetime of an object. The hash maps that will be maintained would be: +/// # [unique_id]->[payload] +/// # [kernel address]->[unique_id] +/// +/// Unique_id MUST be propagated downstream to the OpenCL runtime to ensure the +/// associations back to the sources. This requires elp from the compiler +/// front-end. +/// +struct payload_t { + /// String ID of the function/kernel name; only valid if 'name' is valid + string_id_t name_sid = invalid_id; + /// String ID of the source file name + string_id_t source_file_sid = invalid_id; + /// Name of the trace point; graph, algorithm, lock names, for example. + const char *name = nullptr; + /// Absolute path of the source file; may have to to be unicode string + const char *source_file = nullptr; + /// Line number information to correlate the trace point + uint32_t line_no = invalid_id; + /// For a complex statement, column number may be needed to resolve the trace + /// point; currently none of the compiler builtins return a valid column no + uint32_t column_no = invalid_id; + /// Kernel/lambda/function address + const void *code_ptr_va = nullptr; + /// Internal bookkeeping slot - do not change. + uint64_t internal; + /// Flags indicating whether string name, codepointer, source file and hash + /// values are available + uint64_t flags = 0; + + payload_t() = default; + + // If the address of the kernel/function name is provided, we mark it as + // valid since we can potentially reconstruct the name and the source file + // information during post-processing step of symbol resolution; this + // indicates a partial but valid payload. + payload_t(void *codeptr) { + code_ptr_va = codeptr; + name_sid = invalid_id; ///< Invalid string ID + source_file_sid = invalid_id; ///< Invalid string ID + name = nullptr; ///< Invalid name string pointer + source_file = nullptr; ///< Invalid source file string pointer + line_no = invalid_id; ///< Invalid line number + column_no = invalid_id; ///< Invalid column number + flags = (uint64_t)payload_flag_t::CodePointerAvailable; + } + + // If neither an address or the fully identifyable source file name and + // location are not available, we take in the name of the + // function/task/user-defined name as input and create a hash from it. We + // mark it as valid since we can display the name in a timeline view, but the + // payload is considered to be a partial but valid payload. + payload_t(const char *func_name) { + code_ptr_va = nullptr; + name_sid = invalid_id; ///< Invalid string ID + source_file_sid = invalid_id; ///< Invalid string ID + name = func_name; ///< Invalid name string pointer + source_file = nullptr; ///< Invalid source file string pointer + line_no = invalid_id; ///< Invalid line number + column_no = invalid_id; ///< Invalid column number + flags = (uint64_t)(payload_flag_t::NameAvailable); + } + + payload_t(const char *func_name, void *codeptr) { + code_ptr_va = codeptr; + name_sid = invalid_id; ///< Invalid string ID + source_file_sid = invalid_id; ///< Invalid string ID + name = func_name; ///< Invalid name string pointer + source_file = nullptr; ///< Invalid source file string pointer + line_no = invalid_id; ///< Invalid line number + column_no = invalid_id; ///< Invalid column number + flags = (uint64_t)payload_flag_t::NameAvailable | + (uint64_t)payload_flag_t::CodePointerAvailable; + } + + // We need the payload to contain at the very least, the code pointer + // information of the kernel or function. In the full payload case, we will + // also have the function name and source file name along with the line and + // column number of the trace point that forms the payload. + payload_t(const char *kname, const char *sf, int line, int col, + void *codeptr) { + code_ptr_va = codeptr; + /// Invalid string ID as the string hasn't been registered yet + name_sid = invalid_id; + source_file_sid = invalid_id; + /// Capture the rest of the parameters + name = kname; + source_file = sf; + line_no = line; + column_no = col; + flags = (uint64_t)payload_flag_t::NameAvailable | + (uint64_t)payload_flag_t::SourceFileAvailable | + (uint64_t)payload_flag_t::LineInfoAvailable | + (uint64_t)payload_flag_t::ColumnInfoAvailable | + (uint64_t)payload_flag_t::CodePointerAvailable; + } +}; + +/// @brief Enumerator defining the global/basic trace point types +/// @details The frame work defines the global/basic trace point types +/// that are necessary for modeling parallel runtimes. A helper macro +/// provided to create the enum values as the LSB is reserved for +/// determining if the trace point is a 'begin' trace or an 'end' +/// trace. This reserved bit is used by the scoped_notify() class +/// to automatically send the closing enum trace type for a given +/// trace point type. +/// +/// The provided macros TRACE_POINT_BEGIN(val) and TRACE_POINT_END(val) +/// must be used in all user defined enums that are defined to extend +/// the trace point types. +/// +/// The trace_type data is of type uint8_t and the 7-LSB bits are used +/// to enumerate trace types. the MSB bit is reserved for user-defined +/// trace types and is set to 0 for predefined trace point types defined +/// by the framework. +/// +/// When user-defined trace types are being declared, a new ID is added +/// to this value to create a uint16_t data type. The LSB 8-bits have +/// the 8th bit set indicating that it is user-defined and the remaining +/// 7-bits will indicated the user defined trace point type. However, +/// since multiple tools or vendors could create their own trace point +/// types, we require the vendor_id to create a vendor namespace to avoid +/// collisions. +/// +/// user-defined bit +/// | +/// | +/// |+-----+---- 127 possible values for +/// || | defining trace types. +/// || | Due to the scope bit, +/// || | 63 unique scope types +/// || | can be defined. +/// vv v +/// Field width (uint16_t) |........|........| +/// 15 8 7 0 +/// ^ ^ +/// | | +/// | | +/// +------+----- Reserved for vendor ID +/// +enum class trace_point_type_t : uint16_t { + unknown_type = 0, + /// Indicates that a graph has been instantiated + graph_create = XPTI_TRACE_POINT_BEGIN(1), + /// Indicates that a new node object has been instantiated + node_create = XPTI_TRACE_POINT_BEGIN(2), + /// Indicates that a new edge object has been instantiated + edge_create = XPTI_TRACE_POINT_BEGIN(3), + /// Indicates the beginning of a parallel region + region_begin = XPTI_TRACE_POINT_BEGIN(4), + /// Indicates the end of a parallel region + region_end = XPTI_TRACE_POINT_END(4), + /// Indicates the begin of a task execution, the parent of which could be a + /// graph or a parallel region + task_begin = XPTI_TRACE_POINT_BEGIN(5), + /// Indicates the end of an executing task + task_end = XPTI_TRACE_POINT_END(5), + /// Indicates the begin of a barrier call + barrier_begin = XPTI_TRACE_POINT_BEGIN(6), + /// Indicates the end of a barrier + barrier_end = XPTI_TRACE_POINT_END(6), + /// Similar to barrier begin, but captures the information for a lock + lock_begin = XPTI_TRACE_POINT_BEGIN(7), + /// Similar to barrier end, but captures the information for a lock + lock_end = XPTI_TRACE_POINT_END(7), + /// Use to model triggers (impulse) at various points in time - will not have + /// an end equivalent + signal = XPTI_TRACE_POINT_BEGIN(8), + /// Used to model the data transfer initiation from device A to device B + transfer_begin = XPTI_TRACE_POINT_BEGIN(9), + /// Used to model the completion of a previously initiated data transfer + /// event + transfer_end = XPTI_TRACE_POINT_END(9), + /// Is present for completeness to capture the spawning of new threads in a + /// runtime + thread_begin = XPTI_TRACE_POINT_BEGIN(10), + /// Models the end of the lifetime of a thread + thread_end = XPTI_TRACE_POINT_END(10), + /// Models the explicit barrier begin in SYCL + wait_begin = XPTI_TRACE_POINT_BEGIN(11), + /// Models the explicit barrier end in SYCL + wait_end = XPTI_TRACE_POINT_END(11), + /// Indicates that the trace point is user defined and only the tool defined + /// for a stream will be able to handle it + user_defined = 1 << 7 +}; + +/// @brief Enumerator defining the global/basic trace event types +/// @details The frame work defines the global/basic trace event types that are +/// necessary for modeling parallel runtimes. +/// +/// The event_type data is of type uint8_t and the 7-LSB bits are used to +/// enumerate event types. the MSB bit is reserved for user-defined event types +/// and is set to 0 for predefined event types defined by the framework. +/// +/// When user-defined event types are being declared, a new ID is added to this +/// value to create a uint16_t data type. The LSB 8-bits have the 8th bit set +/// indicating that it is user-defined and the remaining 7-bits will indicated +/// the user defined trace event type. However, since multiple tools or vendors +/// could create their own trace event types, we require the vendor_id to +/// create a vendor namespace to avoid collisions. +/// +/// user-defined bit +/// | +/// | +/// |+-----+---- 127 possible values for +/// || | defining event types. +/// || | +/// || | +/// || | +/// vv v +/// Field width (uint16_t) |........|........| +/// 15 8 7 0 +/// ^ ^ +/// | | +/// | | +/// +------+----- Reserved for vendor ID +enum class trace_event_type_t : uint16_t { + /// In this case, the callback can choose to map it to something called + /// unknown or ignore it entirely + unknown_event = 0, + /// Event type is graph - usually reported for traces from graph or for + /// graph, node or edge object creation + graph = XPTI_EVENT(1), + /// Algorithm type describes a parallel algorithm such as a parallel_for + algorithm = XPTI_EVENT(2), + /// Barrier event is usually a synchronization type that causes threads to + /// wait until something happens and found in parallel algorithms and explicit + /// synchronization use cases in asynchronous programming + barrier = XPTI_EVENT(3), + /// Activity in the scheduler that is not useful work is reported as this + /// event type + scheduler = XPTI_EVENT(4), + /// Asynchronous activity event + async = XPTI_EVENT(5), + /// Synchronization event - only the contention time is captured by this + /// event and marked as overhead + lock = XPTI_EVENT(6), + /// Indicates that the current event is an offload read request + offload_read = XPTI_EVENT(7), + /// Indicates that the current event is an offload write request + offload_write = XPTI_EVENT(8), + /// User defined event for extensibility and will have to be registered by + /// the tool/runtime + user_defined = 1 << 7 +}; + +enum class trace_activity_type_t { + /// Activity type is unknown; it is upto the collector handling the callback + /// to mark it as needed + unknown_activity = 0, + /// Any activity reported by the tracing that results in useful work, hence + /// active time + active = 1, + /// Activity that was primarily due to overheads such as time spent in + /// barriers and schedulers, acquiring locks, etc + overhead = 1 << 1, + /// Activities that may be considered as background tasks; for example, + /// asynchronous activities or region callbacks that are placeholders for + /// nested activities + background = 1 << 2, + /// Explicit sleeps could be a result of calling APIs that result in zero + /// active time + sleep_activity = 1 << 3 +}; + +struct reserved_data_t { + /// Has a reference to the associated payload field for an event + payload_t *payload; + /// Has additional metadata that may be defined by the user as key-value + /// pairs + metadata_t metadata; +}; + +struct trace_event_data_t { + /// Unique id that corresponds to an event type or event group type + int64_t unique_id = invalid_id; + /// Data ID: ID that tracks the data elements streaming through the algorithm + /// (mostly graphs; will be the same as instance_id for algorithms) + uint64_t data_id = 0; + /// Instance id of an algorithm with id=unique_id + uint64_t instance_id = 0; + /// The type of event + uint16_t event_type; + /// How this event is classified: active, overhead, barrier etc + uint16_t activity_type; + /// Unused 32-bit slot that could be used for any ids that need to be + /// propagated in the future + uint32_t unused; + /// If event_type is "graph" and trace_type is "edge_create", then the source + /// ID is set + int64_t source_id = invalid_id; + /// If event_type is "graph" and trace_type is "edge_create", then the target + /// ID is set + int64_t target_id = invalid_id; + /// A reserved slot for memory growth, if required by the framework + reserved_data_t reserved; + /// User defined data, if required; owned by the user shared object and will + /// not be deleted when event data is destroyed + void *user_data = nullptr; +}; + +/// +/// The error code list is incomplete and still +/// being defined. +/// +enum class result_t : int32_t { + // Success codes here (values >=0) + XPTI_RESULT_SUCCESS = int32_t(0), + XPTI_RESULT_FALSE = int32_t(1), + // Error codes here (values < 0) + XPTI_RESULT_FAIL = int32_t(0x80004001), + XPTI_RESULT_NOTIMPL = int32_t(0x80004002), + XPTI_RESULT_DUPLICATE = int32_t(0x80004003), + XPTI_RESULT_NOTFOUND = int32_t(0x80004004), + XPTI_RESULT_UNDELETE = int32_t(0x80004005), + XPTI_RESULT_INVALIDARG = int32_t(0x80004006) +}; + +// These defines are present to enable plugin developers +// who want to subscribe to the streams from the framework +// +#if defined(_WIN64) || defined(_WIN32) /* Windows */ +#ifdef XPTI_CALLBACK_API_EXPORTS +#define XPTI_CALLBACK_API __declspec(dllexport) +#else +#define XPTI_CALLBACK_API __declspec(dllimport) +#endif +#else /* Generic Unix/Linux */ +#ifdef XPTI_CALLBACK_API_EXPORTS +#define XPTI_CALLBACK_API __attribute__((visibility("default"))) +#else +#define XPTI_CALLBACK_API +#endif +#endif +/// @brief Callback function prototype +/// @details All callback functions that are registered with +/// the tracing framework have this signature. +/// +/// @param [in] trace_type The trace type for which this callback has been +/// invoked. +/// @param [in] parent Parent object for which the current object/trace is a +/// child of. If the current trace is not nested, the parent object will be +/// NULL. +/// @param [in] child Child object for this callback has been invoked. +/// @param [in] user_data Data sent by the caller which can be anything and the +/// tool trying to interpret it needs to know the type for the handshake to be +/// successful. Most of the time, this field is used to send in const char * +/// data. +typedef void (*tracepoint_callback_api_t)(uint16_t trace_type, + xpti::trace_event_data_t *parent, + xpti::trace_event_data_t *child, + uint64_t instance, + const void *user_data); +typedef void (*plugin_init_t)(unsigned int, unsigned int, const char *, + const char *); +typedef void (*plugin_fini_t)(const char *); + +constexpr uint16_t trace_task_begin = + static_cast(xpti::trace_point_type_t::task_begin); +constexpr uint16_t trace_task_end = + static_cast(xpti::trace_point_type_t::task_end); +constexpr uint16_t trace_wait_begin = + static_cast(xpti::trace_point_type_t::wait_begin); +constexpr uint16_t trace_wait_end = + static_cast(xpti::trace_point_type_t::wait_end); +constexpr uint16_t trace_barrier_begin = + static_cast(xpti::trace_point_type_t::barrier_begin); +constexpr uint16_t trace_barrier_end = + static_cast(xpti::trace_point_type_t::barrier_end); +constexpr uint16_t trace_graph_create = + static_cast(xpti::trace_point_type_t::graph_create); +constexpr uint16_t trace_node_create = + static_cast(xpti::trace_point_type_t::node_create); +constexpr uint16_t trace_edge_create = + static_cast(xpti::trace_point_type_t::edge_create); +constexpr uint16_t trace_signal = + static_cast(xpti::trace_point_type_t::signal); + +constexpr uint16_t trace_graph_event = + static_cast(xpti::trace_event_type_t::graph); +constexpr uint16_t trace_algorithm_event = + static_cast(xpti::trace_event_type_t::algorithm); +} // namespace xpti + +using xpti_tp = xpti::trace_point_type_t; +using xpti_te = xpti::trace_event_type_t; +using xpti_at = xpti::trace_activity_type_t; +using xpti_td = xpti::trace_event_data_t; + +extern "C" { +/// @brief The framework loads the tool which implements xptiTraceInit() and +/// calls it when the runtime is being initialized +/// @details When tools implement callbacks and want to register them with +/// the runtime, they must implement the xptiTraceInit() and xptiTraceFinish() +/// functions and the runtime will try to resolve these symbols on load. +/// xptiTraceInit() is then called by the runtime so that the tool knows when +/// the runtime is instantiated so it can register its callbacks in the +/// xptiTraceInit() function. +/// +/// When the runtime calls the tool's implementation of the xptiTraceInit() +/// function, it also provides the version of the runtime that is invoking the +/// init call. This allows tools implementers to handle certain calls based on +/// the runtime version the tools supports. +/// +/// @code +/// void XPTI_CALLBACK_API xptiTraceInit +/// ( +/// unsigned int maj, +/// unsigned int min, +/// const char *version, +/// const char *stream_name +/// ) +/// { +/// std::string v = version; // make a copy of the version string +/// if(maj < 3) { +/// // do something here like registering callbacks +/// g_stream_id = xptiRegisterStream(stream_name); +/// xptiRegisterCallback(g_stream_id, graph_create, trace_point_begin); +/// xptiRegisterCallback(g_stream_id, node_create, trace_point_begin); +/// xptiRegisterCallback(g_stream_id, edge_create, trace_point_begin); +/// xptiRegisterCallback(g_stream_id, region_begin, trace_point_begin); +/// xptiRegisterCallback(g_stream_id, region_end, trace_point_end); +/// xptiRegisterCallback(g_stream_id, task_begin, trace_point_begin); +/// xptiRegisterCallback(g_stream_id, task_end, trace_point_end); +/// xptiRegisterCallback(g_stream_id, barrier_begin, trace_point_begin); +/// xptiRegisterCallback(g_stream_id, barrier_end, trace_point_end); +/// xptiRegisterCallback(g_stream_id, lock_begin, trace_point_begin); +/// xptiRegisterCallback(g_stream_id, lock_end, trace_point_end); +/// xptiRegisterCallback(g_stream_id, transfer_begin, trace_point_begin); +/// xptiRegisterCallback(g_stream_id, transfer_end, trace_point_end); +/// xptiRegisterCallback(g_stream_id, thread_begin, trace_point_begin); +/// xptiRegisterCallback(g_stream_id, thread_end, trace_point_end); +/// xptiRegisterCallback(g_stream_id, wait_begin, trace_point_begin); +/// xptiRegisterCallback(g_stream_id, wait_end, trace_point_end); +/// xptiRegisterCallback(g_stream_id, signal, trace_point_begin); +/// } else { +/// // report incompatible tool error message +/// } +/// } +/// @endcode +/// +/// @param [in] major_version The major version of the runtime +/// @param [in] minor_version The minor version of the runtime. if the version +/// consists a tertiary number, it will not be reported. For example, if we have +/// a version number 5.1.23776, the only 5 and 1 we be reported for major and +/// minor versions. The API assumes that semantic versioning is being used for +/// the runtime/application. +/// +/// @see https://semver.org/ Major revision number change will break API +/// compatibility. Minor revision number change will always be backward +/// compatible, but may contain additional functionality. +/// +/// @param [in] version_str Null terminated version string. This value is +/// guaranteed to be valid for the duration of the xptiTraceInit() call. +/// @param [in] stream_name Null terminated string indicating the stream name +/// that is invoking this xptiTraceInit() call. This value is guaranteed to be +/// valid for the duration of the xptiTraceInit() call. +/// @return none +XPTI_CALLBACK_API void xptiTraceInit(unsigned int major_version, + unsigned int minor_version, + const char *version_str, + const char *stream_name); + +/// @brief Function to handle unloading of the module or termination of +/// application +/// @details This function will get called when the application +/// or the runtime implementing the trace point is about to be +/// unloaded or terminated. +/// +/// @param [in] stream_name Null terminated string indicating the stream name +/// that is invoking this xptiTraceFinish() call. This value is guaranteed to be +/// valid for the duration of the xptiTraceFinish() call. The subscriber who has +/// subscribed to this stream can now free up all internal data structures and +/// memory that has been allocated to manage the stream data. +XPTI_CALLBACK_API void xptiTraceFinish(const char *stream_name); +} diff --git a/xpti/include/xpti_trace_framework.h b/xpti/include/xpti_trace_framework.h new file mode 100644 index 0000000000000..c91294acd12e8 --- /dev/null +++ b/xpti/include/xpti_trace_framework.h @@ -0,0 +1,414 @@ +// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +#pragma once +#include "xpti_data_types.h" + +#if defined(XPTI_STATIC_LIBRARY) +// If we are building or using the proxy +// static library we don't export any symbols +// +#define XPTI_EXPORT_API +#else +#if defined(_WIN64) || defined(_WIN32) /* Windows */ +#ifdef XPTI_API_EXPORTS +#define XPTI_EXPORT_API __declspec(dllexport) +#else +#define XPTI_EXPORT_API __declspec(dllimport) +#endif +#else /* Generic Unix/Linux */ +#ifdef XPTI_API_EXPORTS +#define XPTI_EXPORT_API __attribute__((visibility("default"))) +#else +#define XPTI_EXPORT_API +#endif +#endif +#endif + +#define XPTI_EXTRACT_USER_DEFINED_ID(val) ((uint16_t)val & 0x007f) +#define XPTI_TOOL_ID(val) (((uint16_t)val >> 8) & 0x00ff) + +extern "C" { + +/// @brief Initialization function that is called when a new stream is generated +/// @details When a runtime or application that uses XPTI instrumentation API +/// starts to generate a new stream, a call to xptiInitialize() must be made to +/// let all subscribers know that a new stream is being generated. If the +/// subscribers are interested in this stream, they can the choose to subscribe +/// to the stream. +/// @param stream Name of the stream, for example "sycl", "opencl" etc +/// @param maj Major version number +/// @param min Minor version number +/// @param version Full version as a string +/// @return None +XPTI_EXPORT_API xpti::result_t xptiInitialize(const char *stream, uint32_t maj, + uint32_t min, + const char *version); + +/// @brief Finalization function that is called when a stream halted +/// @details When a runtime or application that uses XPTI instrumentation API +/// stops generating the stream, a call to xptiFinalize() must be made to let +/// all subscribers know that the stream identified by 'stream' has stopped +/// generating events. If the subscribers are registered to receive events from +/// this stream, they can choose to unsubscribe from the stream or handle the +/// situation when the stream stop sending events. +/// @param stream Name of the stream, for example "sycl", "opencl" etc +/// @return None +XPTI_EXPORT_API void xptiFinalize(const char *stream); + +/// @brief Generates a unique ID +/// @details When a tool is subscribing to the event stream and wants to +/// generate task IDs that do not collide with unique IDs currently being +/// generated for nodes, edges and graphs, this API can be used. Any time, a +/// task that represents the instance of a node executing on the device is being +/// traced, the event ID corresponds to the unique ID of the node it represents +/// and in order to disambiguate the instances, a task ID can be generated and +/// sent as the instance ID for that task. +XPTI_EXPORT_API uint64_t xptiGetUniqueId(); + +/// @brief Register a string to the string table +/// @details All strings in the XPTI framework are referred to by their string +/// IDs and this method allow you to register a string and get the string ID for +/// it. In addition to the string ID, a reference to the string in the string +/// table is also returned. This lifetime of this string reference is equal to +/// the lifetime of the XPTI framework. +/// @param string The string to be registered with the string table. If the +/// string already exists in the string table, the previous ID is returned along +/// with the reference to the string in the string table. +/// @param table_string A reference to the string in the string table. This +/// string reference is guaranteed to be valid for the lifetime of the XPTI +/// framework. +/// @return The string ID of the string being registered. If an error occurs +/// during registration, xpti::invalid_id is returned. +XPTI_EXPORT_API xpti::string_id_t xptiRegisterString(const char *string, + char **table_string); + +/// @brief Lookup a string in the string tablewith its string ID +/// @details All strings in the XPTI framework are referred to by their string +/// IDs and this method allows you to lookup a string by its string ID. The +/// lifetime of the returned string reference is equal to the lifetime of the +/// XPTI framework. +/// @param id The string ID of the string to lookup. +/// @return A reference to the string identified by the string ID. +XPTI_EXPORT_API const char *xptiLookupString(xpti::string_id_t id); + +/// @brief Register a stream by its name and get a stream ID +/// @details When events in a given stream have to be notified to the +/// subscribers, the stream ID to which the events belong to is required. This +/// method will register a stream by its name and return an ID that can be used +/// for notifications. +/// @param stream_name The stream name that needs to be registered. +/// @return The stream ID. If the stream has already been registered, the +/// previously generated stream ID is returned. +XPTI_EXPORT_API uint8_t xptiRegisterStream(const char *stream_name); + +/// @brief Unregister a stream by its name +/// @details Unregistering a stream will invalidate the stream ID associated +/// with it by calling xptiFinalize() on all subscribers registered to this +/// stream and disabling all registered callbacks for this stream. +/// @param stream_name The stream name that needs to be unregistered. +/// @return The result code is XPTI_RESULT_SUCCESS when successful and +/// XPTI_RESULT_NOTFOUND if the stream is not found. +XPTI_EXPORT_API xpti::result_t xptiUnregisterStream(const char *stream_name); + +/// @brief Registers a user defined trace point +/// @details The framework allows applications or runtimes using the framework +/// to extend the pre-defined tracepoint types. In order to facilitate this, a +/// tool name must be provided. This allows multiple vendors to instrument and +/// extend different software modules and have them behave well when put +/// together. However, the tool_name must be unique for this to behave well. +/// +/// @code +/// typedef enum { +/// my_tp_extn1_begin = XPTI_TRACE_POINT_BEGIN(0), +/// my_tp_extn1_end = XPTI_TRACE_POINT_END(0), +/// my_tp_extn2_begin = XPTI_TRACE_POINT_BEGIN(1), +/// my_tp_extn2_end = XPTI_TRACE_POINT_END(1) +/// }tp_extension_t; +/// ... +/// uint16_t tp1_start = xptiRegisterUserDefinedTracePoint("myTest", +/// my_tp_extn1_begin); +/// uint16_t tp1_end = xptiRegisterUserDefinedTracePoint("myTest", +/// my_tp_extn1_end); +/// uint16_t tp2_start = xptiRegisterUserDefinedTracePoint("myTest", +/// my_tp_extn2_begin); +/// uint16_t tp2_end = xptiRegisterUserDefinedTracePoint("myTest", +/// my_tp_extn2_end); +/// ... +/// xptiNotifySubscribers(stream_id, tp1_start, parent, event, instance, +/// nullptr); +/// @endcode +/// +/// @param tool_name The tool name that is extending tracepoint types for its +/// use. +/// @param user_defined_tp The user defined tracepoint is a value ranging from +/// 0-127, which would allow vendors to create 64 pairs of tracepoints. +/// @return The result code is XPTI_RESULT_SUCCESS when successful and +/// XPTI_RESULT_NOTFOUND if the stream is not found. +XPTI_EXPORT_API uint16_t xptiRegisterUserDefinedTracePoint( + const char *tool_name, uint8_t user_defined_tp); + +/// @brief Registers a user defined event type +/// @details The framework allows applications or runtimes using the framework +/// to extend the pre-defined event types. In order to facilitate this, a +/// tool name must be provided. This allows multiple vendors to instrument and +/// extend different software modules and have them behave well when put +/// together. However, the tool_name must be unique for this to behave well. +/// +/// @code +/// typedef enum { +/// my_ev_extn1 = XPTI_EVENT(0), +/// my_ev_extn2 = XPTI_EVENT(1) +/// } event_extension_t; +/// ... +/// uint16_t my_ev1 = xptiRegisterUserDefinedEventType("myTest", my_ev_extn1); +/// uint16_t my_ev2 = xptiRegisterUserDefinedEventType("myTest", my_ev_extn2); +/// ... +/// uint64_t InstanceNo; +/// MyEvent = xptiMakeEvent("application_foo", &Payload, +/// my_ev1, xpti::trace_activity_type_t::active, +/// &InstanceNo); +/// @endcode +/// +/// In order for an notification to be received for such an event, a callback +/// must be registered. +/// +/// @param tool_name The tool name that is extending tracepoint types for its +/// use. +/// @param user_defined_event The user defined event is a value ranging +/// from 0-127, which would allow vendors to create 127 new events under +/// tool_name. +/// @return The result code is XPTI_RESULT_SUCCESS when successful and +/// XPTI_RESULT_NOTFOUND if the stream is not found. +XPTI_EXPORT_API uint16_t xptiRegisterUserDefinedEventType( + const char *tool_name, uint8_t user_defined_event); + +/// @brief Creates a trace point event +/// @details When the application or runtime wants to instrument interesting +/// sections of the code, they can create trace point events that represent +/// these sections and use the created event to notify subscribers that such an +/// event ocurred. Each created event will have a unique ID. If the same payload +/// is provided to the xptiMakeEvent() function, the same trace event is +/// returned after looking up the invariant information in the payload +/// parameter. If the unique ID or the event itself has been cached, there will +/// be no lookup costs. However, if they are not cached, the same payload is +/// provded each time the section is encountered and the event that has been +/// created previously will be returned. This will however incur a lookup cost +/// and it is recommended that this be avoided to keep the instrumentation +/// overheads minimal. +/// +/// @code +/// uint64_t InstanceNo; +/// trace_event_data_t *MyEvent; +/// xpti::payload_t Payload("foo", "foo.cpp", 100, 0, (void *)this); +/// MyEvent = xptiMakeEvent("foo", &Payload, +/// xpti::trace_event_type_t::algorithm, +/// xpti::trace_activity_type_t::active, +/// &InstanceNo); +/// +/// // Cache MyEvent locally so it can be used the next time around by +/// // avoiding a lookup +/// @endcode +/// +/// @param name The name of the event, typically the function name or kernel +/// name, etc +/// @param payload The payload that uniquely describes the trace point which can +/// be done by the function name, source file name and line number within the +/// source file and the address of the function, for example. +/// @param event The event type of the current trace event being created, as in +/// is it a graph event or an algorithm event, etc. +/// @param activity The activity type for the event - as in active, background, +/// overhead etc. +/// @param instance_no This value is returned by the framework and represents +/// the instance number of this event. If the same event is attempted to be +/// created again, the instance ID give you an indication of how many times this +/// section has been visited. +/// @return The trace event representing the section's payload is returned. +XPTI_EXPORT_API xpti::trace_event_data_t * +xptiMakeEvent(const char *name, xpti::payload_t *payload, uint16_t event, + xpti::trace_activity_type_t activity, uint64_t *instance_no); + +/// @brief Retrieves a trace event given the unique id of the event +/// @details If the unique ID of a trace event is cached, this function allows +/// you to query the framework for the trace event data structure. +/// +/// @param uid The unique ID of the event for which the lookup needs to be +/// performed +/// @return The trace event with unique ID equal to uid. If the unique ID is not +/// present, then nullptr will be returned. +XPTI_EXPORT_API const xpti::trace_event_data_t *xptiFindEvent(int64_t uid); + +/// @brief Retrieves the payload information associated with an event +/// @details An event encapsulates the unique payload it represents and this +/// function allows you to query the payload with the trace event data pointer. +/// +/// @param lookup_object The trace event object for which the payload +/// information must be retrieved. +/// @return The payload data structure pointer for the event. +XPTI_EXPORT_API const xpti::payload_t * +xptiQueryPayload(xpti::trace_event_data_t *lookup_object); + +/// @brief Registers a callback for a trace point type +/// @details Subscribers receive notifications to the trace point types they +/// register a callback with. This function allows subscribers to register the +/// same or different callback with all trace point types. +/// +/// @param stream_id The stream for which the registration is requested +/// @param trace_type The trace point type for which the registration must be +/// made. For example, you can register a different call back for +/// xpti::trace_point_type_t::task_begin and xpti::trace_point_type_t::task_end. +/// @param cb The callback function who's signature is of the type +/// xpti::tracepoint_callback_api_t +/// @return The result code which can be one of: +/// 1. XPTI_RESULT_SUCCESS when the registration is successful +/// 2. XPTI_RESULT_DUPLICATE when the callback function has already +/// been registered for the stream and trace point type +/// 3. XPTI_RESULT_UNDELETE when the registration is for a callback +/// that had been previously unregistered. +XPTI_EXPORT_API xpti::result_t +xptiRegisterCallback(uint8_t stream_id, uint16_t trace_type, + xpti::tracepoint_callback_api_t cb); + +/// @brief Unregisters a previously registered callback for a trace point type +/// @details Subscribers receive notifications to the trace point types they +/// register a callback with. This function allows subscribers to unregister +/// any previously registered callback functions with this function so they can +/// stop receiving notifications. +/// +/// @param stream_id The stream for which the registration must be disabled +/// @param trace_type The trace point type for which the registration must be +/// disabled. +/// @param cb The callback function who's signature is of the type +/// xpti::tracepoint_callback_api_t and must be disabled. +/// @return The result code which can be one of: +/// 1. XPTI_RESULT_SUCCESS when the unregistration is successful +/// 2. XPTI_RESULT_DUPLICATE when the callback function has already +/// been disabled for the stream and trace point type +/// 3. XPTI_RESULT_NOTFOUND if the callbackhas not been previously +/// registered. +XPTI_EXPORT_API xpti::result_t +xptiUnregisterCallback(uint8_t stream_id, uint16_t trace_type, + xpti::tracepoint_callback_api_t cb); + +/// @brief Notifies all registered subscribers that an event has occurred +/// @details Subscribers receive notifications to the trace point types they +/// register a callback with. This function allows subscribers to unregister +/// any previously registered callback functions with this function so they can +/// stop receiving notifications. +/// +/// @param stream_id The stream for which the registration must be disabled +/// @param trace_type The trace point type for which the notification is being +/// sent out +/// @param parent The parent trace event type for the current event. If none +/// exist, this can be nullptr. +/// @param object The event object for which the notification must be sent out. +/// @param instance The instance number of the current event and this value is +/// guaranteed to be static for the duration of the callback handler. +/// @param temporal_user_data This is the field where each tool can send in some +/// state information and the handshake of the type of this data type must be +/// handled by extending tracepoint types that handle diffent types od user +/// data. +/// @return The result code which can be one of: +/// 1. XPTI_RESULT_SUCCESS when the notification is successful +XPTI_EXPORT_API xpti::result_t +xptiNotifySubscribers(uint8_t stream_id, uint16_t trace_type, + xpti::trace_event_data_t *parent, + xpti::trace_event_data_t *object, uint64_t instance, + const void *temporal_user_data); + +/// @brief Associates pairs with an event +/// @details If the instrumentation embedded in applications need to send +/// additional metadata to the framwork and eventually the subscribers, this +/// function can be used. The metadata is of the form of pairs and +/// are only of string types. Internall, the data is represented as +/// pairs of string IDs, so when one queries the metadata, they must look up the +/// value's string ID. +/// +/// @param e The event for which the metadata is being added +/// @param key The key that identifies the metadata as a string +/// @param value The value for the key as a string +/// @return The result code which can be one of: +/// 1. XPTI_RESULT_SUCCESS when the add is successful +/// 2. XPTI_RESULT_INVALIDARG when the inputs are invalid +/// 3. XPTI_RESULT_DUPLICATE when the key-value pair already exists +XPTI_EXPORT_API xpti::result_t xptiAddMetadata(xpti::trace_event_data_t *e, + const char *key, + const char *value); + +/// @brief Query the metadata table for a given event +/// @details In order to retrieve metadata information for a given event, you +/// must get the metadata tables and perform your queries on this table. +/// @code +/// auto m = xptiQueryMetadata(event); +/// // Example of printing all the metadata contents +/// for( auto &md : m ) { +/// printf("++ %20s:%s\n", xptiLookupString(md.first), +/// xptiLookupString(md.second)); +/// } +/// // Here's an example of a query on the table +/// char *table_string; +/// xpti::string_id_t key_id = xptiRegisterString("myKey", &table_string); +/// auto index = m.find(key_id); +/// if(index != m.end()) { +/// // Retrieve the value +/// const char *value = xptiLookupString((*index).second); +/// } +/// @endcode +/// +/// @param e The event for which the metadata is being requested +/// @return The metadata table of type xpti::metadata_t * +XPTI_EXPORT_API xpti::metadata_t * +xptiQueryMetadata(xpti::trace_event_data_t *e); + +/// @brief Returns a bool that indicates whether tracing is enabled or not +/// @details If the tracing is enabled by the XPTI_TRACE_ENABLE=1 environment +/// variable, a valid dispatcher for dispatching calls to the framework and if +/// there exists one or more valid subscribers, then this function will return +/// true, else false +/// @return bool that indicates whether it is enabled or not +XPTI_EXPORT_API bool xptiTraceEnabled(); + +/// @brief Resets internal state +/// @details This method is currently ONLY used by the tests and is NOT +/// recommended for use in the instrumentation of applications or runtimes. +/// The proxy/stub library does not implement this function. +XPTI_EXPORT_API void xptiReset(); + +/// @brief Force sets internal state to trace enabled +/// @details This method is currently ONLY used by the tests and is NOT +/// recommended for use in the instrumentation of applications or runtimes. +/// The proxy/stub library does not implement this function. +XPTI_EXPORT_API void xptiForceSetTraceEnabled(bool yesOrNo); + +typedef xpti::result_t (*xpti_initialize_t)(const char *, uint32_t, uint32_t, + const char *); +typedef void (*xpti_finalize_t)(const char *); +typedef uint64_t (*xpti_get_unique_id_t)(); +typedef xpti::string_id_t (*xpti_register_string_t)(const char *, char **); +typedef const char *(*xpti_lookup_string_t)(xpti::string_id_t); +typedef uint8_t (*xpti_register_stream_t)(const char *); +typedef xpti::result_t (*xpti_unregister_stream_t)(const char *); +typedef uint16_t (*xpti_register_user_defined_tp_t)(const char *, uint8_t); +typedef uint16_t (*xpti_register_user_defined_et_t)(const char *, uint8_t); +typedef xpti::trace_event_data_t *(*xpti_make_event_t)( + const char *, xpti::payload_t *, uint16_t, xpti::trace_activity_type_t, + uint64_t *); +typedef const xpti::trace_event_data_t *(*xpti_find_event_t)(int64_t); +typedef const xpti::payload_t *(*xpti_query_payload_t)( + xpti::trace_event_data_t *); +typedef xpti::result_t (*xpti_register_cb_t)(uint8_t, uint16_t, + xpti::tracepoint_callback_api_t); +typedef xpti::result_t (*xpti_unregister_cb_t)(uint8_t, uint16_t, + xpti::tracepoint_callback_api_t); +typedef xpti::result_t (*xpti_notify_subscribers_t)( + uint8_t, uint16_t, xpti::trace_event_data_t *, xpti::trace_event_data_t *, + uint64_t instance, const void *temporal_user_data); +typedef xpti::result_t (*xpti_add_metadata_t)(xpti::trace_event_data_t *, + const char *, const char *); +typedef xpti::metadata_t *(*xpti_query_metadata_t)(xpti::trace_event_data_t *); +typedef bool (*xpti_trace_enabled_t)(); +} diff --git a/xpti/include/xpti_trace_framework.hpp b/xpti/include/xpti_trace_framework.hpp new file mode 100644 index 0000000000000..65a030791f711 --- /dev/null +++ b/xpti/include/xpti_trace_framework.hpp @@ -0,0 +1,321 @@ +// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +#pragma once +#include +#include + +#include "xpti_data_types.h" +#include "xpti_trace_framework.h" + +#if defined(_WIN32) || defined(_WIN64) +#include +#include +#include +typedef HINSTANCE xpti_plugin_handle_t; +typedef FARPROC xpti_plugin_function_t; +#define XPTI_PLUGIN_STRING "*.dll" +#define XPTI_PATH_SEPARATOR "\\" +// Windows does not have PATH_MAX defined or is inconsistent; Documentation +// suggests that 32767 is the max string length of environment variables on +// Windows +constexpr auto WIN_PATH_MAX = 32767; +#else // Linux and MacOSX +#include +#include +#include +#include + +typedef void *xpti_plugin_handle_t; +typedef void *xpti_plugin_function_t; +#define XPTI_PATH_SEPARATOR "/" + +#if defined(__unix__) // Linux +#define XPTI_PLUGIN_STRING "*.so" +#elif defined(__APPLE__) // Mac +#define XPTI_PLUGIN_STRING "*.dylib" +#endif +#endif + +/// Insert something when compiled with msvc +/// https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros +#ifdef _MSC_VER +#define __XPTI_INSERT_IF_MSVC(x) x +#else +#define __XPTI_INSERT_IF_MSVC(x) +#endif + +namespace xpti { +namespace utils { + +class StringHelper { +public: + template std::string addressAsString(T address) { + std::stringstream ss; + ss << std::hex << address; + return ss.str(); + } + + template + std::string nameWithAddress(const char *prefix, T address) { + std::string coded_string; + + if (prefix) + coded_string = prefix; + else + coded_string = "unknown"; + + coded_string += "[" + addressAsString(address) + "]"; + return coded_string; + } + + template + std::string nameWithAddress(std::string &prefix, T address) { + std::string coded_string; + if (!prefix.empty()) + coded_string = prefix + "[" + addressAsString(address) + "]"; + else + coded_string = "unknown[" + addressAsString(address) + "]"; + + return coded_string; + } + + std::string nameWithAddressString(const char *prefix, std::string &address) { + std::string coded_string; + + if (prefix) + coded_string = prefix; + else + coded_string = "unknown"; + + coded_string += "[" + address + "]"; + return coded_string; + } + + std::string nameWithAddressString(const std::string &prefix, + std::string &address) { + std::string coded_string; + ; + if (!prefix.empty()) + coded_string = prefix + "[" + address + "]"; + else + coded_string = "unknown[" + address + "]"; + + return coded_string; + } +}; + +class PlatformHelper { +public: + /// @brief Retrieves the last error and represents it as a std::string + /// @details This function is a platform independent abstraction for + /// retrieving the last error that was captured. + /// + /// @return The last error logged in the system + /// + std::string getLastError() { + std::string error; +#if defined(_WIN32) || defined(_WIN64) + DWORD err = GetLastError(); + LPVOID msgBuff; + size_t size = FormatMessageA( + FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS, + NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&msgBuff, + 0, NULL); + + if (size) { + LPCSTR msgStr = (LPCSTR)msgBuff; + error = std::string(msgStr, msgStr + size); + } + LocalFree(msgBuff); +#else + const char *err_string = dlerror(); + if (err_string) + error = err_string; +#endif + return error; + } + + std::string getEnvironmentVariable(const std::string &var) { + // Previous code that used a secure API for getting the environment variable + // and was replaced with a C++11 std::getenv for simplicity. However, if + // environments require the use of the suggested API from Microsoft, the + // code segment below may be un-commented and current std::getenv() code + // block encapsulated in the #else clause. + // + // #if defined(_WIN32) || defined(_WIN64) + // // Implementation that uses the secure versions of the API to get an + // // environment variable. + // char *valuePtr = nullptr; + // size_t length; + // errno_t error = _dupenv_s(&valuePtr, &length, var.c_str()); + // // Variable doesn't exist + // if (error) + // return ""; + // // If the variable exists, then get the value into a temporary copy + // std::string retValue = valuePtr; + // free(valuePtr); + // return retValue; + // #else + + // Remove irrelevant deprecation warning in this context in case /sdl option + // is used to provide more information on deprecated API + // https://docs.microsoft.com/en-us/cpp/c-runtime-library/security-features-in-the-crt + // https://docs.microsoft.com/en-us/cpp/build/reference/sdl-enable-additional-security-checks + __XPTI_INSERT_IF_MSVC(__pragma(warning(suppress : 4996))) + const char *val = std::getenv(var.c_str()); + return val ? val : ""; + // #endif + } + + /// @brief Finds a function defined by symbol in a shared object or DLL + /// @details This function is a platform independent abstraction for finding + /// a symbol in a shared object of DLL. If successful, it will return a + /// valid function pointer. + /// + /// @param [in] h The plugin handle which is required to make this + /// query + /// @param [in] symbol The symbol that needs to be searched within the + /// shared object or DLL represented by the handle h + /// + /// @return If the query is successful, the function returns + /// a valid address + /// @return nullptr If the query fails, nullptr is returned + /// + xpti_plugin_function_t findFunction(xpti_plugin_handle_t h, + const char *symbol) { + xpti_plugin_function_t func = nullptr; + if (h && symbol) { +#if defined(_WIN32) || defined(_WIN64) + func = GetProcAddress(h, symbol); +#else + func = dlsym(h, symbol); +#endif + } + return func; + } + + /// @brief Loads a shared object or DLL and returns a plugin handle + /// @details This function is a platform independent abstraction for loading + /// a DLL or shared object and returns a valid plugin handle if successful. + /// + /// @param [in] h The plugin handle which is required to make this + /// query + /// @param [in] symbol The symbol that needs to be searched within the + /// shared object or DLL represented by the handle h + /// + /// @return If the query is successful, the function returns + /// a valid address + /// @return nullptr If the query fails, nullptr is returned + /// + xpti_plugin_handle_t loadLibrary(const char *path, std::string &error) { + xpti_plugin_handle_t handle = 0; +#if defined(_WIN32) || defined(_WIN64) + handle = LoadLibraryA(path); + if (!handle) { + error = getLastError(); + } +#else + handle = dlopen(path, RTLD_LAZY); + if (!handle) { + error = getLastError(); + } +#endif + return handle; + } + + xpti::result_t unloadLibrary(xpti_plugin_handle_t h) { + xpti::result_t fr = xpti::result_t::XPTI_RESULT_SUCCESS; +#if defined(_WIN32) || defined(_WIN64) + if (!FreeLibrary(h)) { + // Error occurred while unloading the share object + return xpti::result_t::XPTI_RESULT_FAIL; + } +#else + if (dlclose(h)) { + // Error occurred while unloading the share object + return xpti::result_t::XPTI_RESULT_FAIL; + } +#endif + return fr; + } + + /// @brief Checks is tracing has been enabled through XPTI_TRACE_ENABLE + /// variable + /// @details The environment variable XPTI_TRACE_ENABLE is checked to see if + /// it is set. If it is not set, tracing is assumed to be enabled. If set, + /// then "true" or "1" indicates enabled and "false" or "0" indicates + /// disabled. + /// @return bool true if set to "true" or "1" and false if set to "false" + /// or "0" + bool checkTraceEnv() { + std::string env = getEnvironmentVariable("XPTI_TRACE_ENABLE"); + if (env.empty()) { + return true; + } + if (env == "true" || env == "1") + return true; + if (env == "false" || env == "0") + return false; + // If someone sets the variable to garbage, then we consider it as disabled + return false; + } +}; +} // namespace utils + +namespace framework { +constexpr uint16_t signal = (uint16_t)xpti::trace_point_type_t::signal; +constexpr uint16_t graph_create = + (uint16_t)xpti::trace_point_type_t::graph_create; +constexpr uint16_t node_create = + (uint16_t)xpti::trace_point_type_t::node_create; +constexpr uint16_t edge_create = + (uint16_t)xpti::trace_point_type_t::edge_create; + +class scoped_notify { +public: + scoped_notify(const char *stream, uint16_t trace_type, + xpti::trace_event_data_t *parent, + xpti::trace_event_data_t *object, uint64_t instance, + const void *user_data = nullptr) + : m_object(object), m_parent(parent), m_stream_id(0), + m_trace_type(trace_type), m_user_data(user_data), m_instance(instance) { + if (xptiTraceEnabled() && object) { + uint16_t open = m_trace_type & 0xfffe; + m_stream_id = xptiRegisterStream(stream); + xptiNotifySubscribers(m_stream_id, open, parent, object, instance, + m_user_data); + } + } + + ~scoped_notify() { + if (xptiTraceEnabled() && m_object) { + switch (m_trace_type) { + case signal: + case graph_create: + case node_create: + case edge_create: + break; + default: { + uint16_t close = m_trace_type | 1; + xptiNotifySubscribers(m_stream_id, close, m_parent, m_object, + m_instance, m_user_data); + } break; + } + } + } + +private: + xpti::trace_event_data_t *m_object, *m_parent; + uint8_t m_stream_id; + uint16_t m_trace_type; + const void *m_user_data; + uint64_t m_instance; +}; +} // namespace framework +} // namespace xpti diff --git a/xpti/src/CMakeLists.txt b/xpti/src/CMakeLists.txt new file mode 100644 index 0000000000000..d2d35c126dc76 --- /dev/null +++ b/xpti/src/CMakeLists.txt @@ -0,0 +1,18 @@ +cmake_minimum_required(VERSION 2.8.9) +project (xpti) +file(GLOB SOURCES *.cpp) +include_directories(${XPTI_DIR}/include) +add_definitions(-DXPTI_STATIC_LIBRARY) +add_library(xpti STATIC ${SOURCES}) + +# Handle the debug version for the Microsoft compiler as a special case by +# creating a debug version of the static library that uses the flags used by +# the SYCL runtime +if (MSVC) + add_library(xptid STATIC ${SOURCES}) + target_compile_options(xptid PRIVATE ${XPTI_CXX_FLAGS_DEBUG}) + target_compile_options(xpti PRIVATE ${XPTI_CXX_FLAGS_RELEASE}) +endif() + +# Set the location of the library installation +install(TARGETS xpti DESTINATION ${CMAKE_BINARY_DIR}) diff --git a/xpti/src/xpti_proxy.cpp b/xpti/src/xpti_proxy.cpp new file mode 100644 index 0000000000000..df41f770e235a --- /dev/null +++ b/xpti/src/xpti_proxy.cpp @@ -0,0 +1,329 @@ +// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +#include "xpti_trace_framework.hpp" + +#include +#include +#include +#include + +enum functions_t { + XPTI_INITIALIZE, + XPTI_FINALIZE, + XPTI_GET_UNIQUE_ID, + XPTI_REGISTER_STRING, + XPTI_LOOKUP_STRING, + XPTI_REGISTER_STREAM, + XPTI_UNREGISTER_STREAM, + XPTI_REGISTER_USER_DEFINED_TP, + XPTI_REGISTER_USER_DEFINED_ET, + XPTI_MAKE_EVENT, + XPTI_FIND_EVENT, + XPTI_QUERY_PAYLOAD, + XPTI_REGISTER_CALLBACK, + XPTI_UNREGISTER_CALLBACK, + XPTI_NOTIFY_SUBSCRIBERS, + XPTI_ADD_METADATA, + XPTI_QUERY_METADATA, + XPTI_TRACE_ENABLED, + + // All additional functions need to appear before + // the XPTI_FW_API_COUNT enum + XPTI_FW_API_COUNT ///< This enum must always be the last one in the list +}; + +namespace xpti { +class ProxyLoader { + std::unordered_map m_function_names = { + {XPTI_INITIALIZE, "xptiInitialize"}, + {XPTI_FINALIZE, "xptiFinalize"}, + {XPTI_GET_UNIQUE_ID, "xptiGetUniqueId"}, + {XPTI_REGISTER_STRING, "xptiRegisterString"}, + {XPTI_LOOKUP_STRING, "xptiLookupString"}, + {XPTI_REGISTER_STREAM, "xptiRegisterStream"}, + {XPTI_UNREGISTER_STREAM, "xptiUnregisterStream"}, + {XPTI_REGISTER_USER_DEFINED_TP, "xptiRegisterUserDefinedTracePoint"}, + {XPTI_REGISTER_USER_DEFINED_ET, "xptiRegisterUserDefinedEventType"}, + {XPTI_MAKE_EVENT, "xptiMakeEvent"}, + {XPTI_FIND_EVENT, "xptiFindEvent"}, + {XPTI_QUERY_PAYLOAD, "xptiQueryPayload"}, + {XPTI_REGISTER_CALLBACK, "xptiRegisterCallback"}, + {XPTI_UNREGISTER_CALLBACK, "xptiUnregisterCallback"}, + {XPTI_NOTIFY_SUBSCRIBERS, "xptiNotifySubscribers"}, + {XPTI_ADD_METADATA, "xptiAddMetadata"}, + {XPTI_QUERY_METADATA, "xptiQueryMetadata"}, + {XPTI_TRACE_ENABLED, "xptiTraceEnabled"}}; + +public: + typedef std::vector dispatch_table_t; + + ProxyLoader() : m_loaded(false), m_fw_plugin_handle(nullptr) { + // When this object is created, we attempt to load + // the share object implementation. We look for the + // environment variable XPTI_FRAMEWORK_DISPATCHER to + // see if it has been set. If not, all methods in + // the proxy should end up being close to no-ops + // + std::string env = + m_loader.getEnvironmentVariable("XPTI_FRAMEWORK_DISPATCHER"); + if (env.empty()) + return; + std::string error; + m_fw_plugin_handle = m_loader.loadLibrary(env.c_str(), error); + if (m_fw_plugin_handle) { + // We will defer changing m_loaded = true until the + // end of this block after we are able to resolve + // all of the entry points + // + m_dispatch_table.resize(XPTI_FW_API_COUNT); + for (auto &func_name : m_function_names) { + xpti_plugin_function_t func = + m_loader.findFunction(m_fw_plugin_handle, func_name.second); + if (!func) { // Return if we fail on even one function + m_loader.unloadLibrary(m_fw_plugin_handle); + m_fw_plugin_handle = nullptr; + return; + } + m_dispatch_table[func_name.first] = func; + } + // Only if all the functions are found and loaded, + // do we set the m_loaded = true + // + m_loaded = true; + } + } + + ~ProxyLoader() { + // If the loading of the framework library was + // successful, we should close the handle in the + // destructor to decrement the reference count + // maintained by the loader. + // + if (m_fw_plugin_handle) { + m_loader.unloadLibrary(m_fw_plugin_handle); + } + } + + inline bool noErrors() { return m_loaded; } + + void *functionByIndex(int index) { + if (index >= XPTI_INITIALIZE && index < XPTI_FW_API_COUNT) { + return m_dispatch_table[index]; + } + return nullptr; + } + +private: + bool m_loaded; + xpti_plugin_handle_t m_fw_plugin_handle; + dispatch_table_t m_dispatch_table; + xpti::utils::PlatformHelper m_loader; +}; + +static ProxyLoader g_loader; +} // namespace xpti + +XPTI_EXPORT_API uint16_t xptiRegisterUserDefinedTracePoint( + const char *tool_name, uint8_t user_defined_tp) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_REGISTER_USER_DEFINED_TP); + if (f) { + return (*(xpti_register_user_defined_tp_t)f)(tool_name, user_defined_tp); + } + } + return xpti::invalid_id; +} + +XPTI_EXPORT_API uint16_t xptiRegisterUserDefinedEventType( + const char *tool_name, uint8_t user_defined_event) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_REGISTER_USER_DEFINED_ET); + if (f) { + return (*(xpti_register_user_defined_et_t)f)(tool_name, + user_defined_event); + } + } + return xpti::invalid_id; +} + +XPTI_EXPORT_API xpti::result_t xptiInitialize(const char *stream, uint32_t maj, + uint32_t min, + const char *version) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_INITIALIZE); + if (f) { + return (*(xpti_initialize_t)f)(stream, maj, min, version); + } + } + return xpti::result_t::XPTI_RESULT_FAIL; +} + +XPTI_EXPORT_API void xptiFinalize(const char *stream) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_FINALIZE); + if (f) { + (*(xpti_finalize_t)f)(stream); + } + } +} + +XPTI_EXPORT_API uint64_t xptiGetUniqueId() { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_GET_UNIQUE_ID); + if (f) { + return (*(xpti_get_unique_id_t)f)(); + } + } + return xpti::invalid_id; +} + +XPTI_EXPORT_API xpti::string_id_t xptiRegisterString(const char *string, + char **table_string) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_REGISTER_STRING); + if (f) { + return (*(xpti_register_string_t)f)(string, table_string); + } + } + return xpti::invalid_id; +} + +XPTI_EXPORT_API const char *xptiLookupString(xpti::string_id_t id) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_LOOKUP_STRING); + if (f) { + return (*(xpti_lookup_string_t)f)(id); + } + } + return nullptr; +} + +XPTI_EXPORT_API uint8_t xptiRegisterStream(const char *stream_name) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_REGISTER_STREAM); + if (f) { + return (*(xpti_register_stream_t)f)(stream_name); + } + } + return xpti::invalid_id; +} + +XPTI_EXPORT_API xpti::result_t xptiUnregisterStream(const char *stream_name) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_UNREGISTER_STREAM); + if (f) { + return (*(xpti_unregister_stream_t)f)(stream_name); + } + } + return xpti::result_t::XPTI_RESULT_FAIL; +} +XPTI_EXPORT_API xpti::trace_event_data_t * +xptiMakeEvent(const char *name, xpti::payload_t *payload, uint16_t event, + xpti::trace_activity_type_t activity, uint64_t *instance_no) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_MAKE_EVENT); + if (f) { + return (*(xpti_make_event_t)f)(name, payload, event, activity, + instance_no); + } + } + return nullptr; +} + +XPTI_EXPORT_API const xpti::trace_event_data_t *xptiFindEvent(int64_t uid) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_FIND_EVENT); + if (f) { + return (*(xpti_find_event_t)f)(uid); + } + } + return nullptr; +} + +XPTI_EXPORT_API const xpti::payload_t * +xptiQueryPayload(xpti::trace_event_data_t *lookup_object) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_QUERY_PAYLOAD); + if (f) { + return (*(xpti_query_payload_t)f)(lookup_object); + } + } + return nullptr; +} + +XPTI_EXPORT_API xpti::result_t +xptiRegisterCallback(uint8_t stream_id, uint16_t trace_type, + xpti::tracepoint_callback_api_t cb) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_REGISTER_CALLBACK); + if (f) { + return (*(xpti_register_cb_t)f)(stream_id, trace_type, cb); + } + } + return xpti::result_t::XPTI_RESULT_FAIL; +} + +XPTI_EXPORT_API xpti::result_t +xptiUnregisterCallback(uint8_t stream_id, uint16_t trace_type, + xpti::tracepoint_callback_api_t cb) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_UNREGISTER_CALLBACK); + if (f) { + return (*(xpti_unregister_cb_t)f)(stream_id, trace_type, cb); + } + } + return xpti::result_t::XPTI_RESULT_FAIL; +} + +XPTI_EXPORT_API xpti::result_t +xptiNotifySubscribers(uint8_t stream_id, uint16_t trace_type, + xpti::trace_event_data_t *parent, + xpti::trace_event_data_t *object, uint64_t instance, + const void *user_data) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_NOTIFY_SUBSCRIBERS); + if (f) { + return (*(xpti_notify_subscribers_t)f)(stream_id, trace_type, parent, + object, instance, user_data); + } + } + return xpti::result_t::XPTI_RESULT_FAIL; +} + +XPTI_EXPORT_API bool xptiTraceEnabled() { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_TRACE_ENABLED); + if (f) { + return (*(xpti_trace_enabled_t)f)(); + } + } + return false; +} + +XPTI_EXPORT_API xpti::result_t xptiAddMetadata(xpti::trace_event_data_t *e, + const char *key, + const char *value) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_ADD_METADATA); + if (f) { + return (*(xpti_add_metadata_t)f)(e, key, value); + } + } + return xpti::result_t::XPTI_RESULT_FAIL; +} + +XPTI_EXPORT_API xpti::metadata_t * +xptiQueryMetadata(xpti::trace_event_data_t *lookup_object) { + if (xpti::g_loader.noErrors()) { + auto f = xpti::g_loader.functionByIndex(XPTI_QUERY_METADATA); + if (f) { + return (*(xpti_query_metadata_t)f)(lookup_object); + } + } + return nullptr; +}