Skip to content

[SYCL] Reduce number of getenv calls to improve performance of short-running parallel_for kernels on Windows. #4321

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Aug 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 17 additions & 28 deletions sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -773,33 +773,17 @@ class __SYCL_EXPORT handler {
// Range should be at least this to make rounding worthwhile.
size_t MinRangeX = 1024;

// Parse optional parameters of this form:
// MinRound:PreferredRound:MinRange
char *RoundParams = getenv("SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS");
if (RoundParams != nullptr) {
std::string Params(RoundParams);
size_t Pos = Params.find(':');
if (Pos != std::string::npos) {
MinFactorX = std::stoi(Params.substr(0, Pos));
Params.erase(0, Pos + 1);
Pos = Params.find(':');
if (Pos != std::string::npos) {
GoodFactorX = std::stoi(Params.substr(0, Pos));
Params.erase(0, Pos + 1);
MinRangeX = std::stoi(Params);
}
}
}
// Check if rounding parameters have been set through environment:
// SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=MinRound:PreferredRound:MinRange
this->GetRangeRoundingSettings(MinFactorX, GoodFactorX, MinRangeX);

// Disable the rounding-up optimizations under these conditions:
// 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
// 2. The string SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is in
// the kernel name.
// 3. The kernel is provided via an interoperability method.
// 4. The API "this_item" is used inside the kernel.
// 5. The range is already a multiple of the rounding factor.
// 2. The kernel is provided via an interoperability method.
// 3. The API "this_item" is used inside the kernel.
// 4. The range is already a multiple of the rounding factor.
//
// Cases 3 and 4 could be supported with extra effort.
// Cases 2 and 3 could be supported with extra effort.
// As an optimization for the common case it is an
// implementation choice to not support those scenarios.
// Note that "this_item" is a free function, i.e. not tied to any
Expand All @@ -809,13 +793,11 @@ class __SYCL_EXPORT handler {
// call-graph to make this_item calls kernel-specific but this is
// not considered worthwhile.

// Get the kernal name to check condition 3.
// Get the kernel name to check condition 2.
std::string KName = typeid(NameT *).name();
using KI = detail::KernelInfo<KernelName>;
bool DisableRounding =
(getenv("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != nullptr) ||
Copy link
Contributor

@s-kanaev s-kanaev Aug 12, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This environment variable is documented.
I don't know if there are users of this environment variable.
Anyway, I believe, it should become deprecated first with or without any effect. It can be removed being deprecated for at least one update. @bader , could you, please, confirm it?
Also, if the reason for eliminating of this env var is reducing host overhead, one could employ SYCLConfig class from source/detail/config.hpp.

(KName.find("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") !=
std::string::npos) ||
this->DisableRangeRounding() ||
(KI::getName() == nullptr || KI::getName()[0] == '\0') ||
(KI::callsThisItem());

Expand All @@ -830,7 +812,7 @@ class __SYCL_EXPORT handler {
// will yield a rounded-up value for the total range.
size_t NewValX =
((NumWorkItems[0] + GoodFactorX - 1) / GoodFactorX) * GoodFactorX;
if (getenv("SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE") != nullptr)
if (this->RangeRoundingTrace())
std::cout << "parallel_for range adjusted from " << NumWorkItems[0]
<< " to " << NewValX << std::endl;

Expand Down Expand Up @@ -2444,6 +2426,13 @@ class __SYCL_EXPORT handler {

friend class ::MockHandler;

bool DisableRangeRounding();

bool RangeRoundingTrace();

void GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
size_t &MinRange);

template <typename WrapperT, typename TransformedArgType, int Dims,
typename KernelType>
auto getRangeRoundedKernelLambda(KernelType KernelFunc,
Expand Down
3 changes: 3 additions & 0 deletions sycl/source/detail/config.def
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ CONFIG(SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP, 1, __SYCL_DISABLE_EXECUTION_GRAPH_C
CONFIG(SYCL_DEVICE_ALLOWLIST, 1024, __SYCL_DEVICE_ALLOWLIST)
CONFIG(SYCL_BE, 16, __SYCL_BE)
CONFIG(SYCL_PI_TRACE, 16, __SYCL_PI_TRACE)
CONFIG(SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE, 16, __SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE)
CONFIG(SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING, 16, __SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING)
CONFIG(SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS, 64, __SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS)
CONFIG(SYCL_DEVICELIB_NO_FALLBACK, 1, __SYCL_DEVICELIB_NO_FALLBACK)
CONFIG(SYCL_DEVICE_FILTER, 1024, __SYCL_DEVICE_FILTER)
CONFIG(SYCL_PROGRAM_LINK_OPTIONS, 64, __SYCL_PROGRAM_LINK_OPTIONS)
Expand Down
58 changes: 58 additions & 0 deletions sycl/source/detail/config.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,64 @@ template <> class SYCLConfig<SYCL_PI_TRACE> {
}
};

template <> class SYCLConfig<SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE> {
using BaseT = SYCLConfigBase<SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE>;

public:
static bool get() {
static const char *ValStr = BaseT::getRawValue();
return ValStr != nullptr;
}
};

template <> class SYCLConfig<SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING> {
using BaseT = SYCLConfigBase<SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING>;

public:
static bool get() {
static const char *ValStr = BaseT::getRawValue();
return ValStr != nullptr;
}
};

template <> class SYCLConfig<SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS> {
using BaseT = SYCLConfigBase<SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS>;

private:
public:
static void GetSettings(size_t &MinFactor, size_t &GoodFactor,
size_t &MinRange) {
static const char *RoundParams = BaseT::getRawValue();
if (RoundParams == nullptr)
return;

static bool ProcessedFactors = false;
static size_t MF;
static size_t GF;
static size_t MR;
if (!ProcessedFactors) {
// Parse optional parameters of this form (all values required):
// MinRound:PreferredRound:MinRange
std::string Params(RoundParams);
size_t Pos = Params.find(':');
if (Pos != std::string::npos) {
MF = std::stoi(Params.substr(0, Pos));
Params.erase(0, Pos + 1);
Pos = Params.find(':');
if (Pos != std::string::npos) {
GF = std::stoi(Params.substr(0, Pos));
Params.erase(0, Pos + 1);
MR = std::stoi(Params);
}
}
ProcessedFactors = true;
}
MinFactor = MF;
GoodFactor = GF;
MinRange = MR;
}
};

// Array is used by SYCL_DEVICE_FILTER and SYCL_DEVICE_ALLOWLIST
static const std::array<std::pair<std::string, info::device_type>, 5>
SyclDeviceTypeMap = {{{"host", info::device_type::host},
Expand Down
16 changes: 16 additions & 0 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <CL/sycl/handler.hpp>
#include <CL/sycl/info/info_desc.hpp>
#include <CL/sycl/stream.hpp>
#include <detail/config.hpp>
#include <detail/global_handler.hpp>
#include <detail/kernel_bundle_impl.hpp>
#include <detail/kernel_impl.hpp>
Expand Down Expand Up @@ -483,6 +484,21 @@ void handler::barrier(const std::vector<event> &WaitList) {
[](const event &Event) { return detail::getSyclObjImpl(Event); });
}

using namespace sycl::detail;
bool handler::DisableRangeRounding() {
return SYCLConfig<SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING>::get();
}

bool handler::RangeRoundingTrace() {
return SYCLConfig<SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE>::get();
}

void handler::GetRangeRoundingSettings(size_t &MinFactor, size_t &GoodFactor,
size_t &MinRange) {
SYCLConfig<SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS>::GetSettings(
MinFactor, GoodFactor, MinRange);
}

void handler::memcpy(void *Dest, const void *Src, size_t Count) {
throwIfActionIsCreated();
MSrcPtr = const_cast<void *>(Src);
Expand Down
3 changes: 3 additions & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -3899,9 +3899,12 @@ _ZN2cl4sycl7handler10mem_adviseEPKvmi
_ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmb
_ZN2cl4sycl7handler10processArgEPvRKNS0_6detail19kernel_param_kind_tEimRmbb
_ZN2cl4sycl7handler13getKernelNameB5cxx11Ev
_ZN2cl4sycl7handler18RangeRoundingTraceEv
_ZN2cl4sycl7handler18extractArgsAndReqsEv
_ZN2cl4sycl7handler20DisableRangeRoundingEv
_ZN2cl4sycl7handler20associateWithHandlerEPNS0_6detail16AccessorBaseHostENS0_6access6targetE
_ZN2cl4sycl7handler22setHandlerKernelBundleERKSt10shared_ptrINS0_6detail18kernel_bundle_implEE
_ZN2cl4sycl7handler24GetRangeRoundingSettingsERmS2_S2_
_ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tE
_ZN2cl4sycl7handler28extractArgsAndReqsFromLambdaEPcmPKNS0_6detail19kernel_param_desc_tEb
_ZN2cl4sycl7handler6memcpyEPvPKvm
Expand Down
3 changes: 3 additions & 0 deletions sycl/test/abi/sycl_symbols_windows.dump
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,7 @@
?Any@__host_std@cl@@YAHV?$vec@_J$0BA@@sycl@2@@Z
?Clear@exception_list@sycl@cl@@AEAAXXZ
?DirSep@OSUtil@detail@sycl@cl@@2QEBDEB
?DisableRangeRounding@handler@sycl@cl@@AEAA_NXZ
?Dot@__host_std@cl@@YA?AVhalf@half_impl@detail@sycl@2@V?$vec@Vhalf@half_impl@detail@sycl@cl@@$00@62@0@Z
?Dot@__host_std@cl@@YA?AVhalf@half_impl@detail@sycl@2@V?$vec@Vhalf@half_impl@detail@sycl@cl@@$01@62@0@Z
?Dot@__host_std@cl@@YA?AVhalf@half_impl@detail@sycl@2@V?$vec@Vhalf@half_impl@detail@sycl@cl@@$02@62@0@Z
Expand Down Expand Up @@ -747,6 +748,7 @@
?FUnordNotEqual@__host_std@cl@@YAHVhalf@half_impl@detail@sycl@2@0@Z
?GetNativeMem@interop_handler@sycl@cl@@AEBA_KPEAVAccessorImplHost@detail@23@@Z
?GetNativeQueue@interop_handler@sycl@cl@@AEBA_KXZ
?GetRangeRoundingSettings@handler@sycl@cl@@AEAAXAEA_K00@Z
?IsFinite@__host_std@cl@@YA?AV?$vec@F$00@sycl@2@V?$vec@Vhalf@half_impl@detail@sycl@cl@@$00@42@@Z
?IsFinite@__host_std@cl@@YA?AV?$vec@F$01@sycl@2@V?$vec@Vhalf@half_impl@detail@sycl@cl@@$01@42@@Z
?IsFinite@__host_std@cl@@YA?AV?$vec@F$02@sycl@2@V?$vec@Vhalf@half_impl@detail@sycl@cl@@$02@42@@Z
Expand Down Expand Up @@ -877,6 +879,7 @@
?PushBack@exception_list@sycl@cl@@AEAAX$$QEAVexception_ptr@std@@@Z
?PushBack@exception_list@sycl@cl@@AEAAXAEBVexception_ptr@std@@@Z
?REJECT_DEVICE_SCORE@device_selector@sycl@cl@@1HB
?RangeRoundingTrace@handler@sycl@cl@@AEAA_NXZ
?SignBitSet@__host_std@cl@@YA?AV?$vec@F$00@sycl@2@V?$vec@Vhalf@half_impl@detail@sycl@cl@@$00@42@@Z
?SignBitSet@__host_std@cl@@YA?AV?$vec@F$01@sycl@2@V?$vec@Vhalf@half_impl@detail@sycl@cl@@$01@42@@Z
?SignBitSet@__host_std@cl@@YA?AV?$vec@F$02@sycl@2@V?$vec@Vhalf@half_impl@detail@sycl@cl@@$02@42@@Z
Expand Down