Skip to content

Commit 2f632f8

Browse files
author
Jeff Hammond
authored
[SYCL] Support non-x86 platforms (#2333)
Tested on AAarch64: - Cavium ThunderX2 with Linux 4.15 - Raspberry Pi 4 with Linux 5.5 Signed-off-by: Jeff R. Hammond <jeff.r.hammond@intel.com>
1 parent a21d7ef commit 2f632f8

File tree

1 file changed

+40
-4
lines changed

1 file changed

+40
-4
lines changed

sycl/source/detail/platform_util.cpp

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,11 @@
1111
#include <detail/platform_util.hpp>
1212

1313
#if defined(SYCL_RT_OS_LINUX)
14+
#include <errno.h>
15+
#include <unistd.h>
16+
#if defined(__x86_64__) || defined(__i386__)
1417
#include <cpuid.h>
18+
#endif
1519
#elif defined(SYCL_RT_OS_WINDOWS)
1620
#include <intrin.h>
1721
#endif
@@ -20,6 +24,7 @@ __SYCL_INLINE_NAMESPACE(cl) {
2024
namespace sycl {
2125
namespace detail {
2226

27+
#if defined(__x86_64__) || defined(__i386__)
2328
// Used by methods that duplicate OpenCL behaviour in order to get CPU info
2429
static void cpuid(uint32_t *CPUInfo, uint32_t Type, uint32_t SubType = 0) {
2530
#if defined(SYCL_RT_OS_LINUX)
@@ -28,11 +33,13 @@ static void cpuid(uint32_t *CPUInfo, uint32_t Type, uint32_t SubType = 0) {
2833
__cpuidex(reinterpret_cast<int *>(CPUInfo), Type, SubType);
2934
#endif
3035
}
36+
#endif
3137

3238
uint32_t PlatformUtil::getMaxClockFrequency() {
3339
throw runtime_error(
3440
"max_clock_frequency parameter is not supported for host device",
3541
PI_INVALID_DEVICE);
42+
#if defined(__x86_64__) || defined(__i386__)
3643
uint32_t CPUInfo[4];
3744
string_class Buff(sizeof(CPUInfo) * 3 + 1, 0);
3845
size_t Offset = 0;
@@ -62,21 +69,43 @@ uint32_t PlatformUtil::getMaxClockFrequency() {
6269
Buff = Buff.substr(Buff.rfind(' '), Buff.length());
6370
Freq *= std::stod(Buff);
6471
return Freq;
72+
#endif
73+
return 0;
6574
}
6675

6776
uint32_t PlatformUtil::getMemCacheLineSize() {
77+
#if defined(__x86_64__) || defined(__i386__)
6878
uint32_t CPUInfo[4];
6979
cpuid(CPUInfo, 0x80000006);
7080
return CPUInfo[2] & 0xff;
81+
#elif defined(SYCL_RT_OS_LINUX) && defined(_SC_LEVEL2_DCACHE_LINESIZE)
82+
long lineSize = sysconf(_SC_LEVEL2_DCACHE_LINESIZE);
83+
if (lineSize > 0) {
84+
return lineSize;
85+
}
86+
#endif
87+
return 8;
7188
}
7289

7390
uint64_t PlatformUtil::getMemCacheSize() {
91+
#if defined(__x86_64__) || defined(__i386__)
7492
uint32_t CPUInfo[4];
7593
cpuid(CPUInfo, 0x80000006);
7694
return static_cast<uint64_t>(CPUInfo[2] >> 16) * 1024;
95+
#elif defined(SYCL_RT_OS_LINUX) && defined(_SC_LEVEL2_DCACHE_SIZE)
96+
long cacheSize = sysconf(_SC_LEVEL2_DCACHE_SIZE);
97+
if (cacheSize > 0) {
98+
return cacheSize;
99+
}
100+
#endif
101+
return static_cast<uint64_t>(16 * 1024);
77102
}
78103

79104
uint32_t PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex TIndex) {
105+
106+
#if defined(__x86_64__) || defined(__i386__)
107+
uint32_t Index = static_cast<uint32_t>(TIndex);
108+
80109
// SSE4.2 has 16 byte (XMM) registers
81110
static constexpr uint32_t VECTOR_WIDTH_SSE42[] = {16, 8, 4, 2, 4, 2, 0};
82111
// AVX supports 32 byte (YMM) registers only for floats and doubles
@@ -86,8 +115,6 @@ uint32_t PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex TIndex) {
86115
// AVX512 has 64 byte (ZMM) registers
87116
static constexpr uint32_t VECTOR_WIDTH_AVX512[] = {64, 32, 16, 8, 16, 8, 0};
88117

89-
uint32_t Index = static_cast<uint32_t>(TIndex);
90-
91118
#if defined(SYCL_RT_OS_LINUX)
92119
if (__builtin_cpu_supports("avx512f"))
93120
return VECTOR_WIDTH_AVX512[Index];
@@ -119,14 +146,23 @@ uint32_t PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex TIndex) {
119146
#endif
120147

121148
return VECTOR_WIDTH_SSE42[Index];
149+
150+
#elif defined(__ARM_NEON)
151+
uint32_t Index = static_cast<uint32_t>(TIndex);
152+
153+
// NEON has 16 byte registers
154+
static constexpr uint32_t VECTOR_WIDTH_NEON[] = {16, 8, 4, 2, 4, 2, 0};
155+
return VECTOR_WIDTH_NEON[Index];
156+
157+
#endif
158+
return 0;
122159
}
123160

124161
void PlatformUtil::prefetch(const char *Ptr, size_t NumBytes) {
125162
if (!Ptr)
126163
return;
127164

128-
// The current implementation assumes 64-byte x86 cache lines.
129-
const size_t CacheLineSize = 64;
165+
const size_t CacheLineSize = PlatformUtil::getMemCacheLineSize();
130166
const size_t CacheLineMask = ~(CacheLineSize - 1);
131167
const char *PtrEnd = Ptr + NumBytes;
132168

0 commit comments

Comments
 (0)