diff --git a/SYCL/ESIMD/api/slm_gather_scatter_rgba.cpp b/SYCL/ESIMD/api/slm_gather_scatter_rgba.cpp new file mode 100644 index 0000000000..e0286c7ee4 --- /dev/null +++ b/SYCL/ESIMD/api/slm_gather_scatter_rgba.cpp @@ -0,0 +1,187 @@ +// REQUIRES: gpu +// UNSUPPORTED: cuda +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// +// The test checks functionality of the slm_gather_rgba/slm_scatter_rgba ESIMD +// API. + +#include "../esimd_test_utils.hpp" + +#include +#include +#include + +using namespace cl::sycl; + +constexpr int MASKED_LANE_NUM_REV = 1; +constexpr int NUM_RGBA_CHANNELS = get_num_channels_enabled( + sycl::ext::intel::experimental::esimd::rgba_channel_mask::ABGR); + +template struct Kernel { + T *bufOut; + Kernel(T *bufOut) : bufOut(bufOut) {} + + void operator()(sycl::nd_item<1> ndi) const SYCL_ESIMD_KERNEL { + using namespace sycl::ext::intel::experimental::esimd; + constexpr int numChannels = get_num_channels_enabled(CH_MASK); + uint32_t i = ndi.get_global_id(0); + + // In this test, each group consist of one workitem. No barriers required. + // Each workitem accesses contiguous block of VL elements, where + // each element consists of RGBA channels. + slm_init(VL * NUM_RGBA_CHANNELS * sizeof(T)); + + // Prepare initial values in SLM: + // 0, -1, -2, -3, -4 ... + // slm_scatter only supports VL = 16 or 32, so conservatively write in + // chunks of 16 elements. + constexpr unsigned numStores = (VL * NUM_RGBA_CHANNELS) / 16; + for (int i = 0; i < numStores; i++) { + simd vals(-i * 16, -1); + simd fourByteOffsets(i * 16 * sizeof(T), sizeof(T)); + slm_scatter(vals, fourByteOffsets); + } + + // Prepare values to store into SLM in a SOA manner, e.g.: + // R R R R ... G G G G ... B B B B ... A A A A ... + // 0, 4, 8, 12, ... 1, 5, 9, 13, ... 2, 6, 10, 14, ... 3, 7, 11, 15 ... + simd valsIn; + for (unsigned i = 0; i < numChannels; i++) + for (unsigned j = 0; j < VL; j++) + valsIn[i * VL + j] = j * numChannels + i; + + // Store values to SLM. In the SLM it will be transposed into AOS: + // R G B A R G B A ... + // 0, 1, 2, 3, 4, 5, 6, 7 ... + simd byteOffsets(0, sizeof(T) * NUM_RGBA_CHANNELS); + slm_scatter_rgba(valsIn, byteOffsets); + + // Load back values from SLM. They will be transposed back to SOA. + simd pred = 1; + pred[VL - MASKED_LANE_NUM_REV] = 0; // mask out the last lane + simd valsOut = + slm_gather_rgba(byteOffsets, pred); + + // Copy results to the output USM buffer. Maximum write block size must be + // at most 8 owords, so conservatively write in chunks of 8 elements. + uint32_t global_offset = i * VL * NUM_RGBA_CHANNELS; + for (unsigned i = 0; i < (VL * numChannels) / 8; i++) { + simd valsToWrite = valsOut.template select<8, 1>(i * 8); + valsToWrite.copy_to(bufOut + global_offset + i * 8); + } + } +}; + +std::string convertMaskToStr( + sycl::ext::intel::experimental::esimd::rgba_channel_mask mask) { + using namespace sycl::ext::intel::experimental::esimd; + switch (mask) { + case rgba_channel_mask::R: + return "R"; + case rgba_channel_mask::GR: + return "GR"; + case rgba_channel_mask::ABGR: + return "ABGR"; + default: + return ""; + } + return ""; +} + +template bool test(queue q) { + using namespace sycl::ext::intel::experimental::esimd; + constexpr int numChannels = get_num_channels_enabled(CH_MASK); + constexpr size_t size = VL * numChannels; + + std::cout << "Testing T=" << typeid(T).name() << " VL=" << VL + << " MASK=" << convertMaskToStr(CH_MASK) << "...\n"; + + auto dev = q.get_device(); + auto ctxt = q.get_context(); + T *A = static_cast(malloc_shared(size * sizeof(T), dev, ctxt)); + T *gold = new T[size]; + + for (int i = 0; i < size; ++i) { + A[i] = (T)-i; + } + + // Fill out the array with gold values. + // R R R R ... G G G G ... B B B B ... A A A A ... + // 0, 4, 8, 12, ... 1, 5, 9, 13, ... 2, 6, 10, 14, ... 3, 7, 11, 15 ... + for (unsigned i = 0; i < numChannels; i++) + for (unsigned j = 0; j < VL; j++) + gold[i * VL + j] = j * numChannels + i; + + // Account for masked out last lanes (with pred argument to slm_gather_rgba). + unsigned maskedIndex = VL - 1; + for (unsigned i = 0; i < numChannels; i++, maskedIndex += VL) + gold[maskedIndex] = 0; + + try { + // We need that many workitems + sycl::range<1> GlobalRange{1}; + // Number of workitems in a workgroup + sycl::range<1> LocalRange{1}; + sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange}; + + auto e = q.submit([&](handler &cgh) { + Kernel kernel(A); + cgh.parallel_for(Range, kernel); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cerr << "SYCL exception caught: " << e.what() << '\n'; + free(A, ctxt); + delete[] gold; + return static_cast(e.code()); + } + + int err_cnt = 0; + for (unsigned i = 0; i < size; ++i) { + if (A[i] != gold[i]) { + if (++err_cnt < 35) { + std::cerr << "failed at index " << i << ": " << A[i] + << " != " << gold[i] << " (gold)\n"; + } + } + } + + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(size - err_cnt) / (float)size) * 100.0f << "% (" + << (size - err_cnt) << "/" << size << ")\n"; + } + + free(A, ctxt); + delete[] gold; + + std::cout << (err_cnt > 0 ? " FAILED\n" : " Passed\n"); + return err_cnt > 0 ? false : true; +} + +template bool test(queue q) { + using namespace sycl::ext::intel::experimental::esimd; + bool passed = true; + passed &= test(q); + passed &= test(q); + passed &= test(q); + return passed; +} + +int main(void) { + queue q(esimd_test::ESIMDSelector{}, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + bool passed = true; + passed &= test(q); + passed &= test(q); + passed &= test(q); + passed &= test(q); + passed &= test(q); + passed &= test(q); + + return passed ? 0 : 1; +} diff --git a/SYCL/ESIMD/histogram_256_slm.cpp b/SYCL/ESIMD/histogram_256_slm.cpp index 1013967a54..e43b289b99 100644 --- a/SYCL/ESIMD/histogram_256_slm.cpp +++ b/SYCL/ESIMD/histogram_256_slm.cpp @@ -36,7 +36,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output, slm_offset += 16 * lid; slm_offset *= sizeof(int); simd slm_data = 0; - slm_store(slm_data, slm_offset); + slm_scatter(slm_data, slm_offset); esimd_barrier(); // Each thread handles NUM_BLOCKSxBLOCK_WIDTH pixel blocks @@ -60,7 +60,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output, // Update global sum by atomically adding each local histogram simd local_histogram; - local_histogram = slm_load(slm_offset); + local_histogram = slm_gather(slm_offset); flat_atomic(output, slm_offset.select<8, 1>(0), local_histogram.select<8, 1>(0), 1); flat_atomic(output, slm_offset.select<8, 1>(8), diff --git a/SYCL/ESIMD/histogram_256_slm_spec.cpp b/SYCL/ESIMD/histogram_256_slm_spec.cpp index ac70a597ab..1b7a5cb423 100644 --- a/SYCL/ESIMD/histogram_256_slm_spec.cpp +++ b/SYCL/ESIMD/histogram_256_slm_spec.cpp @@ -37,7 +37,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output, slm_offset += 16 * lid; slm_offset *= sizeof(int); simd slm_data = 0; - slm_store(slm_data, slm_offset); + slm_scatter(slm_data, slm_offset); esimd_barrier(); // Each thread handles NUM_BLOCKSxBLOCK_WIDTH pixel blocks @@ -61,7 +61,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output, // Update global sum by atomically adding each local histogram simd local_histogram; - local_histogram = slm_load(slm_offset); + local_histogram = slm_gather(slm_offset); flat_atomic(output, slm_offset.select<8, 1>(0), local_histogram.select<8, 1>(0), 1); flat_atomic(output, slm_offset.select<8, 1>(8), diff --git a/SYCL/ESIMD/histogram_256_slm_spec_2020.cpp b/SYCL/ESIMD/histogram_256_slm_spec_2020.cpp index 8a794ffd67..a4208cdbcc 100644 --- a/SYCL/ESIMD/histogram_256_slm_spec_2020.cpp +++ b/SYCL/ESIMD/histogram_256_slm_spec_2020.cpp @@ -32,7 +32,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output, slm_offset += 16 * lid; slm_offset *= sizeof(int); simd slm_data = 0; - slm_store(slm_data, slm_offset); + slm_scatter(slm_data, slm_offset); esimd_barrier(); // Each thread handles NUM_BLOCKSxBLOCK_WIDTH pixel blocks @@ -55,7 +55,7 @@ ESIMD_INLINE void histogram_atomic(const uint32_t *input_ptr, uint32_t *output, // Update global sum by atomically adding each local histogram simd local_histogram; - local_histogram = slm_load(slm_offset); + local_histogram = slm_gather(slm_offset); flat_atomic(output, slm_offset.select<8, 1>(0), local_histogram.select<8, 1>(0), 1); flat_atomic(output, slm_offset.select<8, 1>(8), diff --git a/SYCL/ESIMD/slm_barrier.cpp b/SYCL/ESIMD/slm_barrier.cpp index b29af975cb..4cab01e8bf 100644 --- a/SYCL/ESIMD/slm_barrier.cpp +++ b/SYCL/ESIMD/slm_barrier.cpp @@ -62,7 +62,7 @@ void load_to_slm(uint grpSize, uint localId, uint slmOffset, char *addr, rowTrans.select<8, 1>(40) = row1.select<8, 4>(2); rowTrans.select<8, 1>(56) = row1.select<8, 4>(3); - slm_store4(rowTrans, vOffsets); + slm_scatter_rgba(rowTrans, vOffsets); threadOffsetInMemory += grpSize * 256; vOffsets += (grpSize * 256); } @@ -123,7 +123,7 @@ int main(void) { v_Off = v_Off + shiftID * 64; - v_slmData = slm_load(v_Off); + v_slmData = slm_gather(v_Off); v_slmData.copy_to(B + globalID * VL); }); diff --git a/SYCL/ESIMD/slm_split_barrier.cpp b/SYCL/ESIMD/slm_split_barrier.cpp index a4ecbf09aa..b00e503648 100644 --- a/SYCL/ESIMD/slm_split_barrier.cpp +++ b/SYCL/ESIMD/slm_split_barrier.cpp @@ -62,7 +62,7 @@ void load_to_slm(uint grpSize, uint localId, uint slmOffset, char *addr, rowTrans.select<8, 1>(40) = row1.select<8, 4>(2); rowTrans.select<8, 1>(56) = row1.select<8, 4>(3); - slm_store4(rowTrans, vOffsets); + slm_scatter_rgba(rowTrans, vOffsets); threadOffsetInMemory += grpSize * 256; vOffsets += (grpSize * 256); } @@ -125,7 +125,7 @@ int main(void) { v_Off = v_Off + shiftID * 64; - v_slmData = slm_load(v_Off); + v_slmData = slm_gather(v_Off); v_slmData.copy_to(B + globalID * VL); });