Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 66c65ac

Browse files
[SYCL][L0] check fixed queue indices when submitting to sub-sub-devices (#1139)
1 parent 1f66a4f commit 66c65ac

File tree

1 file changed

+169
-0
lines changed

1 file changed

+169
-0
lines changed
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
// REQUIRES: gpu-intel-pvc, level_zero
2+
3+
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.out
4+
// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
5+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
6+
7+
// Check that queues created on sub-sub-devices are going to specific compute
8+
// engines:
9+
// CHECK: [getZeQueue]: create queue ordinal = 0, index = 0 (round robin in [0, 0])
10+
// CHECK: [getZeQueue]: create queue ordinal = 0, index = 1 (round robin in [1, 1])
11+
// CHECK: [getZeQueue]: create queue ordinal = 0, index = 2 (round robin in [2, 2])
12+
// CHECK: [getZeQueue]: create queue ordinal = 0, index = 3 (round robin in [3, 3])
13+
14+
#include "CL/sycl.hpp"
15+
#include "CL/sycl/usm.hpp"
16+
#include <chrono>
17+
#include <cmath>
18+
#include <iostream>
19+
#include <math.h>
20+
#include <unistd.h>
21+
22+
namespace sycl = cl::sycl;
23+
using namespace std::chrono;
24+
25+
#define random_float() (rand() / double(RAND_MAX))
26+
#define INTER_NUM (150)
27+
#define KERNEL_NUM (2000)
28+
29+
void run(std::vector<sycl::queue> &queues) {
30+
auto N = 1024 * 16;
31+
size_t global_range = 1024;
32+
size_t local_range = 16;
33+
34+
float *buffer_host0 = sycl::malloc_host<float>(N, queues[0]);
35+
float *buffer_device0 = sycl::malloc_device<float>(N, queues[0]);
36+
37+
float *buffer_host1 = sycl::malloc_host<float>(N, queues[1]);
38+
float *buffer_device1 = sycl::malloc_device<float>(N, queues[1]);
39+
40+
float *buffer_host2 = sycl::malloc_host<float>(N, queues[2]);
41+
float *buffer_device2 = sycl::malloc_device<float>(N, queues[2]);
42+
43+
float *buffer_host3 = sycl::malloc_host<float>(N, queues[3]);
44+
float *buffer_device3 = sycl::malloc_device<float>(N, queues[3]);
45+
46+
for (int i = 0; i < N; ++i) {
47+
buffer_host0[i] = static_cast<float>(random_float());
48+
buffer_host1[i] = static_cast<float>(random_float());
49+
buffer_host2[i] = static_cast<float>(random_float());
50+
buffer_host3[i] = static_cast<float>(random_float());
51+
}
52+
53+
queues[0].memcpy(buffer_device0, buffer_host0, N * sizeof(float)).wait();
54+
queues[1].memcpy(buffer_device1, buffer_host1, N * sizeof(float)).wait();
55+
queues[2].memcpy(buffer_device2, buffer_host2, N * sizeof(float)).wait();
56+
queues[3].memcpy(buffer_device3, buffer_host3, N * sizeof(float)).wait();
57+
58+
for (auto m = 0; m < INTER_NUM; ++m) {
59+
for (int k = 0; k < KERNEL_NUM; ++k) {
60+
auto event0 = queues[0].submit([&](sycl::handler &h) {
61+
h.parallel_for<class kernel0>(
62+
cl::sycl::nd_range<1>(cl::sycl::range<1>{global_range},
63+
cl::sycl::range<1>{local_range}),
64+
[=](cl::sycl::nd_item<1> item) {
65+
int i = item.get_global_linear_id();
66+
buffer_device0[i] = buffer_device0[i] + float(2.0);
67+
});
68+
});
69+
auto event1 = queues[1].submit([&](sycl::handler &h) {
70+
h.parallel_for<class kernel1>(
71+
cl::sycl::nd_range<1>(cl::sycl::range<1>{global_range},
72+
cl::sycl::range<1>{local_range}),
73+
[=](cl::sycl::nd_item<1> item) {
74+
int i = item.get_global_linear_id();
75+
buffer_device1[i] = buffer_device1[i] + float(2.0);
76+
});
77+
});
78+
auto event2 = queues[2].submit([&](sycl::handler &h) {
79+
h.parallel_for<class kernel2>(
80+
cl::sycl::nd_range<1>(cl::sycl::range<1>{global_range},
81+
cl::sycl::range<1>{local_range}),
82+
[=](cl::sycl::nd_item<1> item) {
83+
int i = item.get_global_linear_id();
84+
buffer_device2[i] = buffer_device2[i] + float(2.0);
85+
});
86+
});
87+
auto event3 = queues[3].submit([&](sycl::handler &h) {
88+
h.parallel_for<class kernel3>(
89+
cl::sycl::nd_range<1>(cl::sycl::range<1>{global_range},
90+
cl::sycl::range<1>{local_range}),
91+
[=](cl::sycl::nd_item<1> item) {
92+
int i = item.get_global_linear_id();
93+
buffer_device3[i] = buffer_device3[i] + float(2.0);
94+
});
95+
});
96+
}
97+
queues[0].wait();
98+
queues[1].wait();
99+
queues[2].wait();
100+
queues[3].wait();
101+
}
102+
103+
free(buffer_host0, queues[0]);
104+
free(buffer_device0, queues[0]);
105+
106+
free(buffer_host1, queues[1]);
107+
free(buffer_device1, queues[1]);
108+
109+
free(buffer_host2, queues[2]);
110+
free(buffer_device2, queues[2]);
111+
112+
free(buffer_host3, queues[3]);
113+
free(buffer_device3, queues[3]);
114+
115+
std::cout << "[info] Finish all" << std::endl;
116+
}
117+
118+
int main(void) {
119+
std::cout << "[info] this case is used to submit workloads to queues on "
120+
"subsub device"
121+
<< std::endl;
122+
std::vector<sycl::device> subsub;
123+
124+
auto devices = sycl::device::get_devices(sycl::info::device_type::gpu);
125+
std::cout << "[info] device count = " << devices.size() << std::endl;
126+
127+
// watch out device here
128+
auto subdevices =
129+
devices[1]
130+
.create_sub_devices<
131+
sycl::info::partition_property::partition_by_affinity_domain>(
132+
sycl::info::partition_affinity_domain::next_partitionable);
133+
std::cout << "[info] sub device size = " << subdevices.size() << std::endl;
134+
for (auto &subdev : subdevices) {
135+
auto subsubdevices = subdev.create_sub_devices<
136+
sycl::info::partition_property::partition_by_affinity_domain>(
137+
sycl::info::partition_affinity_domain::next_partitionable);
138+
std::cout << "[info] sub-sub device size = " << subsubdevices.size()
139+
<< std::endl;
140+
for (auto &subsubdev : subsubdevices) {
141+
subsub.push_back(subsubdev);
142+
}
143+
}
144+
145+
std::cout << "[info] all sub-sub devices count: " << subsub.size()
146+
<< std::endl;
147+
std::cout << "[important] create 4 sycl queues on first 4 sub-sub devices"
148+
<< std::endl;
149+
150+
sycl::queue q0(subsub[0], {cl::sycl::property::queue::enable_profiling(),
151+
cl::sycl::property::queue::in_order()});
152+
sycl::queue q1(subsub[1], {cl::sycl::property::queue::enable_profiling(),
153+
cl::sycl::property::queue::in_order()});
154+
sycl::queue q2(subsub[2], {cl::sycl::property::queue::enable_profiling(),
155+
cl::sycl::property::queue::in_order()});
156+
sycl::queue q3(subsub[4], {cl::sycl::property::queue::enable_profiling(),
157+
cl::sycl::property::queue::in_order()});
158+
159+
std::vector<sycl::queue> queues;
160+
161+
queues.push_back(std::move(q0));
162+
queues.push_back(std::move(q1));
163+
queues.push_back(std::move(q2));
164+
queues.push_back(std::move(q3));
165+
166+
run(queues);
167+
168+
return 0;
169+
}

0 commit comments

Comments
 (0)