11// REQUIRES: gpu-intel-pvc, level_zero
22
33// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.out
4- // RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
5- // RUN: %GPU_RUN_PLACEHOLDER %t.out
4+ // RUN: env ZE_DEBUG=1 env ZEX_NUMBER_OF_CCS=0:4 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
5+ // RUN: env ZEX_NUMBER_OF_CCS=0:4 %GPU_RUN_PLACEHOLDER %t.out
66
77// Check that queues created on sub-sub-devices are going to specific compute
88// engines:
@@ -25,89 +25,62 @@ using namespace std::chrono;
2525#define INTER_NUM (150 )
2626#define KERNEL_NUM (2000 )
2727
28- void run (std::vector<queue> &queues) {
28+ void make_queue_and_run_workload (std::vector<device> &subsubdevices) {
29+ std::cout << " [important] create " << subsubdevices.size ()
30+ << " sycl queues, one for each sub-sub device" << std::endl;
31+
2932 auto N = 1024 * 16 ;
3033 size_t global_range = 1024 ;
3134 size_t local_range = 16 ;
3235
33- float *buffer_host0 = malloc_host<float >(N, queues[0 ]);
34- float *buffer_device0 = malloc_device<float >(N, queues[0 ]);
35-
36- float *buffer_host1 = malloc_host<float >(N, queues[1 ]);
37- float *buffer_device1 = malloc_device<float >(N, queues[1 ]);
38-
39- float *buffer_host2 = malloc_host<float >(N, queues[2 ]);
40- float *buffer_device2 = malloc_device<float >(N, queues[2 ]);
36+ std::vector<queue> queues;
37+ std::vector<float *> host_mem_ptrs;
38+ std::vector<float *> device_mem_ptrs;
39+
40+ // Create queues for each subdevice.
41+ for (auto &ccs : subsubdevices) {
42+ queue q (ccs,
43+ {property::queue::enable_profiling (), property::queue::in_order ()});
44+ auto *host_mem_ptr = malloc_host<float >(N, q);
45+ auto *device_mem_ptr = malloc_device<float >(N, q);
46+
47+ for (int i = 0 ; i < N; ++i) {
48+ host_mem_ptr[i] = static_cast <float >(random_float ());
49+ }
4150
42- float *buffer_host3 = malloc_host<float >(N, queues[3 ]);
43- float *buffer_device3 = malloc_device<float >(N, queues[3 ]);
51+ q.memcpy (device_mem_ptr, host_mem_ptr, N * sizeof (float )).wait ();
4452
45- for (int i = 0 ; i < N; ++i) {
46- buffer_host0[i] = static_cast <float >(random_float ());
47- buffer_host1[i] = static_cast <float >(random_float ());
48- buffer_host2[i] = static_cast <float >(random_float ());
49- buffer_host3[i] = static_cast <float >(random_float ());
53+ host_mem_ptrs.push_back (host_mem_ptr);
54+ device_mem_ptrs.push_back (device_mem_ptr);
55+ queues.push_back (q);
5056 }
5157
52- queues[0 ].memcpy (buffer_device0, buffer_host0, N * sizeof (float )).wait ();
53- queues[1 ].memcpy (buffer_device1, buffer_host1, N * sizeof (float )).wait ();
54- queues[2 ].memcpy (buffer_device2, buffer_host2, N * sizeof (float )).wait ();
55- queues[3 ].memcpy (buffer_device3, buffer_host3, N * sizeof (float )).wait ();
56-
58+ // Run workload.
5759 for (auto m = 0 ; m < INTER_NUM; ++m) {
5860 for (int k = 0 ; k < KERNEL_NUM; ++k) {
59- auto event0 = queues[0 ].submit ([&](handler &h) {
60- h.parallel_for <class kernel0 >(
61- nd_range<1 >(range<1 >{global_range}, range<1 >{local_range}),
62- [=](nd_item<1 > item) {
63- int i = item.get_global_linear_id ();
64- buffer_device0[i] = buffer_device0[i] + float (2.0 );
65- });
66- });
67- auto event1 = queues[1 ].submit ([&](handler &h) {
68- h.parallel_for <class kernel1 >(
69- nd_range<1 >(range<1 >{global_range}, range<1 >{local_range}),
70- [=](nd_item<1 > item) {
71- int i = item.get_global_linear_id ();
72- buffer_device1[i] = buffer_device1[i] + float (2.0 );
73- });
74- });
75- auto event2 = queues[2 ].submit ([&](handler &h) {
76- h.parallel_for <class kernel2 >(
77- nd_range<1 >(range<1 >{global_range}, range<1 >{local_range}),
78- [=](nd_item<1 > item) {
79- int i = item.get_global_linear_id ();
80- buffer_device2[i] = buffer_device2[i] + float (2.0 );
81- });
82- });
83- auto event3 = queues[3 ].submit ([&](handler &h) {
84- h.parallel_for <class kernel3 >(
61+ for (int j = 0 ; j < queues.size (); j++) {
62+ queue current_queue = queues[j];
63+ float *device_mem_ptr = device_mem_ptrs[j];
64+
65+ auto event0 = current_queue.parallel_for <>(
8566 nd_range<1 >(range<1 >{global_range}, range<1 >{local_range}),
8667 [=](nd_item<1 > item) {
8768 int i = item.get_global_linear_id ();
88- buffer_device3 [i] = buffer_device3 [i] + float (2.0 );
69+ device_mem_ptr [i] = device_mem_ptr [i] + float (2.0 );
8970 });
90- });
71+ }
9172 }
92- queues[0 ].wait ();
93- queues[1 ].wait ();
94- queues[2 ].wait ();
95- queues[3 ].wait ();
96- }
97-
98- free (buffer_host0, queues[0 ]);
99- free (buffer_device0, queues[0 ]);
10073
101- free (buffer_host1, queues[1 ]);
102- free (buffer_device1, queues[1 ]);
103-
104- free (buffer_host2, queues[2 ]);
105- free (buffer_device2, queues[2 ]);
74+ for (auto q : queues)
75+ q.wait ();
76+ }
10677
107- free (buffer_host3, queues[3 ]);
108- free (buffer_device3, queues[3 ]);
78+ for (int j = 0 ; j < queues.size (); j++) {
79+ sycl::free (device_mem_ptrs[j], queues[j]);
80+ sycl::free (host_mem_ptrs[j], queues[j]);
81+ }
10982
110- std::cout << " [info] Finish all " << std::endl;
83+ std::cout << " [info] Finish running workload " << std::endl;
11184}
11285
11386int main (void ) {
@@ -116,20 +89,17 @@ int main(void) {
11689 << std::endl;
11790 std::vector<device> subsub;
11891
119- auto devices = device::get_devices (info::device_type::gpu);
120- std::cout << " [info] device count = " << devices.size () << std::endl;
92+ device d;
12193
12294 // watch out device here
123- auto subdevices =
124- devices[1 ]
125- .create_sub_devices <
126- info::partition_property::partition_by_affinity_domain>(
127- info::partition_affinity_domain::next_partitionable);
95+ auto subdevices = d.create_sub_devices <
96+ info::partition_property::partition_by_affinity_domain>(
97+ info::partition_affinity_domain::next_partitionable);
12898 std::cout << " [info] sub device size = " << subdevices.size () << std::endl;
12999 for (auto &subdev : subdevices) {
130100 auto subsubdevices = subdev.create_sub_devices <
131- info::partition_property::partition_by_affinity_domain>(
132- info::partition_affinity_domain::next_partitionable);
101+ info::partition_property::ext_intel_partition_by_cslice>();
102+
133103 std::cout << " [info] sub-sub device size = " << subsubdevices.size ()
134104 << std::endl;
135105 for (auto &subsubdev : subsubdevices) {
@@ -139,26 +109,8 @@ int main(void) {
139109
140110 std::cout << " [info] all sub-sub devices count: " << subsub.size ()
141111 << std::endl;
142- std::cout << " [important] create 4 sycl queues on first 4 sub-sub devices"
143- << std::endl;
144-
145- queue q0 (subsub[0 ],
146- {property::queue::enable_profiling (), property::queue::in_order ()});
147- queue q1 (subsub[1 ],
148- {property::queue::enable_profiling (), property::queue::in_order ()});
149- queue q2 (subsub[2 ],
150- {property::queue::enable_profiling (), property::queue::in_order ()});
151- queue q3 (subsub[4 ],
152- {property::queue::enable_profiling (), property::queue::in_order ()});
153-
154- std::vector<queue> queues;
155-
156- queues.push_back (std::move (q0));
157- queues.push_back (std::move (q1));
158- queues.push_back (std::move (q2));
159- queues.push_back (std::move (q3));
160112
161- run (queues );
113+ make_queue_and_run_workload (subsub );
162114
163115 return 0 ;
164116}
0 commit comments