oneAPI-samples/Publications/GPU-Opt-Guide/prefetch/prefetch_3.cpp at main · oneapi-src/oneAPI-samples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
//==============================================================
// Copyright © 2023 Intel Corporation
//
// SPDX-License-Identifier: MIT
// =============================================================
#include <iostream>
#include <sycl/sycl.hpp>

enum LSC_LDCC {
  LSC_LDCC_DEFAULT,
  LSC_LDCC_L1UC_L3UC, // 1 // Override to L1 uncached and L3 uncached
  LSC_LDCC_L1UC_L3C,  // 2 // Override to L1 uncached and L3 cached
  LSC_LDCC_L1C_L3UC,  // 3 // Override to L1 cached and L3 uncached
  LSC_LDCC_L1C_L3C,   // 4 // Override to L1 cached and L3 cached
  LSC_LDCC_L1S_L3UC,  // 5 // Override to L1 streaming load and L3 uncached
  LSC_LDCC_L1S_L3C,   // 6 // Override to L1 streaming load and L3 cached
  LSC_LDCC_L1IAR_L3C, // 7 // Override to L1 invalidate-after-read, and L3
                      // cached
};

extern "C" {
SYCL_EXTERNAL void
__builtin_IB_lsc_prefetch_global_uint(const __attribute__((opencl_global))
                                      uint32_t *base,
                                      int immElemOff, enum LSC_LDCC cacheOpt);
}

int main() {
  sycl::queue q{sycl::gpu_selector_v,
                sycl::property::queue::enable_profiling{}};

  std::cout << "Device: " << q.get_device().get_info<sycl::info::device::name>()
            << std::endl;

  srand(time(NULL));

  constexpr int SIMD_SIZE = 32;
  constexpr int SUBGROUPS = 16;
  constexpr int GLOBAL_WORK_SIZE = SIMD_SIZE * SUBGROUPS;

  constexpr int ITEMS_PER_LANE = 2048;
  constexpr int ITEMS_PER_SUBGROUP = ITEMS_PER_LANE * SIMD_SIZE;
  constexpr int DATA_SIZE = ITEMS_PER_SUBGROUP * SUBGROUPS;

  float *result = sycl::malloc_shared<float>(GLOBAL_WORK_SIZE, q);
  float *values = sycl::malloc_shared<float>(DATA_SIZE, q);
  int *indices = sycl::malloc_shared<int>(DATA_SIZE, q);
  for (auto i = 0; i < DATA_SIZE; ++i) {
    values[i] = static_cast<float>(i);
    // Fill indices in random order.
    indices[i] = rand() % DATA_SIZE;
  }

  // Snippet begin
  auto e = q.submit([&](auto &h) {
    h.parallel_for(
        sycl::nd_range(sycl::range{GLOBAL_WORK_SIZE},
                       sycl::range{GLOBAL_WORK_SIZE}),
        [=](sycl::nd_item<1> it) [[intel::reqd_sub_group_size(SIMD_SIZE)]] {
          const int i = it.get_global_linear_id();
          const int lane = it.get_sub_group().get_local_id()[0];
          const int subgroup = it.get_sub_group().get_group_id()[0];

          // Index starting position
          int *indexCurrent = indices + lane + subgroup * ITEMS_PER_SUBGROUP;
          int *indexNext = indexCurrent + SIMD_SIZE;

          float dx = 0.0f;
#pragma unroll(0)
          for (int j = 0; j < ITEMS_PER_LANE; ++j) {
        // Prefetch next index to cache
#if __SYCL_DEVICE_ONLY__
            if (j < ITEMS_PER_LANE - 1)
              __builtin_IB_lsc_prefetch_global_uint(
                  (const __attribute__((opencl_global)) uint32_t *)indexNext, 0,
                  LSC_LDCC_L1C_L3C);
#endif
            // Load index, might be cached
            int index = *indexCurrent;
            // Latency might be reduced if index was cached
            float v = values[index];
            for (int k = 0; k < 64; ++k)
              dx += sycl::sqrt(v + k);

            indexCurrent = indexNext;
            indexNext += SIMD_SIZE;
          }

          result[i] = dx;
        });
  });
  // Snippet end
  q.wait();
  std::cout << "Kernel time = "
            << (e.template get_profiling_info<
                    sycl::info::event_profiling::command_end>() -
                e.template get_profiling_info<
                    sycl::info::event_profiling::command_start>())
            << " ns" << std::endl;
  return 0;
}