Skip to content

Commit 4c86a11

Browse files
committed
Reduce redundant computation in CCL kernel
When converting the grid-based CCL kernel to a graph based one, we currently reify the definitionally symmetric adjacency relation in two directions which costs us additional compute as well as additional storage. This commit removes the redundant computation to increase the CCL performance.
1 parent 1804636 commit 4c86a11

3 files changed

Lines changed: 20 additions & 32 deletions

File tree

device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,20 @@ TRACCC_DEVICE void fast_sv_1(const thread_id_t& thread_id,
7575
const auto cid = static_cast<details::index_t>(
7676
tst * thread_id.getBlockDimX() + thread_id.getLocalThreadIdX());
7777

78-
TRACCC_ASSUME(adjc[tst] <= 8);
78+
TRACCC_ASSUME(adjc[tst] <= 4);
7979
for (unsigned char k = 0; k < adjc[tst]; ++k) {
80-
details::index_t q = gf.at(adjv[8 * tst + k]);
80+
const auto cid2 = adjv[4 * tst + k];
8181

82-
if (gf.at(cid) > q) {
83-
f.at(f.at(cid)) = q;
84-
f.at(cid) = q;
82+
details::index_t q2 = gf.at(cid2);
83+
details::index_t q1 = gf.at(cid);
84+
85+
if (gf.at(cid) > q2) {
86+
f.at(f.at(cid)) = q2;
87+
f.at(cid) = q2;
88+
}
89+
if (gf.at(cid2) > q1) {
90+
f.at(f.at(cid2)) = q1;
91+
f.at(cid2) = q1;
8592
}
8693
}
8794
}
@@ -169,7 +176,7 @@ TRACCC_DEVICE inline void ccl_core(
169176
reduce_problem_cell(cells_device, cid,
170177
static_cast<unsigned int>(partition_start),
171178
static_cast<unsigned int>(partition_end), adjc[tst],
172-
&adjv[8 * tst]);
179+
&adjv[4 * tst]);
173180

174181
f.at(cid) = cid;
175182
gf.at(cid) = cid;
@@ -303,7 +310,7 @@ TRACCC_DEVICE inline void ccl_kernel(
303310
barrier.blockBarrier();
304311

305312
// Vector of indices of the adjacent cells
306-
details::index_t _adjv[details::CELLS_PER_THREAD_STACK_LIMIT * 8];
313+
details::index_t _adjv[details::CELLS_PER_THREAD_STACK_LIMIT * 4];
307314

308315
/*
309316
* The number of adjacent cells for each cell must start at zero, to
@@ -349,7 +356,7 @@ TRACCC_DEVICE inline void ccl_kernel(
349356
(thread_id.getLocalThreadIdX() * cfg.max_cells_per_thread *
350357
cfg.backup_size_multiplier);
351358
adjv = adjv_backup.data() +
352-
(thread_id.getLocalThreadIdX() * 8 * cfg.max_cells_per_thread *
359+
(thread_id.getLocalThreadIdX() * 4 * cfg.max_cells_per_thread *
353360
cfg.backup_size_multiplier);
354361
use_scratch = true;
355362
} else {

device/common/include/traccc/clusterization/device/impl/reduce_problem_cell.ipp

Lines changed: 4 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,18 @@ TRACCC_HOST_DEVICE inline void reduce_problem_cell(
3030
const edm::silicon_cell reference_cell = cells.at(pos);
3131

3232
/*
33-
* First, we traverse the cells backwards, starting from the current
33+
* We traverse the cells backwards, starting from the current
3434
* cell and working back to the first, collecting adjacent cells
3535
* along the way.
3636
*/
37-
for (unsigned int j = pos - 1; j < pos; --j) {
37+
for (unsigned int j = pos + 1; j < end; ++j) {
3838
/*
3939
* Since the data is sorted, we can assume that if we see a cell
4040
* sufficiently far away in both directions, it becomes
4141
* impossible for that cell to ever be adjacent to this one.
4242
* This is a small optimisation.
4343
*/
44-
if (traccc::details::is_far_enough(reference_cell, cells.at(j))) {
44+
if (traccc::details::is_far_enough(cells.at(j), reference_cell)) {
4545
break;
4646
}
4747

@@ -50,26 +50,7 @@ TRACCC_HOST_DEVICE inline void reduce_problem_cell(
5050
* in the current cell's adjacency set.
5151
*/
5252
if (traccc::details::is_adjacent(reference_cell, cells.at(j))) {
53-
assert(adjc < 8);
54-
adjv[adjc++] = static_cast<unsigned short>(j - start);
55-
}
56-
}
57-
58-
/*
59-
* Now we examine all the cells past the current one, using almost
60-
* the same logic as in the backwards pass.
61-
*/
62-
for (unsigned int j = pos + 1; j < end; ++j) {
63-
/*
64-
* Note that this check now looks in the opposite direction! An
65-
* important difference.
66-
*/
67-
if (traccc::details::is_far_enough(cells.at(j), reference_cell)) {
68-
break;
69-
}
70-
71-
if (traccc::details::is_adjacent(reference_cell, cells.at(j))) {
72-
assert(adjc < 8);
53+
assert(adjc < 4);
7354
adjv[adjc++] = static_cast<unsigned short>(j - start);
7455
}
7556
}

device/common/src/clusterization/clusterization_algorithm.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ clusterization_algorithm::clusterization_algorithm(
2020
m_gf_backup{m_config.backup_size(), mr.main},
2121
m_backup_mutex{vecmem::make_unique_alloc<unsigned int>(mr.main)},
2222
m_adjc_backup{m_config.backup_size(), mr.main},
23-
m_adjv_backup{m_config.backup_size() * 8, mr.main} {
23+
m_adjv_backup{m_config.backup_size() * 4, mr.main} {
2424

2525
copy().setup(m_f_backup)->wait();
2626
copy().setup(m_gf_backup)->wait();

0 commit comments

Comments
 (0)