Skip to content

Commit 4d2316d

Browse files
github-actions[bot]wmaxeyfbusatobernhardmgruber
authored
Fix cuda::barrier missing accounting of results in try_wait (#7538) (#7635)
* Fix cuda::barrier missing accounting of results in try_wait * Cleanup redundant return * Apply suggestion from @fbusato --------- (cherry picked from commit ce23268) Co-authored-by: Wesley Maxey <71408887+wmaxey@users.noreply.github.com> Co-authored-by: Federico Busato <50413820+fbusato@users.noreply.github.com> Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
1 parent 9ec2e8d commit 4d2316d

File tree

3 files changed

+72
-36
lines changed

3 files changed

+72
-36
lines changed

libcudacxx/include/cuda/__barrier/barrier_block_scope.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -243,11 +243,11 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
243243
bool __ready = 0;
244244
::cuda::std::chrono::high_resolution_clock::time_point const __start =
245245
::cuda::std::chrono::high_resolution_clock::now();
246-
::cuda::std::chrono::nanoseconds __elapsed;
246+
::cuda::std::chrono::nanoseconds __elapsed(0);
247247
do
248248
{
249249
const ::cuda::std::uint32_t __wait_nsec = static_cast<::cuda::std::uint32_t>((__nanosec - __elapsed).count());
250-
::cuda::ptx::mbarrier_try_wait(__native_handle(), __token, __wait_nsec);
250+
__ready = ::cuda::ptx::mbarrier_try_wait(__native_handle(), __token, __wait_nsec);
251251
__elapsed = ::cuda::std::chrono::high_resolution_clock::now() - __start;
252252
} while (!__ready && (__nanosec > __elapsed));
253253
return __ready;
@@ -343,11 +343,11 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
343343
int32_t __ready = 0;
344344
::cuda::std::chrono::high_resolution_clock::time_point const __start =
345345
::cuda::std::chrono::high_resolution_clock::now();
346-
::cuda::std::chrono::nanoseconds __elapsed;
346+
::cuda::std::chrono::nanoseconds __elapsed(0);
347347
do
348348
{
349349
const ::cuda::std::uint32_t __wait_nsec = static_cast<::cuda::std::uint32_t>((__nanosec - __elapsed).count());
350-
::cuda::ptx::mbarrier_try_wait_parity(__native_handle(), __phase_parity, __wait_nsec);
350+
__ready = ::cuda::ptx::mbarrier_try_wait_parity(__native_handle(), __phase_parity, __wait_nsec);
351351
__elapsed = ::cuda::std::chrono::high_resolution_clock::now() - __start;
352352
} while (!__ready && (__nanosec > __elapsed));
353353

@@ -397,6 +397,7 @@ class barrier<thread_scope_block, ::cuda::std::__empty_completion> : public __bl
397397
NV_ANY_TARGET,
398398
(return ::cuda::std::__cccl_thread_poll_with_backoff(
399399
::cuda::std::__barrier_poll_tester_parity<barrier>(this, __phase_parity), __nanosec);))
400+
_CCCL_UNREACHABLE();
400401
}
401402

402403
public:

libcudacxx/test/libcudacxx/force_include.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,12 @@ int main(int argc, char** argv)
7777
int ret = fake_main(argc, argv);
7878
if (ret != 0)
7979
{
80+
printf("Host testing returned failure\n");
8081
return ret;
8182
}
8283

84+
printf("Testing on device:\n");
85+
fflush(stdout);
8386
list_devices();
8487
int* cuda_ret = 0;
8588
CUDA_CALL(err, cudaMalloc(&cuda_ret, sizeof(int)));
@@ -111,7 +114,12 @@ int main(int argc, char** argv)
111114
CUDA_CALL(err, cudaDeviceSynchronize());
112115
CUDA_CALL(err, cudaMemcpy(&ret, cuda_ret, sizeof(int), cudaMemcpyDeviceToHost));
113116
CUDA_CALL(err, cudaFree(cuda_ret));
117+
fflush(stdout);
114118

119+
if (ret != 0)
120+
{
121+
printf("Device testing returned failure\n");
122+
}
115123
return ret;
116124
}
117125

libcudacxx/test/libcudacxx/std/thread/thread.barrier/try_wait_for.pass.cpp

Lines changed: 59 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -18,57 +18,84 @@
1818
#include "test_macros.h"
1919

2020
template <typename Barrier, template <typename, typename> class Selector, typename Initializer = constructor_initializer>
21-
__host__ __device__ void test(bool add_delay = false)
21+
__host__ __device__ int test(bool add_delay = false)
2222
{
23+
printf("delay %s\r\n", add_delay ? "enabled" : "disabled");
24+
2325
Selector<Barrier, Initializer> sel;
2426
SHARED Barrier* b;
25-
b = sel.construct(2);
26-
auto delay = cuda::std::chrono::duration<int>(0);
27+
b = sel.construct(2);
28+
auto delay = cuda::std::chrono::nanoseconds(0);
29+
auto timeout = cuda::std::chrono::nanoseconds(100000000);
2730

2831
if (add_delay)
2932
{
30-
delay = cuda::std::chrono::duration<int>(1);
33+
delay = cuda::std::chrono::nanoseconds(100000);
3134
}
3235

33-
typename Barrier::arrival_token* tok = nullptr;
34-
execute_on_main_thread([&] {
35-
tok = new auto(b->arrive());
36-
});
36+
auto time = cuda::std::chrono::high_resolution_clock::now();
37+
cuda::std::atomic_ref<decltype(time)> time_ref(time);
3738

38-
auto awaiter = LAMBDA()
39+
auto measure = LAMBDA()->cuda::std::chrono::nanoseconds
3940
{
40-
while (b->try_wait_for(cuda::std::move(*tok), delay) == false)
41-
{
42-
}
41+
return cuda::std::chrono::duration_cast<cuda::std::chrono::nanoseconds>(
42+
cuda::std::chrono::high_resolution_clock::now() - time_ref.load());
4343
};
44-
auto arriver = LAMBDA()
44+
4545
{
46-
(void) b->arrive();
47-
};
48-
concurrent_agents_launch(awaiter, arriver);
46+
typename Barrier::arrival_token* tok = nullptr;
47+
execute_on_main_thread([&] {
48+
tok = new auto(b->arrive());
49+
});
4950

50-
execute_on_main_thread([&] {
51-
auto tok2 = b->arrive(2);
52-
while (b->try_wait_for(cuda::std::move(tok2), delay) == false)
51+
auto awaiter = LAMBDA()
52+
{
53+
time_ref = cuda::std::chrono::high_resolution_clock::now();
54+
while ((b->try_wait_for(cuda::std::move(*tok), delay) == false) && (measure() < timeout))
55+
{
56+
}
57+
printf("p1 barrier delay: %lluns\r\n", measure().count());
58+
};
59+
auto arriver = LAMBDA()
5360
{
61+
(void) b->arrive();
62+
};
63+
concurrent_agents_launch(awaiter, arriver);
64+
if (measure() > timeout)
65+
{
66+
printf("Deadlock detected in p1\r\n");
67+
return 1;
5468
}
55-
});
69+
}
70+
{
71+
execute_on_main_thread([&] {
72+
auto tok2 = b->arrive(2);
73+
time_ref = ::cuda::std::chrono::high_resolution_clock::now();
74+
while ((b->try_wait_for(cuda::std::move(tok2), delay) == false) && (measure() < timeout))
75+
{
76+
}
77+
printf("p2 barrier delay: %lluns\r\n", measure().count());
78+
});
79+
if (measure() > timeout)
80+
{
81+
printf("Deadlock detected in p2\r\n");
82+
return 1;
83+
}
84+
}
85+
return 0;
5686
}
5787

5888
int main(int, char**)
5989
{
60-
NV_IF_ELSE_TARGET(
90+
int failure = 0;
91+
NV_IF_TARGET(
6192
NV_IS_HOST,
62-
(
63-
// Required by concurrent_agents_launch to know how many we're launching
64-
cuda_thread_count = 2;
65-
66-
test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>();
67-
test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>(true);),
68-
(test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>();
69-
test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>();
70-
test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>(true);
71-
test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>(true);))
93+
(cuda_thread_count = 2; failure |= test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>();
94+
failure |= test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>(true);),
95+
(failure |= test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>();
96+
failure |= test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>();
97+
failure |= test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>(true);
98+
failure |= test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>(true);))
7299

73-
return 0;
100+
return failure;
74101
}

0 commit comments

Comments
 (0)