|
18 | 18 | #include "test_macros.h" |
19 | 19 |
|
20 | 20 | template <typename Barrier, template <typename, typename> class Selector, typename Initializer = constructor_initializer> |
21 | | -__host__ __device__ void test(bool add_delay = false) |
| 21 | +__host__ __device__ int test(bool add_delay = false) |
22 | 22 | { |
| 23 | + printf("delay %s\r\n", add_delay ? "enabled" : "disabled"); |
| 24 | + |
23 | 25 | Selector<Barrier, Initializer> sel; |
24 | 26 | SHARED Barrier* b; |
25 | | - b = sel.construct(2); |
26 | | - auto delay = cuda::std::chrono::duration<int>(0); |
| 27 | + b = sel.construct(2); |
| 28 | + auto delay = cuda::std::chrono::nanoseconds(0); |
| 29 | + auto timeout = cuda::std::chrono::nanoseconds(100000000); |
27 | 30 |
|
28 | 31 | if (add_delay) |
29 | 32 | { |
30 | | - delay = cuda::std::chrono::duration<int>(1); |
| 33 | + delay = cuda::std::chrono::nanoseconds(100000); |
31 | 34 | } |
32 | 35 |
|
33 | | - typename Barrier::arrival_token* tok = nullptr; |
34 | | - execute_on_main_thread([&] { |
35 | | - tok = new auto(b->arrive()); |
36 | | - }); |
| 36 | + auto time = cuda::std::chrono::high_resolution_clock::now(); |
| 37 | + cuda::std::atomic_ref<decltype(time)> time_ref(time); |
37 | 38 |
|
38 | | - auto awaiter = LAMBDA() |
| 39 | + auto measure = LAMBDA()->cuda::std::chrono::nanoseconds |
39 | 40 | { |
40 | | - while (b->try_wait_for(cuda::std::move(*tok), delay) == false) |
41 | | - { |
42 | | - } |
| 41 | + return cuda::std::chrono::duration_cast<cuda::std::chrono::nanoseconds>( |
| 42 | + cuda::std::chrono::high_resolution_clock::now() - time_ref.load()); |
43 | 43 | }; |
44 | | - auto arriver = LAMBDA() |
| 44 | + |
45 | 45 | { |
46 | | - (void) b->arrive(); |
47 | | - }; |
48 | | - concurrent_agents_launch(awaiter, arriver); |
| 46 | + typename Barrier::arrival_token* tok = nullptr; |
| 47 | + execute_on_main_thread([&] { |
| 48 | + tok = new auto(b->arrive()); |
| 49 | + }); |
49 | 50 |
|
50 | | - execute_on_main_thread([&] { |
51 | | - auto tok2 = b->arrive(2); |
52 | | - while (b->try_wait_for(cuda::std::move(tok2), delay) == false) |
| 51 | + auto awaiter = LAMBDA() |
| 52 | + { |
| 53 | + time_ref = cuda::std::chrono::high_resolution_clock::now(); |
| 54 | + while ((b->try_wait_for(cuda::std::move(*tok), delay) == false) && (measure() < timeout)) |
| 55 | + { |
| 56 | + } |
| 57 | + printf("p1 barrier delay: %lluns\r\n", measure().count()); |
| 58 | + }; |
| 59 | + auto arriver = LAMBDA() |
53 | 60 | { |
| 61 | + (void) b->arrive(); |
| 62 | + }; |
| 63 | + concurrent_agents_launch(awaiter, arriver); |
| 64 | + if (measure() > timeout) |
| 65 | + { |
| 66 | + printf("Deadlock detected in p1\r\n"); |
| 67 | + return 1; |
54 | 68 | } |
55 | | - }); |
| 69 | + } |
| 70 | + { |
| 71 | + execute_on_main_thread([&] { |
| 72 | + auto tok2 = b->arrive(2); |
| 73 | + time_ref = ::cuda::std::chrono::high_resolution_clock::now(); |
| 74 | + while ((b->try_wait_for(cuda::std::move(tok2), delay) == false) && (measure() < timeout)) |
| 75 | + { |
| 76 | + } |
| 77 | + printf("p2 barrier delay: %lluns\r\n", measure().count()); |
| 78 | + }); |
| 79 | + if (measure() > timeout) |
| 80 | + { |
| 81 | + printf("Deadlock detected in p2\r\n"); |
| 82 | + return 1; |
| 83 | + } |
| 84 | + } |
| 85 | + return 0; |
56 | 86 | } |
57 | 87 |
|
58 | 88 | int main(int, char**) |
59 | 89 | { |
60 | | - NV_IF_ELSE_TARGET( |
| 90 | + int failure = 0; |
| 91 | + NV_IF_TARGET( |
61 | 92 | NV_IS_HOST, |
62 | | - ( |
63 | | - // Required by concurrent_agents_launch to know how many we're launching |
64 | | - cuda_thread_count = 2; |
65 | | - |
66 | | - test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>(); |
67 | | - test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>(true);), |
68 | | - (test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>(); |
69 | | - test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>(); |
70 | | - test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>(true); |
71 | | - test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>(true);)) |
| 93 | + (cuda_thread_count = 2; failure |= test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>(); |
| 94 | + failure |= test<cuda::barrier<cuda::thread_scope_block>, local_memory_selector>(true);), |
| 95 | + (failure |= test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>(); |
| 96 | + failure |= test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>(); |
| 97 | + failure |= test<cuda::barrier<cuda::thread_scope_block>, shared_memory_selector>(true); |
| 98 | + failure |= test<cuda::barrier<cuda::thread_scope_block>, global_memory_selector>(true);)) |
72 | 99 |
|
73 | | - return 0; |
| 100 | + return failure; |
74 | 101 | } |
0 commit comments