Skip to content

Commit 29ed855

Browse files
caugonnetandralexbernhardmgruber
authored andcommitted
[STF] Document dot sections (NVIDIA#3506)
* Start to document STF dot sections * fix formatting * Minor fixes in the doc * Add missing file * clang-format * Remove dot_push_section and dot_pop_section and also fix a bazillion warnings * Format * More Format * Add missing mv * misc. C++ fixes and clang-format * Update dot_section doc to reflect that we removed dot_push_section and dot_pop_section * - Fix documentation error - Use the dot_section doc example as a test - do not use assert directly * Review and a few more touches * Improvement for docs/cudax/stf.rst Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com> --------- Co-authored-by: Andrei Alexandrescu <andrei@erdani.com> Co-authored-by: Bernhard Manfred Gruber <bernhardmgruber@gmail.com>
1 parent 1403479 commit 29ed855

File tree

22 files changed

+369
-136
lines changed

22 files changed

+369
-136
lines changed

cudax/examples/stf/linear_algebra/07-cholesky.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -659,14 +659,14 @@ int main(int argc, char** argv)
659659
return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row);
660660
};
661661

662-
ctx.dot_push_section("fillA");
662+
auto s = ctx.dot_section("fillA");
663663
if (check_result)
664664
{
665665
Aref.fill(hilbert);
666666
}
667667

668668
A.fill(hilbert);
669-
ctx.dot_pop_section();
669+
s.end();
670670

671671
/* Right-hand side */
672672
matrix<double> B_potrs(N, 1, NB, 1, false, "B");
@@ -693,9 +693,9 @@ int main(int argc, char** argv)
693693
cudaEvent_t startEvent_pdpotrf, stopEvent_pdpotrf;
694694
float milliseconds_pdpotrf = 0;
695695

696-
// for (int row = 0; row < A.mt; row++)
696+
// for (size_t row = 0; row < A.mt; row++)
697697
// {
698-
// for (int col = 0; col <= row; col++)
698+
// for (size_t col = 0; col <= row; col++)
699699
// {
700700
// cuda_safe_call(cudaSetDevice(A.get_preferred_devid(row, col)));
701701
// NOOP(A, row, col);

cudax/examples/stf/linear_algebra/07-potri.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -197,17 +197,17 @@ public:
197197
void print()
198198
{
199199
// print blocks by blocks
200-
for (int colb = 0; colb < nt; colb++)
200+
for (size_t colb = 0; colb < nt; colb++)
201201
{
202202
int low_rowb = sym_matrix ? colb : 0;
203-
for (int rowb = low_rowb; rowb < mt; rowb++)
203+
for (size_t rowb = low_rowb; rowb < mt; rowb++)
204204
{
205205
// Each task fills a block
206206
ctx.host_launch(get_handle(rowb, colb).read())->*[=](auto sA) {
207-
for (int lcol = 0; lcol < sA.extent(1); lcol++)
207+
for (size_t lcol = 0; lcol < sA.extent(1); lcol++)
208208
{
209209
size_t col = lcol + colb * sA.extent(1);
210-
for (int lrow = 0; lrow < sA.extent(0); lrow++)
210+
for (size_t lrow = 0; lrow < sA.extent(0); lrow++)
211211
{
212212
size_t row = lrow + rowb * sA.extent(0);
213213

cudax/examples/stf/linear_algebra/cg_csr.cu

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ public:
5151
static void copy_vector(const vector& from, vector& to)
5252
{
5353
to.ctx.parallel_for(to.handle.shape(), to.handle.write(), from.handle.read()).set_symbol("copy_vector")
54-
->*[] _CCCL_DEVICE(size_t i, slice<double> dto, slice<double> dfrom) {
54+
->*[] _CCCL_DEVICE(size_t i, slice<double> dto, slice<const double> dfrom) {
5555
dto(i) = dfrom(i);
5656
};
5757
}
@@ -116,6 +116,13 @@ public:
116116
copy_scalar(a, *this);
117117
}
118118

119+
scalar& operator=(scalar&& a)
120+
{
121+
handle = mv(a.handle);
122+
ctx = mv(a.ctx);
123+
return *this;
124+
}
125+
119126
scalar operator/(scalar const& rhs) const
120127
{
121128
// Submit a task that computes this/rhs

cudax/include/cuda/experimental/__stf/internal/backend_ctx.cuh

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -955,16 +955,6 @@ public:
955955
reserved::per_ctx_dot::set_parent_ctx(parent_ctx.get_dot(), get_dot());
956956
}
957957

958-
void dot_push_section(::std::string symbol) const
959-
{
960-
reserved::dot::section::push(mv(symbol));
961-
}
962-
963-
void dot_pop_section() const
964-
{
965-
reserved::dot::section::pop();
966-
}
967-
968958
auto dot_section(::std::string symbol) const
969959
{
970960
return reserved::dot::section::guard(mv(symbol));

cudax/include/cuda/experimental/__stf/internal/dot.cuh

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ public:
163163
static int get_current_section_id();
164164

165165
template <typename task_type, typename data_type>
166-
void add_vertex(task_type t)
166+
void add_vertex(const task_type& t)
167167
{
168168
// Do this work outside the critical section
169169
const auto remove_deps = getenv("CUDASTF_DOT_REMOVE_DATA_DEPS");
@@ -208,7 +208,7 @@ public:
208208
}
209209

210210
template <typename task_type>
211-
void add_vertex_timing(task_type t, float time_ms, int device = -1)
211+
void add_vertex_timing(const task_type& t, float time_ms, int device = -1)
212212
{
213213
::std::lock_guard<::std::mutex> guard(mtx);
214214

@@ -286,7 +286,7 @@ public:
286286
::std::shared_ptr<per_ctx_dot> parent;
287287
::std::vector<::std::shared_ptr<per_ctx_dot>> children;
288288

289-
const ::std::string get_ctx_symbol() const
289+
const ::std::string& get_ctx_symbol() const
290290
{
291291
return ctx_symbol;
292292
}
@@ -352,7 +352,10 @@ public:
352352
// Constructor to initialize symbol and children
353353
section(::std::string sym)
354354
: symbol(mv(sym))
355-
{}
355+
{
356+
static_assert(::std::is_move_constructible_v<section>, "section must be move constructible");
357+
static_assert(::std::is_move_assignable_v<section>, "section must be move assignable");
358+
}
356359

357360
class guard
358361
{
@@ -362,10 +365,24 @@ public:
362365
section::push(mv(symbol));
363366
}
364367

365-
~guard()
368+
void end()
366369
{
370+
_CCCL_ASSERT(active, "Attempting to end the same section twice.");
367371
section::pop();
372+
active = false;
373+
}
374+
375+
~guard()
376+
{
377+
if (active)
378+
{
379+
section::pop();
380+
}
368381
}
382+
383+
private:
384+
// Have we called end() ?
385+
bool active = true;
369386
};
370387

371388
static auto& current()
@@ -380,7 +397,7 @@ public:
380397
auto sec = ::std::make_shared<section>(mv(symbol));
381398
int id = sec->get_id();
382399

383-
int parent_id = current().size() == 0 ? 0 : current().top();
400+
int parent_id = current().empty() ? 0 : current().top();
384401
sec->parent_id = parent_id;
385402

386403
// Save the section in the map
@@ -416,7 +433,7 @@ public:
416433
return 1 + int(id);
417434
}
418435

419-
const ::std::string get_symbol() const
436+
const ::std::string& get_symbol() const
420437
{
421438
return symbol;
422439
}
@@ -431,7 +448,7 @@ public:
431448
::std::vector<int> children_ids;
432449

433450
private:
434-
int depth;
451+
int depth = ::std::numeric_limits<int>::min();
435452

436453
::std::string symbol;
437454

cudax/include/cuda/experimental/stf.cuh

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -635,40 +635,14 @@ public:
635635
payload);
636636
}
637637

638-
/**
639-
* @brief Start a new section in the DOT file identified by its symbol
640-
*/
641-
void dot_push_section(::std::string symbol) const
642-
{
643-
_CCCL_ASSERT(payload.index() != ::std::variant_npos, "Context is not initialized");
644-
::std::visit(
645-
[symbol = mv(symbol)](auto& self) {
646-
self.dot_push_section(symbol);
647-
},
648-
payload);
649-
}
650-
651-
/**
652-
* @brief Ends current dot section
653-
*/
654-
void dot_pop_section() const
655-
{
656-
_CCCL_ASSERT(payload.index() != ::std::variant_npos, "Context is not initialized");
657-
::std::visit(
658-
[](auto& self) {
659-
self.dot_pop_section();
660-
},
661-
payload);
662-
}
663-
664638
/**
665639
* @brief RAII-style description of a new section in the DOT file identified by its symbol
666640
*/
667641
auto dot_section(::std::string symbol) const
668642
{
669643
_CCCL_ASSERT(payload.index() != ::std::variant_npos, "Context is not initialized");
670644
return ::std::visit(
671-
[symbol = mv(symbol)](auto& self) {
645+
[&symbol](auto& self) {
672646
return self.dot_section(symbol);
673647
},
674648
payload);

cudax/test/stf/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ set(stf_test_sources
88
dot/basic.cu
99
dot/graph_print_to_dot.cu
1010
dot/sections.cu
11+
dot/sections_2.cu
1112
dot/with_events.cu
1213
error_checks/ctx_mismatch.cu
1314
error_checks/data_interface_mismatch.cu

cudax/test/stf/cuda-samples/3_CUDA_Features/jacobiCudaGraphs/jacobi.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ static __global__ void finalError(double* x, double* g_sum)
228228
double JacobiMethodGpuCudaGraphExecKernelSetParams(
229229
const float* A,
230230
const double* b,
231-
const float conv_threshold,
231+
float conv_threshold,
232232
const int max_iter,
233233
double* x,
234234
double* x_new,

cudax/test/stf/dot/sections_2.cu

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Part of CUDASTF in CUDA C++ Core Libraries,
4+
// under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
/**
12+
* @file
13+
* @brief This test makes sure we can generate a dot file with sections
14+
*/
15+
16+
#include <cuda/experimental/stf.cuh>
17+
18+
using namespace cuda::experimental::stf;
19+
20+
int main()
21+
{
22+
// TODO (miscco): Make it work for windows
23+
#if !_CCCL_COMPILER(MSVC)
24+
context ctx;
25+
auto lA = ctx.logical_token().set_symbol("A");
26+
auto lB = ctx.logical_token().set_symbol("B");
27+
auto lC = ctx.logical_token().set_symbol("C");
28+
29+
// Begin a top-level section named "foo"
30+
auto s_foo = ctx.dot_section("foo");
31+
for (size_t i = 0; i < 2; i++)
32+
{
33+
// Section named "bar" using RAII
34+
auto s_bar = ctx.dot_section("bar");
35+
ctx.task(lA.read(), lB.rw()).set_symbol("t1")->*[](cudaStream_t, auto, auto) {};
36+
for (size_t j = 0; j < 2; j++)
37+
{
38+
// Section named "baz" using RAII
39+
auto s_bar = ctx.dot_section("baz");
40+
ctx.task(lA.read(), lC.rw()).set_symbol("t2")->*[](cudaStream_t, auto, auto) {};
41+
ctx.task(lB.read(), lC.read(), lA.rw()).set_symbol("t3")->*[](cudaStream_t, auto, auto, auto) {};
42+
// Implicit end of section "baz"
43+
}
44+
// Implicit end of section "bar"
45+
}
46+
s_foo.end(); // Explicit end of section "foo"
47+
ctx.finalize();
48+
#endif // !_CCCL_COMPILER(MSVC)
49+
}

0 commit comments

Comments
 (0)