11/*
2- * Copyright (c) 2019-2020 , NVIDIA CORPORATION.
2+ * Copyright (c) 2019-2021 , NVIDIA CORPORATION.
33 *
44 * Licensed under the Apache License, Version 2.0 (the "License");
55 * you may not use this file except in compliance with the License.
@@ -71,12 +71,13 @@ struct SCC_Data {
7171 p_d_r_o_ (p_d_r_o),
7272 p_d_c_i_ (p_d_c_i),
7373 d_C (nrows * nrows, 0 ),
74- d_Cprev (nrows * nrows, 0 )
74+ d_Cprev (nrows * nrows, 0 ),
75+ p_d_C_ (d_C.data().get())
7576 {
7677 init ();
7778 }
7879
79- const thrust::device_vector< ByteT>& get_C (void ) const { return d_C ; }
80+ ByteT const * get_Cptr (void ) const { return p_d_C_ ; }
8081
8182 size_t nrows (void ) const { return nrows_; }
8283
@@ -100,13 +101,12 @@ struct SCC_Data {
100101
101102 void get_labels (IndexT* d_labels) const
102103 {
103- auto * p_d_C = d_C.data ().get ();
104- size_t n = nrows_; // for lambda capture, since I cannot capture `this` (host), or `nrows_`
104+ size_t n = nrows_; // for lambda capture, since I cannot capture `this` (host), or `nrows_`
105105 thrust::transform (thrust::device,
106106 thrust::make_counting_iterator<IndexT>(0 ),
107107 thrust::make_counting_iterator<IndexT>(nrows_),
108108 d_labels,
109- [n, p_d_C] __device__ (IndexT k) {
109+ [n, p_d_C = p_d_C_ ] __device__ (IndexT k) {
110110 auto begin = p_d_C + k * n;
111111 auto end = begin + n;
112112 ByteT one{1 };
@@ -124,7 +124,6 @@ struct SCC_Data {
124124 size_t nrows = nrows_;
125125 size_t count = 0 ;
126126
127- ByteT* p_d_C = d_C.data ().get ();
128127 ByteT* p_d_Cprev = get_Cprev ().data ().get ();
129128
130129 size_t n2 = nrows * nrows;
@@ -136,57 +135,60 @@ struct SCC_Data {
136135 do {
137136 flag.set (0 );
138137
139- thrust::for_each (thrust::device,
140- thrust::make_counting_iterator< size_t >( 0 ) ,
141- thrust::make_counting_iterator<size_t >(n2 ),
142- [nrows, p_d_C, p_d_Cprev, p_d_flag, p_d_ro, p_d_ci] __device__ ( size_t indx) {
143- ByteT one{ 1 };
144-
145- auto i = indx / nrows;
146- auto j = indx % nrows;
147-
148- if ((i == j) || (p_d_Cprev[indx] == one))
149- p_d_C [indx] = one;
150- else {
151- // this is where a hash-map could help:
152- // only need hashmap[(i,j)]={0,1} (`1` for "hit");
153- // and only for new entries!
154- // already existent entries are covered by
155- // the `if`-branch above!
156- // Hence, hashmap[] can use limited space:
157- // M = max_l{number(new `1` entries)}, where
158- // l = #iterations in the do-loop!
159- // M ~ new `1` entries between A^k and A^{k+1},
160- // k=1,2,...
161- // Might M actually be M ~ nnz(A) = |E| ?!
162- // Probably, because the primitive hash
163- // (via find_if) uses a search space of nnz(A)
164- //
165- // But, what if more than 1 entry pops-up in a row?
166- // Not an issue! Because the hash key is (i,j), and no
167- // more than one entry can exist in position (i,j)!
168- //
169- // And remember, we only need to store the new (i,j) keys
170- // that an iteration produces wrt to the previous iteration!
171- //
172- auto begin = p_d_ci + p_d_ro[i];
173- auto end = p_d_ci + p_d_ro[i + 1 ];
174- auto pos = thrust::find_if (
175- thrust::seq, begin, end, [one, j, nrows, p_d_Cprev, p_d_ci](IndexT k) {
176- return (p_d_Cprev[k * nrows + j] == one );
177- });
178-
179- if (pos != end) p_d_C[indx] = one;
180- }
181-
182- if (p_d_C[indx] != p_d_Cprev[indx])
183- *p_d_flag = 1 ; // race-condition: harmless, worst case many threads
184- // write the same value
185- });
138+ thrust::for_each (
139+ thrust::device ,
140+ thrust::make_counting_iterator<size_t >(0 ),
141+ thrust::make_counting_iterator< size_t >(n2),
142+ [nrows, p_d_C = p_d_C_, p_d_Cprev, p_d_flag, p_d_ro, p_d_ci] __device__ ( size_t indx) {
143+ ByteT one{ 1 };
144+
145+ auto i = indx / nrows;
146+ auto j = indx % nrows;
147+
148+ if ((i == j) || (p_d_Cprev [indx] == one)) {
149+ p_d_C[indx] = one;
150+ } else {
151+ // this ammounts to A (^,v) B
152+ // (where A = adjacency matrix defined by (p_ro, p_ci),
153+ // B := p_d_Cprev; (^,v) := (*,+) semiring);
154+ // Here's why:
155+ // (A (^,v) B)[i][j] := A[i][.] (^,v) B[j][.]
156+ // (where X[i][.] := i-th row of X;
157+ // X[.][j] := j-th column of X);
158+ // which is:
159+ // 1, iff A[i][.] and B[j][.] have a 1 in the same location,
160+ // 0, otherwise;
161+ //
162+ // i.e., corresponfing entry in p_d_C is 1
163+ // if B[k][j] == 1 for any column k in A's i-th row;
164+ // hence, for each column k of row A[i][.],
165+ // which is the set:
166+ // k \in {p_ci + p_ro[i], ..., p_ci + p_ro[i+1] - 1},
167+ // check if (B[k][j] == 1),
168+ // i.e., p_d_Cprev[k*nrows + j]) == 1:
169+ //
170+ auto begin = p_d_ci + p_d_ro[i];
171+ auto end = p_d_ci + p_d_ro[i + 1 ];
172+ auto pos = thrust::find_if (
173+ thrust::seq, begin, end, [one, j, nrows, p_d_Cprev, p_d_ci](IndexT k) {
174+ return (p_d_Cprev[k * nrows + j] == one);
175+ } );
176+
177+ if (pos != end) p_d_C[indx] = one;
178+ }
179+
180+ if (p_d_C[indx] != p_d_Cprev[indx])
181+ *p_d_flag = 1 ; // race-condition: harmless,
182+ // worst case many threads
183+ // write the _same_ value
184+ });
186185 ++count;
187186 cudaDeviceSynchronize ();
188187
189- std::swap (p_d_C, p_d_Cprev);
188+ std::swap (p_d_C_, p_d_Cprev); // Note 1: this swap makes `p_d_Cprev` the
189+ // most recently updated matrix pointer
190+ // at the end of this loop
191+ // (see `Note 2` why this matters);
190192 } while (flag.is_set ());
191193
192194 // C & Ct:
@@ -196,11 +198,13 @@ struct SCC_Data {
196198 thrust::for_each (thrust::device,
197199 thrust::make_counting_iterator<size_t >(0 ),
198200 thrust::make_counting_iterator<size_t >(n2),
199- [nrows, p_d_C, p_d_Cprev] __device__ (size_t indx) {
201+ [nrows, p_d_C = p_d_C_ , p_d_Cprev] __device__ (size_t indx) {
200202 auto i = indx / nrows;
201203 auto j = indx % nrows;
202204 auto tindx = j * nrows + i;
203205
206+ // Note 2: per Note 1, p_d_Cprev is latest:
207+ //
204208 p_d_C[indx] = (p_d_Cprev[indx]) & (p_d_Cprev[tindx]);
205209 });
206210
@@ -215,6 +219,9 @@ struct SCC_Data {
215219 const IndexT* p_d_c_i_; // column indices
216220 thrust::device_vector<ByteT> d_C;
217221 thrust::device_vector<ByteT> d_Cprev;
222+ ByteT* p_d_C_{nullptr }; // holds the most recent update,
223+ // which can have storage in any of d_C or d_Cprev,
224+ // because the pointers get swapped!
218225
219226 thrust::device_vector<ByteT>& get_Cprev (void ) { return d_Cprev; }
220227};
0 commit comments