Skip to content

Commit 1e85ee5

Browse files
Debug MG egonet issues (#2926)
@jnke2016 identified some issues while creating the python bindings for egonet. This PR addresses issues found while testing MG egonet in different MG configurations. Authors: - Chuck Hastings (https://github.com/ChuckHastings) Approvers: - Seunghwa Kang (https://github.com/seunghwak) - Joseph Nke (https://github.com/jnke2016) URL: #2926
1 parent 403ee20 commit 1e85ee5

File tree

4 files changed

+94
-90
lines changed

4 files changed

+94
-90
lines changed

cpp/src/community/egonet_impl.cuh

Lines changed: 46 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,7 @@
2121
#include <cugraph/graph_functions.hpp>
2222
#include <cugraph/graph_view.hpp>
2323
#include <cugraph/utilities/error.hpp>
24-
25-
#include <cstddef>
26-
#include <cugraph/algorithms.hpp>
27-
#include <memory>
28-
#include <tuple>
29-
#include <utility>
24+
#include <cugraph/utilities/host_scalar_comm.hpp>
3025

3126
#include <rmm/cuda_stream_view.hpp>
3227
#include <rmm/device_vector.hpp>
@@ -42,6 +37,13 @@
4237

4338
#include <utilities/high_res_timer.hpp>
4439

40+
#include <cstddef>
41+
#include <cugraph/algorithms.hpp>
42+
#include <memory>
43+
#include <numeric>
44+
#include <tuple>
45+
#include <utility>
46+
4547
namespace {
4648

4749
/*
@@ -69,20 +71,25 @@ extract(raft::handle_t const& handle,
6971
bool do_expensive_check)
7072
{
7173
auto user_stream_view = handle.get_stream();
72-
rmm::device_uvector<size_t> neighbors_offsets(source_vertex.size() + 1, user_stream_view);
74+
75+
size_t num_sources = source_vertex.size();
76+
[[maybe_unused]] std::vector<size_t> source_start{{0}};
77+
78+
if constexpr (multi_gpu) {
79+
source_start =
80+
cugraph::host_scalar_allgather(handle.get_comms(), num_sources, handle.get_stream());
81+
num_sources = std::reduce(source_start.begin(), source_start.end());
82+
std::exclusive_scan(source_start.begin(), source_start.end(), source_start.begin(), size_t{0});
83+
}
84+
85+
rmm::device_uvector<size_t> neighbors_offsets(num_sources + 1, user_stream_view);
7386
rmm::device_uvector<vertex_t> neighbors(0, user_stream_view);
7487

75-
std::vector<size_t> h_neighbors_offsets(source_vertex.size() + 1);
88+
std::vector<size_t> h_neighbors_offsets(num_sources + 1);
7689

7790
// Streams will allocate concurrently later
7891
std::vector<rmm::device_uvector<vertex_t>> reached{};
79-
reached.reserve(source_vertex.size());
80-
for (size_t i = 0; i < source_vertex.size(); i++) {
81-
// Allocations and operations are attached to the worker stream
82-
rmm::device_uvector<vertex_t> local_reach(graph_view.local_vertex_partition_range_size(),
83-
handle.get_next_usable_stream(i));
84-
reached.push_back(std::move(local_reach));
85-
}
92+
reached.reserve(num_sources);
8693

8794
user_stream_view.synchronize();
8895
#ifdef TIMING
@@ -96,11 +103,15 @@ extract(raft::handle_t const& handle,
96103
// the vertices and search for matches until the frontiers
97104
// are large enough to use this approach?
98105

99-
for (size_t i = 0; i < source_vertex.size(); i++) {
106+
for (size_t i = 0; i < num_sources; i++) {
100107
// get light handle from worker pool
101108
raft::handle_t light_handle(handle.get_next_usable_stream(i));
102109
auto worker_stream_view = multi_gpu ? handle.get_stream() : light_handle.get_stream();
103110

111+
rmm::device_uvector<vertex_t> local_reach(graph_view.local_vertex_partition_range_size(),
112+
worker_stream_view);
113+
reached.push_back(std::move(local_reach));
114+
104115
// BFS with cutoff
105116
// consider adding a device API to BFS (ie. accept source on the device)
106117
bool direction_optimizing = false;
@@ -109,12 +120,24 @@ extract(raft::handle_t const& handle,
109120
reached[i].end(),
110121
std::numeric_limits<vertex_t>::max());
111122

123+
raft::device_span<vertex_t const> source{source_vertex.data() + i, 1};
124+
125+
if constexpr (multi_gpu) {
126+
auto it = std::upper_bound(source_start.begin(), source_start.end(), i);
127+
auto gpu = thrust::distance(source_start.begin(), it) - 1;
128+
129+
if (gpu == handle.get_comms().get_rank())
130+
source = raft::device_span<vertex_t const>{source_vertex.data() + i - source_start[gpu], 1};
131+
else
132+
source = raft::device_span<vertex_t const>{source_vertex.data(), size_t{0}};
133+
}
134+
112135
cugraph::bfs<vertex_t, edge_t, weight_t, multi_gpu>(multi_gpu ? handle : light_handle,
113136
graph_view,
114137
reached[i].data(),
115138
nullptr,
116-
source_vertex.data() + i,
117-
1,
139+
source.data(),
140+
source.size(),
118141
direction_optimizing,
119142
radius,
120143
do_expensive_check);
@@ -146,18 +169,16 @@ extract(raft::handle_t const& handle,
146169

147170
// Construct neighbors offsets (just a scan on neighborhod vector sizes)
148171
h_neighbors_offsets[0] = 0;
149-
for (size_t i = 0; i < source_vertex.size(); i++) {
172+
for (size_t i = 0; i < num_sources; i++) {
150173
h_neighbors_offsets[i + 1] = h_neighbors_offsets[i] + reached[i].size();
151174
}
152-
raft::update_device(neighbors_offsets.data(),
153-
&h_neighbors_offsets[0],
154-
source_vertex.size() + 1,
155-
user_stream_view.value());
156-
neighbors.resize(h_neighbors_offsets[source_vertex.size()], user_stream_view.value());
175+
raft::update_device(
176+
neighbors_offsets.data(), &h_neighbors_offsets[0], num_sources + 1, user_stream_view.value());
177+
neighbors.resize(h_neighbors_offsets[num_sources], user_stream_view.value());
157178
user_stream_view.synchronize();
158179

159180
// Construct the neighbors list concurrently
160-
for (size_t i = 0; i < source_vertex.size(); i++) {
181+
for (size_t i = 0; i < num_sources; i++) {
161182
auto worker_stream_view = handle.get_next_usable_stream(i);
162183
thrust::copy(rmm::exec_policy(worker_stream_view),
163184
reached[i].begin(),
@@ -226,9 +247,6 @@ extract_ego(raft::handle_t const& handle,
226247
vertex_t radius,
227248
bool do_expensive_check)
228249
{
229-
CUGRAPH_EXPECTS(source_vertex.size() > 0, "Need at least one source to extract the egonet from");
230-
CUGRAPH_EXPECTS(source_vertex.size() < static_cast<size_t>(graph_view.number_of_vertices()),
231-
"Can't have more sources to extract from than vertices in the graph");
232250
CUGRAPH_EXPECTS(radius > 0, "Radius should be at least 1");
233251
CUGRAPH_EXPECTS(radius < graph_view.number_of_vertices(), "radius is too large");
234252

cpp/src/structure/decompress_to_edgelist_impl.cuh

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,12 +59,11 @@ decompress_to_edgelist_impl(
5959
std::optional<raft::device_span<vertex_t const>> renumber_map,
6060
bool do_expensive_check)
6161
{
62-
CUGRAPH_EXPECTS(renumber_map.has_value(),
63-
"Invalid input arguments: renumber_map.has_value() should be true if multi-GPU.");
64-
CUGRAPH_EXPECTS(
65-
(*renumber_map).size() == static_cast<size_t>(graph_view.local_vertex_partition_range_size()),
66-
"Invalid input arguments: (*renumber_map).size() should match with the local "
67-
"vertex partition range size.");
62+
CUGRAPH_EXPECTS(!renumber_map.has_value() ||
63+
((*renumber_map).size() ==
64+
static_cast<size_t>(graph_view.local_vertex_partition_range_size())),
65+
"Invalid input arguments: (*renumber_map).size() should match with the local "
66+
"vertex partition range size.");
6867

6968
if (do_expensive_check) { /* currently, nothing to do */
7069
}

cpp/tests/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -476,7 +476,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
476476

477477
###############################################################################################
478478
# - MG LOUVAIN tests --------------------------------------------------------------------------
479-
ConfigureTestMG(MG_EGO_TEST community/mg_egonet_test.cpp)
479+
ConfigureTestMG(MG_EGO_TEST community/mg_egonet_test.cu)
480480

481481
###############################################################################################
482482
# - MG WEAKLY CONNECTED COMPONENTS tests ------------------------------------------------------
Lines changed: 42 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616

1717
#include "egonet_validate.hpp"
1818

19+
#include <structure/detail/structure_utils.cuh>
20+
1921
#include <utilities/base_fixture.hpp>
2022
#include <utilities/device_comm_wrapper.hpp>
2123
#include <utilities/high_res_clock.h>
@@ -34,6 +36,7 @@
3436
#include <thrust/execution_policy.h>
3537
#include <thrust/iterator/counting_iterator.h>
3638
#include <thrust/sequence.h>
39+
#include <thrust/sort.h>
3740

3841
#include <gtest/gtest.h>
3942

@@ -126,7 +129,7 @@ class Tests_MGEgonet
126129
cugraph::extract_ego(
127130
*handle_,
128131
mg_graph_view,
129-
raft::device_span<vertex_t const>{d_ego_sources.data(), egonet_usecase.ego_sources_.size()},
132+
raft::device_span<vertex_t const>{d_ego_sources.data(), d_ego_sources.size()},
130133
static_cast<vertex_t>(egonet_usecase.radius_));
131134

132135
if (cugraph::test::g_perf) {
@@ -138,11 +141,6 @@ class Tests_MGEgonet
138141
}
139142

140143
if (egonet_usecase.check_correctness_) {
141-
*d_renumber_map_labels = cugraph::test::device_gatherv(
142-
*handle_,
143-
raft::device_span<vertex_t const>(d_renumber_map_labels->data(),
144-
d_renumber_map_labels->size()));
145-
146144
d_ego_edgelist_src = cugraph::test::device_gatherv(
147145
*handle_,
148146
raft::device_span<vertex_t const>(d_ego_edgelist_src.data(), d_ego_edgelist_src.size()));
@@ -157,54 +155,48 @@ class Tests_MGEgonet
157155
d_ego_edgelist_wgt->data(), d_ego_edgelist_wgt->size()));
158156
}
159157

160-
d_ego_edgelist_offsets = cugraph::test::device_gatherv(
161-
*handle_,
158+
size_t offsets_size = d_ego_edgelist_offsets.size();
159+
160+
auto graph_ids_v = cugraph::detail::expand_sparse_offsets(
162161
raft::device_span<size_t const>(d_ego_edgelist_offsets.data(),
163-
d_ego_edgelist_offsets.size()));
162+
d_ego_edgelist_offsets.size()),
163+
vertex_t{0},
164+
handle_->get_stream());
164165

165-
auto [sg_graph, sg_number_map] =
166-
cugraph::test::mg_graph_to_sg_graph(*handle_, mg_graph_view, d_renumber_map_labels, false);
166+
graph_ids_v = cugraph::test::device_gatherv(
167+
*handle_, raft::device_span<vertex_t const>(graph_ids_v.data(), graph_ids_v.size()));
167168

168-
if (my_rank == 0) {
169-
cugraph::unrenumber_int_vertices<vertex_t, false>(
170-
*handle_,
171-
d_ego_edgelist_src.data(),
172-
d_ego_edgelist_src.size(),
173-
d_renumber_map_labels->data(),
174-
std::vector<vertex_t>{mg_graph_view.number_of_vertices()});
175-
176-
cugraph::unrenumber_int_vertices<vertex_t, false>(
177-
*handle_,
178-
d_ego_edgelist_dst.data(),
179-
d_ego_edgelist_dst.size(),
180-
d_renumber_map_labels->data(),
181-
std::vector<vertex_t>{mg_graph_view.number_of_vertices()});
182-
183-
rmm::device_uvector<vertex_t> d_sg_ego_sources(egonet_usecase.ego_sources_.size(),
184-
handle_->get_stream());
185-
186-
if constexpr (std::is_same<int32_t, vertex_t>::value) {
187-
raft::update_device(d_sg_ego_sources.data(),
188-
egonet_usecase.ego_sources_.data(),
189-
egonet_usecase.ego_sources_.size(),
190-
handle_->get_stream());
191-
} else {
192-
std::vector<vertex_t> h_ego_sources(d_sg_ego_sources.size());
193-
std::transform(egonet_usecase.ego_sources_.begin(),
194-
egonet_usecase.ego_sources_.end(),
195-
h_ego_sources.begin(),
196-
[](auto v) { return static_cast<vertex_t>(v); });
197-
raft::update_device(d_sg_ego_sources.data(),
198-
h_ego_sources.data(),
199-
h_ego_sources.size(),
200-
handle_->get_stream());
201-
}
169+
if (d_ego_edgelist_wgt) {
170+
thrust::sort_by_key(
171+
handle_->get_thrust_policy(),
172+
thrust::make_zip_iterator(
173+
graph_ids_v.begin(), d_ego_edgelist_src.begin(), d_ego_edgelist_dst.begin()),
174+
thrust::make_zip_iterator(
175+
graph_ids_v.end(), d_ego_edgelist_src.end(), d_ego_edgelist_dst.end()),
176+
d_ego_edgelist_wgt->begin());
177+
} else {
178+
thrust::sort(handle_->get_thrust_policy(),
179+
thrust::make_zip_iterator(
180+
graph_ids_v.begin(), d_ego_edgelist_src.begin(), d_ego_edgelist_dst.begin()),
181+
thrust::make_zip_iterator(
182+
graph_ids_v.end(), d_ego_edgelist_src.end(), d_ego_edgelist_dst.end()));
183+
}
184+
185+
d_ego_edgelist_offsets = cugraph::detail::compute_sparse_offsets<size_t>(
186+
graph_ids_v.begin(), graph_ids_v.end(), size_t{0}, offsets_size - 1, handle_->get_stream());
202187

188+
auto [sg_graph, sg_number_map] = cugraph::test::mg_graph_to_sg_graph(
189+
*handle_, mg_graph_view, std::optional<rmm::device_uvector<vertex_t>>{std::nullopt}, false);
190+
191+
d_ego_sources = cugraph::test::device_gatherv(
192+
*handle_, raft::device_span<vertex_t const>(d_ego_sources.data(), d_ego_sources.size()));
193+
194+
if (my_rank == 0) {
203195
auto [d_reference_src, d_reference_dst, d_reference_wgt, d_reference_offsets] =
204196
cugraph::extract_ego(
205197
*handle_,
206198
sg_graph.view(),
207-
raft::device_span<vertex_t const>{d_sg_ego_sources.data(), d_sg_ego_sources.size()},
199+
raft::device_span<vertex_t const>{d_ego_sources.data(), d_ego_sources.size()},
208200
static_cast<vertex_t>(egonet_usecase.radius_));
209201

210202
cugraph::test::egonet_validate(*handle_,
@@ -289,7 +281,7 @@ INSTANTIATE_TEST_SUITE_P(
289281
Tests_MGEgonet_File,
290282
::testing::Combine(
291283
// disable correctness checks for large graphs
292-
::testing::Values(Egonet_Usecase{std::vector<int32_t>{5, 9, 3, 10, 12, 13}, 2, true, true}),
284+
::testing::Values(Egonet_Usecase{std::vector<int32_t>{5, 9, 3, 10, 12, 13}, 2, true, false}),
293285
::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
294286

295287
INSTANTIATE_TEST_SUITE_P(
@@ -301,7 +293,7 @@ INSTANTIATE_TEST_SUITE_P(
301293
Tests_MGEgonet_File64,
302294
::testing::Combine(
303295
// disable correctness checks for large graphs
304-
::testing::Values(Egonet_Usecase{std::vector<int32_t>{5, 9, 3, 10, 12, 13}, 2, true, true}),
296+
::testing::Values(Egonet_Usecase{std::vector<int32_t>{5, 9, 3, 10, 12, 13}, 2, true, false}),
305297
::testing::Values(cugraph::test::File_Usecase("test/datasets/karate.mtx"))));
306298

307299
INSTANTIATE_TEST_SUITE_P(
@@ -313,12 +305,7 @@ INSTANTIATE_TEST_SUITE_P(
313305
Tests_MGEgonet_Rmat,
314306
::testing::Combine(
315307
// disable correctness checks for large graphs
316-
::testing::Values(Egonet_Usecase{std::vector<int32_t>{0}, 1, false, true},
317-
Egonet_Usecase{std::vector<int32_t>{0}, 2, false, true},
318-
Egonet_Usecase{std::vector<int32_t>{0}, 3, false, true},
319-
Egonet_Usecase{std::vector<int32_t>{10, 0, 5}, 2, false, true},
320-
Egonet_Usecase{std::vector<int32_t>{9, 3, 10}, 2, false, true},
321-
Egonet_Usecase{std::vector<int32_t>{5, 9, 3, 10, 12, 13}, 2, true, true}),
308+
::testing::Values(Egonet_Usecase{std::vector<int32_t>{5, 9, 3, 10, 12, 13}, 2, true, false}),
322309
::testing::Values(
323310
cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false, 0, true))));
324311

@@ -331,7 +318,7 @@ INSTANTIATE_TEST_SUITE_P(
331318
Tests_MGEgonet_Rmat64,
332319
::testing::Combine(
333320
// disable correctness checks for large graphs
334-
::testing::Values(Egonet_Usecase{std::vector<int32_t>{5, 9, 3, 10, 12, 13}, 2, true, true}),
321+
::testing::Values(Egonet_Usecase{std::vector<int32_t>{5, 9, 3, 10, 12, 13}, 2, true, false}),
335322
::testing::Values(
336323
cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false, 0, true))));
337324

0 commit comments

Comments
 (0)