Skip to content

Commit a88372d

Browse files
authored
Merge 0828ee9 into 9bcfa14
2 parents 9bcfa14 + 0828ee9 commit a88372d

File tree

7 files changed

+532
-148
lines changed

7 files changed

+532
-148
lines changed

cpp/include/cugraph/experimental/graph_view.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ namespace detail {
221221
// FIXME: threshold values require tuning
222222
// use the hypersparse format (currently, DCSR or DCSC) for the vertices with their degrees smaller
223223
// than col_comm_size * hypersparse_threshold_ratio, should be less than 1.0
224-
double constexpr hypersparse_threshold_ratio = 0.0;
224+
double constexpr hypersparse_threshold_ratio = 0.5;
225225
size_t constexpr low_degree_threshold{raft::warp_size()};
226226
size_t constexpr mid_degree_threshold{1024};
227227
size_t constexpr num_sparse_segments_per_vertex_partition{3};

cpp/include/cugraph/matrix_partition_view.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ class matrix_partition_view_t<vertex_t, edge_t, weight_t, multi_gpu, std::enable
7979
major_first_(major_first),
8080
major_last_(major_last),
8181
minor_first_(minor_first),
82-
minor_last_(minor_first),
82+
minor_last_(minor_last),
8383
major_value_start_offset_(major_value_start_offset)
8484
{
8585
}

cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh

Lines changed: 153 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ namespace experimental {
3838

3939
namespace detail {
4040

41+
// FIXME: block size requires tuning
42+
int32_t constexpr copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size = 1024;
43+
4144
// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
4245
template <typename VertexIterator>
4346
struct minor_to_key_t {
@@ -50,6 +53,151 @@ struct minor_to_key_t {
5053
}
5154
};
5255

56+
template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
57+
__global__ void for_all_major_for_all_nbr_mid_degree(
58+
matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
59+
vertex_t major_first,
60+
vertex_t major_last,
61+
vertex_t* majors)
62+
{
63+
auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
64+
static_assert(
65+
copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size % raft::warp_size() == 0);
66+
auto const lane_id = tid % raft::warp_size();
67+
auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
68+
size_t idx = static_cast<size_t>(tid / raft::warp_size());
69+
70+
while (idx < static_cast<size_t>(major_last - major_first)) {
71+
auto major_offset = major_start_offset + idx;
72+
auto major =
73+
matrix_partition.get_major_from_major_offset_nocheck(static_cast<vertex_t>(major_offset));
74+
vertex_t const* indices{nullptr};
75+
thrust::optional<weight_t const*> weights{nullptr};
76+
edge_t local_degree{};
77+
thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset);
78+
auto local_offset = matrix_partition.get_local_offset(major_offset);
79+
for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
80+
majors[local_offset + i] = major;
81+
}
82+
idx += gridDim.x * (blockDim.x / raft::warp_size());
83+
}
84+
}
85+
86+
template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
87+
__global__ void for_all_major_for_all_nbr_high_degree(
88+
matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
89+
vertex_t major_first,
90+
vertex_t major_last,
91+
vertex_t* majors)
92+
{
93+
auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
94+
size_t idx = static_cast<size_t>(blockIdx.x);
95+
96+
while (idx < static_cast<size_t>(major_last - major_first)) {
97+
auto major_offset = major_start_offset + idx;
98+
auto major =
99+
matrix_partition.get_major_from_major_offset_nocheck(static_cast<vertex_t>(major_offset));
100+
vertex_t const* indices{nullptr};
101+
thrust::optional<weight_t const*> weights{nullptr};
102+
edge_t local_degree{};
103+
thrust::tie(indices, weights, local_degree) =
104+
matrix_partition.get_local_edges(static_cast<vertex_t>(major_offset));
105+
auto local_offset = matrix_partition.get_local_offset(major_offset);
106+
for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
107+
majors[local_offset + i] = major;
108+
}
109+
idx += gridDim.x;
110+
}
111+
}
112+
113+
template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
114+
void decompress_matrix_partition_to_fill_edgelist_majors(
115+
raft::handle_t const& handle,
116+
matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
117+
vertex_t* majors,
118+
std::optional<std::vector<vertex_t>> const& segment_offsets)
119+
{
120+
if (segment_offsets) {
121+
// FIXME: we may further improve performance by 1) concurrently running kernels on different
122+
// segments; 2) individually tuning block sizes for different segments; and 3) adding one more
123+
// segment for very high degree vertices and running segmented reduction
124+
static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
125+
if ((*segment_offsets)[1] > 0) {
126+
raft::grid_1d_block_t update_grid(
127+
(*segment_offsets)[1],
128+
detail::copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size,
129+
handle.get_device_properties().maxGridSize[0]);
130+
131+
detail::for_all_major_for_all_nbr_high_degree<<<update_grid.num_blocks,
132+
update_grid.block_size,
133+
0,
134+
handle.get_stream()>>>(
135+
matrix_partition,
136+
matrix_partition.get_major_first(),
137+
matrix_partition.get_major_first() + (*segment_offsets)[1],
138+
majors);
139+
}
140+
if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
141+
raft::grid_1d_warp_t update_grid(
142+
(*segment_offsets)[2] - (*segment_offsets)[1],
143+
detail::copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size,
144+
handle.get_device_properties().maxGridSize[0]);
145+
146+
detail::for_all_major_for_all_nbr_mid_degree<<<update_grid.num_blocks,
147+
update_grid.block_size,
148+
0,
149+
handle.get_stream()>>>(
150+
matrix_partition,
151+
matrix_partition.get_major_first() + (*segment_offsets)[1],
152+
matrix_partition.get_major_first() + (*segment_offsets)[2],
153+
majors);
154+
}
155+
if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
156+
thrust::for_each(
157+
rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
158+
thrust::make_counting_iterator(matrix_partition.get_major_first()) + (*segment_offsets)[2],
159+
thrust::make_counting_iterator(matrix_partition.get_major_first()) + (*segment_offsets)[3],
160+
[matrix_partition, majors] __device__(auto major) {
161+
auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
162+
auto local_degree = matrix_partition.get_local_degree(major_offset);
163+
auto local_offset = matrix_partition.get_local_offset(major_offset);
164+
thrust::fill(
165+
thrust::seq, majors + local_offset, majors + local_offset + local_degree, major);
166+
});
167+
}
168+
if (matrix_partition.get_dcs_nzd_vertex_count() &&
169+
(*(matrix_partition.get_dcs_nzd_vertex_count()) > 0)) {
170+
thrust::for_each(
171+
rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
172+
thrust::make_counting_iterator(vertex_t{0}),
173+
thrust::make_counting_iterator(*(matrix_partition.get_dcs_nzd_vertex_count())),
174+
[matrix_partition, major_start_offset = (*segment_offsets)[3], majors] __device__(
175+
auto idx) {
176+
auto major = *(matrix_partition.get_major_from_major_hypersparse_idx_nocheck(idx));
177+
auto major_idx =
178+
major_start_offset + idx; // major_offset != major_idx in the hypersparse region
179+
auto local_degree = matrix_partition.get_local_degree(major_idx);
180+
auto local_offset = matrix_partition.get_local_offset(major_idx);
181+
thrust::fill(
182+
thrust::seq, majors + local_offset, majors + local_offset + local_degree, major);
183+
});
184+
}
185+
} else {
186+
thrust::for_each(
187+
rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
188+
thrust::make_counting_iterator(matrix_partition.get_major_first()),
189+
thrust::make_counting_iterator(matrix_partition.get_major_first()) +
190+
matrix_partition.get_major_size(),
191+
[matrix_partition, majors] __device__(auto major) {
192+
auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
193+
auto local_degree = matrix_partition.get_local_degree(major_offset);
194+
auto local_offset = matrix_partition.get_local_offset(major_offset);
195+
thrust::fill(
196+
thrust::seq, majors + local_offset, majors + local_offset + local_degree, major);
197+
});
198+
}
199+
}
200+
53201
} // namespace detail
54202

55203
/**
@@ -283,23 +431,11 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
283431
*(matrix_partition.get_weights()) + matrix_partition.get_number_of_edges(),
284432
tmp_key_aggregated_edge_weights.begin());
285433
}
286-
// FIXME: This is highly inefficient for graphs with high-degree vertices. If we renumber
287-
// vertices to insure that rows within a partition are sorted by their out-degree in
288-
// decreasing order, we will apply this kernel only to low out-degree vertices.
289-
thrust::for_each(
290-
rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
291-
thrust::make_counting_iterator(matrix_partition.get_major_first()),
292-
thrust::make_counting_iterator(matrix_partition.get_major_first()) +
293-
matrix_partition.get_major_size(),
294-
[matrix_partition, tmp_major_vertices = tmp_major_vertices.begin()] __device__(auto major) {
295-
auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
296-
auto local_degree = matrix_partition.get_local_degree(major_offset);
297-
auto local_offset = matrix_partition.get_local_offset(major_offset);
298-
thrust::fill(thrust::seq,
299-
tmp_major_vertices + local_offset,
300-
tmp_major_vertices + local_offset + local_degree,
301-
major);
302-
});
434+
detail::decompress_matrix_partition_to_fill_edgelist_majors(
435+
handle,
436+
matrix_partition,
437+
tmp_major_vertices.data(),
438+
graph_view.get_local_adj_matrix_partition_segment_offsets(i));
303439
rmm::device_uvector<vertex_t> reduced_major_vertices(tmp_major_vertices.size(),
304440
handle.get_stream());
305441
rmm::device_uvector<vertex_t> reduced_minor_keys(reduced_major_vertices.size(),

0 commit comments

Comments
 (0)