@@ -38,6 +38,9 @@ namespace experimental {
3838
3939namespace detail {
4040
41+ // FIXME: block size requires tuning
42+ int32_t constexpr copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size = 1024 ;
43+
4144// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
4245template <typename VertexIterator>
4346struct minor_to_key_t {
@@ -50,6 +53,151 @@ struct minor_to_key_t {
5053 }
5154};
5255
56+ template <typename vertex_t , typename edge_t , typename weight_t , bool multi_gpu>
57+ __global__ void for_all_major_for_all_nbr_mid_degree (
58+ matrix_partition_device_view_t <vertex_t , edge_t , weight_t , multi_gpu> matrix_partition,
59+ vertex_t major_first,
60+ vertex_t major_last,
61+ vertex_t * majors)
62+ {
63+ auto const tid = threadIdx .x + blockIdx .x * blockDim .x ;
64+ static_assert (
65+ copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size % raft::warp_size () == 0 );
66+ auto const lane_id = tid % raft::warp_size ();
67+ auto major_start_offset = static_cast <size_t >(major_first - matrix_partition.get_major_first ());
68+ size_t idx = static_cast <size_t >(tid / raft::warp_size ());
69+
70+ while (idx < static_cast <size_t >(major_last - major_first)) {
71+ auto major_offset = major_start_offset + idx;
72+ auto major =
73+ matrix_partition.get_major_from_major_offset_nocheck (static_cast <vertex_t >(major_offset));
74+ vertex_t const * indices{nullptr };
75+ thrust::optional<weight_t const *> weights{nullptr };
76+ edge_t local_degree{};
77+ thrust::tie (indices, weights, local_degree) = matrix_partition.get_local_edges (major_offset);
78+ auto local_offset = matrix_partition.get_local_offset (major_offset);
79+ for (edge_t i = lane_id; i < local_degree; i += raft::warp_size ()) {
80+ majors[local_offset + i] = major;
81+ }
82+ idx += gridDim .x * (blockDim .x / raft::warp_size ());
83+ }
84+ }
85+
86+ template <typename vertex_t , typename edge_t , typename weight_t , bool multi_gpu>
87+ __global__ void for_all_major_for_all_nbr_high_degree (
88+ matrix_partition_device_view_t <vertex_t , edge_t , weight_t , multi_gpu> matrix_partition,
89+ vertex_t major_first,
90+ vertex_t major_last,
91+ vertex_t * majors)
92+ {
93+ auto major_start_offset = static_cast <size_t >(major_first - matrix_partition.get_major_first ());
94+ size_t idx = static_cast <size_t >(blockIdx .x );
95+
96+ while (idx < static_cast <size_t >(major_last - major_first)) {
97+ auto major_offset = major_start_offset + idx;
98+ auto major =
99+ matrix_partition.get_major_from_major_offset_nocheck (static_cast <vertex_t >(major_offset));
100+ vertex_t const * indices{nullptr };
101+ thrust::optional<weight_t const *> weights{nullptr };
102+ edge_t local_degree{};
103+ thrust::tie (indices, weights, local_degree) =
104+ matrix_partition.get_local_edges (static_cast <vertex_t >(major_offset));
105+ auto local_offset = matrix_partition.get_local_offset (major_offset);
106+ for (edge_t i = threadIdx .x ; i < local_degree; i += blockDim .x ) {
107+ majors[local_offset + i] = major;
108+ }
109+ idx += gridDim .x ;
110+ }
111+ }
112+
113+ template <typename vertex_t , typename edge_t , typename weight_t , bool multi_gpu>
114+ void decompress_matrix_partition_to_fill_edgelist_majors (
115+ raft::handle_t const & handle,
116+ matrix_partition_device_view_t <vertex_t , edge_t , weight_t , multi_gpu> matrix_partition,
117+ vertex_t * majors,
118+ std::optional<std::vector<vertex_t >> const & segment_offsets)
119+ {
120+ if (segment_offsets) {
121+ // FIXME: we may further improve performance by 1) concurrently running kernels on different
122+ // segments; 2) individually tuning block sizes for different segments; and 3) adding one more
123+ // segment for very high degree vertices and running segmented reduction
124+ static_assert (detail::num_sparse_segments_per_vertex_partition == 3 );
125+ if ((*segment_offsets)[1 ] > 0 ) {
126+ raft::grid_1d_block_t update_grid (
127+ (*segment_offsets)[1 ],
128+ detail::copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size,
129+ handle.get_device_properties ().maxGridSize [0 ]);
130+
131+ detail::for_all_major_for_all_nbr_high_degree<<<update_grid.num_blocks,
132+ update_grid.block_size,
133+ 0 ,
134+ handle.get_stream()>>> (
135+ matrix_partition,
136+ matrix_partition.get_major_first (),
137+ matrix_partition.get_major_first () + (*segment_offsets)[1 ],
138+ majors);
139+ }
140+ if ((*segment_offsets)[2 ] - (*segment_offsets)[1 ] > 0 ) {
141+ raft::grid_1d_warp_t update_grid (
142+ (*segment_offsets)[2 ] - (*segment_offsets)[1 ],
143+ detail::copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size,
144+ handle.get_device_properties ().maxGridSize [0 ]);
145+
146+ detail::for_all_major_for_all_nbr_mid_degree<<<update_grid.num_blocks,
147+ update_grid.block_size,
148+ 0 ,
149+ handle.get_stream()>>> (
150+ matrix_partition,
151+ matrix_partition.get_major_first () + (*segment_offsets)[1 ],
152+ matrix_partition.get_major_first () + (*segment_offsets)[2 ],
153+ majors);
154+ }
155+ if ((*segment_offsets)[3 ] - (*segment_offsets)[2 ] > 0 ) {
156+ thrust::for_each (
157+ rmm::exec_policy (handle.get_stream ())->on (handle.get_stream ()),
158+ thrust::make_counting_iterator (matrix_partition.get_major_first ()) + (*segment_offsets)[2 ],
159+ thrust::make_counting_iterator (matrix_partition.get_major_first ()) + (*segment_offsets)[3 ],
160+ [matrix_partition, majors] __device__ (auto major) {
161+ auto major_offset = matrix_partition.get_major_offset_from_major_nocheck (major);
162+ auto local_degree = matrix_partition.get_local_degree (major_offset);
163+ auto local_offset = matrix_partition.get_local_offset (major_offset);
164+ thrust::fill (
165+ thrust::seq, majors + local_offset, majors + local_offset + local_degree, major);
166+ });
167+ }
168+ if (matrix_partition.get_dcs_nzd_vertex_count () &&
169+ (*(matrix_partition.get_dcs_nzd_vertex_count ()) > 0 )) {
170+ thrust::for_each (
171+ rmm::exec_policy (handle.get_stream ())->on (handle.get_stream ()),
172+ thrust::make_counting_iterator (vertex_t {0 }),
173+ thrust::make_counting_iterator (*(matrix_partition.get_dcs_nzd_vertex_count ())),
174+ [matrix_partition, major_start_offset = (*segment_offsets)[3 ], majors] __device__ (
175+ auto idx) {
176+ auto major = *(matrix_partition.get_major_from_major_hypersparse_idx_nocheck (idx));
177+ auto major_idx =
178+ major_start_offset + idx; // major_offset != major_idx in the hypersparse region
179+ auto local_degree = matrix_partition.get_local_degree (major_idx);
180+ auto local_offset = matrix_partition.get_local_offset (major_idx);
181+ thrust::fill (
182+ thrust::seq, majors + local_offset, majors + local_offset + local_degree, major);
183+ });
184+ }
185+ } else {
186+ thrust::for_each (
187+ rmm::exec_policy (handle.get_stream ())->on (handle.get_stream ()),
188+ thrust::make_counting_iterator (matrix_partition.get_major_first ()),
189+ thrust::make_counting_iterator (matrix_partition.get_major_first ()) +
190+ matrix_partition.get_major_size (),
191+ [matrix_partition, majors] __device__ (auto major) {
192+ auto major_offset = matrix_partition.get_major_offset_from_major_nocheck (major);
193+ auto local_degree = matrix_partition.get_local_degree (major_offset);
194+ auto local_offset = matrix_partition.get_local_offset (major_offset);
195+ thrust::fill (
196+ thrust::seq, majors + local_offset, majors + local_offset + local_degree, major);
197+ });
198+ }
199+ }
200+
53201} // namespace detail
54202
55203/* *
@@ -283,23 +431,11 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
283431 *(matrix_partition.get_weights ()) + matrix_partition.get_number_of_edges (),
284432 tmp_key_aggregated_edge_weights.begin ());
285433 }
286- // FIXME: This is highly inefficient for graphs with high-degree vertices. If we renumber
287- // vertices to insure that rows within a partition are sorted by their out-degree in
288- // decreasing order, we will apply this kernel only to low out-degree vertices.
289- thrust::for_each (
290- rmm::exec_policy (handle.get_stream ())->on (handle.get_stream ()),
291- thrust::make_counting_iterator (matrix_partition.get_major_first ()),
292- thrust::make_counting_iterator (matrix_partition.get_major_first ()) +
293- matrix_partition.get_major_size (),
294- [matrix_partition, tmp_major_vertices = tmp_major_vertices.begin ()] __device__ (auto major) {
295- auto major_offset = matrix_partition.get_major_offset_from_major_nocheck (major);
296- auto local_degree = matrix_partition.get_local_degree (major_offset);
297- auto local_offset = matrix_partition.get_local_offset (major_offset);
298- thrust::fill (thrust::seq,
299- tmp_major_vertices + local_offset,
300- tmp_major_vertices + local_offset + local_degree,
301- major);
302- });
434+ detail::decompress_matrix_partition_to_fill_edgelist_majors (
435+ handle,
436+ matrix_partition,
437+ tmp_major_vertices.data (),
438+ graph_view.get_local_adj_matrix_partition_segment_offsets (i));
303439 rmm::device_uvector<vertex_t > reduced_major_vertices (tmp_major_vertices.size (),
304440 handle.get_stream ());
305441 rmm::device_uvector<vertex_t > reduced_minor_keys (reduced_major_vertices.size (),
0 commit comments