rapidsai
diff --git a/‎cpp/include/cugraph/experimental/graph_view.hpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/include/cugraph/experimental/graph_view.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/cugraph/matrix_partition_view.hpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/include/cugraph/matrix_partition_view.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh‎
Lines changed: 153 additions & 17 deletions b/‎cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh‎
Lines changed: 153 additions & 17 deletions
@@ -221,7 +221,7 @@ namespace detail {
 // FIXME: threshold values require tuning
 // use the hypersparse format (currently, DCSR or DCSC) for the vertices with their degrees smaller
 // than col_comm_size * hypersparse_threshold_ratio, should be less than 1.0
-double constexpr hypersparse_threshold_ratio = 0.0;
+double constexpr hypersparse_threshold_ratio = 0.5;
 size_t constexpr low_degree_threshold{raft::warp_size()};
 size_t constexpr mid_degree_threshold{1024};
 size_t constexpr num_sparse_segments_per_vertex_partition{3};
 
@@ -79,7 +79,7 @@ class matrix_partition_view_t<vertex_t, edge_t, weight_t, multi_gpu, std::enable
       major_first_(major_first),
       major_last_(major_last),
       minor_first_(minor_first),
-      minor_last_(minor_first),
+      minor_last_(minor_last),
       major_value_start_offset_(major_value_start_offset)
   {
   }
 
@@ -38,6 +38,9 @@ namespace experimental {
 
 namespace detail {
 
+// FIXME: block size requires tuning
+int32_t constexpr copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size = 1024;
+
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
 template <typename VertexIterator>
 struct minor_to_key_t {
@@ -50,6 +53,151 @@ struct minor_to_key_t {
   }
 };
 
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+__global__ void for_all_major_for_all_nbr_mid_degree(
+  matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
+  vertex_t major_first,
+  vertex_t major_last,
+  vertex_t* majors)
+{
+  auto const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  static_assert(
+    copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size % raft::warp_size() == 0);
+  auto const lane_id      = tid % raft::warp_size();
+  auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
+  size_t idx              = static_cast<size_t>(tid / raft::warp_size());
+
+  while (idx < static_cast<size_t>(major_last - major_first)) {
+    auto major_offset = major_start_offset + idx;
+    auto major =
+      matrix_partition.get_major_from_major_offset_nocheck(static_cast<vertex_t>(major_offset));
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) = matrix_partition.get_local_edges(major_offset);
+    auto local_offset                           = matrix_partition.get_local_offset(major_offset);
+    for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) {
+      majors[local_offset + i] = major;
+    }
+    idx += gridDim.x * (blockDim.x / raft::warp_size());
+  }
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+__global__ void for_all_major_for_all_nbr_high_degree(
+  matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
+  vertex_t major_first,
+  vertex_t major_last,
+  vertex_t* majors)
+{
+  auto major_start_offset = static_cast<size_t>(major_first - matrix_partition.get_major_first());
+  size_t idx              = static_cast<size_t>(blockIdx.x);
+
+  while (idx < static_cast<size_t>(major_last - major_first)) {
+    auto major_offset = major_start_offset + idx;
+    auto major =
+      matrix_partition.get_major_from_major_offset_nocheck(static_cast<vertex_t>(major_offset));
+    vertex_t const* indices{nullptr};
+    thrust::optional<weight_t const*> weights{nullptr};
+    edge_t local_degree{};
+    thrust::tie(indices, weights, local_degree) =
+      matrix_partition.get_local_edges(static_cast<vertex_t>(major_offset));
+    auto local_offset = matrix_partition.get_local_offset(major_offset);
+    for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) {
+      majors[local_offset + i] = major;
+    }
+    idx += gridDim.x;
+  }
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+void decompress_matrix_partition_to_fill_edgelist_majors(
+  raft::handle_t const& handle,
+  matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
+  vertex_t* majors,
+  std::optional<std::vector<vertex_t>> const& segment_offsets)
+{
+  if (segment_offsets) {
+    // FIXME: we may further improve performance by 1) concurrently running kernels on different
+    // segments; 2) individually tuning block sizes for different segments; and 3) adding one more
+    // segment for very high degree vertices and running segmented reduction
+    static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
+    if ((*segment_offsets)[1] > 0) {
+      raft::grid_1d_block_t update_grid(
+        (*segment_offsets)[1],
+        detail::copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size,
+        handle.get_device_properties().maxGridSize[0]);
+
+      detail::for_all_major_for_all_nbr_high_degree<<<update_grid.num_blocks,
+                                                      update_grid.block_size,
+                                                      0,
+                                                      handle.get_stream()>>>(
+        matrix_partition,
+        matrix_partition.get_major_first(),
+        matrix_partition.get_major_first() + (*segment_offsets)[1],
+        majors);
+    }
+    if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) {
+      raft::grid_1d_warp_t update_grid(
+        (*segment_offsets)[2] - (*segment_offsets)[1],
+        detail::copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size,
+        handle.get_device_properties().maxGridSize[0]);
+
+      detail::for_all_major_for_all_nbr_mid_degree<<<update_grid.num_blocks,
+                                                     update_grid.block_size,
+                                                     0,
+                                                     handle.get_stream()>>>(
+        matrix_partition,
+        matrix_partition.get_major_first() + (*segment_offsets)[1],
+        matrix_partition.get_major_first() + (*segment_offsets)[2],
+        majors);
+    }
+    if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) {
+      thrust::for_each(
+        rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+        thrust::make_counting_iterator(matrix_partition.get_major_first()) + (*segment_offsets)[2],
+        thrust::make_counting_iterator(matrix_partition.get_major_first()) + (*segment_offsets)[3],
+        [matrix_partition, majors] __device__(auto major) {
+          auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
+          auto local_degree = matrix_partition.get_local_degree(major_offset);
+          auto local_offset = matrix_partition.get_local_offset(major_offset);
+          thrust::fill(
+            thrust::seq, majors + local_offset, majors + local_offset + local_degree, major);
+        });
+    }
+    if (matrix_partition.get_dcs_nzd_vertex_count() &&
+        (*(matrix_partition.get_dcs_nzd_vertex_count()) > 0)) {
+      thrust::for_each(
+        rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+        thrust::make_counting_iterator(vertex_t{0}),
+        thrust::make_counting_iterator(*(matrix_partition.get_dcs_nzd_vertex_count())),
+        [matrix_partition, major_start_offset = (*segment_offsets)[3], majors] __device__(
+          auto idx) {
+          auto major = *(matrix_partition.get_major_from_major_hypersparse_idx_nocheck(idx));
+          auto major_idx =
+            major_start_offset + idx;  // major_offset != major_idx in the hypersparse region
+          auto local_degree = matrix_partition.get_local_degree(major_idx);
+          auto local_offset = matrix_partition.get_local_offset(major_idx);
+          thrust::fill(
+            thrust::seq, majors + local_offset, majors + local_offset + local_degree, major);
+        });
+    }
+  } else {
+    thrust::for_each(
+      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+      thrust::make_counting_iterator(matrix_partition.get_major_first()),
+      thrust::make_counting_iterator(matrix_partition.get_major_first()) +
+        matrix_partition.get_major_size(),
+      [matrix_partition, majors] __device__(auto major) {
+        auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
+        auto local_degree = matrix_partition.get_local_degree(major_offset);
+        auto local_offset = matrix_partition.get_local_offset(major_offset);
+        thrust::fill(
+          thrust::seq, majors + local_offset, majors + local_offset + local_degree, major);
+      });
+  }
+}
+
 }  // namespace detail
 
 /**
@@ -283,23 +431,11 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
                      *(matrix_partition.get_weights()) + matrix_partition.get_number_of_edges(),
                      tmp_key_aggregated_edge_weights.begin());
       }
-      // FIXME: This is highly inefficient for graphs with high-degree vertices. If we renumber
-      // vertices to insure that rows within a partition are sorted by their out-degree in
-      // decreasing order, we will apply this kernel only to low out-degree vertices.
-      thrust::for_each(
-        rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-        thrust::make_counting_iterator(matrix_partition.get_major_first()),
-        thrust::make_counting_iterator(matrix_partition.get_major_first()) +
-          matrix_partition.get_major_size(),
-        [matrix_partition, tmp_major_vertices = tmp_major_vertices.begin()] __device__(auto major) {
-          auto major_offset = matrix_partition.get_major_offset_from_major_nocheck(major);
-          auto local_degree = matrix_partition.get_local_degree(major_offset);
-          auto local_offset = matrix_partition.get_local_offset(major_offset);
-          thrust::fill(thrust::seq,
-                       tmp_major_vertices + local_offset,
-                       tmp_major_vertices + local_offset + local_degree,
-                       major);
-        });
+      detail::decompress_matrix_partition_to_fill_edgelist_majors(
+        handle,
+        matrix_partition,
+        tmp_major_vertices.data(),
+        graph_view.get_local_adj_matrix_partition_segment_offsets(i));
       rmm::device_uvector<vertex_t> reduced_major_vertices(tmp_major_vertices.size(),
                                                            handle.get_stream());
       rmm::device_uvector<vertex_t> reduced_minor_keys(reduced_major_vertices.size(),
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ class matrix_partition_view_t<vertex_t, edge_t, weight_t, multi_gpu, std::enable`
`79`	`79`	`major_first_(major_first),`
`80`	`80`	`major_last_(major_last),`
`81`	`81`	`minor_first_(minor_first),`
`82`		`- minor_last_(minor_first),`
	`82`	`+ minor_last_(minor_last),`
`83`	`83`	`major_value_start_offset_(major_value_start_offset)`
`84`	`84`	`{`
`85`	`85`	`}`