From a80cec551eb0b9ff64eda46705b411e230da4bd6 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Thu, 14 Oct 2021 15:54:41 -0500
Subject: [PATCH 01/25] Preliminary node2vec selector skeleton.

---
 cpp/src/sampling/rw_traversals.hpp | 75 ++++++++++++++++++++++++++++--
 1 file changed, 70 insertions(+), 5 deletions(-)
diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index 625f7074c7f..a94ac3185db 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -260,10 +260,6 @@ struct visitor_aggregate_weights_t : visitors::visitor_t {
 
 // Biased RW selection logic:
 //
-// FIXME:
-// 1. move sum weights calculation into selector;
-// 2. pass graph_view to constructor;
-//
 template <typename graph_type, typename real_t>
 struct biased_selector_t {
   using vertex_t = typename graph_type::vertex_type;
@@ -338,6 +334,75 @@ struct biased_selector_t {
   sampler_t sampler_;
 };
 
+// node2vec RW selection logic:
+//
+template <typename graph_type, typename real_t>
+struct node2vec_selector_t {
+  using vertex_t = typename graph_type::vertex_type;
+  using edge_t   = typename graph_type::edge_type;
+  using weight_t = typename graph_type::weight_type;
+
+  struct sampler_t {
+    sampler_t(edge_t const* ro,
+              vertex_t const* ci,
+              weight_t const* w,
+              vertex_t max_degree,
+              edge_t num_paths,
+              weight_t* ptr_alpha)
+      : row_offsets_(ro),
+        col_indices_(ci),
+        values_(w),
+        coalesced_alpha_{(max_degree > 0) && (num_paths > 0) && (ptr_alpha != nullptr)
+                           ? thrust::make_tuple(max_degree, num_paths, ptr_alpha)
+                           : thrust::nullopt}
+    {
+    }
+
+    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(vertex_t src_v,
+                                                                              real_t rnd_val) const
+    {
+    }
+
+   private:
+    edge_t const* row_offsets_;
+    vertex_t const* col_indices_;
+    weight_t const* values_;
+
+    // alpha scaling coalesced buffers per path:
+    // (use as cache since the per-path alpha-buffer
+    //  is used twice for each node transition: (1) for computing sum_scaled weights; (2) for using
+    //  scaled_weights for the biased next vertex selection)
+    //
+    thrust::optional<thrust::tuple<vertex_t, edge_t, weight_t*>>
+      coalesced_alpha_;  // tuple<max_vertex_degree,
+                         // num_paths, alpha_buffer[max_vertex_degree*num_paths]>
+  };
+
+  using sampler_type = sampler_t;
+
+  node2vec_selector_t(raft::handle_t const& handle,
+                      graph_type const& graph,
+                      real_t tag,
+                      vertex_t max_deg  = 0,
+                      edge_t num_paths  = 0,
+                      weight_t* p_alpha = nullptr)
+    : sampler_{graph.get_matrix_partition_view().get_offsets(),
+               graph.get_matrix_partition_view().get_indices(),
+               graph.get_matrix_partition_view().get_weights()
+                 ? *(graph.get_matrix_partition_view().get_weights())
+                 : static_cast<weight_t*>(nullptr),
+               max_deg,
+               num_paths,
+               p_alpha}
+  {
+  }
+
+  sampler_t const& get_strategy(void) const { return sampler_; }
+
+ private:
+  sampler_t sampler_;
+};
+
 // classes abstracting the way the random walks path are generated:
 //
 
@@ -502,7 +567,7 @@ struct horizontal_traversal_t {
  private:
   size_t num_paths_;
   size_t max_depth_;
-};  // namespace detail
+};
 
 }  // namespace detail
 }  // namespace cugraph

From c73fab82991501b1118825b1cb8bc3a709357747 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Thu, 14 Oct 2021 17:41:09 -0500
Subject: [PATCH 02/25] Simplified node2vec selector interface.

---
 cpp/src/sampling/rw_traversals.hpp | 39 ++++++++++++++++++++++--------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index a94ac3185db..af36d802987 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -368,12 +368,15 @@ struct node2vec_selector_t {
     vertex_t const* col_indices_;
     weight_t const* values_;
 
-    // alpha scaling coalesced buffers per path:
+    // alpha scaling coalesced buffer (per path):
     // (use as cache since the per-path alpha-buffer
-    //  is used twice for each node transition: (1) for computing sum_scaled weights; (2) for using
-    //  scaled_weights for the biased next vertex selection)
+    //  is used twice for each node transition:
+    //  (1) for computing sum_scaled weights;
+    //  (2) for using scaled_weights for the biased next vertex selection)
+    // this is information related to a scratchpad buffer, used as cache, hence mutable;
+    // (necessary, because get_strategy() is const)
     //
-    thrust::optional<thrust::tuple<vertex_t, edge_t, weight_t*>>
+    mutable thrust::optional<thrust::tuple<vertex_t, edge_t, weight_t*>>
       coalesced_alpha_;  // tuple<max_vertex_degree,
                          // num_paths, alpha_buffer[max_vertex_degree*num_paths]>
   };
@@ -383,23 +386,39 @@ struct node2vec_selector_t {
   node2vec_selector_t(raft::handle_t const& handle,
                       graph_type const& graph,
                       real_t tag,
-                      vertex_t max_deg  = 0,
-                      edge_t num_paths  = 0,
-                      weight_t* p_alpha = nullptr)
-    : sampler_{graph.get_matrix_partition_view().get_offsets(),
+                      edge_t num_paths = 0)
+    : max_out_degree_{num_paths > 0 ? get_max_out_degree(handle, graph) : 0},
+      d_coalesced_alpha_{max_out_degree_ * num_paths, handle.get_stream()},
+      sampler_{graph.get_matrix_partition_view().get_offsets(),
                graph.get_matrix_partition_view().get_indices(),
                graph.get_matrix_partition_view().get_weights()
                  ? *(graph.get_matrix_partition_view().get_weights())
                  : static_cast<weight_t*>(nullptr),
-               max_deg,
+               max_out_degree_,
                num_paths,
-               p_alpha}
+               raw_ptr(d_coalesced_alpha_)}
   {
   }
 
   sampler_t const& get_strategy(void) const { return sampler_; }
 
+  static size_t get_max_out_degree(raft::handle_t const& handle, graph_type const& graph)
+  {
+    // TODO:
+    //
+    return 0;  // for now
+  }
+
  private:
+  size_t max_out_degree_{0};
+
+  // alpha scaling coalesced buffer (per path):
+  // (use as cache since the per-path alpha-buffer
+  //  is used twice for each node transition:
+  //  (1) for computing sum_scaled weights;
+  //  (2) for using scaled_weights for the biased next vertex selection)
+  //
+  rmm::device_uvector<weight_t> d_coalesced_alpha_;
   sampler_t sampler_;
 };
 

From 55b32e9455af044cbf86ba54c6efd95e65537796 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Fri, 15 Oct 2021 11:22:32 -0500
Subject: [PATCH 03/25] Max out-degree computation for alpha scratchpad.

---
 cpp/src/sampling/rw_traversals.hpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index af36d802987..d2ec683d519 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -404,9 +404,15 @@ struct node2vec_selector_t {
 
   static size_t get_max_out_degree(raft::handle_t const& handle, graph_type const& graph)
   {
-    // TODO:
-    //
-    return 0;  // for now
+    using edge_t = node2vec_selector_t::edge_t;
+
+    auto&& d_out_degs = graph.compute_out_degrees(handle);
+
+    return thrust::reduce(handle.get_thrust_policy(),
+                          d_out_degs.begin(),
+                          d_out_degs.end(),
+                          edge_t{0},
+                          thrust::maximum<edge_t>{});
   }
 
  private:
@@ -418,7 +424,7 @@ struct node2vec_selector_t {
   //  (1) for computing sum_scaled weights;
   //  (2) for using scaled_weights for the biased next vertex selection)
   //
-  rmm::device_uvector<weight_t> d_coalesced_alpha_;
+  device_vec_t<weight_t> d_coalesced_alpha_;
   sampler_t sampler_;
 };
 

From 02eb9280554207b1d7a37509b439fe919d93628a Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Fri, 15 Oct 2021 15:48:31 -0500
Subject: [PATCH 04/25] Core node2vec sampling algorithm preliminaries.

---
 cpp/src/sampling/rw_traversals.hpp | 68 +++++++++++++++++++++++++++++-
 1 file changed, 66 insertions(+), 2 deletions(-)

diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index d2ec683d519..007b9c245b3 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -346,21 +346,78 @@ struct node2vec_selector_t {
     sampler_t(edge_t const* ro,
               vertex_t const* ci,
               weight_t const* w,
+              weight_t p,
+              weight_t q,
               vertex_t max_degree,
               edge_t num_paths,
               weight_t* ptr_alpha)
       : row_offsets_(ro),
         col_indices_(ci),
         values_(w),
+        p_(p),
+        q_(q),
         coalesced_alpha_{(max_degree > 0) && (num_paths > 0) && (ptr_alpha != nullptr)
                            ? thrust::make_tuple(max_degree, num_paths, ptr_alpha)
                            : thrust::nullopt}
     {
     }
 
-    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(vertex_t src_v,
-                                                                              real_t rnd_val) const
+    // node2vec alpha scalling logic:
+    // pre-condition: assume column_indices[] is seg-sorted;
+    // (each row has column_indices[] sorted)
+    //
+    __device__ weight_t get_alpha(vertex_t prev_v, vertex_t src_v, vertex_t next_v) const
+    {
+      if (next_v == prev_v) {
+        return 1.0 / p_;
+      } else {
+        // binary-search `next_v` in the adj(prev_v)
+        //
+        auto prev_indx_begin = row_offsets_[prev_v];
+        auto prev_indx_end   = row_offsets_[prev_v + 1];
+
+        auto found_next_from_prev = thrust::binary_search(
+          thrust::seq, col_indices_ + prev_indx_begin, col_indices_ + prev_indx_end, next_v);
+
+        if (found_next_from_prev) {
+          return 1;
+        } else {
+          return 1.0 / q_;
+        }
+      }
+    }
+
+    // FIXME: alpha[] requires `index_path`;
+    //
+    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
+      vertex_t src_v, real_t rnd_val, vertex_t prev_v, edge_t path_index) const
     {
+      auto col_indx_begin = row_offsets_[src_v];
+      auto col_indx_end   = row_offsets_[src_v + 1];
+      if (col_indx_begin == col_indx_end) return thrust::nullopt;  // src_v is a sink
+
+      if (coalesced_alpha_.has_value()) {
+        auto&& [max_out_deg, num_paths, ptr_d_alpha] = *coalesced_alpha_;
+
+        weight_t sum_scaled_weights{0};
+
+        auto col_indx      = col_indx_begin;
+        auto prev_col_indx = col_indx;
+
+        for (; col_indx < col_indx_end; ++col_indx) {
+          auto crt_alpha = get_alpha(prev_v, src_v, col_indices_[col_indx]);
+
+          // if caching is available cache the alpha's for next step
+          // (the actual sampling step);
+          //
+          ptr_d_alpha[max_out_deg * path_index + col_indx] = crt_alpha;
+
+          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[col_indx]);
+
+          sum_scaled_weights += crt_weight * crt_alpha;
+        }
+      } else {
+      }
     }
 
    private:
@@ -368,6 +425,9 @@ struct node2vec_selector_t {
     vertex_t const* col_indices_;
     weight_t const* values_;
 
+    weight_t const p_;
+    weight_t const q_;
+
     // alpha scaling coalesced buffer (per path):
     // (use as cache since the per-path alpha-buffer
     //  is used twice for each node transition:
@@ -386,6 +446,8 @@ struct node2vec_selector_t {
   node2vec_selector_t(raft::handle_t const& handle,
                       graph_type const& graph,
                       real_t tag,
+                      weight_t p,
+                      weight_t q,
                       edge_t num_paths = 0)
     : max_out_degree_{num_paths > 0 ? get_max_out_degree(handle, graph) : 0},
       d_coalesced_alpha_{max_out_degree_ * num_paths, handle.get_stream()},
@@ -394,6 +456,8 @@ struct node2vec_selector_t {
                graph.get_matrix_partition_view().get_weights()
                  ? *(graph.get_matrix_partition_view().get_weights())
                  : static_cast<weight_t*>(nullptr),
+               p,
+               q,
                max_out_degree_,
                num_paths,
                raw_ptr(d_coalesced_alpha_)}

From e903322a8d97226355d28745449cb4fad52c875d Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Fri, 15 Oct 2021 17:30:19 -0500
Subject: [PATCH 05/25] Core node2vec sampling algorithm; the cached option.

---
 cpp/src/sampling/rw_traversals.hpp | 41 +++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 7 deletions(-)

diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index 007b9c245b3..7f8865194a2 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -335,6 +335,13 @@ struct biased_selector_t {
 };
 
 // node2vec RW selection logic:
+// uses biased selector on scaled weights,
+// to be computed (and possibly cached) according to
+// `node2vec` logic (see `get_alpha()`);
+// works on unweighted graphs (for which unscalled weights are 1.0);
+//
+// TODO: need to decide logic on very 1st step of traversal
+//       (which has no `prev_v` vertex);
 //
 template <typename graph_type, typename real_t>
 struct node2vec_selector_t {
@@ -397,25 +404,45 @@ struct node2vec_selector_t {
       if (col_indx_begin == col_indx_end) return thrust::nullopt;  // src_v is a sink
 
       if (coalesced_alpha_.has_value()) {
-        auto&& [max_out_deg, num_paths, ptr_d_alpha] = *coalesced_alpha_;
+        auto&& [max_out_deg, num_paths, ptr_d_scaled_weights] = *coalesced_alpha_;
 
         weight_t sum_scaled_weights{0};
 
-        auto col_indx      = col_indx_begin;
-        auto prev_col_indx = col_indx;
+        auto col_indx = col_indx_begin;
 
+        // sum-scaled-weights reduction loop:
+        //
         for (; col_indx < col_indx_end; ++col_indx) {
           auto crt_alpha = get_alpha(prev_v, src_v, col_indices_[col_indx]);
 
-          // if caching is available cache the alpha's for next step
+          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[col_indx]);
+
+          auto scaled_weight = crt_weight * crt_alpha;
+
+          // caching is available, hence cache the alpha's for next step
           // (the actual sampling step);
           //
-          ptr_d_alpha[max_out_deg * path_index + col_indx] = crt_alpha;
+          ptr_d_scaled_weights[max_out_deg * path_index + col_indx] = scaled_weight;
 
-          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[col_indx]);
+          sum_scaled_weights += scaled_weight;
+        }
+
+        weight_t run_sum_w{0};
+        auto rnd_sum_weights = rnd_val * sum_scaled_weights;
+        col_indx             = col_indx_begin;
+        auto prev_col_indx   = col_indx;
+
+        // biased sampling selection loop:
+        //
+        for (; col_indx < col_indx_end; ++col_indx) {
+          if (rnd_sum_weights < run_sum_w) break;
 
-          sum_scaled_weights += crt_weight * crt_alpha;
+          run_sum_w += ptr_d_scaled_weights[max_out_deg * path_index + col_indx];
+          prev_col_indx = col_indx;
         }
+        return thrust::optional{thrust::make_tuple(
+          col_indices_[prev_col_indx], values_ == nullptr ? weight_t{1} : values_[prev_col_indx])};
+
       } else {
       }
     }

From 75412ec1390b4b5f7e29d1d8035b8ba6cce3b3ca Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Mon, 18 Oct 2021 10:31:36 -0500
Subject: [PATCH 06/25] Fixed alpha cache indexing.

---
 cpp/src/sampling/rw_traversals.hpp | 34 ++++++++++++++++--------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index 7f8865194a2..992249dc826 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -399,49 +399,51 @@ struct node2vec_selector_t {
     __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
       vertex_t src_v, real_t rnd_val, vertex_t prev_v, edge_t path_index) const
     {
-      auto col_indx_begin = row_offsets_[src_v];
-      auto col_indx_end   = row_offsets_[src_v + 1];
-      if (col_indx_begin == col_indx_end) return thrust::nullopt;  // src_v is a sink
+      auto offset_indx_begin = row_offsets_[src_v];
+      auto offset_indx_end   = row_offsets_[src_v + 1];
+      if (offset_indx_begin == offset_indx_end) return thrust::nullopt;  // src_v is a sink
 
       if (coalesced_alpha_.has_value()) {
         auto&& [max_out_deg, num_paths, ptr_d_scaled_weights] = *coalesced_alpha_;
 
         weight_t sum_scaled_weights{0};
 
-        auto col_indx = col_indx_begin;
+        auto offset_indx = offset_indx_begin;
 
         // sum-scaled-weights reduction loop:
         //
-        for (; col_indx < col_indx_end; ++col_indx) {
-          auto crt_alpha = get_alpha(prev_v, src_v, col_indices_[col_indx]);
+        auto const start_alpha_offset = max_out_deg * path_index;
+        for (vertex_t nghbr_indx = 0; offset_indx < offset_indx_end; ++offset_indx, ++nghbr_indx) {
+          auto crt_alpha = get_alpha(prev_v, src_v, col_indices_[offset_indx]);
 
-          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[col_indx]);
+          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[offset_indx]);
 
           auto scaled_weight = crt_weight * crt_alpha;
 
           // caching is available, hence cache the alpha's for next step
           // (the actual sampling step);
           //
-          ptr_d_scaled_weights[max_out_deg * path_index + col_indx] = scaled_weight;
+          ptr_d_scaled_weights[start_alpha_offset + nghbr_indx] = scaled_weight;
 
           sum_scaled_weights += scaled_weight;
         }
 
         weight_t run_sum_w{0};
-        auto rnd_sum_weights = rnd_val * sum_scaled_weights;
-        col_indx             = col_indx_begin;
-        auto prev_col_indx   = col_indx;
+        auto rnd_sum_weights  = rnd_val * sum_scaled_weights;
+        offset_indx           = offset_indx_begin;
+        auto prev_offset_indx = offset_indx;
 
         // biased sampling selection loop:
         //
-        for (; col_indx < col_indx_end; ++col_indx) {
+        for (vertex_t nghbr_indx = 0; offset_indx < offset_indx_end; ++offset_indx, ++nghbr_indx) {
           if (rnd_sum_weights < run_sum_w) break;
 
-          run_sum_w += ptr_d_scaled_weights[max_out_deg * path_index + col_indx];
-          prev_col_indx = col_indx;
+          run_sum_w += ptr_d_scaled_weights[start_alpha_offset + nghbr_indx];
+          prev_offset_indx = offset_indx;
         }
-        return thrust::optional{thrust::make_tuple(
-          col_indices_[prev_col_indx], values_ == nullptr ? weight_t{1} : values_[prev_col_indx])};
+        return thrust::optional{
+          thrust::make_tuple(col_indices_[prev_offset_indx],
+                             values_ == nullptr ? weight_t{1} : values_[prev_offset_indx])};
 
       } else {
       }

From 523e7b1304718623fede1955ffb3873260c1c6ca Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Mon, 18 Oct 2021 11:07:08 -0500
Subject: [PATCH 07/25] Functionality without alpha caching.

---
 cpp/src/sampling/rw_traversals.hpp | 53 +++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index 992249dc826..cac7586cb9c 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -399,26 +399,26 @@ struct node2vec_selector_t {
     __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
       vertex_t src_v, real_t rnd_val, vertex_t prev_v, edge_t path_index) const
     {
-      auto offset_indx_begin = row_offsets_[src_v];
-      auto offset_indx_end   = row_offsets_[src_v + 1];
+      auto const offset_indx_begin = row_offsets_[src_v];
+      auto const offset_indx_end   = row_offsets_[src_v + 1];
+
+      weight_t sum_scaled_weights{0};
+      auto offset_indx = offset_indx_begin;
+
       if (offset_indx_begin == offset_indx_end) return thrust::nullopt;  // src_v is a sink
 
+      // cached solution, for increased performance, but memory expensive:
+      //
       if (coalesced_alpha_.has_value()) {
         auto&& [max_out_deg, num_paths, ptr_d_scaled_weights] = *coalesced_alpha_;
 
-        weight_t sum_scaled_weights{0};
-
-        auto offset_indx = offset_indx_begin;
-
         // sum-scaled-weights reduction loop:
         //
         auto const start_alpha_offset = max_out_deg * path_index;
         for (vertex_t nghbr_indx = 0; offset_indx < offset_indx_end; ++offset_indx, ++nghbr_indx) {
-          auto crt_alpha = get_alpha(prev_v, src_v, col_indices_[offset_indx]);
-
+          auto crt_alpha      = get_alpha(prev_v, src_v, col_indices_[offset_indx]);
           weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[offset_indx]);
-
-          auto scaled_weight = crt_weight * crt_alpha;
+          auto scaled_weight  = crt_weight * crt_alpha;
 
           // caching is available, hence cache the alpha's for next step
           // (the actual sampling step);
@@ -445,7 +445,38 @@ struct node2vec_selector_t {
           thrust::make_tuple(col_indices_[prev_offset_indx],
                              values_ == nullptr ? weight_t{1} : values_[prev_offset_indx])};
 
-      } else {
+      } else {  // uncached solution, with much lower memory footprint but not as efficient
+
+        for (; offset_indx < offset_indx_end; ++offset_indx) {
+          auto crt_alpha = get_alpha(prev_v, src_v, col_indices_[offset_indx]);
+
+          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[offset_indx]);
+
+          auto scaled_weight = crt_weight * crt_alpha;
+          sum_scaled_weights += scaled_weight;
+        }
+
+        weight_t run_sum_w{0};
+        auto rnd_sum_weights  = rnd_val * sum_scaled_weights;
+        offset_indx           = offset_indx_begin;
+        auto prev_offset_indx = offset_indx;
+
+        // biased sampling selection loop:
+        // (Note: re-compute `scaled_weight`, since no cache is available);
+        //
+        for (; offset_indx < offset_indx_end; ++offset_indx) {
+          if (rnd_sum_weights < run_sum_w) break;
+
+          auto crt_alpha      = get_alpha(prev_v, src_v, col_indices_[offset_indx]);
+          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[offset_indx]);
+          auto scaled_weight  = crt_weight * crt_alpha;
+
+          run_sum_w += scaled_weight;
+          prev_offset_indx = offset_indx;
+        }
+        return thrust::optional{
+          thrust::make_tuple(col_indices_[prev_offset_indx],
+                             values_ == nullptr ? weight_t{1} : values_[prev_offset_indx])};
       }
     }
 

From 06e6ac8f10481884a53c8273d202846b443adf14 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Mon, 18 Oct 2021 15:57:46 -0500
Subject: [PATCH 08/25] First step in path for node2vec.

---
 cpp/src/sampling/rw_traversals.hpp | 33 +++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index cac7586cb9c..412c32f41fd 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -394,10 +394,8 @@ struct node2vec_selector_t {
       }
     }
 
-    // FIXME: alpha[] requires `index_path`;
-    //
     __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
-      vertex_t src_v, real_t rnd_val, vertex_t prev_v, edge_t path_index) const
+      vertex_t src_v, real_t rnd_val, vertex_t prev_v, edge_t path_index, bool start_path) const
     {
       auto const offset_indx_begin = row_offsets_[src_v];
       auto const offset_indx_end   = row_offsets_[src_v + 1];
@@ -407,6 +405,35 @@ struct node2vec_selector_t {
 
       if (offset_indx_begin == offset_indx_end) return thrust::nullopt;  // src_v is a sink
 
+      // for 1st vertex in path just use biased random selection:
+      //
+      if (start_path) {  // `src_v` is starting vertex in path
+        for (; offset_indx < offset_indx_end; ++offset_indx) {
+          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[offset_indx]);
+
+          sum_scaled_weights += crt_weight;
+        }
+
+        weight_t run_sum_w{0};
+        auto rnd_sum_weights  = rnd_val * sum_scaled_weights;
+        offset_indx           = offset_indx_begin;
+        auto prev_offset_indx = offset_indx;
+
+        // biased sampling selection loop:
+        // (Note: re-compute `scaled_weight`, since no cache is available);
+        //
+        for (; offset_indx < offset_indx_end; ++offset_indx) {
+          if (rnd_sum_weights < run_sum_w) break;
+
+          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[offset_indx]);
+          run_sum_w += crt_weight;
+          prev_offset_indx = offset_indx;
+        }
+        return thrust::optional{
+          thrust::make_tuple(col_indices_[prev_offset_indx],
+                             values_ == nullptr ? weight_t{1} : values_[prev_offset_indx])};
+      }
+
       // cached solution, for increased performance, but memory expensive:
       //
       if (coalesced_alpha_.has_value()) {

From 5ad4e82a437aecc62f5ba0c3e4ca9c1aae1c4f67 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Mon, 18 Oct 2021 16:36:17 -0500
Subject: [PATCH 09/25] Sampling interface change to accomodate node2vec.

---
 cpp/src/sampling/random_walks.cuh  | 16 +++++++++++++---
 cpp/src/sampling/rw_traversals.hpp | 24 +++++++++++++++++++-----
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 72eb326fbe9..134d55d4d0c 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -278,9 +278,19 @@ struct col_indx_extract_t<graph_t, index_t, std::enable_if_t<graph_t::is_multi_g
                        auto start_v_pos  = chunk_offset + delta;
                        auto start_w_pos  = chunk_offset - path_indx + delta;
 
-                       auto src_v         = ptr_coalesced_v[start_v_pos];
-                       auto rnd_val       = ptr_d_random[path_indx];
-                       auto opt_tpl_vn_wn = sampler(src_v, rnd_val);
+                       auto src_v   = ptr_coalesced_v[start_v_pos];
+                       auto rnd_val = ptr_d_random[path_indx];
+
+                       // `node2vec` info:
+                       //
+                       bool start_path = true;
+                       auto prev_v     = src_v;
+                       if (delta > 0) {
+                         start_path = false;
+                         prev_v     = ptr_coalesced_v[start_v_pos - 1];
+                       }
+
+                       auto opt_tpl_vn_wn = sampler(src_v, rnd_val, prev_v, path_indx, start_path);
 
                        if (opt_tpl_vn_wn.has_value()) {
                          auto src_vertex = thrust::get<0>(*opt_tpl_vn_wn);
diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index 412c32f41fd..aec42817567 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -119,8 +119,12 @@ struct uniform_selector_t {
     {
     }
 
-    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(vertex_t src_v,
-                                                                              real_t rnd_val) const
+    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
+      vertex_t src_v,
+      real_t rnd_val,
+      vertex_t prev_v   = 0 /* not used*/,
+      edge_t path_index = 0 /* not used*/,
+      bool start_path   = false /* not used*/) const
     {
       auto crt_out_deg = ptr_d_cache_out_degs_[src_v];
       if (crt_out_deg == 0) return thrust::nullopt;  // src_v is a sink
@@ -280,8 +284,12 @@ struct biased_selector_t {
     // Sum(weights(neighborhood(src_v))) are pre-computed and
     // stored in ptr_d_sum_weights_ (too expensive to check, here);
     //
-    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(vertex_t src_v,
-                                                                              real_t rnd_val) const
+    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
+      vertex_t src_v,
+      real_t rnd_val,
+      vertex_t prev_v   = 0 /* not used*/,
+      edge_t path_index = 0 /* not used*/,
+      bool start_path   = false /* not used*/) const
     {
       weight_t run_sum_w{0};
       auto rnd_sum_weights = rnd_val * ptr_d_sum_weights_[src_v];
@@ -711,6 +719,8 @@ struct horizontal_traversal_t {
                       sampler] __device__(auto path_index) {
                        auto chunk_offset   = path_index * max_depth;
                        vertex_t src_vertex = ptr_coalesced_v[chunk_offset];
+                       auto prev_v         = src_vertex;
+                       bool start_path     = true;
 
                        for (index_t step_indx = 1; step_indx < max_depth; ++step_indx) {
                          // indexing into coalesced arrays of size num_paths x (max_depth -1):
@@ -720,9 +730,13 @@ struct horizontal_traversal_t {
 
                          auto real_rnd_indx = ptr_d_random[stepping_index];
 
-                         auto opt_tpl_vn_wn = sampler(src_vertex, real_rnd_indx);
+                         auto opt_tpl_vn_wn =
+                           sampler(src_vertex, real_rnd_indx, prev_v, path_index, start_path);
                          if (!opt_tpl_vn_wn.has_value()) break;
 
+                         prev_v     = src_vertex;
+                         start_path = false;
+
                          src_vertex      = thrust::get<0>(*opt_tpl_vn_wn);
                          auto crt_weight = thrust::get<1>(*opt_tpl_vn_wn);
 

From 12bfcffdaa93c54ee7c2dc0e06579028c0357133 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Tue, 19 Oct 2021 18:07:06 -0500
Subject: [PATCH 10/25] Testing node2vec: check logic.

---
 cpp/tests/sampling/rw_low_level_test.cu | 55 ++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/sampling/rw_low_level_test.cu b/cpp/tests/sampling/rw_low_level_test.cu
index 3711fb3f98f..a4bff59350b 100644
--- a/cpp/tests/sampling/rw_low_level_test.cu
+++ b/cpp/tests/sampling/rw_low_level_test.cu
@@ -37,10 +37,12 @@
 #include <iostream>
 #include <iterator>
 #include <limits>
+#include <map>
 #include <numeric>
+#include <optional>
 #include <utilities/high_res_timer.hpp>
 #include <vector>
-
+//
 template <typename value_t>
 using vector_test_t = cugraph::detail::device_vec_t<value_t>;  // for debug purposes
 
@@ -84,6 +86,57 @@ void next_biased(raft::handle_t const& handle,
                     });
 }
 
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::map<vertex_t, std::vector<std::optional<weight_t>>> alpha_node2vec(
+  std::vector<edge_t> const& row_offsets,
+  std::vector<vertex_t> const& col_indices,
+  std::vector<std::optional<vertex_t>> const& v_pred,
+  std::vector<vertex_t> const& v_crt,
+  weight_t p,
+  weight_t q)
+{
+  std::map<vertex_t, std::vector<std::optional<weight_t>>> map_v2alpha;
+
+  auto num_vs = v_crt.size();
+  for (size_t indx = 0; indx < num_vs; ++indx) {
+    std::vector<std::optional<weight_t>> v_p1q;
+
+    auto src_v = v_crt[indx];
+
+    size_t num_neighbors = row_offsets[src_v + 1] - row_offsets[src_v];
+    v_p1q.reserve(num_neighbors);
+
+    if (v_pred[indx].has_value()) {
+      auto pred_v = *(v_pred[indx]);
+
+      for (auto offset_indx = row_offsets[src_v]; offset_indx < row_offsets[src_v + 1];
+           ++offset_indx) {
+        auto next_v = col_indices[offset_indx];
+
+        if (next_v == pred_v) {
+          v_p1q.push_back(1.0 / p);
+        } else {
+          auto const* begin = col_indices + row_offsets[pred_v];
+          auto const* end   = col_indices + row_offsets[pred_v + 1];
+          auto it_found     = std::find(begin, end, next_v);
+
+          if (it_found != end) {
+            v_p1q.push_back(1.0);
+          } else {
+            v_p1q.push_back(1.0 / q);
+          }
+        }
+      }
+    } else {
+      v_p1q.push_back(std::nullopt);
+    }
+
+    map_v2alpha[src_v] = v_p1q;
+  }
+
+  return map_v2alpha;
+}
+
 }  // namespace
 
 // FIXME (per rlratzel request):

From 71c7e3c51a13208a36b5f952c1ebfd635f3d2d02 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Tue, 19 Oct 2021 19:54:58 -0500
Subject: [PATCH 11/25] Testing node2vec: low level test.

---
 cpp/src/sampling/rw_traversals.hpp      |  10 +-
 cpp/tests/sampling/rw_low_level_test.cu | 140 +++++++++++++++++++-----
 2 files changed, 121 insertions(+), 29 deletions(-)

diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index aec42817567..20fa1d33b2c 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -371,9 +371,11 @@ struct node2vec_selector_t {
         values_(w),
         p_(p),
         q_(q),
-        coalesced_alpha_{(max_degree > 0) && (num_paths > 0) && (ptr_alpha != nullptr)
-                           ? thrust::make_tuple(max_degree, num_paths, ptr_alpha)
-                           : thrust::nullopt}
+        coalesced_alpha_{
+          (max_degree > 0) && (num_paths > 0) && (ptr_alpha != nullptr)
+            ? thrust::optional<thrust::tuple<vertex_t, edge_t, weight_t*>>{thrust::make_tuple(
+                max_degree, num_paths, ptr_alpha)}
+            : thrust::nullopt}
     {
     }
 
@@ -553,7 +555,7 @@ struct node2vec_selector_t {
                  : static_cast<weight_t*>(nullptr),
                p,
                q,
-               max_out_degree_,
+               static_cast<vertex_t>(max_out_degree_),
                num_paths,
                raw_ptr(d_coalesced_alpha_)}
   {
diff --git a/cpp/tests/sampling/rw_low_level_test.cu b/cpp/tests/sampling/rw_low_level_test.cu
index a4bff59350b..71598bf3464 100644
--- a/cpp/tests/sampling/rw_low_level_test.cu
+++ b/cpp/tests/sampling/rw_low_level_test.cu
@@ -42,7 +42,7 @@
 #include <optional>
 #include <utilities/high_res_timer.hpp>
 #include <vector>
-//
+
 template <typename value_t>
 using vector_test_t = cugraph::detail::device_vec_t<value_t>;  // for debug purposes
 
@@ -87,24 +87,21 @@ void next_biased(raft::handle_t const& handle,
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-std::map<vertex_t, std::vector<std::optional<weight_t>>> alpha_node2vec(
-  std::vector<edge_t> const& row_offsets,
-  std::vector<vertex_t> const& col_indices,
-  std::vector<std::optional<vertex_t>> const& v_pred,
-  std::vector<vertex_t> const& v_crt,
-  weight_t p,
-  weight_t q)
+void alpha_node2vec(std::vector<edge_t> const& row_offsets,
+                    std::vector<vertex_t> const& col_indices,
+                    std::vector<weight_t>& weights,  // to be scaled!
+                    std::vector<std::optional<vertex_t>> const& v_pred,
+                    std::vector<vertex_t> const& v_crt,
+                    weight_t p,
+                    weight_t q)
 {
-  std::map<vertex_t, std::vector<std::optional<weight_t>>> map_v2alpha;
-
   auto num_vs = v_crt.size();
   for (size_t indx = 0; indx < num_vs; ++indx) {
-    std::vector<std::optional<weight_t>> v_p1q;
-
     auto src_v = v_crt[indx];
 
     size_t num_neighbors = row_offsets[src_v + 1] - row_offsets[src_v];
-    v_p1q.reserve(num_neighbors);
+
+    if (num_neighbors == 0) { continue; }
 
     if (v_pred[indx].has_value()) {
       auto pred_v = *(v_pred[indx]);
@@ -113,28 +110,26 @@ std::map<vertex_t, std::vector<std::optional<weight_t>>> alpha_node2vec(
            ++offset_indx) {
         auto next_v = col_indices[offset_indx];
 
+        weight_t alpha{0};
+
         if (next_v == pred_v) {
-          v_p1q.push_back(1.0 / p);
+          alpha = 1.0 / p;
         } else {
-          auto const* begin = col_indices + row_offsets[pred_v];
-          auto const* end   = col_indices + row_offsets[pred_v + 1];
-          auto it_found     = std::find(begin, end, next_v);
+          auto begin    = col_indices.begin() + row_offsets[pred_v];
+          auto end      = col_indices.begin() + row_offsets[pred_v + 1];
+          auto it_found = std::find(begin, end, next_v);
 
           if (it_found != end) {
-            v_p1q.push_back(1.0);
+            alpha = 1.0;
           } else {
-            v_p1q.push_back(1.0 / q);
+            alpha = 1.0 / q;
           }
         }
+
+        weights[offset_indx] *= alpha;  // scale weights
       }
-    } else {
-      v_p1q.push_back(std::nullopt);
     }
-
-    map_v2alpha[src_v] = v_p1q;
   }
-
-  return map_v2alpha;
 }
 
 }  // namespace
@@ -1270,3 +1265,98 @@ TEST(BiasedRandomWalks, SelectorSmallGraph)
 
   EXPECT_EQ(v_next_v, h_next_v);
 }
+
+TEST(Node2VecRandomWalks, DISABLED_Node2VecSmallGraph)
+{
+  namespace topo = cugraph::topology;
+
+  raft::handle_t handle{};
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+  using real_t   = weight_t;
+
+  weight_t p = 2.0;
+  weight_t q = 4.0;
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  /*
+    0 --(.1)--> 1 --(1.1)--> 4
+   /|\       /\ |            |
+    |       /   |            |
+   (5.1) (3.1)(2.1)        (3.2)
+    |   /       |            |
+    | /        \|/          \|/
+    2 --(4.1)-->3 --(7.2)--> 5
+   */
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  std::vector<real_t> v_rnd{0.2, 0.5, 1.0, 0.1, 0.8};
+  std::vector<vertex_t> v_src_v{0, 1, 3, 4, 5};
+  std::vector<std::optional<vertex_t>> v_pred_v{2, 0, 1, 1, 4};
+
+  vector_test_t<real_t> d_rnd(v_rnd.size(), handle.get_stream());
+  vector_test_t<vertex_t> d_src_v(v_src_v.size(), handle.get_stream());
+
+  EXPECT_EQ(d_rnd.size(), d_src_v.size());
+
+  raft::update_device(d_rnd.data(), v_rnd.data(), d_rnd.size(), handle.get_stream());
+  raft::update_device(d_src_v.data(), v_src_v.data(), d_src_v.size(), handle.get_stream());
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets = graph_view.get_matrix_partition_view().get_offsets();
+
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+
+  weight_t const* values = *(graph_view.get_matrix_partition_view().get_weights());
+
+  // TODO: do a `next_node2vec()` with `node2vec` selector on `graph_view`, v_crt_v, v_prev_v,
+  // and compare the resulting next_v with the one below resulting from
+  // `next_biased()` using biased selector on `node2vec` alpha scaled weights;
+
+  cugraph::detail::node2vec_selector_t n2v_selector{handle, graph_view, 0.0f, p, q};
+
+  std::vector<weight_t> scaled_weights(v_w);
+  std::vector<edge_t> row_offsets(num_vertices + 1);
+  std::vector<vertex_t> col_indices(num_edges);
+
+  raft::update_host(
+    row_offsets.data(), offsets, static_cast<size_t>(num_vertices + 1), handle.get_stream());
+
+  raft::update_host(
+    col_indices.data(), indices, static_cast<size_t>(num_edges), handle.get_stream());
+
+  alpha_node2vec(row_offsets, col_indices, scaled_weights, v_pred_v, v_src_v, p, q);
+
+  auto scaled_graph =
+    cugraph::test::make_graph(handle,
+                              v_src,
+                              v_dst,
+                              std::optional<std::vector<weight_t>>{scaled_weights},
+                              num_vertices,
+                              num_edges);
+
+  auto scaled_graph_view = scaled_graph.view();
+
+  cugraph::detail::biased_selector_t selector{handle, scaled_graph_view, 0.0f};
+
+  vector_test_t<vertex_t> d_next_v(v_src_v.size(), handle.get_stream());
+
+  next_biased(handle, d_src_v, d_rnd, d_next_v, selector);
+
+  std::vector<edge_t> v_next_v(v_src_v.size());
+
+  raft::update_host(v_next_v.data(), d_next_v.data(), v_src_v.size(), handle.get_stream());
+
+  // EXPECT_EQ(v_next_v, h_next_v);
+}

From 0b94613fbb3e37833f2a94d03c21c01b31454276 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Wed, 20 Oct 2021 12:37:59 -0500
Subject: [PATCH 12/25] Testing node2vec: fix due to lack of structured
 bindings support for thrust::tuple.

---
 cpp/src/sampling/rw_traversals.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index 20fa1d33b2c..6aa80b67950 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -447,7 +447,11 @@ struct node2vec_selector_t {
       // cached solution, for increased performance, but memory expensive:
       //
       if (coalesced_alpha_.has_value()) {
-        auto&& [max_out_deg, num_paths, ptr_d_scaled_weights] = *coalesced_alpha_;
+        auto&& tpl = *coalesced_alpha_;
+
+        auto max_out_deg           = thrust::get<0>(tpl);
+        auto num_paths             = thrust::get<1>(tpl);
+        auto* ptr_d_scaled_weights = thrust::get<2>(tpl);
 
         // sum-scaled-weights reduction loop:
         //

From 63ae779342dc5ae4092cf876a48aca4e79c65439 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Wed, 20 Oct 2021 12:59:09 -0500
Subject: [PATCH 13/25] Testing node2vec: low level comparison against biased +
 alpha sclaed weights.

---
 cpp/tests/sampling/rw_low_level_test.cu | 74 ++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 13 deletions(-)

diff --git a/cpp/tests/sampling/rw_low_level_test.cu b/cpp/tests/sampling/rw_low_level_test.cu
index 71598bf3464..344ff93a109 100644
--- a/cpp/tests/sampling/rw_low_level_test.cu
+++ b/cpp/tests/sampling/rw_low_level_test.cu
@@ -86,11 +86,41 @@ void next_biased(raft::handle_t const& handle,
                     });
 }
 
+template <typename vertex_t, typename real_t, typename selector_t>
+void next_node2vec(raft::handle_t const& handle,
+                   vector_test_t<vertex_t> const& d_src_v,
+                   vector_test_t<thrust::optional<vertex_t>> const& d_prev_v,
+                   vector_test_t<real_t> const& d_rnd,
+                   vector_test_t<vertex_t>& d_next_v,
+                   selector_t const& selector)
+{
+  auto begin = thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_prev_v.begin()));
+  auto end   = thrust::make_zip_iterator(thrust::make_tuple(d_src_v.end(), d_prev_v.end()));
+
+  thrust::transform(handle.get_thrust_policy(),
+                    begin,
+                    end,
+                    d_rnd.begin(),
+                    d_next_v.begin(),
+                    [sampler = selector.get_strategy()] __device__(auto tpl, auto rnd_val) {
+                      vertex_t src_v = thrust::get<0>(tpl);
+
+                      if (thrust::get<1>(tpl) != thrust::nullopt) {
+                        vertex_t prev_v = *thrust::get<1>(tpl);
+
+                        auto next_vw = sampler(src_v, rnd_val, prev_v, 0, false);
+                        return (next_vw.has_value() ? thrust::get<0>(*next_vw) : src_v);
+                      } else {
+                        return src_v;
+                      }
+                    });
+}
+
 template <typename vertex_t, typename edge_t, typename weight_t>
 void alpha_node2vec(std::vector<edge_t> const& row_offsets,
                     std::vector<vertex_t> const& col_indices,
                     std::vector<weight_t>& weights,  // to be scaled!
-                    std::vector<std::optional<vertex_t>> const& v_pred,
+                    std::vector<thrust::optional<vertex_t>> const& v_pred,
                     std::vector<vertex_t> const& v_crt,
                     weight_t p,
                     weight_t q)
@@ -1284,6 +1314,8 @@ TEST(Node2VecRandomWalks, DISABLED_Node2VecSmallGraph)
   edge_t num_edges      = 8;
   vertex_t num_vertices = 6;
 
+  // Step 1: graph construction:
+  //
   /*
     0 --(.1)--> 1 --(1.1)--> 4
    /|\       /\ |            |
@@ -1295,14 +1327,14 @@ TEST(Node2VecRandomWalks, DISABLED_Node2VecSmallGraph)
    */
   std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
   std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
-  std::vector<weight_t> v_w{0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  std::vector<weight_t> v_w(1.0, num_edges);  //{0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
 
   auto graph = cugraph::test::make_graph(
     handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
 
   std::vector<real_t> v_rnd{0.2, 0.5, 1.0, 0.1, 0.8};
   std::vector<vertex_t> v_src_v{0, 1, 3, 4, 5};
-  std::vector<std::optional<vertex_t>> v_pred_v{2, 0, 1, 1, 4};
+  std::vector<thrust::optional<vertex_t>> v_pred_v{2, 0, 1, 1, 4};
 
   vector_test_t<real_t> d_rnd(v_rnd.size(), handle.get_stream());
   vector_test_t<vertex_t> d_src_v(v_src_v.size(), handle.get_stream());
@@ -1320,12 +1352,26 @@ TEST(Node2VecRandomWalks, DISABLED_Node2VecSmallGraph)
 
   weight_t const* values = *(graph_view.get_matrix_partition_view().get_weights());
 
-  // TODO: do a `next_node2vec()` with `node2vec` selector on `graph_view`, v_crt_v, v_prev_v,
-  // and compare the resulting next_v with the one below resulting from
-  // `next_biased()` using biased selector on `node2vec` alpha scaled weights;
-
+  // Step 2: `node2vec` selection on original graph:
+  //
   cugraph::detail::node2vec_selector_t n2v_selector{handle, graph_view, 0.0f, p, q};
 
+  vector_test_t<thrust::optional<vertex_t>> d_pred_v(v_pred_v.size(), handle.get_stream());
+
+  raft::update_device(d_pred_v.data(), v_pred_v.data(), v_pred_v.size(), handle.get_stream());
+
+  vector_test_t<vertex_t> d_next_v(v_src_v.size(), handle.get_stream());
+
+  // `node2vec` stepping:
+  //
+  next_node2vec(handle, d_src_v, d_pred_v, d_rnd, d_next_v, n2v_selector);
+
+  std::vector<vertex_t> n2v_next_v(v_src_v.size());
+  raft::update_host(n2v_next_v.data(), d_next_v.data(), v_src_v.size(), handle.get_stream());
+
+  // Step 3: construct similar graph, just with
+  //         alpha scaled weights;
+  //
   std::vector<weight_t> scaled_weights(v_w);
   std::vector<edge_t> row_offsets(num_vertices + 1);
   std::vector<vertex_t> col_indices(num_edges);
@@ -1348,15 +1394,17 @@ TEST(Node2VecRandomWalks, DISABLED_Node2VecSmallGraph)
 
   auto scaled_graph_view = scaled_graph.view();
 
+  // Step 4: biased selection on alpha scaled graph:
+  //
   cugraph::detail::biased_selector_t selector{handle, scaled_graph_view, 0.0f};
 
-  vector_test_t<vertex_t> d_next_v(v_src_v.size(), handle.get_stream());
-
   next_biased(handle, d_src_v, d_rnd, d_next_v, selector);
 
-  std::vector<edge_t> v_next_v(v_src_v.size());
+  std::vector<vertex_t> biased_next_v(v_src_v.size());
+  raft::update_host(biased_next_v.data(), d_next_v.data(), v_src_v.size(), handle.get_stream());
 
-  raft::update_host(v_next_v.data(), d_next_v.data(), v_src_v.size(), handle.get_stream());
-
-  // EXPECT_EQ(v_next_v, h_next_v);
+  // Step 5: compare `node2vec` on original graph
+  //         with biased on graph with alpha scaled weights:
+  //
+  EXPECT_EQ(biased_next_v, n2v_next_v);
 }

From 46fbc8734dc34696b6e2beeec83d5d6f804e1112 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Wed, 20 Oct 2021 13:48:02 -0500
Subject: [PATCH 14/25] Testing node2vec: fix for low level comparison against
 biased + alpha sclaed weights.

---
 cpp/tests/sampling/rw_low_level_test.cu | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/sampling/rw_low_level_test.cu b/cpp/tests/sampling/rw_low_level_test.cu
index 344ff93a109..439742ace7f 100644
--- a/cpp/tests/sampling/rw_low_level_test.cu
+++ b/cpp/tests/sampling/rw_low_level_test.cu
@@ -1296,7 +1296,7 @@ TEST(BiasedRandomWalks, SelectorSmallGraph)
   EXPECT_EQ(v_next_v, h_next_v);
 }
 
-TEST(Node2VecRandomWalks, DISABLED_Node2VecSmallGraph)
+TEST(Node2VecRandomWalks, Node2VecSmallGraph)
 {
   namespace topo = cugraph::topology;
 
@@ -1327,7 +1327,7 @@ TEST(Node2VecRandomWalks, DISABLED_Node2VecSmallGraph)
    */
   std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
   std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
-  std::vector<weight_t> v_w(1.0, num_edges);  //{0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+  std::vector<weight_t> v_w(num_edges, 1.0);  //{0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
 
   auto graph = cugraph::test::make_graph(
     handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
@@ -1369,6 +1369,8 @@ TEST(Node2VecRandomWalks, DISABLED_Node2VecSmallGraph)
   std::vector<vertex_t> n2v_next_v(v_src_v.size());
   raft::update_host(n2v_next_v.data(), d_next_v.data(), v_src_v.size(), handle.get_stream());
 
+  EXPECT_EQ(n2v_next_v.size(), d_src_v.size());
+
   // Step 3: construct similar graph, just with
   //         alpha scaled weights;
   //
@@ -1382,6 +1384,13 @@ TEST(Node2VecRandomWalks, DISABLED_Node2VecSmallGraph)
   raft::update_host(
     col_indices.data(), indices, static_cast<size_t>(num_edges), handle.get_stream());
 
+  std::vector<edge_t> v_ro{0, 1, 3, 6, 7, 8, 8};
+  std::vector<vertex_t> v_ci{1, 3, 4, 0, 1, 3, 5, 5};
+
+  EXPECT_EQ(row_offsets, v_ro);
+  EXPECT_EQ(col_indices, v_ci);
+  EXPECT_EQ(scaled_weights.size(), static_cast<size_t>(num_edges));
+
   alpha_node2vec(row_offsets, col_indices, scaled_weights, v_pred_v, v_src_v, p, q);
 
   auto scaled_graph =
@@ -1403,6 +1412,8 @@ TEST(Node2VecRandomWalks, DISABLED_Node2VecSmallGraph)
   std::vector<vertex_t> biased_next_v(v_src_v.size());
   raft::update_host(biased_next_v.data(), d_next_v.data(), v_src_v.size(), handle.get_stream());
 
+  EXPECT_EQ(biased_next_v.size(), d_src_v.size());
+
   // Step 5: compare `node2vec` on original graph
   //         with biased on graph with alpha scaled weights:
   //

From ffc12e807db4cf62fe08ad3db290476aa1328b1d Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Thu, 21 Oct 2021 13:44:18 -0500
Subject: [PATCH 15/25] Testing node2vec: alpha buffer.

---
 cpp/src/sampling/rw_traversals.hpp      |  10 +-
 cpp/tests/sampling/rw_low_level_test.cu | 163 +++++++++++++++++++++++-
 2 files changed, 166 insertions(+), 7 deletions(-)

diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index 6aa80b67950..def64477c54 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -449,9 +449,9 @@ struct node2vec_selector_t {
       if (coalesced_alpha_.has_value()) {
         auto&& tpl = *coalesced_alpha_;
 
-        auto max_out_deg           = thrust::get<0>(tpl);
-        auto num_paths             = thrust::get<1>(tpl);
-        auto* ptr_d_scaled_weights = thrust::get<2>(tpl);
+        auto max_out_deg               = thrust::get<0>(tpl);
+        auto num_paths                 = thrust::get<1>(tpl);
+        weight_t* ptr_d_scaled_weights = thrust::get<2>(tpl);
 
         // sum-scaled-weights reduction loop:
         //
@@ -521,6 +521,8 @@ struct node2vec_selector_t {
       }
     }
 
+    decltype(auto) get_alpha_buffer(void) const { return coalesced_alpha_; }
+
    private:
     edge_t const* row_offsets_;
     vertex_t const* col_indices_;
@@ -580,6 +582,8 @@ struct node2vec_selector_t {
                           thrust::maximum<edge_t>{});
   }
 
+  device_vec_t<weight_t> const& get_alpha_cache(void) const { return d_coalesced_alpha_; }
+
  private:
   size_t max_out_degree_{0};
 
diff --git a/cpp/tests/sampling/rw_low_level_test.cu b/cpp/tests/sampling/rw_low_level_test.cu
index 439742ace7f..93a8e859024 100644
--- a/cpp/tests/sampling/rw_low_level_test.cu
+++ b/cpp/tests/sampling/rw_low_level_test.cu
@@ -86,6 +86,9 @@ void next_biased(raft::handle_t const& handle,
                     });
 }
 
+// simulates max_depth==1 traversal of multiple paths,
+// where num_paths = distance(begin, end), below:
+//
 template <typename vertex_t, typename real_t, typename selector_t>
 void next_node2vec(raft::handle_t const& handle,
                    vector_test_t<vertex_t> const& d_src_v,
@@ -94,8 +97,11 @@ void next_node2vec(raft::handle_t const& handle,
                    vector_test_t<vertex_t>& d_next_v,
                    selector_t const& selector)
 {
-  auto begin = thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_prev_v.begin()));
-  auto end   = thrust::make_zip_iterator(thrust::make_tuple(d_src_v.end(), d_prev_v.end()));
+  size_t num_paths{d_src_v.size()};
+  auto begin = thrust::make_zip_iterator(thrust::make_tuple(
+    d_src_v.begin(), d_prev_v.begin(), thrust::make_counting_iterator<size_t>(0)));
+  auto end   = thrust::make_zip_iterator(thrust::make_tuple(
+    d_src_v.end(), d_prev_v.end(), thrust::make_counting_iterator<size_t>(num_paths)));
 
   thrust::transform(handle.get_thrust_policy(),
                     begin,
@@ -105,10 +111,12 @@ void next_node2vec(raft::handle_t const& handle,
                     [sampler = selector.get_strategy()] __device__(auto tpl, auto rnd_val) {
                       vertex_t src_v = thrust::get<0>(tpl);
 
+                      size_t path_index = thrust::get<2>(tpl);
+
                       if (thrust::get<1>(tpl) != thrust::nullopt) {
                         vertex_t prev_v = *thrust::get<1>(tpl);
 
-                        auto next_vw = sampler(src_v, rnd_val, prev_v, 0, false);
+                        auto next_vw = sampler(src_v, rnd_val, prev_v, path_index, false);
                         return (next_vw.has_value() ? thrust::get<0>(*next_vw) : src_v);
                       } else {
                         return src_v;
@@ -1412,7 +1420,154 @@ TEST(Node2VecRandomWalks, Node2VecSmallGraph)
   std::vector<vertex_t> biased_next_v(v_src_v.size());
   raft::update_host(biased_next_v.data(), d_next_v.data(), v_src_v.size(), handle.get_stream());
 
-  EXPECT_EQ(biased_next_v.size(), d_src_v.size());
+  // Step 5: compare `node2vec` on original graph
+  //         with biased on graph with alpha scaled weights:
+  //
+  EXPECT_EQ(biased_next_v, n2v_next_v);
+}
+
+TEST(Node2VecRandomWalks, CachedNode2VecSmallGraph)
+{
+  namespace topo = cugraph::topology;
+
+  raft::handle_t handle{};
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+  using real_t   = weight_t;
+
+  weight_t p = 2.0;
+  weight_t q = 4.0;
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  // Step 1: graph construction:
+  //
+  /*
+    0 --(.1)--> 1 --(1.1)--> 4
+   /|\       /\ |            |
+    |       /   |            |
+   (5.1) (3.1)(2.1)        (3.2)
+    |   /       |            |
+    | /        \|/          \|/
+    2 --(4.1)-->3 --(7.2)--> 5
+   */
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w(num_edges, 1.0);  //{0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  std::vector<real_t> v_rnd{0.2, 0.5, 1.0, 0.1, 0.8};
+  std::vector<vertex_t> v_src_v{0, 1, 3, 4, 5};
+  std::vector<thrust::optional<vertex_t>> v_pred_v{2, 0, 1, 1, 4};
+
+  vector_test_t<real_t> d_rnd(v_rnd.size(), handle.get_stream());
+  vector_test_t<vertex_t> d_src_v(v_src_v.size(), handle.get_stream());
+
+  EXPECT_EQ(d_rnd.size(), d_src_v.size());
+
+  raft::update_device(d_rnd.data(), v_rnd.data(), d_rnd.size(), handle.get_stream());
+  raft::update_device(d_src_v.data(), v_src_v.data(), d_src_v.size(), handle.get_stream());
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets = graph_view.get_matrix_partition_view().get_offsets();
+
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+
+  weight_t const* values = *(graph_view.get_matrix_partition_view().get_weights());
+
+  // Step 2: `node2vec` selection on original graph:
+  //
+  // CAVEAT: next_node2vec(), steps in parallel, so it simulates
+  //         traversing multiple paths (of size max_depth == 1);
+  //         if ignored, this creates a data race on the cached
+  //         alpha buffer!
+  //
+  edge_t num_paths(d_src_v.size());
+  cugraph::detail::node2vec_selector_t n2v_selector{
+    handle, graph_view, 0.0f, p, q, num_paths};  // use cached approach
+
+  auto const& d_cached_alpha = n2v_selector.get_alpha_cache();
+
+  size_t expected_max_degree{3};
+  EXPECT_EQ(d_cached_alpha.size(), expected_max_degree * num_paths);
+
+  auto&& coalesced_alpha = n2v_selector.get_strategy().get_alpha_buffer();
+
+  ASSERT_TRUE(coalesced_alpha != thrust::nullopt);
+
+  EXPECT_EQ(static_cast<size_t>(thrust::get<0>(*coalesced_alpha)), expected_max_degree);
+  EXPECT_EQ(thrust::get<1>(*coalesced_alpha), num_paths);
+  EXPECT_EQ(thrust::get<2>(*coalesced_alpha), d_cached_alpha.data());
+
+  vector_test_t<thrust::optional<vertex_t>> d_pred_v(v_pred_v.size(), handle.get_stream());
+
+  raft::update_device(d_pred_v.data(), v_pred_v.data(), v_pred_v.size(), handle.get_stream());
+
+  vector_test_t<vertex_t> d_next_v(v_src_v.size(), handle.get_stream());
+
+  // `node2vec` stepping:
+  //
+  // CAVEAT: next_node2vec(), steps in parallel, so it simulates
+  //         traversing multiple paths (of size max_depth == 1);
+  //         if ignored, this creates a data race on the cached
+  //         alpha buffer!
+  //
+  // <- FIXME: PROBLEM here: BUG to be fixed for cached alpha testing
+  //
+  next_node2vec(handle, d_src_v, d_pred_v, d_rnd, d_next_v, n2v_selector);
+
+  std::vector<vertex_t> n2v_next_v(v_src_v.size());
+  raft::update_host(n2v_next_v.data(), d_next_v.data(), v_src_v.size(), handle.get_stream());
+
+  EXPECT_EQ(n2v_next_v.size(), d_src_v.size());
+
+  // Step 3: construct similar graph, just with
+  //         alpha scaled weights;
+  //
+  std::vector<weight_t> scaled_weights(v_w);
+  std::vector<edge_t> row_offsets(num_vertices + 1);
+  std::vector<vertex_t> col_indices(num_edges);
+
+  raft::update_host(
+    row_offsets.data(), offsets, static_cast<size_t>(num_vertices + 1), handle.get_stream());
+
+  raft::update_host(
+    col_indices.data(), indices, static_cast<size_t>(num_edges), handle.get_stream());
+
+  std::vector<edge_t> v_ro{0, 1, 3, 6, 7, 8, 8};
+  std::vector<vertex_t> v_ci{1, 3, 4, 0, 1, 3, 5, 5};
+
+  EXPECT_EQ(row_offsets, v_ro);
+  EXPECT_EQ(col_indices, v_ci);
+  EXPECT_EQ(scaled_weights.size(), static_cast<size_t>(num_edges));
+
+  alpha_node2vec(row_offsets, col_indices, scaled_weights, v_pred_v, v_src_v, p, q);
+
+  auto scaled_graph =
+    cugraph::test::make_graph(handle,
+                              v_src,
+                              v_dst,
+                              std::optional<std::vector<weight_t>>{scaled_weights},
+                              num_vertices,
+                              num_edges);
+
+  auto scaled_graph_view = scaled_graph.view();
+
+  // Step 4: biased selection on alpha scaled graph:
+  //
+  cugraph::detail::biased_selector_t selector{handle, scaled_graph_view, 0.0f};
+
+  next_biased(handle, d_src_v, d_rnd, d_next_v, selector);
+
+  std::vector<vertex_t> biased_next_v(v_src_v.size());
+  raft::update_host(biased_next_v.data(), d_next_v.data(), v_src_v.size(), handle.get_stream());
 
   // Step 5: compare `node2vec` on original graph
   //         with biased on graph with alpha scaled weights:

From 8326b0bf0e7d8d4892f17a0c9faf28f324c90aba Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Thu, 21 Oct 2021 16:39:33 -0500
Subject: [PATCH 16/25] node2vec sampling exposed to RW.

---
 cpp/src/sampling/random_walks.cuh       | 31 ++++++++++++++++++++++++-
 cpp/tests/sampling/rw_low_level_test.cu |  2 --
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 134d55d4d0c..685afae9f50 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -1128,6 +1128,21 @@ random_walks(raft::handle_t const& handle,
     if (selector_type == static_cast<int>(sampling_strategy_t::BIASED)) {
       detail::biased_selector_t<graph_t, real_t> selector{handle, graph, real_t{0}};
 
+      auto quad_tuple =
+        detail::random_walks_impl<graph_t, decltype(selector), detail::vertical_traversal_t>(
+          handle, graph, d_v_start, max_depth, selector, use_padding);
+      // ignore last element of the quad, seed,
+      // since it's meant for testing / debugging, only:
+      //
+      return std::make_tuple(std::move(std::get<0>(quad_tuple)),
+                             std::move(std::get<1>(quad_tuple)),
+                             std::move(std::get<2>(quad_tuple)));
+    } else if (selector_type == static_cast<int>(sampling_strategy_t::NODE2VEC)) {
+      weight_t p(sampling_strategy->p_);
+      weight_t q(sampling_strategy->q_);
+
+      detail::node2vec_selector_t<graph_t, real_t> selector{handle, graph, real_t{0}, p, q};
+
       auto quad_tuple =
         detail::random_walks_impl<graph_t, decltype(selector), detail::vertical_traversal_t>(
           handle, graph, d_v_start, max_depth, selector, use_padding);
@@ -1150,10 +1165,24 @@ random_walks(raft::handle_t const& handle,
                              std::move(std::get<1>(quad_tuple)),
                              std::move(std::get<2>(quad_tuple)));
     }
-  } else {
+  } else {  // horizontal traversal strategy
     if (selector_type == static_cast<int>(sampling_strategy_t::BIASED)) {
       detail::biased_selector_t<graph_t, real_t> selector{handle, graph, real_t{0}};
 
+      auto quad_tuple =
+        detail::random_walks_impl(handle, graph, d_v_start, max_depth, selector, use_padding);
+      // ignore last element of the quad, seed,
+      // since it's meant for testing / debugging, only:
+      //
+      return std::make_tuple(std::move(std::get<0>(quad_tuple)),
+                             std::move(std::get<1>(quad_tuple)),
+                             std::move(std::get<2>(quad_tuple)));
+    } else if (selector_type == static_cast<int>(sampling_strategy_t::NODE2VEC)) {
+      weight_t p(sampling_strategy->p_);
+      weight_t q(sampling_strategy->q_);
+
+      detail::node2vec_selector_t<graph_t, real_t> selector{handle, graph, real_t{0}, p, q};
+
       auto quad_tuple =
         detail::random_walks_impl(handle, graph, d_v_start, max_depth, selector, use_padding);
       // ignore last element of the quad, seed,
diff --git a/cpp/tests/sampling/rw_low_level_test.cu b/cpp/tests/sampling/rw_low_level_test.cu
index 93a8e859024..224a6283657 100644
--- a/cpp/tests/sampling/rw_low_level_test.cu
+++ b/cpp/tests/sampling/rw_low_level_test.cu
@@ -1519,8 +1519,6 @@ TEST(Node2VecRandomWalks, CachedNode2VecSmallGraph)
   //         if ignored, this creates a data race on the cached
   //         alpha buffer!
   //
-  // <- FIXME: PROBLEM here: BUG to be fixed for cached alpha testing
-  //
   next_node2vec(handle, d_src_v, d_pred_v, d_rnd, d_next_v, n2v_selector);
 
   std::vector<vertex_t> n2v_next_v(v_src_v.size());

From 40f1e02605ec46b3fd79178748a0f1c8d2ab7ffd Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Thu, 21 Oct 2021 18:03:17 -0500
Subject: [PATCH 17/25] Added some pre-conditions for running RW with node2vec.

---
 cpp/src/sampling/random_walks.cuh | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 685afae9f50..81e70542717 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -49,6 +49,8 @@
 #include <cassert>
 #include <cstdlib>  // FIXME: requirement for temporary std::getenv()
 #include <ctime>
+#include <limits>
+//
 #include <optional>
 #include <tuple>
 #include <type_traits>
@@ -1141,6 +1143,11 @@ random_walks(raft::handle_t const& handle,
       weight_t p(sampling_strategy->p_);
       weight_t q(sampling_strategy->q_);
 
+      weight_t roundoff = std::numeric_limits<weight_t>::epsilon();
+      CUGRAPH_EXPECTS(p > roundoff, "node2vec p parameter is too small.");
+
+      CUGRAPH_EXPECTS(q > roundoff, "node2vec q parameter is too small.");
+
       detail::node2vec_selector_t<graph_t, real_t> selector{handle, graph, real_t{0}, p, q};
 
       auto quad_tuple =
@@ -1181,6 +1188,11 @@ random_walks(raft::handle_t const& handle,
       weight_t p(sampling_strategy->p_);
       weight_t q(sampling_strategy->q_);
 
+      weight_t roundoff = std::numeric_limits<weight_t>::epsilon();
+      CUGRAPH_EXPECTS(p > roundoff, "node2vec p parameter is too small.");
+
+      CUGRAPH_EXPECTS(q > roundoff, "node2vec q parameter is too small.");
+
       detail::node2vec_selector_t<graph_t, real_t> selector{handle, graph, real_t{0}, p, q};
 
       auto quad_tuple =

From 16efb7afd04949f63be0b40297bf89ef39990fe2 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Thu, 21 Oct 2021 18:16:46 -0500
Subject: [PATCH 18/25] Test RW with node2vec.

---
 cpp/tests/sampling/random_walks_test.cu | 35 ++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/cpp/tests/sampling/random_walks_test.cu b/cpp/tests/sampling/random_walks_test.cu
index 10a417c921d..0a680c1c379 100644
--- a/cpp/tests/sampling/random_walks_test.cu
+++ b/cpp/tests/sampling/random_walks_test.cu
@@ -132,6 +132,10 @@ class Tests_RandomWalks
                                                                           num_paths};
 
     edge_t max_depth{10};
+
+    weight_t p{4};
+    weight_t q{8};
+
     if (trv_id == traversal_id_t::HORIZONTAL) {
       auto ret_tuple =
         cugraph::random_walks(handle,
@@ -140,7 +144,7 @@ class Tests_RandomWalks
                               num_paths,
                               max_depth,
                               false,
-                              std::make_unique<cugraph::sampling_params_t>(sampling_id));
+                              std::make_unique<cugraph::sampling_params_t>(sampling_id, p, q));
 
       // check results:
       //
@@ -173,9 +177,34 @@ class Tests_RandomWalks
           std::cout << "starting seed on failure: " << std::get<3>(ret_tuple) << '\n';
 
         ASSERT_TRUE(test_all_paths);
-      } else {
+      } else if (sampling_id == 1) {
         impl_details::biased_selector_t<graph_vt, real_t> selector{handle, graph_view, real_t{0}};
 
+        auto ret_tuple = impl_details::random_walks_impl<graph_vt,
+                                                         decltype(selector),
+                                                         impl_details::vertical_traversal_t>(
+          handle,  // required to prevent clang-format to separate functin name from its namespace
+          graph_view,
+          d_start_view,
+          max_depth,
+          selector);
+
+        // check results:
+        //
+        bool test_all_paths = cugraph::test::host_check_rw_paths(handle,
+                                                                 graph_view,
+                                                                 std::get<0>(ret_tuple),
+                                                                 std::get<1>(ret_tuple),
+                                                                 std::get<2>(ret_tuple));
+
+        if (!test_all_paths)
+          std::cout << "starting seed on failure: " << std::get<3>(ret_tuple) << '\n';
+
+        ASSERT_TRUE(test_all_paths);
+      } else {
+        impl_details::node2vec_selector_t<graph_vt, real_t> selector{
+          handle, graph_view, real_t{0}, p, q};
+
         auto ret_tuple = impl_details::random_walks_impl<graph_vt,
                                                          decltype(selector),
                                                          impl_details::vertical_traversal_t>(
@@ -211,7 +240,7 @@ INSTANTIATE_TEST_SUITE_P(
   simple_test,
   Tests_RandomWalks,
   ::testing::Combine(::testing::Values(traversal_id_t::HORIZONTAL, traversal_id_t::VERTICAL),
-                     ::testing::Values(int{0}, int{1}),
+                     ::testing::Values(int{0}, int{1}, int{2}),
                      ::testing::Values(RandomWalks_Usecase("test/datasets/karate.mtx", true),
                                        RandomWalks_Usecase("test/datasets/web-Google.mtx", true),
                                        RandomWalks_Usecase("test/datasets/ljournal-2008.mtx", true),

From 5f7084bb9dd1835084e513ec48fac4df13a6ab60 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Fri, 22 Oct 2021 11:43:29 -0500
Subject: [PATCH 19/25] Added node2vec constraint: requires floating point type
 for weights.

---
 cpp/src/sampling/random_walks.cuh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 81e70542717..772e7eee0ef 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -1126,6 +1126,13 @@ random_walks(raft::handle_t const& handle,
   int selector_type{0};
   if (sampling_strategy) selector_type = static_cast<int>(sampling_strategy->sampling_type_);
 
+  // node2vec is only possible for weight_t being a floating-point type:
+  //
+  if constexpr (!std::is_floating_point_v<weight_t>) {
+    CUGRAPH_EXPECTS(selector_type != static_cast<int>(sampling_strategy_t::NODE2VEC),
+                    "node2vec requires floating point type for weights.");
+  }
+
   if (use_vertical_strategy) {
     if (selector_type == static_cast<int>(sampling_strategy_t::BIASED)) {
       detail::biased_selector_t<graph_t, real_t> selector{handle, graph, real_t{0}};

From fbc8898e3b13a6d28cce8e6359fcb41513a241d6 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Fri, 22 Oct 2021 12:41:35 -0500
Subject: [PATCH 20/25] Added node2vec profiling: alpha cache brings about 30%
 speed-up compared to non-cached.

---
 cpp/tests/sampling/random_walks_profiling.cu | 63 +++++++++++++++++---
 1 file changed, 55 insertions(+), 8 deletions(-)

diff --git a/cpp/tests/sampling/random_walks_profiling.cu b/cpp/tests/sampling/random_walks_profiling.cu
index b5aa787ec28..dbbb959baeb 100644
--- a/cpp/tests/sampling/random_walks_profiling.cu
+++ b/cpp/tests/sampling/random_walks_profiling.cu
@@ -87,6 +87,9 @@ void output_random_walks_time(graph_vt const& graph_view,
 
   edge_t max_depth{10};
 
+  weight_t p{4};
+  weight_t q{8};
+
   HighResTimer hr_timer;
   std::string label{};
 
@@ -109,13 +112,32 @@ void output_random_walks_time(graph_vt const& graph_view,
 
       cudaProfilerStop();
       hr_timer.stop();
-    } else {
+    } else if (sampling_id == 1) {
       label = std::string("RandomWalks; Horizontal traversal; biased sampling - ");
       impl_details::biased_selector_t<graph_vt, real_t> selector{handle, graph_view, real_t{0}};
 
       hr_timer.start(label);
       cudaProfilerStart();
 
+      auto ret_tuple = impl_details::random_walks_impl<graph_vt,
+                                                       decltype(selector),
+                                                       impl_details::horizontal_traversal_t>(
+        handle,  // prevent clang-format to separate function name from its namespace
+        graph_view,
+        d_start_view,
+        max_depth,
+        selector);
+
+      cudaProfilerStop();
+      hr_timer.stop();
+    } else {
+      label = std::string("RandomWalks; Horizontal traversal; node2vec sampling - ");
+      impl_details::node2vec_selector_t<graph_vt, real_t> selector{
+        handle, graph_view, real_t{0}, p, q, num_paths};
+
+      hr_timer.start(label);
+      cudaProfilerStart();
+
       auto ret_tuple = impl_details::random_walks_impl<graph_vt,
                                                        decltype(selector),
                                                        impl_details::horizontal_traversal_t>(
@@ -146,12 +168,31 @@ void output_random_walks_time(graph_vt const& graph_view,
 
       cudaProfilerStop();
       hr_timer.stop();
-    } else {
+    } else if (sampling_id == 1) {
       label = std::string("RandomWalks; Vertical traversal; biased sampling - ");
       impl_details::biased_selector_t<graph_vt, real_t> selector{handle, graph_view, real_t{0}};
       hr_timer.start(label);
       cudaProfilerStart();
 
+      auto ret_tuple = impl_details::random_walks_impl<graph_vt,
+                                                       decltype(selector),
+                                                       impl_details::vertical_traversal_t>(
+        handle,  // prevent clang-format to separate function name from its namespace
+        graph_view,
+        d_start_view,
+        max_depth,
+        selector);
+
+      cudaProfilerStop();
+      hr_timer.stop();
+    } else {
+      label = std::string("RandomWalks; Vertical traversal; node2vec sampling - ");
+      impl_details::node2vec_selector_t<graph_vt, real_t> selector{
+        handle, graph_view, real_t{0}, p, q, num_paths};
+
+      hr_timer.start(label);
+      cudaProfilerStart();
+
       auto ret_tuple = impl_details::random_walks_impl<graph_vt,
                                                        decltype(selector),
                                                        impl_details::vertical_traversal_t>(
@@ -287,22 +328,28 @@ int main(int argc, char** argv)
   // Run benchmarks
   std::cout << "Using dataset: " << dataset << std::endl;
 
-  std::cout << "# Horizontal traversal strategy:\n";
+  std::cout << "##### Horizontal traversal strategy:\n";
 
-  std::cout << "## Uniform sampling strategy:\n";
+  std::cout << "### Uniform sampling strategy:\n";
   run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::HORIZONTAL, 0);
 
-  std::cout << "## Biased sampling strategy:\n";
+  std::cout << "### Biased sampling strategy:\n";
   run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::HORIZONTAL, 1);
 
-  std::cout << "# Vertical traversal strategy:\n";
+  std::cout << "### Node2Vec sampling strategy:\n";
+  run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::HORIZONTAL, 2);
 
-  std::cout << "## Uniform sampling strategy:\n";
+  std::cout << "##### Vertical traversal strategy:\n";
+
+  std::cout << "### Uniform sampling strategy:\n";
   run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::VERTICAL, 0);
 
-  std::cout << "## Biased sampling strategy:\n";
+  std::cout << "### Biased sampling strategy:\n";
   run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::VERTICAL, 1);
 
+  std::cout << "### Node2Vec sampling strategy:\n";
+  run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::VERTICAL, 2);
+
   // FIXME: consider returning non-zero for situations that warrant it (eg. if
   // the algo ran but the results are invalid, if a benchmark threshold is
   // exceeded, etc.)

From ab235dbaab03df6db6236bb3bdc5a8f2304e9c41 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Fri, 22 Oct 2021 15:36:31 -0500
Subject: [PATCH 21/25] Added node2vec flag for selecting alpha cache.

---
 cpp/include/cugraph/api_helpers.hpp     |  8 ++++++--
 cpp/src/sampling/random_walks.cuh       | 10 ++++++++--
 cpp/tests/sampling/random_walks_test.cu | 18 +++++++++---------
 3 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/cpp/include/cugraph/api_helpers.hpp b/cpp/include/cugraph/api_helpers.hpp
index 9aa82b03847..549313abb90 100644
--- a/cpp/include/cugraph/api_helpers.hpp
+++ b/cpp/include/cugraph/api_helpers.hpp
@@ -28,8 +28,11 @@ enum class sampling_strategy_t : int { UNIFORM = 0, BIASED, NODE2VEC };
 struct sampling_params_t {
   sampling_params_t(void) {}
 
-  sampling_params_t(int sampling_type, double p = 1.0, double q = 1.0)
-    : sampling_type_(static_cast<sampling_strategy_t>(sampling_type)), p_(p), q_(q)
+  sampling_params_t(int sampling_type, double p = 1.0, double q = 1.0, bool use_alpha_cache = false)
+    : sampling_type_(static_cast<sampling_strategy_t>(sampling_type)),
+      p_(p),
+      q_(q),
+      use_alpha_cache_(use_alpha_cache)
   {
   }
 
@@ -39,5 +42,6 @@ struct sampling_params_t {
   //
   double p_;
   double q_;
+  bool use_alpha_cache_{false};
 };
 }  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 772e7eee0ef..dfcd589e14e 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -1150,12 +1150,15 @@ random_walks(raft::handle_t const& handle,
       weight_t p(sampling_strategy->p_);
       weight_t q(sampling_strategy->q_);
 
+      edge_t alpha_num_paths = sampling_strategy->use_alpha_cache_ ? num_paths : 0;
+
       weight_t roundoff = std::numeric_limits<weight_t>::epsilon();
       CUGRAPH_EXPECTS(p > roundoff, "node2vec p parameter is too small.");
 
       CUGRAPH_EXPECTS(q > roundoff, "node2vec q parameter is too small.");
 
-      detail::node2vec_selector_t<graph_t, real_t> selector{handle, graph, real_t{0}, p, q};
+      detail::node2vec_selector_t<graph_t, real_t> selector{
+        handle, graph, real_t{0}, p, q, alpha_num_paths};
 
       auto quad_tuple =
         detail::random_walks_impl<graph_t, decltype(selector), detail::vertical_traversal_t>(
@@ -1195,12 +1198,15 @@ random_walks(raft::handle_t const& handle,
       weight_t p(sampling_strategy->p_);
       weight_t q(sampling_strategy->q_);
 
+      edge_t alpha_num_paths = sampling_strategy->use_alpha_cache_ ? num_paths : 0;
+
       weight_t roundoff = std::numeric_limits<weight_t>::epsilon();
       CUGRAPH_EXPECTS(p > roundoff, "node2vec p parameter is too small.");
 
       CUGRAPH_EXPECTS(q > roundoff, "node2vec q parameter is too small.");
 
-      detail::node2vec_selector_t<graph_t, real_t> selector{handle, graph, real_t{0}, p, q};
+      detail::node2vec_selector_t<graph_t, real_t> selector{
+        handle, graph, real_t{0}, p, q, alpha_num_paths};
 
       auto quad_tuple =
         detail::random_walks_impl(handle, graph, d_v_start, max_depth, selector, use_padding);
diff --git a/cpp/tests/sampling/random_walks_test.cu b/cpp/tests/sampling/random_walks_test.cu
index 0a680c1c379..5ab19587f11 100644
--- a/cpp/tests/sampling/random_walks_test.cu
+++ b/cpp/tests/sampling/random_walks_test.cu
@@ -137,14 +137,14 @@ class Tests_RandomWalks
     weight_t q{8};
 
     if (trv_id == traversal_id_t::HORIZONTAL) {
-      auto ret_tuple =
-        cugraph::random_walks(handle,
-                              graph_view,
-                              d_start_view.begin(),
-                              num_paths,
-                              max_depth,
-                              false,
-                              std::make_unique<cugraph::sampling_params_t>(sampling_id, p, q));
+      auto ret_tuple = cugraph::random_walks(
+        handle,
+        graph_view,
+        d_start_view.begin(),
+        num_paths,
+        max_depth,
+        false,
+        std::make_unique<cugraph::sampling_params_t>(sampling_id, p, q, true));
 
       // check results:
       //
@@ -203,7 +203,7 @@ class Tests_RandomWalks
         ASSERT_TRUE(test_all_paths);
       } else {
         impl_details::node2vec_selector_t<graph_vt, real_t> selector{
-          handle, graph_view, real_t{0}, p, q};
+          handle, graph_view, real_t{0}, p, q, num_paths};
 
         auto ret_tuple = impl_details::random_walks_impl<graph_vt,
                                                          decltype(selector),

From bee6048a753c7ad5dd1d2d90737005634b206c47 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Mon, 25 Oct 2021 11:22:57 -0500
Subject: [PATCH 22/25] Tests for node2vec with and without alpha cache.

---
 cpp/tests/sampling/random_walks_test.cu | 58 +++++++++++++++++++------
 1 file changed, 45 insertions(+), 13 deletions(-)

diff --git a/cpp/tests/sampling/random_walks_test.cu b/cpp/tests/sampling/random_walks_test.cu
index 5ab19587f11..4e1ee4719b9 100644
--- a/cpp/tests/sampling/random_walks_test.cu
+++ b/cpp/tests/sampling/random_walks_test.cu
@@ -137,21 +137,53 @@ class Tests_RandomWalks
     weight_t q{8};
 
     if (trv_id == traversal_id_t::HORIZONTAL) {
-      auto ret_tuple = cugraph::random_walks(
-        handle,
-        graph_view,
-        d_start_view.begin(),
-        num_paths,
-        max_depth,
-        false,
-        std::make_unique<cugraph::sampling_params_t>(sampling_id, p, q, true));
-
-      // check results:
+      // `node2vec` without alpha buffer:
       //
-      bool test_all_paths = cugraph::test::host_check_rw_paths(
-        handle, graph_view, std::get<0>(ret_tuple), std::get<1>(ret_tuple), std::get<2>(ret_tuple));
+      if (sampling_id == 2) {
+        auto ret_tuple = cugraph::random_walks(
+          handle,
+          graph_view,
+          d_start_view.begin(),
+          num_paths,
+          max_depth,
+          false,
+          std::make_unique<cugraph::sampling_params_t>(sampling_id, p, q, false));
+
+        // check results:
+        //
+        bool test_all_paths = cugraph::test::host_check_rw_paths(handle,
+                                                                 graph_view,
+                                                                 std::get<0>(ret_tuple),
+                                                                 std::get<1>(ret_tuple),
+                                                                 std::get<2>(ret_tuple));
 
-      ASSERT_TRUE(test_all_paths);
+        ASSERT_TRUE(test_all_paths);
+      }
+
+      // the alpha buffer case should also be tested for `node2vec`
+      // and for the others is irrelevant, so this block is necessary
+      // for any sampling method:
+      //
+      {
+        auto ret_tuple = cugraph::random_walks(
+          handle,
+          graph_view,
+          d_start_view.begin(),
+          num_paths,
+          max_depth,
+          false,
+          std::make_unique<cugraph::sampling_params_t>(sampling_id, p, q, true));
+
+        // check results:
+        //
+        bool test_all_paths = cugraph::test::host_check_rw_paths(handle,
+                                                                 graph_view,
+                                                                 std::get<0>(ret_tuple),
+                                                                 std::get<1>(ret_tuple),
+                                                                 std::get<2>(ret_tuple));
+
+        ASSERT_TRUE(test_all_paths);
+      }
     } else {  // VERTICAL: needs to be force-called via detail
       if (sampling_id == 0) {
         impl_details::uniform_selector_t<graph_vt, real_t> selector{handle, graph_view, real_t{0}};

From b80b7593975836cb58167a69e087174c4469163f Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Mon, 25 Oct 2021 11:49:03 -0500
Subject: [PATCH 23/25] Profiling for node2vec with and without alpha cache.

---
 cpp/tests/sampling/random_walks_profiling.cu | 51 ++++++++++++++++++--
 1 file changed, 47 insertions(+), 4 deletions(-)

diff --git a/cpp/tests/sampling/random_walks_profiling.cu b/cpp/tests/sampling/random_walks_profiling.cu
index dbbb959baeb..b97264ce9f7 100644
--- a/cpp/tests/sampling/random_walks_profiling.cu
+++ b/cpp/tests/sampling/random_walks_profiling.cu
@@ -130,14 +130,35 @@ void output_random_walks_time(graph_vt const& graph_view,
 
       cudaProfilerStop();
       hr_timer.stop();
-    } else {
-      label = std::string("RandomWalks; Horizontal traversal; node2vec sampling - ");
+    } else if (sampling_id == 2) {
+      label =
+        std::string("RandomWalks; Horizontal traversal; node2vec sampling with alpha cache - ");
       impl_details::node2vec_selector_t<graph_vt, real_t> selector{
         handle, graph_view, real_t{0}, p, q, num_paths};
 
       hr_timer.start(label);
       cudaProfilerStart();
 
+      auto ret_tuple = impl_details::random_walks_impl<graph_vt,
+                                                       decltype(selector),
+                                                       impl_details::horizontal_traversal_t>(
+        handle,  // prevent clang-format to separate function name from its namespace
+        graph_view,
+        d_start_view,
+        max_depth,
+        selector);
+
+      cudaProfilerStop();
+      hr_timer.stop();
+    } else {
+      label =
+        std::string("RandomWalks; Horizontal traversal; node2vec sampling without alpha cache - ");
+      impl_details::node2vec_selector_t<graph_vt, real_t> selector{
+        handle, graph_view, real_t{0}, p, q};
+
+      hr_timer.start(label);
+      cudaProfilerStart();
+
       auto ret_tuple = impl_details::random_walks_impl<graph_vt,
                                                        decltype(selector),
                                                        impl_details::horizontal_traversal_t>(
@@ -185,14 +206,34 @@ void output_random_walks_time(graph_vt const& graph_view,
 
       cudaProfilerStop();
       hr_timer.stop();
-    } else {
-      label = std::string("RandomWalks; Vertical traversal; node2vec sampling - ");
+    } else if (sampling_id == 2) {
+      label = std::string("RandomWalks; Vertical traversal; node2vec sampling with alpha cache - ");
       impl_details::node2vec_selector_t<graph_vt, real_t> selector{
         handle, graph_view, real_t{0}, p, q, num_paths};
 
       hr_timer.start(label);
       cudaProfilerStart();
 
+      auto ret_tuple = impl_details::random_walks_impl<graph_vt,
+                                                       decltype(selector),
+                                                       impl_details::vertical_traversal_t>(
+        handle,  // prevent clang-format to separate function name from its namespace
+        graph_view,
+        d_start_view,
+        max_depth,
+        selector);
+
+      cudaProfilerStop();
+      hr_timer.stop();
+    } else {
+      label =
+        std::string("RandomWalks; Vertical traversal; node2vec sampling without alpha cache - ");
+      impl_details::node2vec_selector_t<graph_vt, real_t> selector{
+        handle, graph_view, real_t{0}, p, q};
+
+      hr_timer.start(label);
+      cudaProfilerStart();
+
       auto ret_tuple = impl_details::random_walks_impl<graph_vt,
                                                        decltype(selector),
                                                        impl_details::vertical_traversal_t>(
@@ -338,6 +379,7 @@ int main(int argc, char** argv)
 
   std::cout << "### Node2Vec sampling strategy:\n";
   run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::HORIZONTAL, 2);
+  run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::HORIZONTAL, 3);
 
   std::cout << "##### Vertical traversal strategy:\n";
 
@@ -349,6 +391,7 @@ int main(int argc, char** argv)
 
   std::cout << "### Node2Vec sampling strategy:\n";
   run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::VERTICAL, 2);
+  run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::VERTICAL, 3);
 
   // FIXME: consider returning non-zero for situations that warrant it (eg. if
   // the algo ran but the results are invalid, if a benchmark threshold is

From 8ff3cdf7b90675da79a0dfa7e01c201a8198e147 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Wed, 27 Oct 2021 16:29:28 -0500
Subject: [PATCH 24/25] Addressed review on compute_max_degree().

---
 cpp/src/sampling/rw_traversals.hpp | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index def64477c54..ad16184447f 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -552,7 +552,7 @@ struct node2vec_selector_t {
                       weight_t p,
                       weight_t q,
                       edge_t num_paths = 0)
-    : max_out_degree_{num_paths > 0 ? get_max_out_degree(handle, graph) : 0},
+    : max_out_degree_(num_paths > 0 ? graph.compute_max_out_degree(handle) : 0),
       d_coalesced_alpha_{max_out_degree_ * num_paths, handle.get_stream()},
       sampler_{graph.get_matrix_partition_view().get_offsets(),
                graph.get_matrix_partition_view().get_indices(),
@@ -569,19 +569,6 @@ struct node2vec_selector_t {
 
   sampler_t const& get_strategy(void) const { return sampler_; }
 
-  static size_t get_max_out_degree(raft::handle_t const& handle, graph_type const& graph)
-  {
-    using edge_t = node2vec_selector_t::edge_t;
-
-    auto&& d_out_degs = graph.compute_out_degrees(handle);
-
-    return thrust::reduce(handle.get_thrust_policy(),
-                          d_out_degs.begin(),
-                          d_out_degs.end(),
-                          edge_t{0},
-                          thrust::maximum<edge_t>{});
-  }
-
   device_vec_t<weight_t> const& get_alpha_cache(void) const { return d_coalesced_alpha_; }
 
  private:

From a9abf93728da129442e868006dcc1f79dd32cead Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <aschaffer@nvidia.com>
Date: Wed, 27 Oct 2021 18:09:46 -0500
Subject: [PATCH 25/25] Addressed review on unused args warnings.

---
 cpp/src/sampling/rw_traversals.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index ad16184447f..2d964542b97 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -122,9 +122,9 @@ struct uniform_selector_t {
     __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
       vertex_t src_v,
       real_t rnd_val,
-      vertex_t prev_v   = 0 /* not used*/,
-      edge_t path_index = 0 /* not used*/,
-      bool start_path   = false /* not used*/) const
+      vertex_t = 0 /* not used*/,
+      edge_t   = 0 /* not used*/,
+      bool     = false /* not used*/) const
     {
       auto crt_out_deg = ptr_d_cache_out_degs_[src_v];
       if (crt_out_deg == 0) return thrust::nullopt;  // src_v is a sink
@@ -287,9 +287,9 @@ struct biased_selector_t {
     __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
       vertex_t src_v,
       real_t rnd_val,
-      vertex_t prev_v   = 0 /* not used*/,
-      edge_t path_index = 0 /* not used*/,
-      bool start_path   = false /* not used*/) const
+      vertex_t = 0 /* not used*/,
+      edge_t   = 0 /* not used*/,
+      bool     = false /* not used*/) const
     {
       weight_t run_sum_w{0};
       auto rnd_sum_weights = rnd_val * ptr_d_sum_weights_[src_v];