diff --git a/cpp/include/cugraph/api_helpers.hpp b/cpp/include/cugraph/api_helpers.hpp
index 9aa82b03847..549313abb90 100644
--- a/cpp/include/cugraph/api_helpers.hpp
+++ b/cpp/include/cugraph/api_helpers.hpp
@@ -28,8 +28,11 @@ enum class sampling_strategy_t : int { UNIFORM = 0, BIASED, NODE2VEC };
 struct sampling_params_t {
   sampling_params_t(void) {}
 
-  sampling_params_t(int sampling_type, double p = 1.0, double q = 1.0)
-    : sampling_type_(static_cast<sampling_strategy_t>(sampling_type)), p_(p), q_(q)
+  sampling_params_t(int sampling_type, double p = 1.0, double q = 1.0, bool use_alpha_cache = false)
+    : sampling_type_(static_cast<sampling_strategy_t>(sampling_type)),
+      p_(p),
+      q_(q),
+      use_alpha_cache_(use_alpha_cache)
   {
   }
 
@@ -39,5 +42,6 @@ struct sampling_params_t {
   //
   double p_;
   double q_;
+  bool use_alpha_cache_{false};
 };
 }  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
index 72eb326fbe9..dfcd589e14e 100644
--- a/cpp/src/sampling/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -49,6 +49,8 @@
 #include <cassert>
 #include <cstdlib>  // FIXME: requirement for temporary std::getenv()
 #include <ctime>
+#include <limits>
+//
 #include <optional>
 #include <tuple>
 #include <type_traits>
@@ -278,9 +280,19 @@ struct col_indx_extract_t<graph_t, index_t, std::enable_if_t<graph_t::is_multi_g
                        auto start_v_pos  = chunk_offset + delta;
                        auto start_w_pos  = chunk_offset - path_indx + delta;
 
-                       auto src_v         = ptr_coalesced_v[start_v_pos];
-                       auto rnd_val       = ptr_d_random[path_indx];
-                       auto opt_tpl_vn_wn = sampler(src_v, rnd_val);
+                       auto src_v   = ptr_coalesced_v[start_v_pos];
+                       auto rnd_val = ptr_d_random[path_indx];
+
+                       // `node2vec` info:
+                       //
+                       bool start_path = true;
+                       auto prev_v     = src_v;
+                       if (delta > 0) {
+                         start_path = false;
+                         prev_v     = ptr_coalesced_v[start_v_pos - 1];
+                       }
+
+                       auto opt_tpl_vn_wn = sampler(src_v, rnd_val, prev_v, path_indx, start_path);
 
                        if (opt_tpl_vn_wn.has_value()) {
                          auto src_vertex = thrust::get<0>(*opt_tpl_vn_wn);
@@ -1114,10 +1126,40 @@ random_walks(raft::handle_t const& handle,
   int selector_type{0};
   if (sampling_strategy) selector_type = static_cast<int>(sampling_strategy->sampling_type_);
 
+  // node2vec is only possible for weight_t being a floating-point type:
+  //
+  if constexpr (!std::is_floating_point_v<weight_t>) {
+    CUGRAPH_EXPECTS(selector_type != static_cast<int>(sampling_strategy_t::NODE2VEC),
+                    "node2vec requires floating point type for weights.");
+  }
+
   if (use_vertical_strategy) {
     if (selector_type == static_cast<int>(sampling_strategy_t::BIASED)) {
       detail::biased_selector_t<graph_t, real_t> selector{handle, graph, real_t{0}};
 
+      auto quad_tuple =
+        detail::random_walks_impl<graph_t, decltype(selector), detail::vertical_traversal_t>(
+          handle, graph, d_v_start, max_depth, selector, use_padding);
+      // ignore last element of the quad, seed,
+      // since it's meant for testing / debugging, only:
+      //
+      return std::make_tuple(std::move(std::get<0>(quad_tuple)),
+                             std::move(std::get<1>(quad_tuple)),
+                             std::move(std::get<2>(quad_tuple)));
+    } else if (selector_type == static_cast<int>(sampling_strategy_t::NODE2VEC)) {
+      weight_t p(sampling_strategy->p_);
+      weight_t q(sampling_strategy->q_);
+
+      edge_t alpha_num_paths = sampling_strategy->use_alpha_cache_ ? num_paths : 0;
+
+      weight_t roundoff = std::numeric_limits<weight_t>::epsilon();
+      CUGRAPH_EXPECTS(p > roundoff, "node2vec p parameter is too small.");
+
+      CUGRAPH_EXPECTS(q > roundoff, "node2vec q parameter is too small.");
+
+      detail::node2vec_selector_t<graph_t, real_t> selector{
+        handle, graph, real_t{0}, p, q, alpha_num_paths};
+
       auto quad_tuple =
         detail::random_walks_impl<graph_t, decltype(selector), detail::vertical_traversal_t>(
           handle, graph, d_v_start, max_depth, selector, use_padding);
@@ -1140,10 +1182,32 @@ random_walks(raft::handle_t const& handle,
                              std::move(std::get<1>(quad_tuple)),
                              std::move(std::get<2>(quad_tuple)));
     }
-  } else {
+  } else {  // horizontal traversal strategy
     if (selector_type == static_cast<int>(sampling_strategy_t::BIASED)) {
       detail::biased_selector_t<graph_t, real_t> selector{handle, graph, real_t{0}};
 
+      auto quad_tuple =
+        detail::random_walks_impl(handle, graph, d_v_start, max_depth, selector, use_padding);
+      // ignore last element of the quad, seed,
+      // since it's meant for testing / debugging, only:
+      //
+      return std::make_tuple(std::move(std::get<0>(quad_tuple)),
+                             std::move(std::get<1>(quad_tuple)),
+                             std::move(std::get<2>(quad_tuple)));
+    } else if (selector_type == static_cast<int>(sampling_strategy_t::NODE2VEC)) {
+      weight_t p(sampling_strategy->p_);
+      weight_t q(sampling_strategy->q_);
+
+      edge_t alpha_num_paths = sampling_strategy->use_alpha_cache_ ? num_paths : 0;
+
+      weight_t roundoff = std::numeric_limits<weight_t>::epsilon();
+      CUGRAPH_EXPECTS(p > roundoff, "node2vec p parameter is too small.");
+
+      CUGRAPH_EXPECTS(q > roundoff, "node2vec q parameter is too small.");
+
+      detail::node2vec_selector_t<graph_t, real_t> selector{
+        handle, graph, real_t{0}, p, q, alpha_num_paths};
+
       auto quad_tuple =
         detail::random_walks_impl(handle, graph, d_v_start, max_depth, selector, use_padding);
       // ignore last element of the quad, seed,
diff --git a/cpp/src/sampling/rw_traversals.hpp b/cpp/src/sampling/rw_traversals.hpp
index 625f7074c7f..2d964542b97 100644
--- a/cpp/src/sampling/rw_traversals.hpp
+++ b/cpp/src/sampling/rw_traversals.hpp
@@ -119,8 +119,12 @@ struct uniform_selector_t {
     {
     }
 
-    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(vertex_t src_v,
-                                                                              real_t rnd_val) const
+    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
+      vertex_t src_v,
+      real_t rnd_val,
+      vertex_t = 0 /* not used*/,
+      edge_t   = 0 /* not used*/,
+      bool     = false /* not used*/) const
     {
       auto crt_out_deg = ptr_d_cache_out_degs_[src_v];
       if (crt_out_deg == 0) return thrust::nullopt;  // src_v is a sink
@@ -260,10 +264,6 @@ struct visitor_aggregate_weights_t : visitors::visitor_t {
 
 // Biased RW selection logic:
 //
-// FIXME:
-// 1. move sum weights calculation into selector;
-// 2. pass graph_view to constructor;
-//
 template <typename graph_type, typename real_t>
 struct biased_selector_t {
   using vertex_t = typename graph_type::vertex_type;
@@ -284,8 +284,12 @@ struct biased_selector_t {
     // Sum(weights(neighborhood(src_v))) are pre-computed and
     // stored in ptr_d_sum_weights_ (too expensive to check, here);
     //
-    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(vertex_t src_v,
-                                                                              real_t rnd_val) const
+    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
+      vertex_t src_v,
+      real_t rnd_val,
+      vertex_t = 0 /* not used*/,
+      edge_t   = 0 /* not used*/,
+      bool     = false /* not used*/) const
     {
       weight_t run_sum_w{0};
       auto rnd_sum_weights = rnd_val * ptr_d_sum_weights_[src_v];
@@ -338,6 +342,248 @@ struct biased_selector_t {
   sampler_t sampler_;
 };
 
+// node2vec RW selection logic:
+// uses biased selector on scaled weights,
+// to be computed (and possibly cached) according to
+// `node2vec` logic (see `get_alpha()`);
+// works on unweighted graphs (for which unscalled weights are 1.0);
+//
+// TODO: need to decide logic on very 1st step of traversal
+//       (which has no `prev_v` vertex);
+//
+template <typename graph_type, typename real_t>
+struct node2vec_selector_t {
+  using vertex_t = typename graph_type::vertex_type;
+  using edge_t   = typename graph_type::edge_type;
+  using weight_t = typename graph_type::weight_type;
+
+  struct sampler_t {
+    sampler_t(edge_t const* ro,
+              vertex_t const* ci,
+              weight_t const* w,
+              weight_t p,
+              weight_t q,
+              vertex_t max_degree,
+              edge_t num_paths,
+              weight_t* ptr_alpha)
+      : row_offsets_(ro),
+        col_indices_(ci),
+        values_(w),
+        p_(p),
+        q_(q),
+        coalesced_alpha_{
+          (max_degree > 0) && (num_paths > 0) && (ptr_alpha != nullptr)
+            ? thrust::optional<thrust::tuple<vertex_t, edge_t, weight_t*>>{thrust::make_tuple(
+                max_degree, num_paths, ptr_alpha)}
+            : thrust::nullopt}
+    {
+    }
+
+    // node2vec alpha scalling logic:
+    // pre-condition: assume column_indices[] is seg-sorted;
+    // (each row has column_indices[] sorted)
+    //
+    __device__ weight_t get_alpha(vertex_t prev_v, vertex_t src_v, vertex_t next_v) const
+    {
+      if (next_v == prev_v) {
+        return 1.0 / p_;
+      } else {
+        // binary-search `next_v` in the adj(prev_v)
+        //
+        auto prev_indx_begin = row_offsets_[prev_v];
+        auto prev_indx_end   = row_offsets_[prev_v + 1];
+
+        auto found_next_from_prev = thrust::binary_search(
+          thrust::seq, col_indices_ + prev_indx_begin, col_indices_ + prev_indx_end, next_v);
+
+        if (found_next_from_prev) {
+          return 1;
+        } else {
+          return 1.0 / q_;
+        }
+      }
+    }
+
+    __device__ thrust::optional<thrust::tuple<vertex_t, weight_t>> operator()(
+      vertex_t src_v, real_t rnd_val, vertex_t prev_v, edge_t path_index, bool start_path) const
+    {
+      auto const offset_indx_begin = row_offsets_[src_v];
+      auto const offset_indx_end   = row_offsets_[src_v + 1];
+
+      weight_t sum_scaled_weights{0};
+      auto offset_indx = offset_indx_begin;
+
+      if (offset_indx_begin == offset_indx_end) return thrust::nullopt;  // src_v is a sink
+
+      // for 1st vertex in path just use biased random selection:
+      //
+      if (start_path) {  // `src_v` is starting vertex in path
+        for (; offset_indx < offset_indx_end; ++offset_indx) {
+          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[offset_indx]);
+
+          sum_scaled_weights += crt_weight;
+        }
+
+        weight_t run_sum_w{0};
+        auto rnd_sum_weights  = rnd_val * sum_scaled_weights;
+        offset_indx           = offset_indx_begin;
+        auto prev_offset_indx = offset_indx;
+
+        // biased sampling selection loop:
+        // (Note: re-compute `scaled_weight`, since no cache is available);
+        //
+        for (; offset_indx < offset_indx_end; ++offset_indx) {
+          if (rnd_sum_weights < run_sum_w) break;
+
+          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[offset_indx]);
+          run_sum_w += crt_weight;
+          prev_offset_indx = offset_indx;
+        }
+        return thrust::optional{
+          thrust::make_tuple(col_indices_[prev_offset_indx],
+                             values_ == nullptr ? weight_t{1} : values_[prev_offset_indx])};
+      }
+
+      // cached solution, for increased performance, but memory expensive:
+      //
+      if (coalesced_alpha_.has_value()) {
+        auto&& tpl = *coalesced_alpha_;
+
+        auto max_out_deg               = thrust::get<0>(tpl);
+        auto num_paths                 = thrust::get<1>(tpl);
+        weight_t* ptr_d_scaled_weights = thrust::get<2>(tpl);
+
+        // sum-scaled-weights reduction loop:
+        //
+        auto const start_alpha_offset = max_out_deg * path_index;
+        for (vertex_t nghbr_indx = 0; offset_indx < offset_indx_end; ++offset_indx, ++nghbr_indx) {
+          auto crt_alpha      = get_alpha(prev_v, src_v, col_indices_[offset_indx]);
+          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[offset_indx]);
+          auto scaled_weight  = crt_weight * crt_alpha;
+
+          // caching is available, hence cache the alpha's for next step
+          // (the actual sampling step);
+          //
+          ptr_d_scaled_weights[start_alpha_offset + nghbr_indx] = scaled_weight;
+
+          sum_scaled_weights += scaled_weight;
+        }
+
+        weight_t run_sum_w{0};
+        auto rnd_sum_weights  = rnd_val * sum_scaled_weights;
+        offset_indx           = offset_indx_begin;
+        auto prev_offset_indx = offset_indx;
+
+        // biased sampling selection loop:
+        //
+        for (vertex_t nghbr_indx = 0; offset_indx < offset_indx_end; ++offset_indx, ++nghbr_indx) {
+          if (rnd_sum_weights < run_sum_w) break;
+
+          run_sum_w += ptr_d_scaled_weights[start_alpha_offset + nghbr_indx];
+          prev_offset_indx = offset_indx;
+        }
+        return thrust::optional{
+          thrust::make_tuple(col_indices_[prev_offset_indx],
+                             values_ == nullptr ? weight_t{1} : values_[prev_offset_indx])};
+
+      } else {  // uncached solution, with much lower memory footprint but not as efficient
+
+        for (; offset_indx < offset_indx_end; ++offset_indx) {
+          auto crt_alpha = get_alpha(prev_v, src_v, col_indices_[offset_indx]);
+
+          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[offset_indx]);
+
+          auto scaled_weight = crt_weight * crt_alpha;
+          sum_scaled_weights += scaled_weight;
+        }
+
+        weight_t run_sum_w{0};
+        auto rnd_sum_weights  = rnd_val * sum_scaled_weights;
+        offset_indx           = offset_indx_begin;
+        auto prev_offset_indx = offset_indx;
+
+        // biased sampling selection loop:
+        // (Note: re-compute `scaled_weight`, since no cache is available);
+        //
+        for (; offset_indx < offset_indx_end; ++offset_indx) {
+          if (rnd_sum_weights < run_sum_w) break;
+
+          auto crt_alpha      = get_alpha(prev_v, src_v, col_indices_[offset_indx]);
+          weight_t crt_weight = (values_ == nullptr ? weight_t{1} : values_[offset_indx]);
+          auto scaled_weight  = crt_weight * crt_alpha;
+
+          run_sum_w += scaled_weight;
+          prev_offset_indx = offset_indx;
+        }
+        return thrust::optional{
+          thrust::make_tuple(col_indices_[prev_offset_indx],
+                             values_ == nullptr ? weight_t{1} : values_[prev_offset_indx])};
+      }
+    }
+
+    decltype(auto) get_alpha_buffer(void) const { return coalesced_alpha_; }
+
+   private:
+    edge_t const* row_offsets_;
+    vertex_t const* col_indices_;
+    weight_t const* values_;
+
+    weight_t const p_;
+    weight_t const q_;
+
+    // alpha scaling coalesced buffer (per path):
+    // (use as cache since the per-path alpha-buffer
+    //  is used twice for each node transition:
+    //  (1) for computing sum_scaled weights;
+    //  (2) for using scaled_weights for the biased next vertex selection)
+    // this is information related to a scratchpad buffer, used as cache, hence mutable;
+    // (necessary, because get_strategy() is const)
+    //
+    mutable thrust::optional<thrust::tuple<vertex_t, edge_t, weight_t*>>
+      coalesced_alpha_;  // tuple<max_vertex_degree,
+                         // num_paths, alpha_buffer[max_vertex_degree*num_paths]>
+  };
+
+  using sampler_type = sampler_t;
+
+  node2vec_selector_t(raft::handle_t const& handle,
+                      graph_type const& graph,
+                      real_t tag,
+                      weight_t p,
+                      weight_t q,
+                      edge_t num_paths = 0)
+    : max_out_degree_(num_paths > 0 ? graph.compute_max_out_degree(handle) : 0),
+      d_coalesced_alpha_{max_out_degree_ * num_paths, handle.get_stream()},
+      sampler_{graph.get_matrix_partition_view().get_offsets(),
+               graph.get_matrix_partition_view().get_indices(),
+               graph.get_matrix_partition_view().get_weights()
+                 ? *(graph.get_matrix_partition_view().get_weights())
+                 : static_cast<weight_t*>(nullptr),
+               p,
+               q,
+               static_cast<vertex_t>(max_out_degree_),
+               num_paths,
+               raw_ptr(d_coalesced_alpha_)}
+  {
+  }
+
+  sampler_t const& get_strategy(void) const { return sampler_; }
+
+  device_vec_t<weight_t> const& get_alpha_cache(void) const { return d_coalesced_alpha_; }
+
+ private:
+  size_t max_out_degree_{0};
+
+  // alpha scaling coalesced buffer (per path):
+  // (use as cache since the per-path alpha-buffer
+  //  is used twice for each node transition:
+  //  (1) for computing sum_scaled weights;
+  //  (2) for using scaled_weights for the biased next vertex selection)
+  //
+  device_vec_t<weight_t> d_coalesced_alpha_;
+  sampler_t sampler_;
+};
+
 // classes abstracting the way the random walks path are generated:
 //
 
@@ -470,6 +716,8 @@ struct horizontal_traversal_t {
                       sampler] __device__(auto path_index) {
                        auto chunk_offset   = path_index * max_depth;
                        vertex_t src_vertex = ptr_coalesced_v[chunk_offset];
+                       auto prev_v         = src_vertex;
+                       bool start_path     = true;
 
                        for (index_t step_indx = 1; step_indx < max_depth; ++step_indx) {
                          // indexing into coalesced arrays of size num_paths x (max_depth -1):
@@ -479,9 +727,13 @@ struct horizontal_traversal_t {
 
                          auto real_rnd_indx = ptr_d_random[stepping_index];
 
-                         auto opt_tpl_vn_wn = sampler(src_vertex, real_rnd_indx);
+                         auto opt_tpl_vn_wn =
+                           sampler(src_vertex, real_rnd_indx, prev_v, path_index, start_path);
                          if (!opt_tpl_vn_wn.has_value()) break;
 
+                         prev_v     = src_vertex;
+                         start_path = false;
+
                          src_vertex      = thrust::get<0>(*opt_tpl_vn_wn);
                          auto crt_weight = thrust::get<1>(*opt_tpl_vn_wn);
 
@@ -502,7 +754,7 @@ struct horizontal_traversal_t {
  private:
   size_t num_paths_;
   size_t max_depth_;
-};  // namespace detail
+};
 
 }  // namespace detail
 }  // namespace cugraph
diff --git a/cpp/tests/sampling/random_walks_profiling.cu b/cpp/tests/sampling/random_walks_profiling.cu
index b5aa787ec28..b97264ce9f7 100644
--- a/cpp/tests/sampling/random_walks_profiling.cu
+++ b/cpp/tests/sampling/random_walks_profiling.cu
@@ -87,6 +87,9 @@ void output_random_walks_time(graph_vt const& graph_view,
 
   edge_t max_depth{10};
 
+  weight_t p{4};
+  weight_t q{8};
+
   HighResTimer hr_timer;
   std::string label{};
 
@@ -109,13 +112,53 @@ void output_random_walks_time(graph_vt const& graph_view,
 
       cudaProfilerStop();
       hr_timer.stop();
-    } else {
+    } else if (sampling_id == 1) {
       label = std::string("RandomWalks; Horizontal traversal; biased sampling - ");
       impl_details::biased_selector_t<graph_vt, real_t> selector{handle, graph_view, real_t{0}};
 
       hr_timer.start(label);
       cudaProfilerStart();
 
+      auto ret_tuple = impl_details::random_walks_impl<graph_vt,
+                                                       decltype(selector),
+                                                       impl_details::horizontal_traversal_t>(
+        handle,  // prevent clang-format to separate function name from its namespace
+        graph_view,
+        d_start_view,
+        max_depth,
+        selector);
+
+      cudaProfilerStop();
+      hr_timer.stop();
+    } else if (sampling_id == 2) {
+      label =
+        std::string("RandomWalks; Horizontal traversal; node2vec sampling with alpha cache - ");
+      impl_details::node2vec_selector_t<graph_vt, real_t> selector{
+        handle, graph_view, real_t{0}, p, q, num_paths};
+
+      hr_timer.start(label);
+      cudaProfilerStart();
+
+      auto ret_tuple = impl_details::random_walks_impl<graph_vt,
+                                                       decltype(selector),
+                                                       impl_details::horizontal_traversal_t>(
+        handle,  // prevent clang-format to separate function name from its namespace
+        graph_view,
+        d_start_view,
+        max_depth,
+        selector);
+
+      cudaProfilerStop();
+      hr_timer.stop();
+    } else {
+      label =
+        std::string("RandomWalks; Horizontal traversal; node2vec sampling without alpha cache - ");
+      impl_details::node2vec_selector_t<graph_vt, real_t> selector{
+        handle, graph_view, real_t{0}, p, q};
+
+      hr_timer.start(label);
+      cudaProfilerStart();
+
       auto ret_tuple = impl_details::random_walks_impl<graph_vt,
                                                        decltype(selector),
                                                        impl_details::horizontal_traversal_t>(
@@ -146,12 +189,51 @@ void output_random_walks_time(graph_vt const& graph_view,
 
       cudaProfilerStop();
       hr_timer.stop();
-    } else {
+    } else if (sampling_id == 1) {
       label = std::string("RandomWalks; Vertical traversal; biased sampling - ");
       impl_details::biased_selector_t<graph_vt, real_t> selector{handle, graph_view, real_t{0}};
       hr_timer.start(label);
       cudaProfilerStart();
 
+      auto ret_tuple = impl_details::random_walks_impl<graph_vt,
+                                                       decltype(selector),
+                                                       impl_details::vertical_traversal_t>(
+        handle,  // prevent clang-format to separate function name from its namespace
+        graph_view,
+        d_start_view,
+        max_depth,
+        selector);
+
+      cudaProfilerStop();
+      hr_timer.stop();
+    } else if (sampling_id == 2) {
+      label = std::string("RandomWalks; Vertical traversal; node2vec sampling with alpha cache - ");
+      impl_details::node2vec_selector_t<graph_vt, real_t> selector{
+        handle, graph_view, real_t{0}, p, q, num_paths};
+
+      hr_timer.start(label);
+      cudaProfilerStart();
+
+      auto ret_tuple = impl_details::random_walks_impl<graph_vt,
+                                                       decltype(selector),
+                                                       impl_details::vertical_traversal_t>(
+        handle,  // prevent clang-format to separate function name from its namespace
+        graph_view,
+        d_start_view,
+        max_depth,
+        selector);
+
+      cudaProfilerStop();
+      hr_timer.stop();
+    } else {
+      label =
+        std::string("RandomWalks; Vertical traversal; node2vec sampling without alpha cache - ");
+      impl_details::node2vec_selector_t<graph_vt, real_t> selector{
+        handle, graph_view, real_t{0}, p, q};
+
+      hr_timer.start(label);
+      cudaProfilerStart();
+
       auto ret_tuple = impl_details::random_walks_impl<graph_vt,
                                                        decltype(selector),
                                                        impl_details::vertical_traversal_t>(
@@ -287,22 +369,30 @@ int main(int argc, char** argv)
   // Run benchmarks
   std::cout << "Using dataset: " << dataset << std::endl;
 
-  std::cout << "# Horizontal traversal strategy:\n";
+  std::cout << "##### Horizontal traversal strategy:\n";
 
-  std::cout << "## Uniform sampling strategy:\n";
+  std::cout << "### Uniform sampling strategy:\n";
   run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::HORIZONTAL, 0);
 
-  std::cout << "## Biased sampling strategy:\n";
+  std::cout << "### Biased sampling strategy:\n";
   run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::HORIZONTAL, 1);
 
-  std::cout << "# Vertical traversal strategy:\n";
+  std::cout << "### Node2Vec sampling strategy:\n";
+  run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::HORIZONTAL, 2);
+  run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::HORIZONTAL, 3);
 
-  std::cout << "## Uniform sampling strategy:\n";
+  std::cout << "##### Vertical traversal strategy:\n";
+
+  std::cout << "### Uniform sampling strategy:\n";
   run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::VERTICAL, 0);
 
-  std::cout << "## Biased sampling strategy:\n";
+  std::cout << "### Biased sampling strategy:\n";
   run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::VERTICAL, 1);
 
+  std::cout << "### Node2Vec sampling strategy:\n";
+  run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::VERTICAL, 2);
+  run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true), traversal_id_t::VERTICAL, 3);
+
   // FIXME: consider returning non-zero for situations that warrant it (eg. if
   // the algo ran but the results are invalid, if a benchmark threshold is
   // exceeded, etc.)
diff --git a/cpp/tests/sampling/random_walks_test.cu b/cpp/tests/sampling/random_walks_test.cu
index 10a417c921d..4e1ee4719b9 100644
--- a/cpp/tests/sampling/random_walks_test.cu
+++ b/cpp/tests/sampling/random_walks_test.cu
@@ -132,22 +132,58 @@ class Tests_RandomWalks
                                                                           num_paths};
 
     edge_t max_depth{10};
+
+    weight_t p{4};
+    weight_t q{8};
+
     if (trv_id == traversal_id_t::HORIZONTAL) {
-      auto ret_tuple =
-        cugraph::random_walks(handle,
-                              graph_view,
-                              d_start_view.begin(),
-                              num_paths,
-                              max_depth,
-                              false,
-                              std::make_unique<cugraph::sampling_params_t>(sampling_id));
-
-      // check results:
+      // `node2vec` without alpha buffer:
       //
-      bool test_all_paths = cugraph::test::host_check_rw_paths(
-        handle, graph_view, std::get<0>(ret_tuple), std::get<1>(ret_tuple), std::get<2>(ret_tuple));
+      if (sampling_id == 2) {
+        auto ret_tuple = cugraph::random_walks(
+          handle,
+          graph_view,
+          d_start_view.begin(),
+          num_paths,
+          max_depth,
+          false,
+          std::make_unique<cugraph::sampling_params_t>(sampling_id, p, q, false));
+
+        // check results:
+        //
+        bool test_all_paths = cugraph::test::host_check_rw_paths(handle,
+                                                                 graph_view,
+                                                                 std::get<0>(ret_tuple),
+                                                                 std::get<1>(ret_tuple),
+                                                                 std::get<2>(ret_tuple));
+
+        ASSERT_TRUE(test_all_paths);
+      }
+
+      // the alpha buffer case should also be tested for `node2vec`
+      // and for the others is irrelevant, so this block is necessary
+      // for any sampling method:
+      //
+      {
+        auto ret_tuple = cugraph::random_walks(
+          handle,
+          graph_view,
+          d_start_view.begin(),
+          num_paths,
+          max_depth,
+          false,
+          std::make_unique<cugraph::sampling_params_t>(sampling_id, p, q, true));
+
+        // check results:
+        //
+        bool test_all_paths = cugraph::test::host_check_rw_paths(handle,
+                                                                 graph_view,
+                                                                 std::get<0>(ret_tuple),
+                                                                 std::get<1>(ret_tuple),
+                                                                 std::get<2>(ret_tuple));
 
-      ASSERT_TRUE(test_all_paths);
+        ASSERT_TRUE(test_all_paths);
+      }
     } else {  // VERTICAL: needs to be force-called via detail
       if (sampling_id == 0) {
         impl_details::uniform_selector_t<graph_vt, real_t> selector{handle, graph_view, real_t{0}};
@@ -173,9 +209,34 @@ class Tests_RandomWalks
           std::cout << "starting seed on failure: " << std::get<3>(ret_tuple) << '\n';
 
         ASSERT_TRUE(test_all_paths);
-      } else {
+      } else if (sampling_id == 1) {
         impl_details::biased_selector_t<graph_vt, real_t> selector{handle, graph_view, real_t{0}};
 
+        auto ret_tuple = impl_details::random_walks_impl<graph_vt,
+                                                         decltype(selector),
+                                                         impl_details::vertical_traversal_t>(
+          handle,  // required to prevent clang-format to separate functin name from its namespace
+          graph_view,
+          d_start_view,
+          max_depth,
+          selector);
+
+        // check results:
+        //
+        bool test_all_paths = cugraph::test::host_check_rw_paths(handle,
+                                                                 graph_view,
+                                                                 std::get<0>(ret_tuple),
+                                                                 std::get<1>(ret_tuple),
+                                                                 std::get<2>(ret_tuple));
+
+        if (!test_all_paths)
+          std::cout << "starting seed on failure: " << std::get<3>(ret_tuple) << '\n';
+
+        ASSERT_TRUE(test_all_paths);
+      } else {
+        impl_details::node2vec_selector_t<graph_vt, real_t> selector{
+          handle, graph_view, real_t{0}, p, q, num_paths};
+
         auto ret_tuple = impl_details::random_walks_impl<graph_vt,
                                                          decltype(selector),
                                                          impl_details::vertical_traversal_t>(
@@ -211,7 +272,7 @@ INSTANTIATE_TEST_SUITE_P(
   simple_test,
   Tests_RandomWalks,
   ::testing::Combine(::testing::Values(traversal_id_t::HORIZONTAL, traversal_id_t::VERTICAL),
-                     ::testing::Values(int{0}, int{1}),
+                     ::testing::Values(int{0}, int{1}, int{2}),
                      ::testing::Values(RandomWalks_Usecase("test/datasets/karate.mtx", true),
                                        RandomWalks_Usecase("test/datasets/web-Google.mtx", true),
                                        RandomWalks_Usecase("test/datasets/ljournal-2008.mtx", true),
diff --git a/cpp/tests/sampling/rw_low_level_test.cu b/cpp/tests/sampling/rw_low_level_test.cu
index 3711fb3f98f..224a6283657 100644
--- a/cpp/tests/sampling/rw_low_level_test.cu
+++ b/cpp/tests/sampling/rw_low_level_test.cu
@@ -37,7 +37,9 @@
 #include <iostream>
 #include <iterator>
 #include <limits>
+#include <map>
 #include <numeric>
+#include <optional>
 #include <utilities/high_res_timer.hpp>
 #include <vector>
 
@@ -84,6 +86,90 @@ void next_biased(raft::handle_t const& handle,
                     });
 }
 
+// simulates max_depth==1 traversal of multiple paths,
+// where num_paths = distance(begin, end), below:
+//
+template <typename vertex_t, typename real_t, typename selector_t>
+void next_node2vec(raft::handle_t const& handle,
+                   vector_test_t<vertex_t> const& d_src_v,
+                   vector_test_t<thrust::optional<vertex_t>> const& d_prev_v,
+                   vector_test_t<real_t> const& d_rnd,
+                   vector_test_t<vertex_t>& d_next_v,
+                   selector_t const& selector)
+{
+  size_t num_paths{d_src_v.size()};
+  auto begin = thrust::make_zip_iterator(thrust::make_tuple(
+    d_src_v.begin(), d_prev_v.begin(), thrust::make_counting_iterator<size_t>(0)));
+  auto end   = thrust::make_zip_iterator(thrust::make_tuple(
+    d_src_v.end(), d_prev_v.end(), thrust::make_counting_iterator<size_t>(num_paths)));
+
+  thrust::transform(handle.get_thrust_policy(),
+                    begin,
+                    end,
+                    d_rnd.begin(),
+                    d_next_v.begin(),
+                    [sampler = selector.get_strategy()] __device__(auto tpl, auto rnd_val) {
+                      vertex_t src_v = thrust::get<0>(tpl);
+
+                      size_t path_index = thrust::get<2>(tpl);
+
+                      if (thrust::get<1>(tpl) != thrust::nullopt) {
+                        vertex_t prev_v = *thrust::get<1>(tpl);
+
+                        auto next_vw = sampler(src_v, rnd_val, prev_v, path_index, false);
+                        return (next_vw.has_value() ? thrust::get<0>(*next_vw) : src_v);
+                      } else {
+                        return src_v;
+                      }
+                    });
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+void alpha_node2vec(std::vector<edge_t> const& row_offsets,
+                    std::vector<vertex_t> const& col_indices,
+                    std::vector<weight_t>& weights,  // to be scaled!
+                    std::vector<thrust::optional<vertex_t>> const& v_pred,
+                    std::vector<vertex_t> const& v_crt,
+                    weight_t p,
+                    weight_t q)
+{
+  auto num_vs = v_crt.size();
+  for (size_t indx = 0; indx < num_vs; ++indx) {
+    auto src_v = v_crt[indx];
+
+    size_t num_neighbors = row_offsets[src_v + 1] - row_offsets[src_v];
+
+    if (num_neighbors == 0) { continue; }
+
+    if (v_pred[indx].has_value()) {
+      auto pred_v = *(v_pred[indx]);
+
+      for (auto offset_indx = row_offsets[src_v]; offset_indx < row_offsets[src_v + 1];
+           ++offset_indx) {
+        auto next_v = col_indices[offset_indx];
+
+        weight_t alpha{0};
+
+        if (next_v == pred_v) {
+          alpha = 1.0 / p;
+        } else {
+          auto begin    = col_indices.begin() + row_offsets[pred_v];
+          auto end      = col_indices.begin() + row_offsets[pred_v + 1];
+          auto it_found = std::find(begin, end, next_v);
+
+          if (it_found != end) {
+            alpha = 1.0;
+          } else {
+            alpha = 1.0 / q;
+          }
+        }
+
+        weights[offset_indx] *= alpha;  // scale weights
+      }
+    }
+  }
+}
+
 }  // namespace
 
 // FIXME (per rlratzel request):
@@ -1217,3 +1303,272 @@ TEST(BiasedRandomWalks, SelectorSmallGraph)
 
   EXPECT_EQ(v_next_v, h_next_v);
 }
+
+TEST(Node2VecRandomWalks, Node2VecSmallGraph)
+{
+  namespace topo = cugraph::topology;
+
+  raft::handle_t handle{};
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+  using real_t   = weight_t;
+
+  weight_t p = 2.0;
+  weight_t q = 4.0;
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  // Step 1: graph construction:
+  //
+  /*
+    0 --(.1)--> 1 --(1.1)--> 4
+   /|\       /\ |            |
+    |       /   |            |
+   (5.1) (3.1)(2.1)        (3.2)
+    |   /       |            |
+    | /        \|/          \|/
+    2 --(4.1)-->3 --(7.2)--> 5
+   */
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w(num_edges, 1.0);  //{0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  std::vector<real_t> v_rnd{0.2, 0.5, 1.0, 0.1, 0.8};
+  std::vector<vertex_t> v_src_v{0, 1, 3, 4, 5};
+  std::vector<thrust::optional<vertex_t>> v_pred_v{2, 0, 1, 1, 4};
+
+  vector_test_t<real_t> d_rnd(v_rnd.size(), handle.get_stream());
+  vector_test_t<vertex_t> d_src_v(v_src_v.size(), handle.get_stream());
+
+  EXPECT_EQ(d_rnd.size(), d_src_v.size());
+
+  raft::update_device(d_rnd.data(), v_rnd.data(), d_rnd.size(), handle.get_stream());
+  raft::update_device(d_src_v.data(), v_src_v.data(), d_src_v.size(), handle.get_stream());
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets = graph_view.get_matrix_partition_view().get_offsets();
+
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+
+  weight_t const* values = *(graph_view.get_matrix_partition_view().get_weights());
+
+  // Step 2: `node2vec` selection on original graph:
+  //
+  cugraph::detail::node2vec_selector_t n2v_selector{handle, graph_view, 0.0f, p, q};
+
+  vector_test_t<thrust::optional<vertex_t>> d_pred_v(v_pred_v.size(), handle.get_stream());
+
+  raft::update_device(d_pred_v.data(), v_pred_v.data(), v_pred_v.size(), handle.get_stream());
+
+  vector_test_t<vertex_t> d_next_v(v_src_v.size(), handle.get_stream());
+
+  // `node2vec` stepping:
+  //
+  next_node2vec(handle, d_src_v, d_pred_v, d_rnd, d_next_v, n2v_selector);
+
+  std::vector<vertex_t> n2v_next_v(v_src_v.size());
+  raft::update_host(n2v_next_v.data(), d_next_v.data(), v_src_v.size(), handle.get_stream());
+
+  EXPECT_EQ(n2v_next_v.size(), d_src_v.size());
+
+  // Step 3: construct similar graph, just with
+  //         alpha scaled weights;
+  //
+  std::vector<weight_t> scaled_weights(v_w);
+  std::vector<edge_t> row_offsets(num_vertices + 1);
+  std::vector<vertex_t> col_indices(num_edges);
+
+  raft::update_host(
+    row_offsets.data(), offsets, static_cast<size_t>(num_vertices + 1), handle.get_stream());
+
+  raft::update_host(
+    col_indices.data(), indices, static_cast<size_t>(num_edges), handle.get_stream());
+
+  std::vector<edge_t> v_ro{0, 1, 3, 6, 7, 8, 8};
+  std::vector<vertex_t> v_ci{1, 3, 4, 0, 1, 3, 5, 5};
+
+  EXPECT_EQ(row_offsets, v_ro);
+  EXPECT_EQ(col_indices, v_ci);
+  EXPECT_EQ(scaled_weights.size(), static_cast<size_t>(num_edges));
+
+  alpha_node2vec(row_offsets, col_indices, scaled_weights, v_pred_v, v_src_v, p, q);
+
+  auto scaled_graph =
+    cugraph::test::make_graph(handle,
+                              v_src,
+                              v_dst,
+                              std::optional<std::vector<weight_t>>{scaled_weights},
+                              num_vertices,
+                              num_edges);
+
+  auto scaled_graph_view = scaled_graph.view();
+
+  // Step 4: biased selection on alpha scaled graph:
+  //
+  cugraph::detail::biased_selector_t selector{handle, scaled_graph_view, 0.0f};
+
+  next_biased(handle, d_src_v, d_rnd, d_next_v, selector);
+
+  std::vector<vertex_t> biased_next_v(v_src_v.size());
+  raft::update_host(biased_next_v.data(), d_next_v.data(), v_src_v.size(), handle.get_stream());
+
+  // Step 5: compare `node2vec` on original graph
+  //         with biased on graph with alpha scaled weights:
+  //
+  EXPECT_EQ(biased_next_v, n2v_next_v);
+}
+
+TEST(Node2VecRandomWalks, CachedNode2VecSmallGraph)
+{
+  namespace topo = cugraph::topology;
+
+  raft::handle_t handle{};
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+  using real_t   = weight_t;
+
+  weight_t p = 2.0;
+  weight_t q = 4.0;
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  // Step 1: graph construction:
+  //
+  /*
+    0 --(.1)--> 1 --(1.1)--> 4
+   /|\       /\ |            |
+    |       /   |            |
+   (5.1) (3.1)(2.1)        (3.2)
+    |   /       |            |
+    | /        \|/          \|/
+    2 --(4.1)-->3 --(7.2)--> 5
+   */
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w(num_edges, 1.0);  //{0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f};
+
+  auto graph = cugraph::test::make_graph(
+    handle, v_src, v_dst, std::optional<std::vector<weight_t>>{v_w}, num_vertices, num_edges);
+
+  std::vector<real_t> v_rnd{0.2, 0.5, 1.0, 0.1, 0.8};
+  std::vector<vertex_t> v_src_v{0, 1, 3, 4, 5};
+  std::vector<thrust::optional<vertex_t>> v_pred_v{2, 0, 1, 1, 4};
+
+  vector_test_t<real_t> d_rnd(v_rnd.size(), handle.get_stream());
+  vector_test_t<vertex_t> d_src_v(v_src_v.size(), handle.get_stream());
+
+  EXPECT_EQ(d_rnd.size(), d_src_v.size());
+
+  raft::update_device(d_rnd.data(), v_rnd.data(), d_rnd.size(), handle.get_stream());
+  raft::update_device(d_src_v.data(), v_src_v.data(), d_src_v.size(), handle.get_stream());
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets = graph_view.get_matrix_partition_view().get_offsets();
+
+  vertex_t const* indices = graph_view.get_matrix_partition_view().get_indices();
+
+  weight_t const* values = *(graph_view.get_matrix_partition_view().get_weights());
+
+  // Step 2: `node2vec` selection on original graph:
+  //
+  // CAVEAT: next_node2vec(), steps in parallel, so it simulates
+  //         traversing multiple paths (of size max_depth == 1);
+  //         if ignored, this creates a data race on the cached
+  //         alpha buffer!
+  //
+  edge_t num_paths(d_src_v.size());
+  cugraph::detail::node2vec_selector_t n2v_selector{
+    handle, graph_view, 0.0f, p, q, num_paths};  // use cached approach
+
+  auto const& d_cached_alpha = n2v_selector.get_alpha_cache();
+
+  size_t expected_max_degree{3};
+  EXPECT_EQ(d_cached_alpha.size(), expected_max_degree * num_paths);
+
+  auto&& coalesced_alpha = n2v_selector.get_strategy().get_alpha_buffer();
+
+  ASSERT_TRUE(coalesced_alpha != thrust::nullopt);
+
+  EXPECT_EQ(static_cast<size_t>(thrust::get<0>(*coalesced_alpha)), expected_max_degree);
+  EXPECT_EQ(thrust::get<1>(*coalesced_alpha), num_paths);
+  EXPECT_EQ(thrust::get<2>(*coalesced_alpha), d_cached_alpha.data());
+
+  vector_test_t<thrust::optional<vertex_t>> d_pred_v(v_pred_v.size(), handle.get_stream());
+
+  raft::update_device(d_pred_v.data(), v_pred_v.data(), v_pred_v.size(), handle.get_stream());
+
+  vector_test_t<vertex_t> d_next_v(v_src_v.size(), handle.get_stream());
+
+  // `node2vec` stepping:
+  //
+  // CAVEAT: next_node2vec(), steps in parallel, so it simulates
+  //         traversing multiple paths (of size max_depth == 1);
+  //         if ignored, this creates a data race on the cached
+  //         alpha buffer!
+  //
+  next_node2vec(handle, d_src_v, d_pred_v, d_rnd, d_next_v, n2v_selector);
+
+  std::vector<vertex_t> n2v_next_v(v_src_v.size());
+  raft::update_host(n2v_next_v.data(), d_next_v.data(), v_src_v.size(), handle.get_stream());
+
+  EXPECT_EQ(n2v_next_v.size(), d_src_v.size());
+
+  // Step 3: construct similar graph, just with
+  //         alpha scaled weights;
+  //
+  std::vector<weight_t> scaled_weights(v_w);
+  std::vector<edge_t> row_offsets(num_vertices + 1);
+  std::vector<vertex_t> col_indices(num_edges);
+
+  raft::update_host(
+    row_offsets.data(), offsets, static_cast<size_t>(num_vertices + 1), handle.get_stream());
+
+  raft::update_host(
+    col_indices.data(), indices, static_cast<size_t>(num_edges), handle.get_stream());
+
+  std::vector<edge_t> v_ro{0, 1, 3, 6, 7, 8, 8};
+  std::vector<vertex_t> v_ci{1, 3, 4, 0, 1, 3, 5, 5};
+
+  EXPECT_EQ(row_offsets, v_ro);
+  EXPECT_EQ(col_indices, v_ci);
+  EXPECT_EQ(scaled_weights.size(), static_cast<size_t>(num_edges));
+
+  alpha_node2vec(row_offsets, col_indices, scaled_weights, v_pred_v, v_src_v, p, q);
+
+  auto scaled_graph =
+    cugraph::test::make_graph(handle,
+                              v_src,
+                              v_dst,
+                              std::optional<std::vector<weight_t>>{scaled_weights},
+                              num_vertices,
+                              num_edges);
+
+  auto scaled_graph_view = scaled_graph.view();
+
+  // Step 4: biased selection on alpha scaled graph:
+  //
+  cugraph::detail::biased_selector_t selector{handle, scaled_graph_view, 0.0f};
+
+  next_biased(handle, d_src_v, d_rnd, d_next_v, selector);
+
+  std::vector<vertex_t> biased_next_v(v_src_v.size());
+  raft::update_host(biased_next_v.data(), d_next_v.data(), v_src_v.size(), handle.get_stream());
+
+  // Step 5: compare `node2vec` on original graph
+  //         with biased on graph with alpha scaled weights:
+  //
+  EXPECT_EQ(biased_next_v, n2v_next_v);
+}