Fix Fanout -1 (#2358)

VibhuJawa · web-flow · commit df82b7498709 · 2022-06-16T20:46:52.000Z
This PR fixes sampling when for the default value of -1 which is called below: https://github.com/rapidsai/cugraph/blob/92f6ba451b2d9e6c3f60dbccfa05bbf3c480e43a/python/cugraph/cugraph/gnn/graph_store.py#L129-L133 In workflows this is called during inference so we need this to work for inference to work .with CuGraphStorage . Authors: - Vibhu Jawa (https://github.com/VibhuJawa) Approvers: - Rick Ratzel (https://github.com/rlratzel) - Alex Barghi (https://github.com/alexbarghi-nv) URL: #2358
diff --git a/python/cugraph/cugraph/gnn/graph_store.py b/python/cugraph/cugraph/gnn/graph_store.py
@@ -85,6 +85,8 @@ def sample_neighbors(self,
             Node IDs to sample neighbors from.
         fanout : int
             The number of edges to be sampled for each node on each edge type.
+            If -1 is given all the neighboring edges for each node on
+            each edge type will be selected.
         edge_dir : str {"in" or "out"}
             Determines whether to sample inbound or outbound edges.
             Can take either in for inbound edges or out for outbound edges.
diff --git a/python/cugraph/cugraph/tests/test_graph_store.py b/python/cugraph/cugraph/tests/test_graph_store.py
@@ -164,6 +164,29 @@ def test_sample_neighbors(graph_file):
     assert len(parents_list) > 0
 
 
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_sample_neighbor_neg_one_fanout(graph_file):
+    cu_M = utils.read_csv_file(graph_file)
+
+    g = cugraph.Graph(directed=True)
+    g.from_cudf_edgelist(cu_M, source='0', destination='1', renumber=True)
+
+    pg = PropertyGraph()
+    pg.add_edge_data(cu_M,
+                     type_name="edge",
+                     vertex_col_names=("0", "1"),
+                     property_columns=["2"])
+
+    gstore = cugraph.gnn.CuGraphStore(graph=pg)
+
+    nodes = gstore.get_vertex_ids()
+    sampled_nodes = nodes[:5]
+    # -1, default fan_out
+    parents_list, children_list = gstore.sample_neighbors(sampled_nodes, -1)
+
+    assert len(parents_list) > 0
+
+
 @pytest.mark.parametrize("graph_file", utils.DATASETS)
 def test_n_data(graph_file):
     cu_M = utils.read_csv_file(graph_file)
diff --git a/python/cugraph/cugraph/utilities/utils.py b/python/cugraph/cugraph/utilities/utils.py
@@ -479,14 +479,17 @@ def create_random_bipartite(v1, v2, size, dtype):
 
 
 def sample_groups(df, by, n_samples):
-    # Sample n_samples in the df frm by column
+    # Sample n_samples in the df using the by column
 
     # Step 1
     # first, shuffle the dataframe and reset its index,
     # so that the ordering of values within each group
     # is made random:
     df = df.sample(frac=1).reset_index(drop=True)
 
+    # If we want to keep all samples we return
+    if n_samples == -1:
+        return df
     # Step 2
     # add an integer-encoded version of the "by" column,
     # since the rank aggregation seems not to work for