Skip to content

Commit addb9d7

Browse files
authored
Refactor MG neighborhood sampling and add SG implementation (#2285)
A new implementation of neighborhood sampling meeting the new C API was merged. This PR 1. Moves the old implementation to the dir `proto` 2. Refactor MG neighborhood sampling bindings based on the new implementation 3. Add bindings for SG neighborhood sampling 4. Refactor tests DO NOT MERGE YET: PENDING fix for missing vertices in `start_list` closes #2272 Authors: - Joseph Nke (https://github.com/jnke2016) - Chuck Hastings (https://github.com/ChuckHastings) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) - Seunghwa Kang (https://github.com/seunghwak) - https://github.com/betochimas - Rick Ratzel (https://github.com/rlratzel) URL: #2285
1 parent 5d147ef commit addb9d7

File tree

19 files changed

+698
-236
lines changed

19 files changed

+698
-236
lines changed

python/cugraph/cugraph/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,13 @@
109109
from raft import raft_include_test
110110
from cugraph.dask.comms import comms
111111

112-
from cugraph.sampling import random_walks, rw_path, node2vec
112+
from cugraph.sampling import (
113+
random_walks,
114+
rw_path,
115+
node2vec,
116+
uniform_neighbor_sample,
117+
)
118+
113119

114120
from cugraph import experimental
115121

python/cugraph/cugraph/dask/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,5 @@
2020
from .community.triangle_count import triangle_count
2121
from .centrality.katz_centrality import katz_centrality
2222
from .components.connectivity import weakly_connected_components
23+
from .sampling.uniform_neighbor_sample import uniform_neighbor_sample
2324
from .centrality.eigenvector_centrality import eigenvector_centrality

python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py renamed to python/cugraph/cugraph/dask/sampling/uniform_neighbor_sample.py

Lines changed: 46 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,14 @@
1818
import dask_cudf
1919
import cudf
2020

21-
from pylibcugraph.experimental import (MGGraph,
22-
ResourceHandle,
23-
GraphProperties,
24-
uniform_neighborhood_sampling,
25-
)
21+
from pylibcugraph import (ResourceHandle,
22+
GraphProperties,
23+
MGGraph
24+
)
25+
26+
from pylibcugraph import \
27+
uniform_neighbor_sample as pylibcugraph_uniform_neighbor_sample
28+
2629
from cugraph.dask.common.input_utils import get_distributed_data
2730
from cugraph.dask.comms import comms as Comms
2831

@@ -34,7 +37,6 @@ def call_nbr_sampling(sID,
3437
num_edges,
3538
do_expensive_check,
3639
start_list,
37-
info_list,
3840
h_fan_out,
3941
with_replacement):
4042

@@ -59,36 +61,38 @@ def call_nbr_sampling(sID,
5961
num_edges,
6062
do_expensive_check)
6163

62-
ret_val = uniform_neighborhood_sampling(handle,
63-
mg,
64-
start_list,
65-
info_list,
66-
h_fan_out,
67-
with_replacement,
68-
do_expensive_check)
64+
ret_val = pylibcugraph_uniform_neighbor_sample(handle,
65+
mg,
66+
start_list,
67+
h_fan_out,
68+
with_replacement,
69+
do_expensive_check)
6970
return ret_val
7071

7172

72-
def convert_to_cudf(cp_arrays):
73+
def convert_to_cudf(cp_arrays, weight_t):
7374
"""
7475
Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper
7576
"""
76-
cupy_sources, cupy_destinations, cupy_labels, cupy_indices = cp_arrays
77-
# cupy_sources, cupy_destinations, cupy_labels, cupy_indices,
78-
# cupy_counts = cp_arrays
77+
cupy_sources, cupy_destinations, cupy_indices = cp_arrays
78+
7979
df = cudf.DataFrame()
8080
df["sources"] = cupy_sources
8181
df["destinations"] = cupy_destinations
82-
df["labels"] = cupy_labels
8382
df["indices"] = cupy_indices
84-
# df["counts"] = cupy_counts
83+
84+
if weight_t == "int32":
85+
df.indices = df.indices.astype("int32")
86+
elif weight_t == "int64":
87+
df.indices = df.indices.astype("int64")
88+
8589
return df
8690

8791

88-
def EXPERIMENTAL__uniform_neighborhood(input_graph,
89-
start_info_list,
90-
fanout_vals,
91-
with_replacement=True):
92+
def uniform_neighbor_sample(input_graph,
93+
start_list,
94+
fanout_vals,
95+
with_replacement=True):
9296
"""
9397
Does neighborhood sampling, which samples nodes from a graph based on the
9498
current node's neighbors, with a corresponding fanout value at each hop.
@@ -99,10 +103,8 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph,
99103
cuGraph graph, which contains connectivity information as dask cudf
100104
edge list dataframe
101105
102-
start_info_list : tuple of list or cudf.Series (int32)
103-
Tuple of a list of starting vertices for sampling, along with a
104-
corresponding list of label for reorganizing results after sending
105-
the input to different callers.
106+
start_list : list or cudf.Series (int32)
107+
a list of starting vertices for sampling
106108
107109
fanout_vals : list (int32)
108110
List of branching out (fan-out) degrees per starting vertex for each
@@ -111,6 +113,7 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph,
111113
with_replacement: bool, optional (default=True)
112114
Flag to specify if the random sampling is done with replacement
113115
116+
114117
Returns
115118
-------
116119
result : dask_cudf.DataFrame
@@ -120,8 +123,6 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph,
120123
Contains the source vertices from the sampling result
121124
ddf['destinations']: dask_cudf.Series
122125
Contains the destination vertices from the sampling result
123-
ddf['labels']: dask_cudf.Series
124-
Contains the start labels from the sampling result
125126
ddf['indices']: dask_cudf.Series
126127
Contains the indices from the sampling result for path
127128
reconstruction
@@ -135,18 +136,15 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph,
135136
input_graph.compute_renumber_edge_list(
136137
transposed=False, legacy_renum_only=True)
137138

138-
start_list, info_list = start_info_list
139+
if isinstance(start_list, int):
140+
start_list = [start_list]
139141

140142
if isinstance(start_list, list):
141143
start_list = cudf.Series(start_list)
142-
if start_list.dtype != 'int32':
144+
if start_list.dtype != "int32":
143145
raise ValueError(f"'start_list' must have int32 values, "
144146
f"got: {start_list.dtype}")
145-
if isinstance(info_list, list):
146-
info_list = cudf.Series(info_list)
147-
if info_list.dtype != 'int32':
148-
raise ValueError(f"'info_list' must have int32 values, "
149-
f"got: {info_list.dtype}")
147+
150148
# fanout_vals must be a host array!
151149
# FIXME: ensure other sequence types (eg. cudf Series) can be handled.
152150
if isinstance(fanout_vals, list):
@@ -156,12 +154,18 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph,
156154
f"got: {type(fanout_vals)}")
157155

158156
ddf = input_graph.edgelist.edgelist_df
159-
num_edges = len(ddf)
160-
data = get_distributed_data(ddf)
161-
162157
src_col_name = input_graph.renumber_map.renumbered_src_col_name
163158
dst_col_name = input_graph.renumber_map.renumbered_dst_col_name
164159

160+
weight_t = ddf["value"].dtype
161+
if weight_t == "int32":
162+
ddf = ddf.astype({"value": "float32"})
163+
elif weight_t == "int64":
164+
ddf = ddf.astype({"value": "float64"})
165+
166+
num_edges = len(ddf)
167+
data = get_distributed_data(ddf)
168+
165169
# start_list uses "external" vertex IDs, but if the graph has been
166170
# renumbered, the start vertex IDs must also be renumbered.
167171
if input_graph.renumbered:
@@ -177,7 +181,6 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph,
177181
num_edges,
178182
do_expensive_check,
179183
start_list,
180-
info_list,
181184
fanout_vals,
182185
with_replacement,
183186
workers=[wf[0]])
@@ -186,14 +189,14 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph,
186189
wait(result)
187190

188191
cudf_result = [client.submit(convert_to_cudf,
189-
cp_arrays)
192+
cp_arrays, weight_t)
190193
for cp_arrays in result]
191194

192195
wait(cudf_result)
193196

194197
ddf = dask_cudf.from_delayed(cudf_result)
195198
if input_graph.renumbered:
196-
ddf = input_graph.unrenumber(ddf, "sources")
197-
ddf = input_graph.unrenumber(ddf, "destinations")
199+
ddf = input_graph.unrenumber(ddf, "sources", preserve_order=True)
200+
ddf = input_graph.unrenumber(ddf, "destinations", preserve_order=True)
198201

199202
return ddf

python/cugraph/cugraph/experimental/dask/__init__.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,3 @@
1010
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1111
# See the License for the specific language governing permissions and
1212
# limitations under the License.
13-
14-
from cugraph.utilities.api_tools import experimental_warning_wrapper
15-
16-
from cugraph.dask.sampling.neighborhood_sampling import \
17-
EXPERIMENTAL__uniform_neighborhood
18-
uniform_neighborhood_sampling = \
19-
experimental_warning_wrapper(EXPERIMENTAL__uniform_neighborhood)

python/cugraph/cugraph/proto/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2019-2021, NVIDIA CORPORATION.
1+
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
22
# Licensed under the Apache License, Version 2.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at

python/cugraph/cugraph/sampling/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,5 @@
1313

1414
from cugraph.sampling.random_walks import random_walks, rw_path
1515
from cugraph.sampling.node2vec import node2vec
16+
from cugraph.sampling.uniform_neighbor_sample import \
17+
uniform_neighbor_sample
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# Copyright (c) 2022, NVIDIA CORPORATION.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
from pylibcugraph import (ResourceHandle,
15+
GraphProperties,
16+
SGGraph,
17+
)
18+
from pylibcugraph import uniform_neighbor_sample as \
19+
pylibcugraph_uniform_neighbor_sample
20+
21+
import numpy
22+
23+
import cudf
24+
25+
26+
def uniform_neighbor_sample(G,
27+
start_list,
28+
fanout_vals,
29+
with_replacement=True,
30+
is_edge_ids=False):
31+
"""
32+
Does neighborhood sampling, which samples nodes from a graph based on the
33+
current node's neighbors, with a corresponding fanout value at each hop.
34+
35+
Parameters
36+
----------
37+
G : cugraph.Graph
38+
cuGraph graph, which contains connectivity information as dask cudf
39+
edge list dataframe
40+
41+
start_list : list or cudf.Series (int32)
42+
a list of starting vertices for sampling
43+
44+
fanout_vals : list (int32)
45+
List of branching out (fan-out) degrees per starting vertex for each
46+
hop level.
47+
48+
with_replacement: bool, optional (default=True)
49+
Flag to specify if the random sampling is done with replacement
50+
51+
Returns
52+
-------
53+
result : cudf.DataFrame
54+
GPU data frame containing two cudf.Series
55+
56+
df['sources']: cudf.Series
57+
Contains the source vertices from the sampling result
58+
df['destinations']: cudf.Series
59+
Contains the destination vertices from the sampling result
60+
df['indices']: cudf.Series
61+
Contains the indices from the sampling result for path
62+
reconstruction
63+
"""
64+
65+
if isinstance(start_list, int):
66+
start_list = [start_list]
67+
68+
if isinstance(start_list, list):
69+
start_list = cudf.Series(start_list, dtype="int32")
70+
if start_list.dtype != "int32":
71+
raise ValueError(f"'start_list' must have int32 values, "
72+
f"got: {start_list.dtype}")
73+
74+
# fanout_vals must be a host array!
75+
# FIXME: ensure other sequence types (eg. cudf Series) can be handled.
76+
if isinstance(fanout_vals, list):
77+
fanout_vals = numpy.asarray(fanout_vals, dtype="int32")
78+
else:
79+
raise TypeError("fanout_vals must be a list, "
80+
f"got: {type(fanout_vals)}")
81+
82+
if G.renumbered is True:
83+
if isinstance(start_list, cudf.DataFrame):
84+
start_list = G.lookup_internal_vertex_id(
85+
start_list, start_list.columns)
86+
else:
87+
start_list = G.lookup_internal_vertex_id(start_list)
88+
89+
srcs = G.edgelist.edgelist_df['src']
90+
dsts = G.edgelist.edgelist_df['dst']
91+
weights = G.edgelist.edgelist_df['weights']
92+
weight_t = weights.dtype
93+
94+
if weight_t == "int32":
95+
weights = weights.astype("float32")
96+
if weight_t == "int64":
97+
weights = weights.astype("float64")
98+
99+
if srcs.dtype != 'int32':
100+
raise ValueError(f"Graph vertices must have int32 values, "
101+
f"got: {srcs.dtype}")
102+
103+
resource_handle = ResourceHandle()
104+
graph_props = GraphProperties(is_multigraph=G.is_multigraph())
105+
store_transposed = False
106+
renumber = False
107+
do_expensive_check = False
108+
109+
sg = SGGraph(resource_handle, graph_props, srcs, dsts, weights,
110+
store_transposed, renumber, do_expensive_check)
111+
112+
sources, destinations, indices = \
113+
pylibcugraph_uniform_neighbor_sample(resource_handle, sg, start_list,
114+
fanout_vals, with_replacement,
115+
do_expensive_check)
116+
117+
df = cudf.DataFrame()
118+
df["sources"] = sources
119+
df["destinations"] = destinations
120+
df["indices"] = indices
121+
if weight_t == "int32":
122+
df["indices"] = indices.astype("int32")
123+
elif weight_t == "int64":
124+
df["indices"] = indices.astype("int64")
125+
else:
126+
df["indices"] = indices
127+
128+
if G.renumbered:
129+
df = G.unrenumber(df, "sources", preserve_order=True)
130+
df = G.unrenumber(df, "destinations", preserve_order=True)
131+
132+
return df

python/cugraph/cugraph/structure/graph_classes.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,8 @@ def from_cudf_edgelist(
9898
source="source",
9999
destination="destination",
100100
edge_attr=None,
101-
renumber=True
101+
renumber=True,
102+
legacy_renum_only=False
102103
):
103104
"""
104105
Initialize a graph from the edge list. It is an error to call this
@@ -150,11 +151,13 @@ def from_cudf_edgelist(
150151
elif (self._Impl.edgelist is not None or
151152
self._Impl.adjlist is not None):
152153
raise RuntimeError("Graph already has values")
153-
self._Impl._simpleGraphImpl__from_edgelist(input_df,
154-
source=source,
155-
destination=destination,
156-
edge_attr=edge_attr,
157-
renumber=renumber)
154+
self._Impl._simpleGraphImpl__from_edgelist(
155+
input_df,
156+
source=source,
157+
destination=destination,
158+
edge_attr=edge_attr,
159+
renumber=renumber,
160+
legacy_renum_only=legacy_renum_only)
158161

159162
def from_cudf_adjlist(self, offset_col, index_col, value_col=None):
160163
"""

0 commit comments

Comments
 (0)