Skip to content

Commit 50ba399

Browse files
authored
Fix out of index errors encountered with sampling on out of index samples (#2825)
THIS PR does the following - [x] Ensure we dont sample on out of range values Issue: #2828 - [x] Add tests for the sampling error - [x] Ensure all the DGL examples here pass https://github.com/rapidsai/dgl/blob/6ece904c69687adcd35a5ea41d1f5ca4ea01c0e2/examples/cugraph-pytorch/cugraph-local/rgcn-hetero/README.MD - [x] Reformat out the non class specific utilities in prepration for DGL graph service class Authors: - Vibhu Jawa (https://github.com/VibhuJawa) Approvers: - Rick Ratzel (https://github.com/rlratzel) - Joseph Nke (https://github.com/jnke2016) URL: #2825
1 parent 74ead42 commit 50ba399

File tree

9 files changed

+468
-273
lines changed

9 files changed

+468
-273
lines changed

python/cugraph/cugraph/gnn/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@
1212
# limitations under the License.
1313

1414
from .graph_store import CuGraphStore
15-
from .graph_store import CuFeatureStorage
15+
from .dgl_extensions.feature_storage import CuFeatureStorage

python/cugraph/cugraph/gnn/dgl_extensions/__init__.py

Whitespace-only changes.
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Copyright (c) 2022, NVIDIA CORPORATION.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
15+
import cudf
16+
import dask_cudf
17+
import cupy as cp
18+
from cugraph.experimental import MGPropertyGraph
19+
20+
21+
class CuFeatureStorage:
22+
"""
23+
Storage for node/edge feature data.
24+
"""
25+
26+
def __init__(
27+
self, pg, columns, storage_type, backend_lib="torch", indices_offset=0
28+
):
29+
self.pg = pg
30+
self.columns = columns
31+
if backend_lib == "torch":
32+
from torch.utils.dlpack import from_dlpack
33+
elif backend_lib == "tf":
34+
from tensorflow.experimental.dlpack import from_dlpack
35+
elif backend_lib == "cupy":
36+
from cupy import from_dlpack
37+
else:
38+
raise NotImplementedError(
39+
f"Only PyTorch ('torch'), TensorFlow ('tf'), and CuPy ('cupy') "
40+
f"backends are currently supported, got {backend_lib=}"
41+
)
42+
if storage_type not in ["edge", "node"]:
43+
raise NotImplementedError("Only edge and node storage is supported")
44+
45+
self.storage_type = storage_type
46+
47+
self.from_dlpack = from_dlpack
48+
self.indices_offset = indices_offset
49+
50+
def fetch(self, indices, device=None, pin_memory=False, **kwargs):
51+
"""Fetch the features of the given node/edge IDs to the
52+
given device.
53+
54+
Parameters
55+
----------
56+
indices : Tensor
57+
Node or edge IDs.
58+
device : Device
59+
Device context.
60+
pin_memory :
61+
62+
Returns
63+
-------
64+
Tensor
65+
Feature data stored in PyTorch Tensor.
66+
"""
67+
# Default implementation uses synchronous fetch.
68+
69+
indices = cp.asarray(indices)
70+
if isinstance(self.pg, MGPropertyGraph):
71+
# dask_cudf loc breaks if we provide cudf series/cupy array
72+
# https://github.com/rapidsai/cudf/issues/11877
73+
indices = indices.get()
74+
else:
75+
indices = cudf.Series(indices)
76+
77+
indices = indices + self.indices_offset
78+
79+
if self.storage_type == "node":
80+
subset_df = self.pg.get_vertex_data(
81+
vertex_ids=indices, columns=self.columns
82+
)
83+
else:
84+
subset_df = self.pg.get_edge_data(edge_ids=indices, columns=self.columns)
85+
86+
subset_df = subset_df[self.columns]
87+
88+
if isinstance(subset_df, dask_cudf.DataFrame):
89+
subset_df = subset_df.compute()
90+
91+
if len(subset_df) == 0:
92+
raise ValueError(f"indices = {indices} not found in FeatureStorage")
93+
cap = subset_df.to_dlpack()
94+
tensor = self.from_dlpack(cap)
95+
del cap
96+
if device:
97+
if not isinstance(tensor, cp.ndarray):
98+
# Cant transfer to different device for cupy
99+
tensor = tensor.to(device)
100+
return tensor
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Copyright (c) 2022, NVIDIA CORPORATION.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Copyright (c) 2022, NVIDIA CORPORATION.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
# Utils for adding data to cugraph graphstore objects
15+
16+
17+
def _update_feature_map(
18+
pg_feature_map, feat_name_obj, contains_vector_features, columns
19+
):
20+
"""
21+
Update the existing feature map `pg_feature_map` based on `feat_name_obj`
22+
"""
23+
if contains_vector_features:
24+
if feat_name_obj is None:
25+
raise ValueError(
26+
"feature name must be provided when wrapping"
27+
+ " multiple columns under a single feature name"
28+
+ " or a feature map"
29+
)
30+
31+
if isinstance(feat_name_obj, str):
32+
pg_feature_map[feat_name_obj] = columns
33+
34+
elif isinstance(feat_name_obj, dict):
35+
covered_columns = []
36+
for col in feat_name_obj.keys():
37+
current_cols = feat_name_obj[col]
38+
# Handle strings too
39+
if isinstance(current_cols, str):
40+
current_cols = [current_cols]
41+
covered_columns = covered_columns + current_cols
42+
43+
if set(covered_columns) != set(columns):
44+
raise ValueError(
45+
f"All the columns {columns} not covered in {covered_columns} "
46+
f"Please check the feature_map {feat_name_obj} provided"
47+
)
48+
49+
for key, cols in feat_name_obj.items():
50+
if isinstance(cols, str):
51+
cols = [cols]
52+
pg_feature_map[key] = cols
53+
else:
54+
raise ValueError(f"{feat_name_obj} should be str or dict")
55+
else:
56+
if feat_name_obj:
57+
raise ValueError(
58+
f"feat_name {feat_name_obj} is only valid when "
59+
"wrapping multiple columns under feature names"
60+
)
61+
for col in columns:
62+
pg_feature_map[col] = [col]
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
# Copyright (c) 2022, NVIDIA CORPORATION.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
15+
# Utils for sampling on graphstore like objects
16+
import cugraph
17+
import cudf
18+
import cupy as cp
19+
import dask_cudf
20+
from cugraph.experimental import PropertyGraph
21+
22+
src_n = PropertyGraph.src_col_name
23+
dst_n = PropertyGraph.dst_col_name
24+
type_n = PropertyGraph.type_col_name
25+
eid_n = PropertyGraph.edge_id_col_name
26+
vid_n = PropertyGraph.vertex_col_name
27+
28+
29+
def get_subgraph_and_src_range_from_edgelist(edge_list, is_mg, reverse_edges=False):
30+
if reverse_edges:
31+
edge_list = edge_list.rename(columns={src_n: dst_n, dst_n: src_n})
32+
33+
subgraph = cugraph.MultiGraph(directed=True)
34+
if is_mg:
35+
# FIXME: Can not switch to renumber = False
36+
# For MNMG Algos
37+
# Remove when https://github.com/rapidsai/cugraph/issues/2437
38+
# lands
39+
create_subgraph_f = subgraph.from_dask_cudf_edgelist
40+
renumber = True
41+
edge_list = edge_list.persist()
42+
src_range = edge_list[src_n].min().compute(), edge_list[src_n].max().compute()
43+
44+
else:
45+
# Note: We have to keep renumber = False
46+
# to handle cases when the seed_nodes is not present in subgraph
47+
create_subgraph_f = subgraph.from_cudf_edgelist
48+
renumber = False
49+
src_range = edge_list[src_n].min(), edge_list[src_n].max()
50+
51+
create_subgraph_f(
52+
edge_list,
53+
source=src_n,
54+
destination=dst_n,
55+
edge_attr=eid_n,
56+
renumber=renumber,
57+
# FIXME: renumber=False is not supported for MNMG algos
58+
legacy_renum_only=True,
59+
)
60+
61+
return subgraph, src_range
62+
63+
64+
def sample_multiple_sgs(
65+
sgs,
66+
sample_f,
67+
start_list_d,
68+
start_list_dtype,
69+
edge_dir,
70+
fanout,
71+
with_replacement,
72+
):
73+
start_list_types = list(start_list_d.keys())
74+
output_dfs = []
75+
for can_etype, (sg, start_list_range) in sgs.items():
76+
can_etype = _convert_can_etype_s_to_tup(can_etype)
77+
if _edge_types_contains_canonical_etype(can_etype, start_list_types, edge_dir):
78+
if edge_dir == "in":
79+
subset_type = can_etype[2]
80+
else:
81+
subset_type = can_etype[0]
82+
output = sample_single_sg(
83+
sg,
84+
sample_f,
85+
start_list_d[subset_type],
86+
start_list_dtype,
87+
start_list_range,
88+
fanout,
89+
with_replacement,
90+
)
91+
output_dfs.append(output)
92+
if len(output_dfs) == 0:
93+
empty_df = cudf.DataFrame({"sources": [], "destinations": [], "indices": []})
94+
return empty_df.astype(cp.int32)
95+
96+
if isinstance(output_dfs[0], dask_cudf.DataFrame):
97+
return dask_cudf.concat(output_dfs, ignore_index=True)
98+
else:
99+
return cudf.concat(output_dfs, ignore_index=True)
100+
101+
102+
def sample_single_sg(
103+
sg,
104+
sample_f,
105+
start_list,
106+
start_list_dtype,
107+
start_list_range,
108+
fanout,
109+
with_replacement,
110+
):
111+
if isinstance(start_list, dict):
112+
start_list = cudf.concat(list(start_list.values()))
113+
114+
# Uniform sampling fails when the dtype
115+
# of the seed dtype is not same as the node dtype
116+
start_list = start_list.astype(start_list_dtype)
117+
118+
# Filter start list by ranges
119+
# to enure the seed is with in index values
120+
# see below:
121+
# https://github.com/rapidsai/cugraph/blob/branch-22.12/cpp/src/prims/per_v_random_select_transform_outgoing_e.cuh
122+
start_list = start_list[
123+
(start_list >= start_list_range[0]) & (start_list <= start_list_range[1])
124+
]
125+
sampled_df = sample_f(
126+
sg,
127+
start_list=start_list,
128+
fanout_vals=[fanout],
129+
with_replacement=with_replacement,
130+
)
131+
return sampled_df
132+
133+
134+
def _edge_types_contains_canonical_etype(can_etype, edge_types, edge_dir):
135+
src_type, _, dst_type = can_etype
136+
if edge_dir == "in":
137+
return dst_type in edge_types
138+
else:
139+
return src_type in edge_types
140+
141+
142+
def _convert_can_etype_s_to_tup(canonical_etype_s):
143+
src_type, etype, dst_type = canonical_etype_s.split(",")
144+
src_type = src_type[2:-1]
145+
dst_type = dst_type[2:-2]
146+
etype = etype[2:-1]
147+
return (src_type, etype, dst_type)
148+
149+
150+
def create_dlpack_d(d):
151+
dlpack_d = {}
152+
for k, df in d.items():
153+
if len(df) == 0:
154+
dlpack_d[k] = (None, None, None)
155+
else:
156+
dlpack_d[k] = (
157+
df[src_n].to_dlpack(),
158+
df[dst_n].to_dlpack(),
159+
df[eid_n].to_dlpack(),
160+
)
161+
162+
return dlpack_d
163+
164+
165+
def get_underlying_dtype_from_sg(sg):
166+
"""
167+
Returns the underlying dtype of the subgraph
168+
"""
169+
# FIXME: Remove after we have consistent naming
170+
# https://github.com/rapidsai/cugraph/issues/2618
171+
sg_columns = sg.edgelist.edgelist_df.columns
172+
if "src" in sg_columns:
173+
# src for single node graph
174+
sg_node_dtype = sg.edgelist.edgelist_df["src"].dtype
175+
elif src_n in sg_columns:
176+
# _SRC_ for multi-node graphs
177+
sg_node_dtype = sg.edgelist.edgelist_df[src_n].dtype
178+
else:
179+
raise ValueError(f"Source column {src_n} not found in the subgraph")
180+
181+
return sg_node_dtype

0 commit comments

Comments
 (0)