Skip to content

Commit 59b0eb7

Browse files
authored
[BUG] Unsupported graph for similiarity algos (#3710)
This PR update the docstrings raises an error when running any similarity algos with vertices from a graph that are unrenumbered. Authors: - Joseph Nke (https://github.com/jnke2016) Approvers: - Brad Rees (https://github.com/BradReesWork) URL: #3710
1 parent b8de24c commit 59b0eb7

File tree

12 files changed

+234
-12
lines changed

12 files changed

+234
-12
lines changed

python/cugraph/cugraph/link_prediction/jaccard.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
1+
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
22
# Licensed under the Apache License, Version 2.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at
@@ -20,7 +20,7 @@
2020
)
2121

2222

23-
def jaccard(input_graph, vertex_pair=None):
23+
def jaccard(input_graph, vertex_pair=None, do_expensive_check=True):
2424
"""
2525
Compute the Jaccard similarity between each pair of vertices connected by
2626
an edge, or between arbitrary pairs of vertices specified by the user.
@@ -36,6 +36,10 @@ def jaccard(input_graph, vertex_pair=None):
3636
of cugraph.jaccard is different from the behavior of
3737
networkx.jaccard_coefficient.
3838
39+
This algorithm doesn't currently support datasets with vertices that
40+
are not (re)numebred vertices from 0 to V-1 where V is the total number of
41+
vertices as this creates isolated vertices.
42+
3943
cugraph.jaccard, in the absence of a specified vertex pair list, will
4044
use the edges of the graph to construct a vertex pair list and will
4145
return the jaccard coefficient for those vertex pairs.
@@ -80,6 +84,10 @@ def jaccard(input_graph, vertex_pair=None):
8084
current implementation computes the jaccard coefficient for all
8185
adjacent vertices in the graph.
8286
87+
do_expensive_check: bool (default=True)
88+
When set to True, check if the vertices in the graph are (re)numbered
89+
from 0 to V-1 where V is the total number of vertices.
90+
8391
Returns
8492
-------
8593
df : cudf.DataFrame
@@ -104,6 +112,22 @@ def jaccard(input_graph, vertex_pair=None):
104112
>>> df = cugraph.jaccard(G)
105113
106114
"""
115+
if do_expensive_check:
116+
if not input_graph.renumbered:
117+
input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
118+
max_vertex = input_df.max().max()
119+
expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
120+
input_df.dtypes[0]
121+
)
122+
nodes = (
123+
cudf.concat([input_df["src"], input_df["dst"]])
124+
.unique()
125+
.sort_values()
126+
.reset_index(drop=True)
127+
)
128+
if not expected_nodes.equals(nodes):
129+
raise ValueError("Unrenumbered vertices are not supported.")
130+
107131
if input_graph.is_directed():
108132
raise ValueError("Input must be an undirected Graph.")
109133
if type(vertex_pair) == cudf.DataFrame:
@@ -120,10 +144,14 @@ def jaccard(input_graph, vertex_pair=None):
120144
return df
121145

122146

123-
def jaccard_coefficient(G, ebunch=None):
147+
def jaccard_coefficient(G, ebunch=None, do_expensive_check=True):
124148
"""
125149
For NetworkX Compatability. See `jaccard`
126150
151+
NOTE: This algorithm doesn't currently support datasets with vertices that
152+
are not (re)numebred vertices from 0 to V-1 where V is the total number of
153+
vertices as this creates isolated vertices.
154+
127155
Parameters
128156
----------
129157
graph : cugraph.Graph

python/cugraph/cugraph/link_prediction/overlap.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
1+
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
22
# Licensed under the Apache License, Version 2.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at
@@ -20,10 +20,14 @@
2020
)
2121

2222

23-
def overlap_coefficient(G, ebunch=None):
23+
def overlap_coefficient(G, ebunch=None, do_expensive_check=True):
2424
"""
2525
For NetworkX Compatability. See `overlap`
2626
27+
NOTE: This algorithm doesn't currently support datasets with vertices that
28+
are not (re)numebred vertices from 0 to V-1 where V is the total number of
29+
vertices as this creates isolated vertices.
30+
2731
"""
2832
vertex_pair = None
2933

@@ -42,7 +46,7 @@ def overlap_coefficient(G, ebunch=None):
4246
return df
4347

4448

45-
def overlap(input_graph, vertex_pair=None):
49+
def overlap(input_graph, vertex_pair=None, do_expensive_check=True):
4650
"""
4751
Compute the Overlap Coefficient between each pair of vertices connected by
4852
an edge, or between arbitrary pairs of vertices specified by the user.
@@ -54,6 +58,10 @@ def overlap(input_graph, vertex_pair=None):
5458
neighbors. If first is specified but second is not, or vice versa, an
5559
exception will be thrown.
5660
61+
NOTE: This algorithm doesn't currently support datasets with vertices that
62+
are not (re)numebred vertices from 0 to V-1 where V is the total number of
63+
vertices as this creates isolated vertices.
64+
5765
Parameters
5866
----------
5967
input_graph : cugraph.Graph
@@ -66,6 +74,10 @@ def overlap(input_graph, vertex_pair=None):
6674
vertices. If provided, the overlap coefficient is computed for the
6775
given vertex pairs, else, it is computed for all vertex pairs.
6876
77+
do_expensive_check: bool (default=True)
78+
When set to True, check if the vertices in the graph are (re)numbered
79+
from 0 to V-1 where V is the total number of vertices.
80+
6981
Returns
7082
-------
7183
df : cudf.DataFrame
@@ -90,6 +102,21 @@ def overlap(input_graph, vertex_pair=None):
90102
>>> df = cugraph.overlap(G)
91103
92104
"""
105+
if do_expensive_check:
106+
if not input_graph.renumbered:
107+
input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
108+
max_vertex = input_df.max().max()
109+
expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
110+
input_df.dtypes[0]
111+
)
112+
nodes = (
113+
cudf.concat([input_df["src"], input_df["dst"]])
114+
.unique()
115+
.sort_values()
116+
.reset_index(drop=True)
117+
)
118+
if not expected_nodes.equals(nodes):
119+
raise ValueError("Unrenumbered vertices are not supported.")
93120

94121
if type(vertex_pair) == cudf.DataFrame:
95122
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)

python/cugraph/cugraph/link_prediction/sorensen.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
)
2222

2323

24-
def sorensen(input_graph, vertex_pair=None):
24+
def sorensen(input_graph, vertex_pair=None, do_expensive_check=True):
2525
"""
2626
Compute the Sorensen coefficient between each pair of vertices connected by
2727
an edge, or between arbitrary pairs of vertices specified by the user.
@@ -30,6 +30,10 @@ def sorensen(input_graph, vertex_pair=None):
3030
If first is specified but second is not, or vice versa, an exception will
3131
be thrown.
3232
33+
NOTE: This algorithm doesn't currently support datasets with vertices that
34+
are not (re)numebred vertices from 0 to V-1 where V is the total number of
35+
vertices as this creates isolated vertices.
36+
3337
cugraph.sorensen, in the absence of a specified vertex pair list, will
3438
use the edges of the graph to construct a vertex pair list and will
3539
return the sorensen coefficient for those vertex pairs.
@@ -50,6 +54,10 @@ def sorensen(input_graph, vertex_pair=None):
5054
current implementation computes the Sorensen coefficient for all
5155
adjacent vertices in the graph.
5256
57+
do_expensive_check: bool (default=True)
58+
When set to True, check if the vertices in the graph are (re)numbered
59+
from 0 to V-1 where V is the total number of vertices.
60+
5361
Returns
5462
-------
5563
df : cudf.DataFrame
@@ -76,6 +84,22 @@ def sorensen(input_graph, vertex_pair=None):
7684
>>> df = cugraph.sorensen(G)
7785
7886
"""
87+
if do_expensive_check:
88+
if not input_graph.renumbered:
89+
input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
90+
max_vertex = input_df.max().max()
91+
expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
92+
input_df.dtypes[0]
93+
)
94+
nodes = (
95+
cudf.concat([input_df["src"], input_df["dst"]])
96+
.unique()
97+
.sort_values()
98+
.reset_index(drop=True)
99+
)
100+
if not expected_nodes.equals(nodes):
101+
raise ValueError("Unrenumbered vertices are not supported.")
102+
79103
if type(input_graph) is not Graph:
80104
raise TypeError("input graph must a Graph")
81105

@@ -94,10 +118,14 @@ def sorensen(input_graph, vertex_pair=None):
94118
return df
95119

96120

97-
def sorensen_coefficient(G, ebunch=None):
121+
def sorensen_coefficient(G, ebunch=None, do_expensive_check=True):
98122
"""
99123
For NetworkX Compatability. See `sorensen`
100124
125+
NOTE: This algorithm doesn't currently support datasets with vertices that
126+
are not (re)numebred vertices from 0 to V-1 where V is the total number of
127+
vertices as this creates isolated vertices.
128+
101129
Parameters
102130
----------
103131
G : cugraph.Graph

python/cugraph/cugraph/link_prediction/wjaccard.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2019-2022, NVIDIA CORPORATION.
1+
# Copyright (c) 2019-2023, NVIDIA CORPORATION.
22
# Licensed under the Apache License, Version 2.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at
@@ -17,7 +17,7 @@
1717
from cugraph.utilities import renumber_vertex_pair
1818

1919

20-
def jaccard_w(input_graph, weights, vertex_pair=None):
20+
def jaccard_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
2121
"""
2222
Compute the weighted Jaccard similarity between each pair of vertices
2323
connected by an edge, or between arbitrary pairs of vertices specified by
@@ -29,6 +29,10 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
2929
neighbors. If first is specified but second is not, or vice versa, an
3030
exception will be thrown.
3131
32+
NOTE: This algorithm doesn't currently support datasets with vertices that
33+
are not (re)numebred vertices from 0 to V-1 where V is the total number of
34+
vertices as this creates isolated vertices.
35+
3236
Parameters
3337
----------
3438
input_graph : cugraph.Graph
@@ -51,6 +55,10 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
5155
vertices. If provided, the jaccard coefficient is computed for the
5256
given vertex pairs, else, it is computed for all vertex pairs.
5357
58+
do_expensive_check: bool (default=True)
59+
When set to True, check if the vertices in the graph are (re)numbered
60+
from 0 to V-1 where V is the total number of vertices.
61+
5462
Returns
5563
-------
5664
df : cudf.DataFrame
@@ -87,6 +95,22 @@ def jaccard_w(input_graph, weights, vertex_pair=None):
8795
>>> df = cugraph.jaccard_w(G, weights)
8896
8997
"""
98+
if do_expensive_check:
99+
if not input_graph.renumbered:
100+
input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
101+
max_vertex = input_df.max().max()
102+
expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
103+
input_df.dtypes[0]
104+
)
105+
nodes = (
106+
cudf.concat([input_df["src"], input_df["dst"]])
107+
.unique()
108+
.sort_values()
109+
.reset_index(drop=True)
110+
)
111+
if not expected_nodes.equals(nodes):
112+
raise ValueError("Unrenumbered vertices are not supported.")
113+
90114
if type(input_graph) is not Graph:
91115
raise TypeError("input graph must a Graph")
92116

python/cugraph/cugraph/link_prediction/woverlap.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from cugraph.utilities import renumber_vertex_pair
1717

1818

19-
def overlap_w(input_graph, weights, vertex_pair=None):
19+
def overlap_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
2020
"""
2121
Compute the weighted Overlap Coefficient between each pair of vertices
2222
connected by an edge, or between arbitrary pairs of vertices specified by
@@ -28,6 +28,10 @@ def overlap_w(input_graph, weights, vertex_pair=None):
2828
neighbors. If first is specified but second is not, or vice versa, an
2929
exception will be thrown.
3030
31+
NOTE: This algorithm doesn't currently support datasets with vertices that
32+
are not (re)numebred vertices from 0 to V-1 where V is the total number of
33+
vertices as this creates isolated vertices.
34+
3135
Parameters
3236
----------
3337
input_graph : cugraph.Graph
@@ -51,6 +55,10 @@ def overlap_w(input_graph, weights, vertex_pair=None):
5155
vertices. If provided, the overlap coefficient is computed for the
5256
given vertex pairs, else, it is computed for all vertex pairs.
5357
58+
do_expensive_check: bool (default=True)
59+
When set to True, check if the vertices in the graph are (re)numbered
60+
from 0 to V-1 where V is the total number of vertices.
61+
5462
Returns
5563
-------
5664
df : cudf.DataFrame
@@ -88,6 +96,21 @@ def overlap_w(input_graph, weights, vertex_pair=None):
8896
... len(weights['vertex']))]
8997
>>> df = cugraph.overlap_w(G, weights)
9098
"""
99+
if do_expensive_check:
100+
if not input_graph.renumbered:
101+
input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
102+
max_vertex = input_df.max().max()
103+
expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
104+
input_df.dtypes[0]
105+
)
106+
nodes = (
107+
cudf.concat([input_df["src"], input_df["dst"]])
108+
.unique()
109+
.sort_values()
110+
.reset_index(drop=True)
111+
)
112+
if not expected_nodes.equals(nodes):
113+
raise ValueError("Unrenumbered vertices are not supported.")
91114

92115
if type(vertex_pair) == cudf.DataFrame:
93116
vertex_pair = renumber_vertex_pair(input_graph, vertex_pair)

python/cugraph/cugraph/link_prediction/wsorensen.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,17 @@
1717
from cugraph.utilities import renumber_vertex_pair
1818

1919

20-
def sorensen_w(input_graph, weights, vertex_pair=None):
20+
def sorensen_w(input_graph, weights, vertex_pair=None, do_expensive_check=True):
2121
"""
2222
Compute the weighted Sorensen similarity between each pair of vertices
2323
connected by an edge, or between arbitrary pairs of vertices specified by
2424
the user. Sorensen coefficient is defined between two sets as the ratio of
2525
twice the volume of their intersection divided by the volume of each set.
2626
27+
NOTE: This algorithm doesn't currently support datasets with vertices that
28+
are not (re)numebred vertices from 0 to V-1 where V is the total number of
29+
vertices as this creates isolated vertices.
30+
2731
Parameters
2832
----------
2933
input_graph : cugraph.Graph
@@ -47,6 +51,10 @@ def sorensen_w(input_graph, weights, vertex_pair=None):
4751
vertices. If provided, the sorensen coefficient is computed for the
4852
given vertex pairs, else, it is computed for all vertex pairs.
4953
54+
do_expensive_check: bool (default=True)
55+
When set to True, check if the vertices in the graph are (re)numbered
56+
from 0 to V-1 where V is the total number of vertices.
57+
5058
Returns
5159
-------
5260
df : cudf.DataFrame
@@ -85,6 +93,22 @@ def sorensen_w(input_graph, weights, vertex_pair=None):
8593
>>> df = cugraph.sorensen_w(G, weights)
8694
8795
"""
96+
if do_expensive_check:
97+
if not input_graph.renumbered:
98+
input_df = input_graph.edgelist.edgelist_df[["src", "dst"]]
99+
max_vertex = input_df.max().max()
100+
expected_nodes = cudf.Series(range(0, max_vertex + 1, 1)).astype(
101+
input_df.dtypes[0]
102+
)
103+
nodes = (
104+
cudf.concat([input_df["src"], input_df["dst"]])
105+
.unique()
106+
.sort_values()
107+
.reset_index(drop=True)
108+
)
109+
if not expected_nodes.equals(nodes):
110+
raise ValueError("Unrenumbered vertices are not supported.")
111+
88112
if type(input_graph) is not Graph:
89113
raise TypeError("input graph must a Graph")
90114

0 commit comments

Comments
 (0)