Skip to content

Commit 2b8395a

Browse files
authored
Raise a warning for certain algorithms (#2756)
Certain algorithms (Katz, HITS, PageRank, Eigenvector centrality) require the flag `store_transposed` to be set to `True` for optimal performance. Although the CAPI internally `transposed` the graph at the algo's call if it wasn't at the graph creation, this adds extra overheads. This PR raises an exception if the user doesn't set the flag to `True` at the graph creation closes #2742 Authors: - Joseph Nke (https://github.com/jnke2016) Approvers: - Alex Barghi (https://github.com/alexbarghi-nv) - Rick Ratzel (https://github.com/rlratzel) URL: #2756
1 parent 6a7ea66 commit 2b8395a

21 files changed

+294
-25
lines changed

python/cugraph/cugraph/centrality/eigenvector_centrality.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
df_score_to_dictionary,
1919
)
2020
import cudf
21+
import warnings
2122

2223

2324
def eigenvector_centrality(
@@ -77,6 +78,11 @@ def eigenvector_centrality(
7778
raise ValueError(f"'tol' must be a positive float, got: {tol}")
7879

7980
G, isNx = ensure_cugraph_obj_for_nx(G)
81+
if G.store_transposed is False:
82+
warning_msg = ("Eigenvector centrality expects the 'store_transposed' "
83+
"flag to be set to 'True' for optimal performance "
84+
"during the graph creation")
85+
warnings.warn(warning_msg, UserWarning)
8086

8187
vertices, values = \
8288
pylib_eigen(

python/cugraph/cugraph/centrality/katz_centrality.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
df_score_to_dictionary,
1919
)
2020
import cudf
21+
import warnings
2122

2223

2324
def katz_centrality(
@@ -112,6 +113,12 @@ def katz_centrality(
112113
"""
113114
G, isNx = ensure_cugraph_obj_for_nx(G)
114115

116+
if G.store_transposed is False:
117+
warning_msg = ("Katz centrality expects the 'store_transposed' flag "
118+
"to be set to 'True' for optimal performance during "
119+
"the graph creation")
120+
warnings.warn(warning_msg, UserWarning)
121+
115122
if alpha is None:
116123
degree_max = G.degree()['degree'].max()
117124
alpha = 1 / (degree_max)

python/cugraph/cugraph/dask/centrality/eigenvector_centrality.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import cugraph.dask.comms.comms as Comms
2222
import dask_cudf
2323
import cudf
24+
import warnings
2425

2526

2627
def _call_plc_eigenvector_centrality(sID,
@@ -115,6 +116,12 @@ def eigenvector_centrality(
115116
"""
116117
client = input_graph._client
117118

119+
if input_graph.store_transposed is False:
120+
warning_msg = ("Eigenvector centrality expects the 'store_transposed' "
121+
"flag to be set to 'True' for optimal performance "
122+
"during the graph creation")
123+
warnings.warn(warning_msg, UserWarning)
124+
118125
# FIXME: should we add this parameter as an option?
119126
do_expensive_check = False
120127

python/cugraph/cugraph/dask/centrality/katz_centrality.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import cugraph.dask.comms.comms as Comms
2121
import dask_cudf
2222
import cudf
23+
import warnings
2324

2425

2526
def _call_plc_katz_centrality(sID,
@@ -146,6 +147,12 @@ def katz_centrality(
146147
"""
147148
client = input_graph._client
148149

150+
if input_graph.store_transposed is False:
151+
warning_msg = ("Katz centrality expects the 'store_transposed' flag "
152+
"to be set to 'True' for optimal performance during "
153+
"the graph creation")
154+
warnings.warn(warning_msg, UserWarning)
155+
149156
if alpha is None:
150157
degree_max = input_graph.degree()['degree'].max().compute()
151158
alpha = 1 / (degree_max)

python/cugraph/cugraph/dask/common/part_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ def persist_distributed_data(dask_df, client):
8181
async def _extract_partitions(dask_obj, client=None, batch_enabled=False):
8282
client = default_client() if client is None else client
8383
worker_list = Comms.get_workers()
84+
85+
# repartition the 'dask_obj' to get as many partitions as there
86+
# are workers
87+
dask_obj = dask_obj.repartition(npartitions=len(worker_list))
8488
# dask.dataframe or dask.array
8589
if isinstance(dask_obj, (daskDataFrame, daskArray, daskSeries)):
8690
# parts = persist_distributed_data(dask_obj, client)

python/cugraph/cugraph/dask/link_analysis/hits.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import cugraph.dask.comms.comms as Comms
1919
import dask_cudf
2020
import cudf
21+
import warnings
2122

2223
from pylibcugraph import (ResourceHandle,
2324
hits as pylibcugraph_hits
@@ -131,6 +132,12 @@ def hits(input_graph, tol=1.0e-5, max_iter=100, nstart=None, normalized=True):
131132

132133
client = input_graph._client
133134

135+
if input_graph.store_transposed is False:
136+
warning_msg = ("HITS expects the 'store_transposed' flag "
137+
"to be set to 'True' for optimal performance during "
138+
"the graph creation")
139+
warnings.warn(warning_msg, UserWarning)
140+
134141
do_expensive_check = False
135142
initial_hubs_guess_vertices = None
136143
initial_hubs_guess_values = None

python/cugraph/cugraph/dask/link_analysis/pagerank.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,12 @@ def pagerank(input_graph,
233233
# Initialize dask client
234234
client = input_graph._client
235235

236+
if input_graph.store_transposed is False:
237+
warning_msg = ("Pagerank expects the 'store_transposed' flag "
238+
"to be set to 'True' for optimal performance during "
239+
"the graph creation")
240+
warnings.warn(warning_msg, UserWarning)
241+
236242
initial_guess_vertices = None
237243
initial_guess_values = None
238244
precomputed_vertex_out_weight_vertices = None

python/cugraph/cugraph/link_analysis/hits.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
hits as pylibcugraph_hits
2121
)
2222
import cudf
23+
import warnings
2324

2425

2526
def hits(
@@ -84,6 +85,11 @@ def hits(
8485
"""
8586

8687
G, isNx = ensure_cugraph_obj_for_nx(G)
88+
if G.store_transposed is False:
89+
warning_msg = ("HITS expects the 'store_transposed' flag "
90+
"to be set to 'True' for optimal performance during "
91+
"the graph creation")
92+
warnings.warn(warning_msg, UserWarning)
8793

8894
do_expensive_check = False
8995
init_hubs_guess_vertices = None

python/cugraph/cugraph/link_analysis/pagerank.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,12 @@ def pagerank(
164164
pre_vtx_o_wgt_sums = None
165165

166166
G, isNx = ensure_cugraph_obj_for_nx(G, weight)
167+
if G.store_transposed is False:
168+
warning_msg = ("Pagerank expects the 'store_transposed' flag "
169+
"to be set to 'True' for optimal performance during "
170+
"the graph creation")
171+
warnings.warn(warning_msg, UserWarning)
172+
167173
do_expensive_check = False
168174

169175
if nstart is not None:

python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,10 @@ def in_degree(self, vertex_subset=None):
408408
nodes.columns = vertex_col_names
409409

410410
df["degree"] = 1
411-
in_degree = df.groupby(dst_col_name).degree.count().reset_index()
411+
412+
# FIXME: leverage the C++ in_degree for optimal performance
413+
in_degree = df.groupby(dst_col_name).degree.count(
414+
split_out=df.npartitions).reset_index()
412415

413416
# Add vertices with zero in_degree
414417
in_degree = nodes.merge(in_degree, how='outer').fillna(0)
@@ -494,7 +497,9 @@ def out_degree(self, vertex_subset=None):
494497
nodes.columns = vertex_col_names
495498

496499
df["degree"] = 1
497-
out_degree = df.groupby(src_col_name).degree.count().reset_index()
500+
# leverage the C++ out_degree for optimal performance
501+
out_degree = df.groupby(src_col_name).degree.count(
502+
split_out=df.npartitions).reset_index()
498503

499504
# Add vertices with zero out_degree
500505
out_degree = nodes.merge(out_degree, how='outer').fillna(0)
@@ -560,8 +565,10 @@ def degree(self, vertex_subset=None):
560565

561566
vertex_in_degree = self.in_degree(vertex_subset)
562567
vertex_out_degree = self.out_degree(vertex_subset)
568+
# FIXME: leverage the C++ degree for optimal performance
563569
vertex_degree = dask_cudf.concat([vertex_in_degree, vertex_out_degree])
564-
vertex_degree = vertex_degree.groupby(['vertex'], as_index=False).sum()
570+
vertex_degree = vertex_degree.groupby(['vertex'], as_index=False).sum(
571+
split_out=self.input_df.npartitions)
565572

566573
return vertex_degree
567574

0 commit comments

Comments
 (0)