Skip to content

Commit 93c88e1

Browse files
authored
Merge 445d019 into e906c98
2 parents e906c98 + 445d019 commit 93c88e1

File tree

7 files changed

+163
-86
lines changed

7 files changed

+163
-86
lines changed

benchmarks/python_e2e/benchmark.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2021, NVIDIA CORPORATION.
1+
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
22
# Licensed under the Apache License, Version 2.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at
@@ -88,9 +88,14 @@ def __init__(self,
8888

8989
#add starting node to algos: BFS and SSSP
9090
for i, algo in enumerate (algo_func_param_list):
91-
if benchmark(algo).name in ["bfs", "sssp"]:
91+
if benchmark(algo).name in ["bfs", "sssp", "neighborhood_sampling"]:
9292
param={}
9393
param["start"]=self.input_dataframe['src'].head()[0]
94+
if benchmark(algo).name in ["neighborhood_sampling"]:
95+
start = [param.pop("start")]
96+
labels = [0]
97+
param["start_info_list"] = (start, labels)
98+
param["fanout_vals"] = [1]
9499
algo_func_param_list[i]=(algo,)+(param,)
95100

96101
self.algos = []
@@ -125,7 +130,8 @@ def run(self):
125130
self.results.append(result)
126131

127132
#algos with transposed=True : PageRank, Katz
128-
#algos with transposed=False: BFS, SSSP, Louvain
133+
#algos with transposed=False: BFS, SSSP, Louvain, HITS, Neighborhood_sampling
134+
#algos supporting the legacy_renum_only: HITS, Neighborhood_sampling
129135
for i in range(len(self.algos)):
130136
if self.algos[i][0].name in ["pagerank", "katz"]: #set transpose=True when renumbering
131137
if self.algos[i][0].name == "katz" and self.construct_graph.name == "from_dask_cudf_edgelist":
@@ -141,6 +147,9 @@ def run(self):
141147
self.algos[i][1]["alpha"] = katz_alpha
142148
if hasattr(G, "compute_renumber_edge_list"):
143149
G.compute_renumber_edge_list(transposed=True)
150+
elif self.algos[i][0].name in ["neighborhood_sampling", "hits"]:
151+
if hasattr(G, "compute_renumber_edge_list"):
152+
G.compute_renumber_edge_list(transposed=False, legacy_renum_only=True)
144153
else: #set transpose=False when renumbering
145154
self.__log("running compute_renumber_edge_list...", end="")
146155
if hasattr(G, "compute_renumber_edge_list"):

benchmarks/python_e2e/cugraph_dask_funcs.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2021, NVIDIA CORPORATION.
1+
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
22
# Licensed under the Apache License, Version 2.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at
@@ -18,6 +18,8 @@
1818
from cugraph.structure.symmetrize import symmetrize_ddf
1919
from cugraph.dask.common.mg_utils import get_visible_devices
2020
from dask_cuda.initialize import initialize
21+
from cugraph.experimental.dask import uniform_neighborhood_sampling
22+
import cudf
2123

2224
import cugraph
2325
from cugraph.comms import comms as Comms
@@ -151,6 +153,18 @@ def katz(G, alpha=None):
151153
print(alpha)
152154
return cugraph.dask.katz_centrality(G, alpha)
153155

156+
def hits(G):
157+
return cugraph.dask.hits(G)
158+
159+
def neighborhood_sampling(G, start_info_list=None, fanout_vals=None):
160+
# convert list to cudf.Series
161+
start_info_list = (
162+
cudf.Series(start_info_list[0], dtype="int32"),
163+
cudf.Series(start_info_list[1], dtype="int32"),
164+
)
165+
166+
return uniform_neighborhood_sampling(
167+
G, start_info_list=start_info_list, fanout_vals=fanout_vals)
154168

155169
################################################################################
156170
# Session-wide setup and teardown

benchmarks/python_e2e/main.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2021, NVIDIA CORPORATION.
1+
# Copyright (c) 2021-2022, NVIDIA CORPORATION.
22
# Licensed under the Apache License, Version 2.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at
@@ -87,6 +87,8 @@ def run(algos,
8787
"wcc": funcs.wcc,
8888
"katz": funcs.katz,
8989
"wcc": funcs.wcc,
90+
"hits": funcs.hits,
91+
"neighborhood_sampling": funcs.neighborhood_sampling,
9092
}
9193

9294
if algos:

python/cugraph/cugraph/dask/link_analysis/hits.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -154,9 +154,12 @@ def hits(input_graph, tol=1.0e-5, max_iter=100, nstart=None, normalized=True):
154154

155155
client = default_client()
156156

157-
# FIXME Still compute renumbering at this layer in case str
158-
# vertex ID are passed
159-
input_graph.compute_renumber_edge_list(transposed=False)
157+
# FIXME: 'legacy_renum_only' will not trigger the C++ renumbering
158+
# In the future, once all the algos follow the C/Pylibcugraph path,
159+
# compute_renumber_edge_list will only be used for multicolumn and
160+
# string vertices since the renumbering will be done in pylibcugraph
161+
input_graph.compute_renumber_edge_list(
162+
transposed=False, legacy_renum_only=True)
160163
ddf = input_graph.edgelist.edgelist_df
161164

162165
graph_properties = GraphProperties(
@@ -205,6 +208,7 @@ def hits(input_graph, tol=1.0e-5, max_iter=100, nstart=None, normalized=True):
205208
wait(cudf_result)
206209

207210
ddf = dask_cudf.from_delayed(cudf_result)
211+
208212
if input_graph.renumbered:
209213
return input_graph.unrenumber(ddf, 'vertex')
210214

python/cugraph/cugraph/dask/sampling/neighborhood_sampling.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,12 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph,
128128
"""
129129
# Initialize dask client
130130
client = default_client()
131-
# Important for handling renumbering
132-
input_graph.compute_renumber_edge_list(transposed=False)
131+
# FIXME: 'legacy_renum_only' will not trigger the C++ renumbering
132+
# In the future, once all the algos follow the C/Pylibcugraph path,
133+
# compute_renumber_edge_list will only be used for multicolumn and
134+
# string vertices since the renumbering will be done in pylibcugraph
135+
input_graph.compute_renumber_edge_list(
136+
transposed=False, legacy_renum_only=True)
133137

134138
start_list, info_list = start_info_list
135139

@@ -158,9 +162,11 @@ def EXPERIMENTAL__uniform_neighborhood(input_graph,
158162
src_col_name = input_graph.renumber_map.renumbered_src_col_name
159163
dst_col_name = input_graph.renumber_map.renumbered_dst_col_name
160164

161-
# start_list uses "external" vertex IDs, but since the graph has been
165+
# start_list uses "external" vertex IDs, but if the graph has been
162166
# renumbered, the start vertex IDs must also be renumbered.
163-
start_list = input_graph.lookup_internal_vertex_id(start_list).compute()
167+
if input_graph.renumbered:
168+
start_list = input_graph.lookup_internal_vertex_id(
169+
start_list).compute()
164170
do_expensive_check = True
165171

166172
result = [client.submit(call_nbr_sampling,

python/cugraph/cugraph/structure/graph_implementation/simpleDistributedGraph.py

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -124,10 +124,23 @@ def __from_edgelist(
124124
#
125125

126126
# FIXME: Edge Attribute not handled
127+
# FIXME: the parameter below is no longer used for unrenumbering
127128
self.properties.renumbered = renumber
128129
self.source_columns = source
129130
self.destination_columns = destination
130131

132+
@property
133+
def renumbered(self):
134+
# This property is now used to determine if a dataframe was renumbered
135+
# by checking the column name. Only the renumbered dataframes will have
136+
# their column names renamed to 'renumbered_src' and 'renumbered_dst'
137+
renumbered_vertex_col_names = ["renumbered_src", "renumbered_dst"]
138+
if self.edgelist.edgelist_df is not None and not (
139+
set(renumbered_vertex_col_names).issubset(
140+
set(self.edgelist.edgelist_df.columns))):
141+
return False
142+
return True
143+
131144
def view_edge_list(self):
132145
"""
133146
Display the edge list. Compute it if needed.
@@ -464,7 +477,9 @@ def neighbors(self, n):
464477
ddf = self.edgelist.edgelist_df
465478
return ddf[ddf["src"] == n]["dst"].reset_index(drop=True)
466479

467-
def compute_renumber_edge_list(self, transposed=False):
480+
def compute_renumber_edge_list(self,
481+
transposed=False,
482+
legacy_renum_only=False):
468483
"""
469484
Compute a renumbered edge list
470485
This function works in the MNMG pipeline and will transform
@@ -486,13 +501,12 @@ def compute_renumber_edge_list(self, transposed=False):
486501
If True, renumber with the intent to make a CSC-like
487502
structure. If False, renumber with the intent to make
488503
a CSR-like structure. Defaults to False.
489-
"""
490-
# FIXME: What to do about edge_attr???
491-
# currently ignored for MNMG
492504
493-
# FIXME: this is confusing - in the code below,
494-
# self.properties.renumbered needs to be interpreted as "needs to be
495-
# renumbered", everywhere else it means "has been renumbered".
505+
legacy_renum_only : (optional) bool
506+
if True, The C++ renumbering will not be triggered.
507+
This parameter is added for new algos following the
508+
C/Pylibcugraph path
509+
"""
496510
if not self.properties.renumbered:
497511
self.edgelist = self.EdgeList(self.input_df)
498512
self.renumber_map = None
@@ -507,10 +521,13 @@ def compute_renumber_edge_list(self, transposed=False):
507521
del self.edgelist
508522

509523
renumbered_ddf, number_map, aggregate_segment_offsets = \
510-
NumberMap.renumber_and_segment(self.input_df,
511-
self.source_columns,
512-
self.destination_columns,
513-
store_transposed=transposed)
524+
NumberMap.renumber_and_segment(
525+
self.input_df,
526+
self.source_columns,
527+
self.destination_columns,
528+
store_transposed=transposed,
529+
legacy_renum_only=legacy_renum_only)
530+
514531
self.edgelist = self.EdgeList(renumbered_ddf)
515532
self.renumber_map = number_map
516533
self.aggregate_segment_offsets = aggregate_segment_offsets

python/cugraph/cugraph/structure/number_map.py

Lines changed: 88 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -499,16 +499,27 @@ def from_internal_vertex_id(
499499
@staticmethod
500500
def renumber_and_segment(
501501
df, src_col_names, dst_col_names, preserve_order=False,
502-
store_transposed=False
502+
store_transposed=False, legacy_renum_only=False
503503
):
504+
# FIXME: Drop the renumber_type 'experimental' once all the
505+
# algos follow the C/Pylibcugraph path
506+
507+
# The renumber_type 'legacy' runs both the python and the
508+
# C++ renumbering.
504509
if isinstance(src_col_names, list):
505510
renumber_type = 'legacy'
506511
elif not (df[src_col_names].dtype == np.int32 or
507512
df[src_col_names].dtype == np.int64):
508513
renumber_type = 'legacy'
509514
else:
515+
# The renumber_type 'experimental' only runs the C++
516+
# renumbering
510517
renumber_type = 'experimental'
511518

519+
if legacy_renum_only and renumber_type == 'experimental':
520+
# The original dataframe will be returned.
521+
renumber_type = 'skip_renumbering'
522+
512523
renumber_map = NumberMap()
513524
if not isinstance(src_col_names, list):
514525
src_col_names = [src_col_names]
@@ -547,6 +558,12 @@ def renumber_and_segment(
547558
df, renumber_map.renumbered_dst_col_name, dst_col_names,
548559
drop=True, preserve_order=preserve_order
549560
)
561+
elif renumber_type == 'skip_renumbering':
562+
# Update the renumbered source and destination column name
563+
# with the original input's source and destination name
564+
renumber_map.renumbered_src_col_name = src_col_names[0]
565+
renumber_map.renumbered_dst_col_name = dst_col_names[0]
566+
550567
else:
551568
df = df.rename(
552569
columns={src_col_names[0]:
@@ -562,69 +579,77 @@ def renumber_and_segment(
562579
is_mnmg = False
563580

564581
if is_mnmg:
565-
client = default_client()
566-
data = get_distributed_data(df)
567-
result = [(client.submit(call_renumber,
568-
Comms.get_session_id(),
569-
wf[1],
570-
renumber_map.renumbered_src_col_name,
571-
renumber_map.renumbered_dst_col_name,
572-
num_edges,
573-
is_mnmg,
574-
store_transposed,
575-
workers=[wf[0]]), wf[0])
576-
for idx, wf in enumerate(data.worker_to_parts.items())]
577-
wait(result)
578-
579-
def get_renumber_map(id_type, data):
580-
return data[0].astype(id_type)
581-
582-
def get_segment_offsets(data):
583-
return data[1]
584-
585-
def get_renumbered_df(id_type, data):
586-
data[2][renumber_map.renumbered_src_col_name] = \
587-
data[2][renumber_map.renumbered_src_col_name]\
588-
.astype(id_type)
589-
data[2][renumber_map.renumbered_dst_col_name] = \
590-
data[2][renumber_map.renumbered_dst_col_name]\
591-
.astype(id_type)
592-
return data[2]
593-
594-
renumbering_map = dask_cudf.from_delayed(
595-
[client.submit(get_renumber_map,
596-
id_type,
597-
data,
598-
workers=[wf])
599-
for (data, wf) in result])
600-
601-
list_of_segment_offsets = client.gather(
602-
[client.submit(get_segment_offsets,
603-
data,
604-
workers=[wf])
605-
for (data, wf) in result])
606-
aggregate_segment_offsets = []
607-
for segment_offsets in list_of_segment_offsets:
608-
aggregate_segment_offsets.extend(segment_offsets)
609-
610-
renumbered_df = dask_cudf.from_delayed(
611-
[client.submit(get_renumbered_df,
612-
id_type,
613-
data,
614-
workers=[wf])
615-
for (data, wf) in result])
616-
if renumber_type == 'legacy':
617-
renumber_map.implementation.ddf = indirection_map.merge(
618-
renumbering_map,
619-
right_on='original_ids', left_on='global_id',
620-
how='right').\
621-
drop(columns=['global_id', 'original_ids'])\
622-
.rename(columns={'new_ids': 'global_id'})
582+
# Do not renumber the algos following the C/Pylibcugraph path
583+
if renumber_type in ['legacy', 'experimental']:
584+
client = default_client()
585+
data = get_distributed_data(df)
586+
result = [(client.submit(call_renumber,
587+
Comms.get_session_id(),
588+
wf[1],
589+
renumber_map.renumbered_src_col_name,
590+
renumber_map.renumbered_dst_col_name,
591+
num_edges,
592+
is_mnmg,
593+
store_transposed,
594+
workers=[wf[0]]), wf[0])
595+
for idx, wf in enumerate(
596+
data.worker_to_parts.items())]
597+
wait(result)
598+
599+
def get_renumber_map(id_type, data):
600+
return data[0].astype(id_type)
601+
602+
def get_segment_offsets(data):
603+
return data[1]
604+
605+
def get_renumbered_df(id_type, data):
606+
data[2][renumber_map.renumbered_src_col_name] = \
607+
data[2][renumber_map.renumbered_src_col_name]\
608+
.astype(id_type)
609+
data[2][renumber_map.renumbered_dst_col_name] = \
610+
data[2][renumber_map.renumbered_dst_col_name]\
611+
.astype(id_type)
612+
return data[2]
613+
614+
renumbering_map = dask_cudf.from_delayed(
615+
[client.submit(get_renumber_map,
616+
id_type,
617+
data,
618+
workers=[wf])
619+
for (data, wf) in result])
620+
621+
list_of_segment_offsets = client.gather(
622+
[client.submit(get_segment_offsets,
623+
data,
624+
workers=[wf])
625+
for (data, wf) in result])
626+
aggregate_segment_offsets = []
627+
for segment_offsets in list_of_segment_offsets:
628+
aggregate_segment_offsets.extend(segment_offsets)
629+
630+
renumbered_df = dask_cudf.from_delayed(
631+
[client.submit(get_renumbered_df,
632+
id_type,
633+
data,
634+
workers=[wf])
635+
for (data, wf) in result])
636+
if renumber_type == 'legacy':
637+
renumber_map.implementation.ddf = indirection_map.merge(
638+
renumbering_map,
639+
right_on='original_ids', left_on='global_id',
640+
how='right').\
641+
drop(columns=['global_id', 'original_ids'])\
642+
.rename(columns={'new_ids': 'global_id'})
643+
else:
644+
renumber_map.implementation.ddf = renumbering_map.rename(
645+
columns={'original_ids': '0', 'new_ids': 'global_id'})
646+
renumber_map.implementation.numbered = True
647+
return renumbered_df, renumber_map, aggregate_segment_offsets
648+
623649
else:
624-
renumber_map.implementation.ddf = renumbering_map.rename(
625-
columns={'original_ids': '0', 'new_ids': 'global_id'})
626-
renumber_map.implementation.numbered = True
627-
return renumbered_df, renumber_map, aggregate_segment_offsets
650+
# There is no aggregate_segment_offsets since the
651+
# C++ renumbering is skipped
652+
return df, renumber_map, None
628653

629654
else:
630655
renumbering_map, segment_offsets, renumbered_df = \

0 commit comments

Comments
 (0)