From a25409b08b897a88da619354883c4d06f3030bcb Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 31 Jan 2022 15:40:25 -0500 Subject: [PATCH 01/14] Add RMM dashboard --- distributed/dashboard/components/rmm.py | 170 ++++++++++++++++++++++++ distributed/dashboard/scheduler.py | 2 + distributed/diagnostics/rmm.py | 29 ++++ distributed/worker.py | 12 +- 4 files changed, 212 insertions(+), 1 deletion(-) create mode 100644 distributed/dashboard/components/rmm.py create mode 100644 distributed/diagnostics/rmm.py diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py new file mode 100644 index 00000000000..d0821611b3c --- /dev/null +++ b/distributed/dashboard/components/rmm.py @@ -0,0 +1,170 @@ +import math + +from bokeh.core.properties import without_property_validation +from bokeh.models import ( + BasicTicker, + ColumnDataSource, + HoverTool, + NumeralTickFormatter, + OpenURL, + TapTool, +) +from bokeh.plotting import figure +from tornado import escape + +from dask.utils import format_bytes + +from distributed.dashboard.components import DashboardComponent, add_periodic_callback +from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024, env +from distributed.dashboard.utils import update +from distributed.utils import log_errors + + +class RMMMemoryUsage(DashboardComponent): + """ + GPU memory usage plot that includes information about memory + managed by RMM. If an RMM pool is being used, shows the amount of + pool memory utilized. + """ + def __init__(self, scheduler, width=600, **kwargs): + with log_errors(): + self.last = 0 + self.scheduler = scheduler + self.source = ColumnDataSource( + { + "rmm-used": [1, 2], + "rmm-used-half": [0.5, 1], + "rmm-total": [2, 4], + "rmm-total-half": [1, 2], + "external-used": [2, 1], + "external-used-x": [3, 4.5], + "worker": ["a", "b"], + "gpu-index": [0, 0], + "y": [1, 2], + "escaped_worker": ["a", "b"], + } + ) + + memory = figure( + title="RMM Memory", + tools="", + id="bk-rmm-memory-worker-plot", + width=int(width / 2), + name="rmm_memory_histogram", + **kwargs, + ) + + rect = memory.rect( + source=self.source, + x="rmm-used-half", + y="y", + width="rmm-used", + height=1, + color="#7401FF", + ) + rect.nonselection_glyph = None + + rect = memory.rect( + source=self.source, + x="rmm-total-half", + y="y", + width="rmm-total", + height=1, + color="#7401FF", + alpha=0.5, + ) + rect.nonselection_glyph = None + + rect = memory.rect( + source=self.source, + x="external-used-x", + y="y", + width="external-used", + height=1, + color="#76B900", + ) + rect.nonselection_glyph = None + + memory.axis[0].ticker = BasicTicker(**TICKS_1024) + memory.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b") + memory.xaxis.major_label_orientation = -math.pi / 12 + memory.x_range.start = 0 + + for fig in [memory]: + fig.xaxis.minor_tick_line_alpha = 0 + fig.yaxis.visible = False + fig.ygrid.visible = False + + tap = TapTool( + callback=OpenURL(url="./info/worker/@escaped_worker.html") + ) + fig.add_tools(tap) + + fig.toolbar_location = None + fig.yaxis.visible = False + + hover = HoverTool() + hover.tooltips = "@worker : @memory_text" + hover.point_policy = "follow_mouse" + memory.add_tools(hover) + + self.memory_figure = memory + + @without_property_validation + def update(self): + with log_errors(): + workers = list(self.scheduler.workers.values()) + rmm_total = [] + rmm_used = [] + external_used = [] + gpu_index = [] + y = [] + worker = [] + external_used_x = [] + memory_max = 0 + memory_total = 0 + + for idx, ws in enumerate(workers): + rmm_metrics = ws.metrics["rmm"] + gpu_extra = ws.extra["gpu"] + gpu_metrics = ws.metrics["gpu"] + + rmm_total_worker = rmm_metrics["rmm-total"] + rmm_used_worker = rmm_metrics["rmm-used"] # RMMM memory only + gpu_used_worker = gpu_metrics["memory-used"] # All GPU memory + external_used_worker = gpu_used_worker - rmm_total_worker + + rmm_total.append(rmm_total_worker) + rmm_used.append(rmm_used_worker) + external_used.append(external_used_worker) + external_used_x.append(rmm_total_worker + external_used_worker / 2) + worker.append(ws.address) + gpu_index.append(idx) + y.append(idx) + + memory_max = max(memory_max, gpu_used_worker) + memory_total = max(memory_total, gpu_extra["memory-total"]) + + result = { + "rmm-total": rmm_total, + "rmm-used": rmm_used, + "external-used": external_used, + "rmm-total-half": [m // 2 for m in rmm_total], + "rmm-used-half": [m // 2 for m in rmm_used], + "external-used-x": external_used_x, + "worker": worker, + "gpu-index": gpu_index, + "y": y, + "escaped_worker": [escape.url_escape(w) for w in worker] + } + self.memory_figure.x_range.end = memory_total + update(self.source, result) + + +def rmm_memory_doc(scheduler, extra, doc): + with log_errors(): + rmm_load = RMMMemoryUsage(scheduler, sizing_mode="stretch_both") + rmm_load.update() + add_periodic_callback(doc, rmm_load, 100) + doc.add_root(rmm_load.memory_figure) + doc.theme = BOKEH_THEME diff --git a/distributed/dashboard/scheduler.py b/distributed/dashboard/scheduler.py index 98791acc9bc..a1461e6c82b 100644 --- a/distributed/dashboard/scheduler.py +++ b/distributed/dashboard/scheduler.py @@ -5,6 +5,7 @@ from .components.nvml import gpu_doc # noqa: 1708 from .components.nvml import NVML_ENABLED, gpu_memory_doc, gpu_utilization_doc +from .components.rmm import rmm_memory_doc from .components.scheduler import ( AggregateAction, BandwidthTypes, @@ -97,6 +98,7 @@ "/individual-profile-server": individual_profile_server_doc, "/individual-gpu-memory": gpu_memory_doc, "/individual-gpu-utilization": gpu_utilization_doc, + "/individual-rmm-memory": rmm_memory_doc } diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py new file mode 100644 index 00000000000..b9f001976d5 --- /dev/null +++ b/distributed/diagnostics/rmm.py @@ -0,0 +1,29 @@ +def _get_pool_size_for_mr(mr): + import rmm + + if not isinstance(mr, rmm.mr.PoolMemoryResource): + if hasattr(mr, "upstream_mr"): + return _get_pool_size_for_mr(mr.upstream_mr) + else: + return 0 + else: + pool_size = mr.pool_size() + return pool_size + +def real_time(): + import rmm + + mr = rmm.mr.get_current_device_resource() + rmm_pool_size = _get_pool_size_for_mr(mr) + + rmm_used = ( + mr.get_allocated_bytes() + if isinstance(mr, rmm.mr.TrackingResourceAdaptor) + else 0 + ) + + rmm_total = max(rmm_pool_size, rmm_used) + return { + "rmm-used": rmm_used, + "rmm-total": rmm_total + } diff --git a/distributed/worker.py b/distributed/worker.py index 25148958f9d..7c97cc5f259 100644 --- a/distributed/worker.py +++ b/distributed/worker.py @@ -57,7 +57,7 @@ pingpong, send_recv, ) -from .diagnostics import nvml +from .diagnostics import nvml, rmm from .diagnostics.plugin import _get_plugin_name from .diskutils import WorkDir, WorkSpace from .http import get_handlers @@ -4686,6 +4686,16 @@ def gpu_startup(worker): DEFAULT_STARTUP_INFORMATION["gpu"] = gpu_startup +try: + import rmm as _rmm +except (Exception, RuntimeError): + pass +else: + async def rmm_metric(worker): + result = await offload(rmm.real_time) + return result + DEFAULT_METRICS["rmm"] = rmm_metric + def print(*args, **kwargs): """Dask print function From 608169d73458d4c2268bd1e424ffd4e0bf07c485 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 3 Feb 2022 16:45:14 -0500 Subject: [PATCH 02/14] Change isinstance->hasattr --- distributed/diagnostics/rmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py index b9f001976d5..92341621d57 100644 --- a/distributed/diagnostics/rmm.py +++ b/distributed/diagnostics/rmm.py @@ -18,7 +18,7 @@ def real_time(): rmm_used = ( mr.get_allocated_bytes() - if isinstance(mr, rmm.mr.TrackingResourceAdaptor) + if hasattr(mr, "get_allocated_bytes") else 0 ) From 25145660af5bb167b54a4a0425ada02e2011c058 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 3 Feb 2022 17:30:54 -0500 Subject: [PATCH 03/14] Changes --- distributed/diagnostics/rmm.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py index 92341621d57..77d5b1ec11e 100644 --- a/distributed/diagnostics/rmm.py +++ b/distributed/diagnostics/rmm.py @@ -10,18 +10,23 @@ def _get_pool_size_for_mr(mr): pool_size = mr.pool_size() return pool_size +def _get_allocated_bytes_for_mr(mr): + import rmm_async + + if not hasattr(mr, "get_allocated_bytes"): + if hasattr(mr, "upstream_mr"): + return _get_allocated_bytes_for_mr(mr.upstream_mr) + else: + return 0 + else: + return mr.get_allocated_bytes() + def real_time(): import rmm mr = rmm.mr.get_current_device_resource() rmm_pool_size = _get_pool_size_for_mr(mr) - - rmm_used = ( - mr.get_allocated_bytes() - if hasattr(mr, "get_allocated_bytes") - else 0 - ) - + rmm_used = _get_allocated_bytes_for_mr(mr) rmm_total = max(rmm_pool_size, rmm_used) return { "rmm-used": rmm_used, From 6b5ea40bfdec0dbe5b4f443bf8801bab03def4af Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 3 Feb 2022 17:32:26 -0500 Subject: [PATCH 04/14] rmm_async -> rmm --- distributed/diagnostics/rmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py index 77d5b1ec11e..71cebc735ac 100644 --- a/distributed/diagnostics/rmm.py +++ b/distributed/diagnostics/rmm.py @@ -11,7 +11,7 @@ def _get_pool_size_for_mr(mr): return pool_size def _get_allocated_bytes_for_mr(mr): - import rmm_async + import rmm if not hasattr(mr, "get_allocated_bytes"): if hasattr(mr, "upstream_mr"): From a2b4f27c02f6b51190c2bc30481c52f6a6274c45 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Fri, 11 Feb 2022 12:15:17 -0500 Subject: [PATCH 05/14] Small fixes --- distributed/diagnostics/rmm.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py index 71cebc735ac..66652f435c8 100644 --- a/distributed/diagnostics/rmm.py +++ b/distributed/diagnostics/rmm.py @@ -1,4 +1,10 @@ -def _get_pool_size_for_mr(mr): +""" +Diagnostics for GPU memory managed by RMM (RAPIDS memory manager). +""" + +def _get_pool_size(mr): + # if the memory resource or any of its upstreams + # is a `PoolMemoryResource`, get its pool size import rmm if not isinstance(mr, rmm.mr.PoolMemoryResource): @@ -10,7 +16,7 @@ def _get_pool_size_for_mr(mr): pool_size = mr.pool_size() return pool_size -def _get_allocated_bytes_for_mr(mr): +def _get_allocated_bytes(mr): import rmm if not hasattr(mr, "get_allocated_bytes"): From ba241abf4a861fca590f0f6d87a812dffec7cd9d Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 14 Feb 2022 14:13:09 -0500 Subject: [PATCH 06/14] Style --- distributed/dashboard/components/rmm.py | 3 ++- distributed/dashboard/scheduler.py | 2 +- distributed/diagnostics/rmm.py | 8 ++++---- distributed/worker.py | 2 ++ 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py index d0821611b3c..e6e1df2cb49 100644 --- a/distributed/dashboard/components/rmm.py +++ b/distributed/dashboard/components/rmm.py @@ -26,6 +26,7 @@ class RMMMemoryUsage(DashboardComponent): managed by RMM. If an RMM pool is being used, shows the amount of pool memory utilized. """ + def __init__(self, scheduler, width=600, **kwargs): with log_errors(): self.last = 0 @@ -155,7 +156,7 @@ def update(self): "worker": worker, "gpu-index": gpu_index, "y": y, - "escaped_worker": [escape.url_escape(w) for w in worker] + "escaped_worker": [escape.url_escape(w) for w in worker], } self.memory_figure.x_range.end = memory_total update(self.source, result) diff --git a/distributed/dashboard/scheduler.py b/distributed/dashboard/scheduler.py index a1461e6c82b..72637f445d7 100644 --- a/distributed/dashboard/scheduler.py +++ b/distributed/dashboard/scheduler.py @@ -98,7 +98,7 @@ "/individual-profile-server": individual_profile_server_doc, "/individual-gpu-memory": gpu_memory_doc, "/individual-gpu-utilization": gpu_utilization_doc, - "/individual-rmm-memory": rmm_memory_doc + "/individual-rmm-memory": rmm_memory_doc, } diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py index 66652f435c8..4ee27d9e983 100644 --- a/distributed/diagnostics/rmm.py +++ b/distributed/diagnostics/rmm.py @@ -2,6 +2,7 @@ Diagnostics for GPU memory managed by RMM (RAPIDS memory manager). """ + def _get_pool_size(mr): # if the memory resource or any of its upstreams # is a `PoolMemoryResource`, get its pool size @@ -16,6 +17,7 @@ def _get_pool_size(mr): pool_size = mr.pool_size() return pool_size + def _get_allocated_bytes(mr): import rmm @@ -27,6 +29,7 @@ def _get_allocated_bytes(mr): else: return mr.get_allocated_bytes() + def real_time(): import rmm @@ -34,7 +37,4 @@ def real_time(): rmm_pool_size = _get_pool_size_for_mr(mr) rmm_used = _get_allocated_bytes_for_mr(mr) rmm_total = max(rmm_pool_size, rmm_used) - return { - "rmm-used": rmm_used, - "rmm-total": rmm_total - } + return {"rmm-used": rmm_used, "rmm-total": rmm_total} diff --git a/distributed/worker.py b/distributed/worker.py index 7c97cc5f259..73bafccd5b5 100644 --- a/distributed/worker.py +++ b/distributed/worker.py @@ -4691,9 +4691,11 @@ def gpu_startup(worker): except (Exception, RuntimeError): pass else: + async def rmm_metric(worker): result = await offload(rmm.real_time) return result + DEFAULT_METRICS["rmm"] = rmm_metric From fc697495679272f4f247b4c50a3aad30f8efe105 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 14 Feb 2022 14:21:00 -0500 Subject: [PATCH 07/14] More style --- distributed/dashboard/components/rmm.py | 4 +--- distributed/diagnostics/rmm.py | 10 ++++------ distributed/worker.py | 1 + 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py index e6e1df2cb49..cd0349ed4d4 100644 --- a/distributed/dashboard/components/rmm.py +++ b/distributed/dashboard/components/rmm.py @@ -12,10 +12,8 @@ from bokeh.plotting import figure from tornado import escape -from dask.utils import format_bytes - from distributed.dashboard.components import DashboardComponent, add_periodic_callback -from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024, env +from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024 from distributed.dashboard.utils import update from distributed.utils import log_errors diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py index 4ee27d9e983..294cf9b0f47 100644 --- a/distributed/diagnostics/rmm.py +++ b/distributed/diagnostics/rmm.py @@ -10,7 +10,7 @@ def _get_pool_size(mr): if not isinstance(mr, rmm.mr.PoolMemoryResource): if hasattr(mr, "upstream_mr"): - return _get_pool_size_for_mr(mr.upstream_mr) + return _get_pool_size(mr.upstream_mr) else: return 0 else: @@ -19,11 +19,9 @@ def _get_pool_size(mr): def _get_allocated_bytes(mr): - import rmm - if not hasattr(mr, "get_allocated_bytes"): if hasattr(mr, "upstream_mr"): - return _get_allocated_bytes_for_mr(mr.upstream_mr) + return _get_allocated_bytes(mr.upstream_mr) else: return 0 else: @@ -34,7 +32,7 @@ def real_time(): import rmm mr = rmm.mr.get_current_device_resource() - rmm_pool_size = _get_pool_size_for_mr(mr) - rmm_used = _get_allocated_bytes_for_mr(mr) + rmm_pool_size = _get_pool_size(mr) + rmm_used = _get_allocated_bytes(mr) rmm_total = max(rmm_pool_size, rmm_used) return {"rmm-used": rmm_used, "rmm-total": rmm_total} diff --git a/distributed/worker.py b/distributed/worker.py index 73bafccd5b5..124e2bc861d 100644 --- a/distributed/worker.py +++ b/distributed/worker.py @@ -4697,6 +4697,7 @@ async def rmm_metric(worker): return result DEFAULT_METRICS["rmm"] = rmm_metric + del _rmm def print(*args, **kwargs): From 2d040dcf0309067201f64f8b0df6626075094f88 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 15 Feb 2022 14:04:05 -0500 Subject: [PATCH 08/14] Protect against missing RMM --- distributed/dashboard/components/rmm.py | 10 ++++++---- distributed/diagnostics/rmm.py | 11 +++++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py index cd0349ed4d4..fb6b9133cbf 100644 --- a/distributed/dashboard/components/rmm.py +++ b/distributed/dashboard/components/rmm.py @@ -124,10 +124,12 @@ def update(self): memory_total = 0 for idx, ws in enumerate(workers): - rmm_metrics = ws.metrics["rmm"] - gpu_extra = ws.extra["gpu"] - gpu_metrics = ws.metrics["gpu"] - + try: + rmm_metrics = ws.metrics["rmm"] + gpu_extra = ws.extra["gpu"] + gpu_metrics = ws.metrics["gpu"] + except KeyError: + continue rmm_total_worker = rmm_metrics["rmm-total"] rmm_used_worker = rmm_metrics["rmm-used"] # RMMM memory only gpu_used_worker = gpu_metrics["memory-used"] # All GPU memory diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py index 294cf9b0f47..603924fab72 100644 --- a/distributed/diagnostics/rmm.py +++ b/distributed/diagnostics/rmm.py @@ -2,12 +2,15 @@ Diagnostics for GPU memory managed by RMM (RAPIDS memory manager). """ +try: + import rmm +except ImportError: + rmm = None + def _get_pool_size(mr): # if the memory resource or any of its upstreams # is a `PoolMemoryResource`, get its pool size - import rmm - if not isinstance(mr, rmm.mr.PoolMemoryResource): if hasattr(mr, "upstream_mr"): return _get_pool_size(mr.upstream_mr) @@ -29,8 +32,8 @@ def _get_allocated_bytes(mr): def real_time(): - import rmm - + if rmm is None: + return {"rmm-used": None, "rmm-total": None} mr = rmm.mr.get_current_device_resource() rmm_pool_size = _get_pool_size(mr) rmm_used = _get_allocated_bytes(mr) From 8db3ceb2f3441f9d299833364bd39d30a7687491 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Tue, 15 Feb 2022 17:25:07 -0500 Subject: [PATCH 09/14] Report GPU and RMM memory used in plot --- distributed/dashboard/components/rmm.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py index fb6b9133cbf..1e1a5e8f2ed 100644 --- a/distributed/dashboard/components/rmm.py +++ b/distributed/dashboard/components/rmm.py @@ -1,4 +1,5 @@ import math +from textwrap import dedent from bokeh.core.properties import without_property_validation from bokeh.models import ( @@ -12,6 +13,8 @@ from bokeh.plotting import figure from tornado import escape +from dask.utils import format_bytes + from distributed.dashboard.components import DashboardComponent, add_periodic_callback from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024 from distributed.dashboard.utils import update @@ -144,7 +147,7 @@ def update(self): y.append(idx) memory_max = max(memory_max, gpu_used_worker) - memory_total = max(memory_total, gpu_extra["memory-total"]) + memory_total += gpu_extra["memory-total"] result = { "rmm-total": rmm_total, @@ -158,7 +161,19 @@ def update(self): "y": y, "escaped_worker": [escape.url_escape(w) for w in worker], } - self.memory_figure.x_range.end = memory_total + self.memory_figure.x_range.end = memory_max + + self.memory_figure.title.text = dedent( + """\ + RMM Utilization: {} / {}\n + GPU Memory: {} / {} + """.format( + format_bytes(sum(rmm_used)), + format_bytes(sum(rmm_total)), + format_bytes(sum([*rmm_total, *external_used])), + format_bytes(memory_total), + ) + ) update(self.source, result) From ccfff41aea4590975111473af3061594661d686e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Thu, 17 Feb 2022 13:28:04 -0500 Subject: [PATCH 10/14] Fix hover --- distributed/dashboard/components/rmm.py | 57 ++++++++++++++++--------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py index 1e1a5e8f2ed..971bda424f7 100644 --- a/distributed/dashboard/components/rmm.py +++ b/distributed/dashboard/components/rmm.py @@ -44,6 +44,10 @@ def __init__(self, scheduler, width=600, **kwargs): "gpu-index": [0, 0], "y": [1, 2], "escaped_worker": ["a", "b"], + "rmm_memory_text": [ + "RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B", + "RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B", + ], } ) @@ -106,7 +110,7 @@ def __init__(self, scheduler, width=600, **kwargs): fig.yaxis.visible = False hover = HoverTool() - hover.tooltips = "@worker : @memory_text" + hover.tooltips = "@worker : @rmm_memory_text" hover.point_policy = "follow_mouse" memory.add_tools(hover) @@ -124,30 +128,54 @@ def update(self): worker = [] external_used_x = [] memory_max = 0 - memory_total = 0 + gpu_total = [] + rmm_memory_text = [] for idx, ws in enumerate(workers): try: rmm_metrics = ws.metrics["rmm"] - gpu_extra = ws.extra["gpu"] gpu_metrics = ws.metrics["gpu"] + gpu_info = ws.extra["gpu"] except KeyError: continue - rmm_total_worker = rmm_metrics["rmm-total"] - rmm_used_worker = rmm_metrics["rmm-used"] # RMMM memory only - gpu_used_worker = gpu_metrics["memory-used"] # All GPU memory + rmm_total_worker = rmm_metrics["rmm-total"] # RMM memory only + rmm_used_worker = rmm_metrics["rmm-used"] + gpu_total_worker = gpu_info["memory-total"] # All GPU memory + gpu_used_worker = gpu_metrics["memory-used"] + external_used_worker = gpu_used_worker - rmm_total_worker rmm_total.append(rmm_total_worker) rmm_used.append(rmm_used_worker) + gpu_total.append(gpu_total_worker) external_used.append(external_used_worker) external_used_x.append(rmm_total_worker + external_used_worker / 2) worker.append(ws.address) gpu_index.append(idx) y.append(idx) - memory_max = max(memory_max, gpu_used_worker) - memory_total += gpu_extra["memory-total"] + memory_max = max(memory_max, gpu_total_worker) + + rmm_memory_text.append( + "RMM memory used: {}/{}\nTotal GPU memory used: {}/{}".format( + format_bytes(rmm_used_worker), + format_bytes(rmm_total_worker), + format_bytes(gpu_used_worker), + format_bytes(gpu_total_worker), + ) + ) + + self.memory_figure.title.text = dedent( + """\ + RMM Utilization: {} / {}\n + GPU Memory: {} / {} + """.format( + format_bytes(sum(rmm_used)), + format_bytes(sum(rmm_total)), + format_bytes(sum([*rmm_total, *external_used])), + format_bytes(sum(gpu_total)), + ) + ) result = { "rmm-total": rmm_total, @@ -160,20 +188,11 @@ def update(self): "gpu-index": gpu_index, "y": y, "escaped_worker": [escape.url_escape(w) for w in worker], + "rmm_memory_text": rmm_memory_text, } + self.memory_figure.x_range.end = memory_max - self.memory_figure.title.text = dedent( - """\ - RMM Utilization: {} / {}\n - GPU Memory: {} / {} - """.format( - format_bytes(sum(rmm_used)), - format_bytes(sum(rmm_total)), - format_bytes(sum([*rmm_total, *external_used])), - format_bytes(memory_total), - ) - ) update(self.source, result) From 597f6bc89bc0e20b6ce9f3e5c2320addeffd3a36 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 2 Mar 2022 14:16:53 -0500 Subject: [PATCH 11/14] Add a test for RMM dashboard --- distributed/diagnostics/tests/test_rmm.py | 28 +++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 distributed/diagnostics/tests/test_rmm.py diff --git a/distributed/diagnostics/tests/test_rmm.py b/distributed/diagnostics/tests/test_rmm.py new file mode 100644 index 00000000000..b39a79fd2b4 --- /dev/null +++ b/distributed/diagnostics/tests/test_rmm.py @@ -0,0 +1,28 @@ +import asyncio + +import pytest + +from dask import delayed +from distributed.utils_test import gen_cluster + + +pytestmark = pytest.mark.gpu + +dask_cuda = pytest.importorskip("dask_cuda") +rmm = pytest.importorskip("rmm") +pynvml = pytest.importorskip("pynvml") + +@gen_cluster( + client=True, + nthreads=[("127.0.0.1", 1)], + Worker=dask_cuda.CUDAWorker, + worker_kwargs={"rmm_track_allocations": True} +) +async def test_rmm_metrics(c, s, *workers): + w = list(s.workers.values())[0] + assert "rmm" in w.metrics + assert w.metrics["rmm"]["rmm-used"] == 0 + result = delayed(rmm.DeviceBuffer)(size=10) + result = result.persist() + await asyncio.sleep(1) + assert w.metrics["rmm"]["rmm-used"] != 0 From baa450ee38c905cf0816f825698d49e7ef32e986 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 7 Mar 2022 09:21:45 -0500 Subject: [PATCH 12/14] Add to test --- distributed/diagnostics/tests/test_rmm.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/distributed/diagnostics/tests/test_rmm.py b/distributed/diagnostics/tests/test_rmm.py index b39a79fd2b4..43a3c85fb27 100644 --- a/distributed/diagnostics/tests/test_rmm.py +++ b/distributed/diagnostics/tests/test_rmm.py @@ -3,6 +3,7 @@ import pytest from dask import delayed +from dask.utils import parse_bytes from distributed.utils_test import gen_cluster @@ -16,13 +17,18 @@ client=True, nthreads=[("127.0.0.1", 1)], Worker=dask_cuda.CUDAWorker, - worker_kwargs={"rmm_track_allocations": True} + worker_kwargs={ + "rmm_pool_size": parse_bytes("10MiB"), + "rmm_track_allocations": True + } ) async def test_rmm_metrics(c, s, *workers): w = list(s.workers.values())[0] assert "rmm" in w.metrics assert w.metrics["rmm"]["rmm-used"] == 0 + assert w.metrics["rmm"]["rmm-total"] == parse_bytes("10MiB") result = delayed(rmm.DeviceBuffer)(size=10) result = result.persist() await asyncio.sleep(1) assert w.metrics["rmm"]["rmm-used"] != 0 + assert w.metrics["rmm"]["rmm-total"] == parse_bytes("10MiB") From 6d44aacc2c041c76f3ecf69f6b5933f94839bc5e Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Mon, 7 Mar 2022 09:59:48 -0500 Subject: [PATCH 13/14] Rename --- .../tests/{test_rmm.py => test_rmm_diagnostics.py} | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) rename distributed/diagnostics/tests/{test_rmm.py => test_rmm_diagnostics.py} (89%) diff --git a/distributed/diagnostics/tests/test_rmm.py b/distributed/diagnostics/tests/test_rmm_diagnostics.py similarity index 89% rename from distributed/diagnostics/tests/test_rmm.py rename to distributed/diagnostics/tests/test_rmm_diagnostics.py index 43a3c85fb27..b0458149085 100644 --- a/distributed/diagnostics/tests/test_rmm.py +++ b/distributed/diagnostics/tests/test_rmm_diagnostics.py @@ -4,8 +4,8 @@ from dask import delayed from dask.utils import parse_bytes -from distributed.utils_test import gen_cluster +from distributed.utils_test import gen_cluster pytestmark = pytest.mark.gpu @@ -13,14 +13,15 @@ rmm = pytest.importorskip("rmm") pynvml = pytest.importorskip("pynvml") + @gen_cluster( client=True, nthreads=[("127.0.0.1", 1)], Worker=dask_cuda.CUDAWorker, worker_kwargs={ "rmm_pool_size": parse_bytes("10MiB"), - "rmm_track_allocations": True - } + "rmm_track_allocations": True, + }, ) async def test_rmm_metrics(c, s, *workers): w = list(s.workers.values())[0] From 34a3cc7e3825161ec8e888a415456ca16600a893 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath Date: Wed, 16 Mar 2022 11:26:37 -0400 Subject: [PATCH 14/14] Replace Rmm with RMM in the HTML template --- distributed/http/templates/base.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/http/templates/base.html b/distributed/http/templates/base.html index 73e252f08e1..fb90cbfdd0b 100644 --- a/distributed/http/templates/base.html +++ b/distributed/http/templates/base.html @@ -40,7 +40,7 @@