From a25409b08b897a88da619354883c4d06f3030bcb Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 31 Jan 2022 15:40:25 -0500
Subject: [PATCH 01/14] Add RMM dashboard

---
 distributed/dashboard/components/rmm.py | 170 ++++++++++++++++++++++++
 distributed/dashboard/scheduler.py      |   2 +
 distributed/diagnostics/rmm.py          |  29 ++++
 distributed/worker.py                   |  12 +-
 4 files changed, 212 insertions(+), 1 deletion(-)
 create mode 100644 distributed/dashboard/components/rmm.py
 create mode 100644 distributed/diagnostics/rmm.py

diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py
new file mode 100644
index 00000000000..d0821611b3c
--- /dev/null
+++ b/distributed/dashboard/components/rmm.py
@@ -0,0 +1,170 @@
+import math
+
+from bokeh.core.properties import without_property_validation
+from bokeh.models import (
+    BasicTicker,
+    ColumnDataSource,
+    HoverTool,
+    NumeralTickFormatter,
+    OpenURL,
+    TapTool,
+)
+from bokeh.plotting import figure
+from tornado import escape
+
+from dask.utils import format_bytes
+
+from distributed.dashboard.components import DashboardComponent, add_periodic_callback
+from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024, env
+from distributed.dashboard.utils import update
+from distributed.utils import log_errors
+
+
+class RMMMemoryUsage(DashboardComponent):
+    """
+    GPU memory usage plot that includes information about memory
+    managed by RMM. If an RMM pool is being used, shows the amount of
+    pool memory utilized.
+    """
+    def __init__(self, scheduler, width=600, **kwargs):
+        with log_errors():
+            self.last = 0
+            self.scheduler = scheduler
+            self.source = ColumnDataSource(
+                {
+                    "rmm-used": [1, 2],
+                    "rmm-used-half": [0.5, 1],
+                    "rmm-total": [2, 4],
+                    "rmm-total-half": [1, 2],
+                    "external-used": [2, 1],
+                    "external-used-x": [3, 4.5],
+                    "worker": ["a", "b"],
+                    "gpu-index": [0, 0],
+                    "y": [1, 2],
+                    "escaped_worker": ["a", "b"],
+                }
+            )
+
+            memory = figure(
+                title="RMM Memory",
+                tools="",
+                id="bk-rmm-memory-worker-plot",
+                width=int(width / 2),
+                name="rmm_memory_histogram",
+                **kwargs,
+            )
+
+            rect = memory.rect(
+                source=self.source,
+                x="rmm-used-half",
+                y="y",
+                width="rmm-used",
+                height=1,
+                color="#7401FF",
+            )
+            rect.nonselection_glyph = None
+
+            rect = memory.rect(
+                source=self.source,
+                x="rmm-total-half",
+                y="y",
+                width="rmm-total",
+                height=1,
+                color="#7401FF",
+                alpha=0.5,
+            )
+            rect.nonselection_glyph = None
+
+            rect = memory.rect(
+                source=self.source,
+                x="external-used-x",
+                y="y",
+                width="external-used",
+                height=1,
+                color="#76B900",
+            )
+            rect.nonselection_glyph = None
+
+            memory.axis[0].ticker = BasicTicker(**TICKS_1024)
+            memory.xaxis[0].formatter = NumeralTickFormatter(format="0.0 b")
+            memory.xaxis.major_label_orientation = -math.pi / 12
+            memory.x_range.start = 0
+
+            for fig in [memory]:
+                fig.xaxis.minor_tick_line_alpha = 0
+                fig.yaxis.visible = False
+                fig.ygrid.visible = False
+
+                tap = TapTool(
+                    callback=OpenURL(url="./info/worker/@escaped_worker.html")
+                )
+                fig.add_tools(tap)
+
+                fig.toolbar_location = None
+                fig.yaxis.visible = False
+
+            hover = HoverTool()
+            hover.tooltips = "@worker : @memory_text"
+            hover.point_policy = "follow_mouse"
+            memory.add_tools(hover)
+
+            self.memory_figure = memory
+
+    @without_property_validation
+    def update(self):
+        with log_errors():
+            workers = list(self.scheduler.workers.values())
+            rmm_total = []
+            rmm_used = []
+            external_used = []
+            gpu_index = []
+            y = []
+            worker = []
+            external_used_x = []
+            memory_max = 0
+            memory_total = 0
+
+            for idx, ws in enumerate(workers):
+                rmm_metrics = ws.metrics["rmm"]
+                gpu_extra = ws.extra["gpu"]
+                gpu_metrics = ws.metrics["gpu"]
+
+                rmm_total_worker = rmm_metrics["rmm-total"]
+                rmm_used_worker = rmm_metrics["rmm-used"]  # RMMM memory only
+                gpu_used_worker = gpu_metrics["memory-used"]  # All GPU memory
+                external_used_worker = gpu_used_worker - rmm_total_worker
+
+                rmm_total.append(rmm_total_worker)
+                rmm_used.append(rmm_used_worker)
+                external_used.append(external_used_worker)
+                external_used_x.append(rmm_total_worker + external_used_worker / 2)
+                worker.append(ws.address)
+                gpu_index.append(idx)
+                y.append(idx)
+
+                memory_max = max(memory_max, gpu_used_worker)
+                memory_total = max(memory_total, gpu_extra["memory-total"])
+
+            result = {
+                "rmm-total": rmm_total,
+                "rmm-used": rmm_used,
+                "external-used": external_used,
+                "rmm-total-half": [m // 2 for m in rmm_total],
+                "rmm-used-half": [m // 2 for m in rmm_used],
+                "external-used-x": external_used_x,
+                "worker": worker,
+                "gpu-index": gpu_index,
+                "y": y,
+                "escaped_worker": [escape.url_escape(w) for w in worker]
+            }
+            self.memory_figure.x_range.end = memory_total
+            update(self.source, result)
+
+
+def rmm_memory_doc(scheduler, extra, doc):
+    with log_errors():
+        rmm_load = RMMMemoryUsage(scheduler, sizing_mode="stretch_both")
+        rmm_load.update()
+        add_periodic_callback(doc, rmm_load, 100)
+        doc.add_root(rmm_load.memory_figure)
+        doc.theme = BOKEH_THEME
diff --git a/distributed/dashboard/scheduler.py b/distributed/dashboard/scheduler.py
index 98791acc9bc..a1461e6c82b 100644
--- a/distributed/dashboard/scheduler.py
+++ b/distributed/dashboard/scheduler.py
@@ -5,6 +5,7 @@
 
 from .components.nvml import gpu_doc  # noqa: 1708
 from .components.nvml import NVML_ENABLED, gpu_memory_doc, gpu_utilization_doc
+from .components.rmm import rmm_memory_doc
 from .components.scheduler import (
     AggregateAction,
     BandwidthTypes,
@@ -97,6 +98,7 @@
     "/individual-profile-server": individual_profile_server_doc,
     "/individual-gpu-memory": gpu_memory_doc,
     "/individual-gpu-utilization": gpu_utilization_doc,
+    "/individual-rmm-memory": rmm_memory_doc
 }
 
 
diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py
new file mode 100644
index 00000000000..b9f001976d5
--- /dev/null
+++ b/distributed/diagnostics/rmm.py
@@ -0,0 +1,29 @@
+def _get_pool_size_for_mr(mr):
+    import rmm
+
+    if not isinstance(mr, rmm.mr.PoolMemoryResource):
+        if hasattr(mr, "upstream_mr"):
+            return _get_pool_size_for_mr(mr.upstream_mr)
+        else:
+            return 0
+    else:
+        pool_size = mr.pool_size()
+        return pool_size
+
+def real_time():
+    import rmm
+
+    mr = rmm.mr.get_current_device_resource()
+    rmm_pool_size = _get_pool_size_for_mr(mr)
+
+    rmm_used = (
+        mr.get_allocated_bytes()
+        if isinstance(mr, rmm.mr.TrackingResourceAdaptor)
+        else 0
+    )
+
+    rmm_total = max(rmm_pool_size, rmm_used)
+    return {
+        "rmm-used": rmm_used,
+        "rmm-total": rmm_total
+    }
diff --git a/distributed/worker.py b/distributed/worker.py
index 25148958f9d..7c97cc5f259 100644
--- a/distributed/worker.py
+++ b/distributed/worker.py
@@ -57,7 +57,7 @@
     pingpong,
     send_recv,
 )
-from .diagnostics import nvml
+from .diagnostics import nvml, rmm
 from .diagnostics.plugin import _get_plugin_name
 from .diskutils import WorkDir, WorkSpace
 from .http import get_handlers
@@ -4686,6 +4686,16 @@ def gpu_startup(worker):
 
     DEFAULT_STARTUP_INFORMATION["gpu"] = gpu_startup
 
+try:
+    import rmm as _rmm
+except (Exception, RuntimeError):
+    pass
+else:
+    async def rmm_metric(worker):
+        result = await offload(rmm.real_time)
+        return result
+    DEFAULT_METRICS["rmm"] = rmm_metric
+
 
 def print(*args, **kwargs):
     """Dask print function

From 608169d73458d4c2268bd1e424ffd4e0bf07c485 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 3 Feb 2022 16:45:14 -0500
Subject: [PATCH 02/14] Change isinstance->hasattr

---
 distributed/diagnostics/rmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py
index b9f001976d5..92341621d57 100644
--- a/distributed/diagnostics/rmm.py
+++ b/distributed/diagnostics/rmm.py
@@ -18,7 +18,7 @@ def real_time():
 
     rmm_used = (
         mr.get_allocated_bytes()
-        if isinstance(mr, rmm.mr.TrackingResourceAdaptor)
+        if hasattr(mr, "get_allocated_bytes")
         else 0
     )
 

From 25145660af5bb167b54a4a0425ada02e2011c058 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 3 Feb 2022 17:30:54 -0500
Subject: [PATCH 03/14] Changes

---
 distributed/diagnostics/rmm.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py
index 92341621d57..77d5b1ec11e 100644
--- a/distributed/diagnostics/rmm.py
+++ b/distributed/diagnostics/rmm.py
@@ -10,18 +10,23 @@ def _get_pool_size_for_mr(mr):
         pool_size = mr.pool_size()
         return pool_size
 
+def _get_allocated_bytes_for_mr(mr):
+    import rmm_async
+
+    if not hasattr(mr, "get_allocated_bytes"):
+        if hasattr(mr, "upstream_mr"):
+            return _get_allocated_bytes_for_mr(mr.upstream_mr)
+        else:
+            return 0
+    else:
+        return mr.get_allocated_bytes()
+
 def real_time():
     import rmm
 
     mr = rmm.mr.get_current_device_resource()
     rmm_pool_size = _get_pool_size_for_mr(mr)
-
-    rmm_used = (
-        mr.get_allocated_bytes()
-        if hasattr(mr, "get_allocated_bytes")
-        else 0
-    )
-
+    rmm_used = _get_allocated_bytes_for_mr(mr)
     rmm_total = max(rmm_pool_size, rmm_used)
     return {
         "rmm-used": rmm_used,

From 6b5ea40bfdec0dbe5b4f443bf8801bab03def4af Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 3 Feb 2022 17:32:26 -0500
Subject: [PATCH 04/14] rmm_async -> rmm

---
 distributed/diagnostics/rmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py
index 77d5b1ec11e..71cebc735ac 100644
--- a/distributed/diagnostics/rmm.py
+++ b/distributed/diagnostics/rmm.py
@@ -11,7 +11,7 @@ def _get_pool_size_for_mr(mr):
         return pool_size
 
 def _get_allocated_bytes_for_mr(mr):
-    import rmm_async
+    import rmm
 
     if not hasattr(mr, "get_allocated_bytes"):
         if hasattr(mr, "upstream_mr"):

From a2b4f27c02f6b51190c2bc30481c52f6a6274c45 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Fri, 11 Feb 2022 12:15:17 -0500
Subject: [PATCH 05/14] Small fixes

---
 distributed/diagnostics/rmm.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py
index 71cebc735ac..66652f435c8 100644
--- a/distributed/diagnostics/rmm.py
+++ b/distributed/diagnostics/rmm.py
@@ -1,4 +1,10 @@
-def _get_pool_size_for_mr(mr):
+"""
+Diagnostics for GPU memory managed by RMM (RAPIDS memory manager).
+"""
+
+def _get_pool_size(mr):
+    # if the memory resource or any of its upstreams
+    # is a `PoolMemoryResource`, get its pool size
     import rmm
 
     if not isinstance(mr, rmm.mr.PoolMemoryResource):
@@ -10,7 +16,7 @@ def _get_pool_size_for_mr(mr):
         pool_size = mr.pool_size()
         return pool_size
 
-def _get_allocated_bytes_for_mr(mr):
+def _get_allocated_bytes(mr):
     import rmm
 
     if not hasattr(mr, "get_allocated_bytes"):

From ba241abf4a861fca590f0f6d87a812dffec7cd9d Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 14 Feb 2022 14:13:09 -0500
Subject: [PATCH 06/14] Style

---
 distributed/dashboard/components/rmm.py | 3 ++-
 distributed/dashboard/scheduler.py      | 2 +-
 distributed/diagnostics/rmm.py          | 8 ++++----
 distributed/worker.py                   | 2 ++
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py
index d0821611b3c..e6e1df2cb49 100644
--- a/distributed/dashboard/components/rmm.py
+++ b/distributed/dashboard/components/rmm.py
@@ -26,6 +26,7 @@ class RMMMemoryUsage(DashboardComponent):
     managed by RMM. If an RMM pool is being used, shows the amount of
     pool memory utilized.
     """
+
     def __init__(self, scheduler, width=600, **kwargs):
         with log_errors():
             self.last = 0
@@ -155,7 +156,7 @@ def update(self):
                 "worker": worker,
                 "gpu-index": gpu_index,
                 "y": y,
-                "escaped_worker": [escape.url_escape(w) for w in worker]
+                "escaped_worker": [escape.url_escape(w) for w in worker],
             }
             self.memory_figure.x_range.end = memory_total
             update(self.source, result)
diff --git a/distributed/dashboard/scheduler.py b/distributed/dashboard/scheduler.py
index a1461e6c82b..72637f445d7 100644
--- a/distributed/dashboard/scheduler.py
+++ b/distributed/dashboard/scheduler.py
@@ -98,7 +98,7 @@
     "/individual-profile-server": individual_profile_server_doc,
     "/individual-gpu-memory": gpu_memory_doc,
     "/individual-gpu-utilization": gpu_utilization_doc,
-    "/individual-rmm-memory": rmm_memory_doc
+    "/individual-rmm-memory": rmm_memory_doc,
 }
 
 
diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py
index 66652f435c8..4ee27d9e983 100644
--- a/distributed/diagnostics/rmm.py
+++ b/distributed/diagnostics/rmm.py
@@ -2,6 +2,7 @@
 Diagnostics for GPU memory managed by RMM (RAPIDS memory manager).
 """
 
+
 def _get_pool_size(mr):
     # if the memory resource or any of its upstreams
     # is a `PoolMemoryResource`, get its pool size
@@ -16,6 +17,7 @@ def _get_pool_size(mr):
         pool_size = mr.pool_size()
         return pool_size
 
+
 def _get_allocated_bytes(mr):
     import rmm
 
@@ -27,6 +29,7 @@ def _get_allocated_bytes(mr):
     else:
         return mr.get_allocated_bytes()
 
+
 def real_time():
     import rmm
 
@@ -34,7 +37,4 @@ def real_time():
     rmm_pool_size = _get_pool_size_for_mr(mr)
     rmm_used = _get_allocated_bytes_for_mr(mr)
     rmm_total = max(rmm_pool_size, rmm_used)
-    return {
-        "rmm-used": rmm_used,
-        "rmm-total": rmm_total
-    }
+    return {"rmm-used": rmm_used, "rmm-total": rmm_total}
diff --git a/distributed/worker.py b/distributed/worker.py
index 7c97cc5f259..73bafccd5b5 100644
--- a/distributed/worker.py
+++ b/distributed/worker.py
@@ -4691,9 +4691,11 @@ def gpu_startup(worker):
 except (Exception, RuntimeError):
     pass
 else:
+
     async def rmm_metric(worker):
         result = await offload(rmm.real_time)
         return result
+
     DEFAULT_METRICS["rmm"] = rmm_metric
 
 

From fc697495679272f4f247b4c50a3aad30f8efe105 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 14 Feb 2022 14:21:00 -0500
Subject: [PATCH 07/14] More style

---
 distributed/dashboard/components/rmm.py |  4 +---
 distributed/diagnostics/rmm.py          | 10 ++++------
 distributed/worker.py                   |  1 +
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py
index e6e1df2cb49..cd0349ed4d4 100644
--- a/distributed/dashboard/components/rmm.py
+++ b/distributed/dashboard/components/rmm.py
@@ -12,10 +12,8 @@
 from bokeh.plotting import figure
 from tornado import escape
 
-from dask.utils import format_bytes
-
 from distributed.dashboard.components import DashboardComponent, add_periodic_callback
-from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024, env
+from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024
 from distributed.dashboard.utils import update
 from distributed.utils import log_errors
 
diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py
index 4ee27d9e983..294cf9b0f47 100644
--- a/distributed/diagnostics/rmm.py
+++ b/distributed/diagnostics/rmm.py
@@ -10,7 +10,7 @@ def _get_pool_size(mr):
 
     if not isinstance(mr, rmm.mr.PoolMemoryResource):
         if hasattr(mr, "upstream_mr"):
-            return _get_pool_size_for_mr(mr.upstream_mr)
+            return _get_pool_size(mr.upstream_mr)
         else:
             return 0
     else:
@@ -19,11 +19,9 @@ def _get_pool_size(mr):
 
 
 def _get_allocated_bytes(mr):
-    import rmm
-
     if not hasattr(mr, "get_allocated_bytes"):
         if hasattr(mr, "upstream_mr"):
-            return _get_allocated_bytes_for_mr(mr.upstream_mr)
+            return _get_allocated_bytes(mr.upstream_mr)
         else:
             return 0
     else:
@@ -34,7 +32,7 @@ def real_time():
     import rmm
 
     mr = rmm.mr.get_current_device_resource()
-    rmm_pool_size = _get_pool_size_for_mr(mr)
-    rmm_used = _get_allocated_bytes_for_mr(mr)
+    rmm_pool_size = _get_pool_size(mr)
+    rmm_used = _get_allocated_bytes(mr)
     rmm_total = max(rmm_pool_size, rmm_used)
     return {"rmm-used": rmm_used, "rmm-total": rmm_total}
diff --git a/distributed/worker.py b/distributed/worker.py
index 73bafccd5b5..124e2bc861d 100644
--- a/distributed/worker.py
+++ b/distributed/worker.py
@@ -4697,6 +4697,7 @@ async def rmm_metric(worker):
         return result
 
     DEFAULT_METRICS["rmm"] = rmm_metric
+    del _rmm
 
 
 def print(*args, **kwargs):

From 2d040dcf0309067201f64f8b0df6626075094f88 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 15 Feb 2022 14:04:05 -0500
Subject: [PATCH 08/14] Protect against missing RMM

---
 distributed/dashboard/components/rmm.py | 10 ++++++----
 distributed/diagnostics/rmm.py          | 11 +++++++----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py
index cd0349ed4d4..fb6b9133cbf 100644
--- a/distributed/dashboard/components/rmm.py
+++ b/distributed/dashboard/components/rmm.py
@@ -124,10 +124,12 @@ def update(self):
             memory_total = 0
 
             for idx, ws in enumerate(workers):
-                rmm_metrics = ws.metrics["rmm"]
-                gpu_extra = ws.extra["gpu"]
-                gpu_metrics = ws.metrics["gpu"]
-
+                try:
+                    rmm_metrics = ws.metrics["rmm"]
+                    gpu_extra = ws.extra["gpu"]
+                    gpu_metrics = ws.metrics["gpu"]
+                except KeyError:
+                    continue
                 rmm_total_worker = rmm_metrics["rmm-total"]
                 rmm_used_worker = rmm_metrics["rmm-used"]  # RMMM memory only
                 gpu_used_worker = gpu_metrics["memory-used"]  # All GPU memory
diff --git a/distributed/diagnostics/rmm.py b/distributed/diagnostics/rmm.py
index 294cf9b0f47..603924fab72 100644
--- a/distributed/diagnostics/rmm.py
+++ b/distributed/diagnostics/rmm.py
@@ -2,12 +2,15 @@
 Diagnostics for GPU memory managed by RMM (RAPIDS memory manager).
 """
 
+try:
+    import rmm
+except ImportError:
+    rmm = None
+
 
 def _get_pool_size(mr):
     # if the memory resource or any of its upstreams
     # is a `PoolMemoryResource`, get its pool size
-    import rmm
-
     if not isinstance(mr, rmm.mr.PoolMemoryResource):
         if hasattr(mr, "upstream_mr"):
             return _get_pool_size(mr.upstream_mr)
@@ -29,8 +32,8 @@ def _get_allocated_bytes(mr):
 
 
 def real_time():
-    import rmm
-
+    if rmm is None:
+        return {"rmm-used": None, "rmm-total": None}
     mr = rmm.mr.get_current_device_resource()
     rmm_pool_size = _get_pool_size(mr)
     rmm_used = _get_allocated_bytes(mr)

From 8db3ceb2f3441f9d299833364bd39d30a7687491 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Tue, 15 Feb 2022 17:25:07 -0500
Subject: [PATCH 09/14] Report GPU and RMM memory used in plot

---
 distributed/dashboard/components/rmm.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py
index fb6b9133cbf..1e1a5e8f2ed 100644
--- a/distributed/dashboard/components/rmm.py
+++ b/distributed/dashboard/components/rmm.py
@@ -1,4 +1,5 @@
 import math
+from textwrap import dedent
 
 from bokeh.core.properties import without_property_validation
 from bokeh.models import (
@@ -12,6 +13,8 @@
 from bokeh.plotting import figure
 from tornado import escape
 
+from dask.utils import format_bytes
+
 from distributed.dashboard.components import DashboardComponent, add_periodic_callback
 from distributed.dashboard.components.scheduler import BOKEH_THEME, TICKS_1024
 from distributed.dashboard.utils import update
@@ -144,7 +147,7 @@ def update(self):
                 y.append(idx)
 
                 memory_max = max(memory_max, gpu_used_worker)
-                memory_total = max(memory_total, gpu_extra["memory-total"])
+                memory_total += gpu_extra["memory-total"]
 
             result = {
                 "rmm-total": rmm_total,
@@ -158,7 +161,19 @@ def update(self):
                 "y": y,
                 "escaped_worker": [escape.url_escape(w) for w in worker],
             }
-            self.memory_figure.x_range.end = memory_total
+            self.memory_figure.x_range.end = memory_max
+
+            self.memory_figure.title.text = dedent(
+                """\
+                RMM Utilization: {} / {}\n
+                GPU Memory: {} / {}
+                """.format(
+                    format_bytes(sum(rmm_used)),
+                    format_bytes(sum(rmm_total)),
+                    format_bytes(sum([*rmm_total, *external_used])),
+                    format_bytes(memory_total),
+                )
+            )
             update(self.source, result)
 
 

From ccfff41aea4590975111473af3061594661d686e Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Thu, 17 Feb 2022 13:28:04 -0500
Subject: [PATCH 10/14] Fix hover

---
 distributed/dashboard/components/rmm.py | 57 ++++++++++++++++---------
 1 file changed, 38 insertions(+), 19 deletions(-)

diff --git a/distributed/dashboard/components/rmm.py b/distributed/dashboard/components/rmm.py
index 1e1a5e8f2ed..971bda424f7 100644
--- a/distributed/dashboard/components/rmm.py
+++ b/distributed/dashboard/components/rmm.py
@@ -44,6 +44,10 @@ def __init__(self, scheduler, width=600, **kwargs):
                     "gpu-index": [0, 0],
                     "y": [1, 2],
                     "escaped_worker": ["a", "b"],
+                    "rmm_memory_text": [
+                        "RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B",
+                        "RMM memory used: 1B/1B\nTotal GPU memory used: 1B/2B",
+                    ],
                 }
             )
 
@@ -106,7 +110,7 @@ def __init__(self, scheduler, width=600, **kwargs):
                 fig.yaxis.visible = False
 
             hover = HoverTool()
-            hover.tooltips = "@worker : @memory_text"
+            hover.tooltips = "@worker : @rmm_memory_text"
             hover.point_policy = "follow_mouse"
             memory.add_tools(hover)
 
@@ -124,30 +128,54 @@ def update(self):
             worker = []
             external_used_x = []
             memory_max = 0
-            memory_total = 0
+            gpu_total = []
+            rmm_memory_text = []
 
             for idx, ws in enumerate(workers):
                 try:
                     rmm_metrics = ws.metrics["rmm"]
-                    gpu_extra = ws.extra["gpu"]
                     gpu_metrics = ws.metrics["gpu"]
+                    gpu_info = ws.extra["gpu"]
                 except KeyError:
                     continue
-                rmm_total_worker = rmm_metrics["rmm-total"]
-                rmm_used_worker = rmm_metrics["rmm-used"]  # RMMM memory only
-                gpu_used_worker = gpu_metrics["memory-used"]  # All GPU memory
+                rmm_total_worker = rmm_metrics["rmm-total"]  # RMM memory only
+                rmm_used_worker = rmm_metrics["rmm-used"]
+                gpu_total_worker = gpu_info["memory-total"]  # All GPU memory
+                gpu_used_worker = gpu_metrics["memory-used"]
+
                 external_used_worker = gpu_used_worker - rmm_total_worker
 
                 rmm_total.append(rmm_total_worker)
                 rmm_used.append(rmm_used_worker)
+                gpu_total.append(gpu_total_worker)
                 external_used.append(external_used_worker)
                 external_used_x.append(rmm_total_worker + external_used_worker / 2)
                 worker.append(ws.address)
                 gpu_index.append(idx)
                 y.append(idx)
 
-                memory_max = max(memory_max, gpu_used_worker)
-                memory_total += gpu_extra["memory-total"]
+                memory_max = max(memory_max, gpu_total_worker)
+
+                rmm_memory_text.append(
+                    "RMM memory used: {}/{}\nTotal GPU memory used: {}/{}".format(
+                        format_bytes(rmm_used_worker),
+                        format_bytes(rmm_total_worker),
+                        format_bytes(gpu_used_worker),
+                        format_bytes(gpu_total_worker),
+                    )
+                )
+
+            self.memory_figure.title.text = dedent(
+                """\
+                RMM Utilization: {} / {}\n
+                GPU Memory: {} / {}
+                """.format(
+                    format_bytes(sum(rmm_used)),
+                    format_bytes(sum(rmm_total)),
+                    format_bytes(sum([*rmm_total, *external_used])),
+                    format_bytes(sum(gpu_total)),
+                )
+            )
 
             result = {
                 "rmm-total": rmm_total,
@@ -160,20 +188,11 @@ def update(self):
                 "gpu-index": gpu_index,
                 "y": y,
                 "escaped_worker": [escape.url_escape(w) for w in worker],
+                "rmm_memory_text": rmm_memory_text,
             }
+
             self.memory_figure.x_range.end = memory_max
 
-            self.memory_figure.title.text = dedent(
-                """\
-                RMM Utilization: {} / {}\n
-                GPU Memory: {} / {}
-                """.format(
-                    format_bytes(sum(rmm_used)),
-                    format_bytes(sum(rmm_total)),
-                    format_bytes(sum([*rmm_total, *external_used])),
-                    format_bytes(memory_total),
-                )
-            )
             update(self.source, result)
 
 

From 597f6bc89bc0e20b6ce9f3e5c2320addeffd3a36 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Wed, 2 Mar 2022 14:16:53 -0500
Subject: [PATCH 11/14] Add a test for RMM dashboard

---
 distributed/diagnostics/tests/test_rmm.py | 28 +++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 distributed/diagnostics/tests/test_rmm.py

diff --git a/distributed/diagnostics/tests/test_rmm.py b/distributed/diagnostics/tests/test_rmm.py
new file mode 100644
index 00000000000..b39a79fd2b4
--- /dev/null
+++ b/distributed/diagnostics/tests/test_rmm.py
@@ -0,0 +1,28 @@
+import asyncio
+
+import pytest
+
+from dask import delayed
+from distributed.utils_test  import gen_cluster
+
+
+pytestmark = pytest.mark.gpu
+
+dask_cuda = pytest.importorskip("dask_cuda")
+rmm = pytest.importorskip("rmm")
+pynvml = pytest.importorskip("pynvml")
+
+@gen_cluster(
+    client=True,
+    nthreads=[("127.0.0.1", 1)],
+    Worker=dask_cuda.CUDAWorker,
+    worker_kwargs={"rmm_track_allocations": True}
+)
+async def test_rmm_metrics(c, s, *workers):
+    w = list(s.workers.values())[0]
+    assert "rmm" in w.metrics
+    assert w.metrics["rmm"]["rmm-used"] == 0
+    result = delayed(rmm.DeviceBuffer)(size=10)
+    result = result.persist()
+    await asyncio.sleep(1)
+    assert w.metrics["rmm"]["rmm-used"] != 0

From baa450ee38c905cf0816f825698d49e7ef32e986 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 7 Mar 2022 09:21:45 -0500
Subject: [PATCH 12/14] Add to test

---
 distributed/diagnostics/tests/test_rmm.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/distributed/diagnostics/tests/test_rmm.py b/distributed/diagnostics/tests/test_rmm.py
index b39a79fd2b4..43a3c85fb27 100644
--- a/distributed/diagnostics/tests/test_rmm.py
+++ b/distributed/diagnostics/tests/test_rmm.py
@@ -3,6 +3,7 @@
 import pytest
 
 from dask import delayed
+from dask.utils import parse_bytes
 from distributed.utils_test  import gen_cluster
 
 
@@ -16,13 +17,18 @@
     client=True,
     nthreads=[("127.0.0.1", 1)],
     Worker=dask_cuda.CUDAWorker,
-    worker_kwargs={"rmm_track_allocations": True}
+    worker_kwargs={
+        "rmm_pool_size": parse_bytes("10MiB"),
+        "rmm_track_allocations": True
+    }
 )
 async def test_rmm_metrics(c, s, *workers):
     w = list(s.workers.values())[0]
     assert "rmm" in w.metrics
     assert w.metrics["rmm"]["rmm-used"] == 0
+    assert w.metrics["rmm"]["rmm-total"] == parse_bytes("10MiB")
     result = delayed(rmm.DeviceBuffer)(size=10)
     result = result.persist()
     await asyncio.sleep(1)
     assert w.metrics["rmm"]["rmm-used"] != 0
+    assert w.metrics["rmm"]["rmm-total"] == parse_bytes("10MiB")

From 6d44aacc2c041c76f3ecf69f6b5933f94839bc5e Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Mon, 7 Mar 2022 09:59:48 -0500
Subject: [PATCH 13/14] Rename

---
 .../tests/{test_rmm.py => test_rmm_diagnostics.py}         | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)
 rename distributed/diagnostics/tests/{test_rmm.py => test_rmm_diagnostics.py} (89%)

diff --git a/distributed/diagnostics/tests/test_rmm.py b/distributed/diagnostics/tests/test_rmm_diagnostics.py
similarity index 89%
rename from distributed/diagnostics/tests/test_rmm.py
rename to distributed/diagnostics/tests/test_rmm_diagnostics.py
index 43a3c85fb27..b0458149085 100644
--- a/distributed/diagnostics/tests/test_rmm.py
+++ b/distributed/diagnostics/tests/test_rmm_diagnostics.py
@@ -4,8 +4,8 @@
 
 from dask import delayed
 from dask.utils import parse_bytes
-from distributed.utils_test  import gen_cluster
 
+from distributed.utils_test import gen_cluster
 
 pytestmark = pytest.mark.gpu
 
@@ -13,14 +13,15 @@
 rmm = pytest.importorskip("rmm")
 pynvml = pytest.importorskip("pynvml")
 
+
 @gen_cluster(
     client=True,
     nthreads=[("127.0.0.1", 1)],
     Worker=dask_cuda.CUDAWorker,
     worker_kwargs={
         "rmm_pool_size": parse_bytes("10MiB"),
-        "rmm_track_allocations": True
-    }
+        "rmm_track_allocations": True,
+    },
 )
 async def test_rmm_metrics(c, s, *workers):
     w = list(s.workers.values())[0]

From 34a3cc7e3825161ec8e888a415456ca16600a893 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <shwina@users.noreply.github.com>
Date: Wed, 16 Mar 2022 11:26:37 -0400
Subject: [PATCH 14/14] Replace Rmm with RMM in the HTML template

---
 distributed/http/templates/base.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/distributed/http/templates/base.html b/distributed/http/templates/base.html
index 73e252f08e1..fb90cbfdd0b 100644
--- a/distributed/http/templates/base.html
+++ b/distributed/http/templates/base.html
@@ -40,7 +40,7 @@
                     <ul>
                         {% for plot in plots %}
                         <li><a href="{{ plot }}">
-                                {{ " ".join(plot.split("-")[1:]).title().replace("Cpu", "CPU").replace("Gpu", "GPU") }}
+                                {{ " ".join(plot.split("-")[1:]).title().replace("Cpu", "CPU").replace("Gpu", "GPU").replace("Rmm", "RMM") }}
                             </a></li>
                         {% endfor %}
                     </ul>