Switch to pynvml_utils.smi for PyNVML 12 (#4863)

jakirkham · web-flow · commit 10884030582f · 2025-01-16T01:18:13.000Z
In the PyNVML 12 upgrade, the `pynvml` package now depends on `nvidia-ml-py`. However `nvidia-ml-py` does not supply the `smi` submodule. To avoid clobbering the package, this module was moved from the `pynvml` namespace to `pynvml_utils`. As a result the `import` here needs the same tweak. Authors: - https://github.com/jakirkham - Alex Barghi (https://github.com/alexbarghi-nv) Approvers: - Alex Barghi (https://github.com/alexbarghi-nv) URL: #4863
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py b/benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -36,9 +36,7 @@
 def init_pytorch_worker(rank: int, use_rmm_torch_allocator: bool = False) -> None:
     import cupy
     import rmm
-    from pynvml.smi import nvidia_smi
 
-    smi = nvidia_smi.getInstance()
     pool_size = 16e9  # FIXME calculate this
 
     rmm.reinitialize(
diff --git a/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py b/benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -201,10 +201,6 @@ def train(self):
                         )
                         logger.info(f"total time: {total_time_iter}")
 
-                        # from pynvml.smi import nvidia_smi
-                        # mem_info = nvidia_smi.getInstance().DeviceQuery('memory.free, memory.total')['gpu'][self.rank % 8]['fb_memory_usage']
-                        # logger.info(f"rank {self.rank} memory: {mem_info}")
-
                     y_true = data.y
                     y_true = y_true.reshape((y_true.shape[0],))
                     x = data.x.to(torch.float32)
diff --git a/python/utils/gpu_metric_poller.py b/python/utils/gpu_metric_poller.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -31,7 +31,7 @@
 import os
 import sys
 import threading
-from pynvml import smi
+import pynvml
 
 
 class GPUMetricPoller(threading.Thread):
@@ -91,18 +91,18 @@ def __runChildLoop(self, readFileNo, writeFileNo):
         childReadPipe = os.fdopen(readFileNo)
         childWritePipe = os.fdopen(writeFileNo, "w")
 
-        smi.nvmlInit()
+        pynvml.nvmlInit()
         # hack - get actual device ID somehow
-        devObj = smi.nvmlDeviceGetHandleByIndex(0)
-        memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
-        utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
+        devObj = pynvml.nvmlDeviceGetHandleByIndex(0)
+        memObj = pynvml.nvmlDeviceGetMemoryInfo(devObj)
+        utilObj = pynvml.nvmlDeviceGetUtilizationRates(devObj)
         initialMemUsed = memObj.used
         initialGpuUtil = utilObj.gpu
 
         controlStr = self.__waitForInput(childReadPipe)
         while True:
-            memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
-            utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
+            memObj = pynvml.nvmlDeviceGetMemoryInfo(devObj)
+            utilObj = pynvml.nvmlDeviceGetUtilizationRates(devObj)
 
             memUsed = memObj.used - initialMemUsed
             gpuUtil = utilObj.gpu - initialGpuUtil
@@ -113,7 +113,7 @@ def __runChildLoop(self, readFileNo, writeFileNo):
                 break
             controlStr = self.__waitForInput(childReadPipe)
 
-        smi.nvmlShutdown()
+        pynvml.nvmlShutdown()
         childReadPipe.close()
         childWritePipe.close()
 
@@ -147,34 +147,3 @@ def startGpuMetricPolling():
 def stopGpuMetricPolling(gpuPollObj):
     gpuPollObj.stop()
     gpuPollObj.join()  # consider using timeout and reporting errors
-
-
-"""
-smi.nvmlInit()
-# hack - get actual device ID somehow
-devObj = smi.nvmlDeviceGetHandleByIndex(0)
-memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
-utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
-initialMemUsed = memObj.used
-initialGpuUtil = utilObj.gpu
-
-while not self.__stop:
-    time.sleep(0.01)
-
-    memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
-    utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
-
-    memUsed = memObj.used - initialMemUsed
-    gpuUtil = utilObj.gpu - initialGpuUtil
-    if memUsed > self.maxGpuMemUsed:
-        self.maxGpuMemUsed = memUsed
-    if gpuUtil > self.maxGpuUtil:
-        self.maxGpuUtil = gpuUtil
-
-    smi.nvmlShutdown()
-"""
-
-
-# if __name__ == "__main__":
-#     sto=stopGpuMetricPolling
-#     po = startGpuMetricPolling()