Skip to content

Commit 1088403

Browse files
authored
Switch to pynvml_utils.smi for PyNVML 12 (#4863)
In the PyNVML 12 upgrade, the `pynvml` package now depends on `nvidia-ml-py`. However `nvidia-ml-py` does not supply the `smi` submodule. To avoid clobbering the package, this module was moved from the `pynvml` namespace to `pynvml_utils`. As a result the `import` here needs the same tweak. Authors: - https://github.com/jakirkham - Alex Barghi (https://github.com/alexbarghi-nv) Approvers: - Alex Barghi (https://github.com/alexbarghi-nv) URL: #4863
1 parent 303a053 commit 1088403

File tree

3 files changed

+11
-48
lines changed

3 files changed

+11
-48
lines changed

benchmarks/cugraph/standalone/bulk_sampling/bench_cugraph_training.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
1+
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
22
# Licensed under the Apache License, Version 2.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at
@@ -36,9 +36,7 @@
3636
def init_pytorch_worker(rank: int, use_rmm_torch_allocator: bool = False) -> None:
3737
import cupy
3838
import rmm
39-
from pynvml.smi import nvidia_smi
4039

41-
smi = nvidia_smi.getInstance()
4240
pool_size = 16e9 # FIXME calculate this
4341

4442
rmm.reinitialize(

benchmarks/cugraph/standalone/bulk_sampling/trainers/pyg/trainers_pyg.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
1+
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
22
# Licensed under the Apache License, Version 2.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at
@@ -201,10 +201,6 @@ def train(self):
201201
)
202202
logger.info(f"total time: {total_time_iter}")
203203

204-
# from pynvml.smi import nvidia_smi
205-
# mem_info = nvidia_smi.getInstance().DeviceQuery('memory.free, memory.total')['gpu'][self.rank % 8]['fb_memory_usage']
206-
# logger.info(f"rank {self.rank} memory: {mem_info}")
207-
208204
y_true = data.y
209205
y_true = y_true.reshape((y_true.shape[0],))
210206
x = data.x.to(torch.float32)

python/utils/gpu_metric_poller.py

Lines changed: 9 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright (c) 2018-2022, NVIDIA CORPORATION.
1+
# Copyright (c) 2018-2025, NVIDIA CORPORATION.
22
# Licensed under the Apache License, Version 2.0 (the "License");
33
# you may not use this file except in compliance with the License.
44
# You may obtain a copy of the License at
@@ -31,7 +31,7 @@
3131
import os
3232
import sys
3333
import threading
34-
from pynvml import smi
34+
import pynvml
3535

3636

3737
class GPUMetricPoller(threading.Thread):
@@ -91,18 +91,18 @@ def __runChildLoop(self, readFileNo, writeFileNo):
9191
childReadPipe = os.fdopen(readFileNo)
9292
childWritePipe = os.fdopen(writeFileNo, "w")
9393

94-
smi.nvmlInit()
94+
pynvml.nvmlInit()
9595
# hack - get actual device ID somehow
96-
devObj = smi.nvmlDeviceGetHandleByIndex(0)
97-
memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
98-
utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
96+
devObj = pynvml.nvmlDeviceGetHandleByIndex(0)
97+
memObj = pynvml.nvmlDeviceGetMemoryInfo(devObj)
98+
utilObj = pynvml.nvmlDeviceGetUtilizationRates(devObj)
9999
initialMemUsed = memObj.used
100100
initialGpuUtil = utilObj.gpu
101101

102102
controlStr = self.__waitForInput(childReadPipe)
103103
while True:
104-
memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
105-
utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
104+
memObj = pynvml.nvmlDeviceGetMemoryInfo(devObj)
105+
utilObj = pynvml.nvmlDeviceGetUtilizationRates(devObj)
106106

107107
memUsed = memObj.used - initialMemUsed
108108
gpuUtil = utilObj.gpu - initialGpuUtil
@@ -113,7 +113,7 @@ def __runChildLoop(self, readFileNo, writeFileNo):
113113
break
114114
controlStr = self.__waitForInput(childReadPipe)
115115

116-
smi.nvmlShutdown()
116+
pynvml.nvmlShutdown()
117117
childReadPipe.close()
118118
childWritePipe.close()
119119

@@ -147,34 +147,3 @@ def startGpuMetricPolling():
147147
def stopGpuMetricPolling(gpuPollObj):
148148
gpuPollObj.stop()
149149
gpuPollObj.join() # consider using timeout and reporting errors
150-
151-
152-
"""
153-
smi.nvmlInit()
154-
# hack - get actual device ID somehow
155-
devObj = smi.nvmlDeviceGetHandleByIndex(0)
156-
memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
157-
utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
158-
initialMemUsed = memObj.used
159-
initialGpuUtil = utilObj.gpu
160-
161-
while not self.__stop:
162-
time.sleep(0.01)
163-
164-
memObj = smi.nvmlDeviceGetMemoryInfo(devObj)
165-
utilObj = smi.nvmlDeviceGetUtilizationRates(devObj)
166-
167-
memUsed = memObj.used - initialMemUsed
168-
gpuUtil = utilObj.gpu - initialGpuUtil
169-
if memUsed > self.maxGpuMemUsed:
170-
self.maxGpuMemUsed = memUsed
171-
if gpuUtil > self.maxGpuUtil:
172-
self.maxGpuUtil = gpuUtil
173-
174-
smi.nvmlShutdown()
175-
"""
176-
177-
178-
# if __name__ == "__main__":
179-
# sto=stopGpuMetricPolling
180-
# po = startGpuMetricPolling()

0 commit comments

Comments
 (0)