Skip to content

Commit 3e3da45

Browse files
Support Dataset cpu-mode in environment with GPUs that have not been detected (#236)
* Run tests in GPU environment with no GPUs visible * Update TensorTable tests with checks for HAS_GPU * Remove unused `_HAS_GPU` variable from `test_utils` * Wrap cupy/cudf imports in HAS_GPU check in `compat` * Update tests to use HAS_GPU from compat module * Reformat test_tensor_table.py * Move HAS_GPU import to compat module * Add pynvml dependency * Update functions in `dispatch` to not use HAS_GPU * Raise RuntimeError in Dataset if we can't run on GPU when cpu=False * Update `convert_data` to handle unavailable cudf and dask_cudf * Remove use of `HAS_GPU` from dispatch * Keep cudf and cupy values representing presence of package * Revert changes to `dataset.py`. Now part of #243 * Revert changes to `dispatch.py`. Now part of #244 * Use branch-name action for branch selection * Remove unused ref_type variable * Extend reason in `test_tensor_column.py` Co-authored-by: Karl Higley <kmhigley@gmail.com> * Extend reason in `tests/unit/table/test_tensor_column.py` Co-authored-by: Karl Higley <kmhigley@gmail.com> * Remove cudf import from compat. Now unrelated to this PR * Remove use of branch-name action. `docker` not available in runner * Add HAS_GPU checks with cupy to support env without visible devices * Correct value of empty visible devices * Update deps for GPU envs to match others * Update get_lib to account for missing visible GPU * Check HAS_GPU in `make_df` to handle visible GPU devices * Update Dataset to handle default case when no visible GPUs are found * Update fixtures to handle cudf with no visible devices * Update tests to handle case of no visible GPUs --------- Co-authored-by: Karl Higley <kmhigley@gmail.com> Co-authored-by: Karl Higley <karlb@nvidia.com>
1 parent d4f1c6e commit 3e3da45

File tree

12 files changed

+93
-42
lines changed

12 files changed

+93
-42
lines changed

.github/workflows/gpu-ci.yml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,21 @@ jobs:
2828
branch=${raw/origin\/}
2929
fi
3030
cd ${{ github.workspace }}; tox -e test-gpu -- $branch
31+
32+
gpu-ci-not-visible:
33+
runs-on: 1GPU
34+
35+
steps:
36+
- uses: actions/checkout@v3
37+
with:
38+
fetch-depth: 0
39+
- name: Run tests
40+
run: |
41+
ref_type=${{ github.ref_type }}
42+
branch=main
43+
if [[ $ref_type == "tag"* ]]
44+
then
45+
raw=$(git branch -r --contains ${{ github.ref_name }})
46+
branch=${raw/origin\/}
47+
fi
48+
cd ${{ github.workspace }}; tox -e test-gpu-not-visible -- $branch

merlin/core/dispatch.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def read_parquet_metadata(path):
129129

130130
def get_lib():
131131
"""Dispatch to the appropriate library (cudf or pandas) for the current environment"""
132-
return cudf or pd
132+
return cudf if (cudf and HAS_GPU) else pd
133133

134134

135135
def reinitialize(managed_memory=False):
@@ -540,7 +540,12 @@ def concat(objs, **kwargs):
540540

541541
def make_df(_like_df=None, device=None):
542542
"""Return a DataFrame with the same dtype as `_like_df`"""
543-
if not cudf or device == "cpu" or isinstance(_like_df, (pd.DataFrame, pd.Series)):
543+
if (
544+
not cudf
545+
or device == "cpu"
546+
or not HAS_GPU
547+
or isinstance(_like_df, (pd.DataFrame, pd.Series))
548+
):
544549
# move to pandas need it on CPU (host memory)
545550
# can be a cudf, cupy or numpy Series
546551
if cudf and isinstance(_like_df, (cudf.DataFrame, cudf.Series)):

merlin/io/dataset.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from fsspec.core import get_fs_token_paths
3131
from fsspec.utils import stringify_path
3232

33+
from merlin.core.compat import HAS_GPU
3334
from merlin.core.dispatch import (
3435
convert_data,
3536
hex_to_int,
@@ -246,7 +247,7 @@ def __init__(
246247
# Check if we are keeping data in cpu memory
247248
self.cpu = cpu
248249
if not self.cpu:
249-
self.cpu = cudf is None
250+
self.cpu = cudf is None or not HAS_GPU
250251

251252
# Keep track of base dataset (optional)
252253
self.base_dataset = base_dataset or self

tests/conftest.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
import numpy as np
2323
import pandas as pd
2424

25+
from merlin.core.compat import HAS_GPU
26+
2527
try:
2628
import cudf
2729

@@ -107,8 +109,8 @@ def get_cuda_cluster():
107109

108110
@pytest.fixture(scope="session")
109111
def datasets(tmpdir_factory):
110-
_lib = cudf if cudf else pd
111-
_datalib = cudf if cudf else dask
112+
_lib = cudf if cudf and HAS_GPU else pd
113+
_datalib = cudf if cudf and HAS_GPU else dask
112114
df = _datalib.datasets.timeseries(
113115
start="2000-01-01",
114116
end="2000-01-04",
@@ -152,7 +154,7 @@ def datasets(tmpdir_factory):
152154
half = int(len(df) // 2)
153155

154156
# Write Parquet Dataset
155-
if cudf:
157+
if cudf and isinstance(df, cudf.DataFrame):
156158
df.iloc[:half].to_parquet(
157159
str(datadir["parquet"].join("dataset-0.parquet")), row_group_size_rows=5000
158160
)
@@ -191,7 +193,7 @@ def paths(engine, datasets):
191193

192194
@pytest.fixture(scope="function")
193195
def df(engine, paths):
194-
_lib = cudf if cudf else pd
196+
_lib = cudf if cudf and HAS_GPU else pd
195197
if engine == "parquet":
196198
df1 = _lib.read_parquet(paths[0])[mycols_pq]
197199
df2 = _lib.read_parquet(paths[1])[mycols_pq]

tests/unit/core/test_dispatch.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,9 @@
1717
import pandas as pd
1818
import pytest
1919

20-
from merlin.core.dispatch import HAS_GPU, concat_columns, is_list_dtype, list_val_dtype, make_df
21-
22-
try:
23-
import cupy as cp
24-
except ImportError:
25-
cp = None
26-
20+
from merlin.core.compat import HAS_GPU
21+
from merlin.core.compat import cupy as cp
22+
from merlin.core.dispatch import concat_columns, is_list_dtype, list_val_dtype, make_df
2723

2824
if HAS_GPU:
2925
_DEVICES = ["cpu", "gpu"]
@@ -53,7 +49,7 @@ def test_concat_columns(device):
5349
assert res.columns.to_list() == ["a", "b", "c"]
5450

5551

56-
@pytest.mark.skipif(not cp, reason="Cupy not available")
52+
@pytest.mark.skipif(not (cp and HAS_GPU), reason="Cupy not available")
5753
def test_pandas_cupy_combo():
5854
rand_cp_nd_arr = cp.random.uniform(0.0, 1.0, size=100)
5955
with pytest.raises(TypeError) as exc_info:

tests/unit/core/test_protocols.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,15 @@
1515
#
1616
import pytest
1717

18+
from merlin.core.compat import HAS_GPU, cudf
1819
from merlin.core.dispatch import make_df, make_series
1920
from merlin.core.protocols import DataFrameLike, DictLike, SeriesLike, Transformable
2021

22+
if HAS_GPU and cudf:
23+
_DEVICES = ["cpu", None]
24+
else:
25+
_DEVICES = ["cpu"]
26+
2127

2228
@pytest.mark.parametrize("protocol", [DictLike])
2329
def test_dictionary_is_dictlike(protocol):
@@ -26,15 +32,15 @@ def test_dictionary_is_dictlike(protocol):
2632
assert isinstance(obj, protocol)
2733

2834

29-
@pytest.mark.parametrize("device", [None, "cpu"])
35+
@pytest.mark.parametrize("device", _DEVICES)
3036
@pytest.mark.parametrize("protocol", [DictLike, DataFrameLike, Transformable])
3137
def test_dataframes_match_protocols(protocol, device):
3238
obj = make_df({}, device=device)
3339

3440
assert isinstance(obj, protocol)
3541

3642

37-
@pytest.mark.parametrize("device", [None, "cpu"])
43+
@pytest.mark.parametrize("device", _DEVICES)
3844
def test_series_are_serieslike(device):
3945
obj = make_series([], device=device)
4046

tests/unit/io/test_io.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import merlin.dtypes as md
3131
import merlin.io
3232
from merlin.core import dispatch
33+
from merlin.core.compat import HAS_GPU
3334
from merlin.io.parquet import GPUParquetWriter
3435
from merlin.schema.io.tensorflow_metadata import TensorflowMetadata
3536
from merlin.schema.tags import Tags, TagSet
@@ -38,6 +39,9 @@
3839
cudf = pytest.importorskip("cudf")
3940
dask_cudf = pytest.importorskip("dask_cudf")
4041

42+
if not HAS_GPU:
43+
pytestmark = pytest.mark.skip(reason="at least one visible CUDA GPU required.")
44+
4145

4246
def _check_partition_lens(ds):
4347
# Simple utility to check that the Parquet metadata

tests/unit/table/test_convert_column.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
import pytest
1919

20+
from merlin.core.compat import HAS_GPU
2021
from merlin.core.compat import cupy as cp
2122
from merlin.core.compat import numpy as np
2223
from merlin.core.compat import tensorflow as tf
@@ -27,7 +28,7 @@
2728
source_cols: List[TensorColumn] = []
2829
output_col_types: List[Type] = []
2930

30-
if cp:
31+
if cp and HAS_GPU:
3132
cp_array = cp.asarray([1, 2, 3, 4])
3233

3334
source_cols.append(CupyColumn(values=cp_array, offsets=cp_array))
@@ -39,7 +40,7 @@
3940
source_cols.append(NumpyColumn(values=np_array, offsets=np_array))
4041
output_col_types.append(NumpyColumn)
4142

42-
if tf:
43+
if tf and HAS_GPU:
4344
with tf.device("/CPU"):
4445
tf_tensor = tf.convert_to_tensor(np.array([1, 2, 3, 4]))
4546
offsets_tensor = tf.convert_to_tensor(np.array([0, 1, 2, 3, 4]))
@@ -52,7 +53,7 @@
5253
source_cols.extend([cpu_tf_column, gpu_tf_column])
5354
output_col_types.append(TensorflowColumn)
5455

55-
if th:
56+
if th and HAS_GPU:
5657
th_tensor = th.tensor([1, 2, 3, 4])
5758
cpu_th_column = TorchColumn(values=th_tensor, offsets=th_tensor)
5859

tests/unit/table/test_tensor_column.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import pytest
1919

2020
import merlin.dtypes as md
21+
from merlin.core.compat import HAS_GPU
2122
from merlin.core.compat import cupy as cp
2223
from merlin.core.compat import numpy as np
2324
from merlin.core.compat import tensorflow as tf
@@ -31,7 +32,7 @@
3132
if np:
3233
col_types.append(NumpyColumn)
3334

34-
if cp:
35+
if cp and HAS_GPU:
3536
col_types.append(CupyColumn)
3637

3738
if tf:
@@ -91,7 +92,7 @@ def test_equality():
9192
assert np_col != np_col_3
9293

9394

94-
@pytest.mark.skipif(cp is None, reason="requires GPU")
95+
@pytest.mark.skipif(not (cp and HAS_GPU), reason="requires CuPy and GPU")
9596
def test_cupy_cpu_transfer():
9697
values = cp.array([1, 2, 3])
9798
offsets = cp.array([0, 1, 3])
@@ -108,7 +109,7 @@ def test_cupy_cpu_transfer():
108109
assert isinstance(cpu_col_again, NumpyColumn)
109110

110111

111-
@pytest.mark.skipif(cp is None, reason="requires GPU")
112+
@pytest.mark.skipif(not (cp and HAS_GPU), reason="requires CuPy and GPU")
112113
def test_numpy_gpu_transfer():
113114
values = np.array([1, 2, 3])
114115
offsets = np.array([0, 1, 3])
@@ -125,7 +126,7 @@ def test_numpy_gpu_transfer():
125126
assert isinstance(gpu_col_again, CupyColumn)
126127

127128

128-
@pytest.mark.skipif(th is None, reason="requires Torch")
129+
@pytest.mark.skipif(not (HAS_GPU and th), reason="requires Torch and GPU")
129130
def test_torch_data_transfer():
130131
values = th.tensor([1, 2, 3])
131132
offsets = th.tensor([0, 1, 3])
@@ -138,7 +139,7 @@ def test_torch_data_transfer():
138139
assert cpu_col_again.device == Device.CPU
139140

140141

141-
@pytest.mark.skipif(tf is None, reason="requires Tensorflow")
142+
@pytest.mark.skipif(not (tf and HAS_GPU), reason="requires TensorFlow and GPU")
142143
def test_tf_data_transfer():
143144
values = tf.constant([1, 2, 3])
144145
offsets = tf.constant([0, 1, 3])

tests/unit/table/test_tensor_table.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,12 @@
1717

1818
import pytest
1919

20+
from merlin.core.compat import HAS_GPU
2021
from merlin.core.compat import cupy as cp
2122
from merlin.core.compat import numpy as np
2223
from merlin.core.compat import tensorflow as tf
2324
from merlin.core.compat import torch as th
24-
from merlin.core.dispatch import HAS_GPU, df_from_dict, dict_from_df, make_df
25+
from merlin.core.dispatch import df_from_dict, dict_from_df, make_df
2526
from merlin.core.protocols import DictLike, Transformable
2627
from merlin.dag import BaseOperator, ColumnSelector
2728
from merlin.table import CupyColumn, Device, NumpyColumn, TensorflowColumn, TensorTable, TorchColumn
@@ -43,7 +44,7 @@
4344
cpu_target_packages.append((NumpyColumn, tensor_dict))
4445
cpu_source_col.append((NumpyColumn, np.array, np))
4546

46-
if cp:
47+
if cp and HAS_GPU:
4748
tensor_dict = {
4849
"a__values": cp.asarray([1, 2, 3]),
4950
"a__offsets": cp.asarray([0, 1, 3]),
@@ -52,7 +53,7 @@
5253
gpu_target_packages.append((CupyColumn, tensor_dict))
5354
gpu_source_col.append((CupyColumn, cp.asarray, cp))
5455

55-
if tf:
56+
if tf and HAS_GPU:
5657
with tf.device("/CPU"):
5758
tensor_dict_cpu = {
5859
"a__values": tf.convert_to_tensor(np.array([1, 2, 3])),
@@ -67,7 +68,7 @@
6768
gpu_target_packages.append((TensorflowColumn, tensor_dict_gpu))
6869
col_type.append(TensorflowColumn)
6970

70-
if th:
71+
if th and HAS_GPU:
7172
tensor_dict_cpu = {
7273
"a__values": th.tensor([1, 2, 3], dtype=th.int32),
7374
"a__offsets": th.tensor([0, 1, 3], dtype=th.int32),
@@ -132,7 +133,8 @@ def test_column_type_validation():
132133

133134

134135
@pytest.mark.skipif(
135-
tf is None, reason="Tensorflow is required for cross-framework validation tests"
136+
not (tf and HAS_GPU),
137+
reason="both TensorFlow and CUDA GPUs are required for cross-framework validation tests",
136138
)
137139
def test_column_device_validation():
138140
with tf.device("/CPU"):
@@ -283,7 +285,7 @@ def test_df_to_dict(device):
283285
assert_eq(df, roundtrip_df)
284286

285287

286-
@pytest.mark.skipif(cp is None, reason="requires GPU")
288+
@pytest.mark.skipif(cp is None or not HAS_GPU, reason="requires GPU and CuPy")
287289
def test_cpu_transfer():
288290
tensor_dict = {
289291
"a__values": cp.array([1, 2, 3]),
@@ -297,7 +299,7 @@ def test_cpu_transfer():
297299
assert isinstance(list(cpu_table.values())[0], NumpyColumn)
298300

299301

300-
@pytest.mark.skipif(cp is None, reason="requires GPU")
302+
@pytest.mark.skipif(cp is None or not HAS_GPU, reason="requires GPU and CuPy")
301303
def test_gpu_transfer():
302304
tensor_dict = {
303305
"a__values": np.array([1, 2, 3]),

0 commit comments

Comments
 (0)