Support Dataset cpu-mode in environment with GPUs that have not been detected (#236)

oliverholworthy · karlhigley · web-flow · commit 3e3da45d67f4 · 2023-03-22T16:04:43.000-04:00
* Run tests in GPU environment with no GPUs visible * Update TensorTable tests with checks for HAS_GPU * Remove unused `_HAS_GPU` variable from `test_utils` * Wrap cupy/cudf imports in HAS_GPU check in `compat` * Update tests to use HAS_GPU from compat module * Reformat test_tensor_table.py * Move HAS_GPU import to compat module * Add pynvml dependency * Update functions in `dispatch` to not use HAS_GPU * Raise RuntimeError in Dataset if we can't run on GPU when cpu=False * Update `convert_data` to handle unavailable cudf and dask_cudf * Remove use of `HAS_GPU` from dispatch * Keep cudf and cupy values representing presence of package * Revert changes to `dataset.py`. Now part of #243 * Revert changes to `dispatch.py`. Now part of #244 * Use branch-name action for branch selection * Remove unused ref_type variable * Extend reason in `test_tensor_column.py` Co-authored-by: Karl Higley <kmhigley@gmail.com> * Extend reason in `tests/unit/table/test_tensor_column.py` Co-authored-by: Karl Higley <kmhigley@gmail.com> * Remove cudf import from compat. Now unrelated to this PR * Remove use of branch-name action. `docker` not available in runner * Add HAS_GPU checks with cupy to support env without visible devices * Correct value of empty visible devices * Update deps for GPU envs to match others * Update get_lib to account for missing visible GPU * Check HAS_GPU in `make_df` to handle visible GPU devices * Update Dataset to handle default case when no visible GPUs are found * Update fixtures to handle cudf with no visible devices * Update tests to handle case of no visible GPUs --------- Co-authored-by: Karl Higley <kmhigley@gmail.com> Co-authored-by: Karl Higley <karlb@nvidia.com>
diff --git a/.github/workflows/gpu-ci.yml b/.github/workflows/gpu-ci.yml
@@ -28,3 +28,21 @@ jobs:
             branch=${raw/origin\/}
           fi
           cd ${{ github.workspace }}; tox -e test-gpu -- $branch
+
+  gpu-ci-not-visible:
+    runs-on: 1GPU
+
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+      - name: Run tests
+        run: |
+          ref_type=${{ github.ref_type }}
+          branch=main
+          if [[ $ref_type == "tag"* ]]
+          then
+            raw=$(git branch -r --contains ${{ github.ref_name }})
+            branch=${raw/origin\/}
+          fi
+          cd ${{ github.workspace }}; tox -e test-gpu-not-visible -- $branch
diff --git a/merlin/core/dispatch.py b/merlin/core/dispatch.py
@@ -129,7 +129,7 @@ def read_parquet_metadata(path):
 
 def get_lib():
     """Dispatch to the appropriate library (cudf or pandas) for the current environment"""
-    return cudf or pd
+    return cudf if (cudf and HAS_GPU) else pd
 
 
 def reinitialize(managed_memory=False):
@@ -540,7 +540,12 @@ def concat(objs, **kwargs):
 
 def make_df(_like_df=None, device=None):
     """Return a DataFrame with the same dtype as `_like_df`"""
-    if not cudf or device == "cpu" or isinstance(_like_df, (pd.DataFrame, pd.Series)):
+    if (
+        not cudf
+        or device == "cpu"
+        or not HAS_GPU
+        or isinstance(_like_df, (pd.DataFrame, pd.Series))
+    ):
         # move to pandas need it on CPU (host memory)
         # can be a cudf, cupy or numpy Series
         if cudf and isinstance(_like_df, (cudf.DataFrame, cudf.Series)):
diff --git a/merlin/io/dataset.py b/merlin/io/dataset.py
@@ -30,6 +30,7 @@
 from fsspec.core import get_fs_token_paths
 from fsspec.utils import stringify_path
 
+from merlin.core.compat import HAS_GPU
 from merlin.core.dispatch import (
     convert_data,
     hex_to_int,
@@ -246,7 +247,7 @@ def __init__(
         # Check if we are keeping data in cpu memory
         self.cpu = cpu
         if not self.cpu:
-            self.cpu = cudf is None
+            self.cpu = cudf is None or not HAS_GPU
 
         # Keep track of base dataset (optional)
         self.base_dataset = base_dataset or self
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -22,6 +22,8 @@
 import numpy as np
 import pandas as pd
 
+from merlin.core.compat import HAS_GPU
+
 try:
     import cudf
 
@@ -107,8 +109,8 @@ def get_cuda_cluster():
 
 @pytest.fixture(scope="session")
 def datasets(tmpdir_factory):
-    _lib = cudf if cudf else pd
-    _datalib = cudf if cudf else dask
+    _lib = cudf if cudf and HAS_GPU else pd
+    _datalib = cudf if cudf and HAS_GPU else dask
     df = _datalib.datasets.timeseries(
         start="2000-01-01",
         end="2000-01-04",
@@ -152,7 +154,7 @@ def datasets(tmpdir_factory):
     half = int(len(df) // 2)
 
     # Write Parquet Dataset
-    if cudf:
+    if cudf and isinstance(df, cudf.DataFrame):
         df.iloc[:half].to_parquet(
             str(datadir["parquet"].join("dataset-0.parquet")), row_group_size_rows=5000
         )
@@ -191,7 +193,7 @@ def paths(engine, datasets):
 
 @pytest.fixture(scope="function")
 def df(engine, paths):
-    _lib = cudf if cudf else pd
+    _lib = cudf if cudf and HAS_GPU else pd
     if engine == "parquet":
         df1 = _lib.read_parquet(paths[0])[mycols_pq]
         df2 = _lib.read_parquet(paths[1])[mycols_pq]
diff --git a/tests/unit/core/test_dispatch.py b/tests/unit/core/test_dispatch.py
@@ -17,13 +17,9 @@
 import pandas as pd
 import pytest
 
-from merlin.core.dispatch import HAS_GPU, concat_columns, is_list_dtype, list_val_dtype, make_df
-
-try:
-    import cupy as cp
-except ImportError:
-    cp = None
-
+from merlin.core.compat import HAS_GPU
+from merlin.core.compat import cupy as cp
+from merlin.core.dispatch import concat_columns, is_list_dtype, list_val_dtype, make_df
 
 if HAS_GPU:
     _DEVICES = ["cpu", "gpu"]
@@ -53,7 +49,7 @@ def test_concat_columns(device):
     assert res.columns.to_list() == ["a", "b", "c"]
 
 
-@pytest.mark.skipif(not cp, reason="Cupy not available")
+@pytest.mark.skipif(not (cp and HAS_GPU), reason="Cupy not available")
 def test_pandas_cupy_combo():
     rand_cp_nd_arr = cp.random.uniform(0.0, 1.0, size=100)
     with pytest.raises(TypeError) as exc_info:
diff --git a/tests/unit/core/test_protocols.py b/tests/unit/core/test_protocols.py
@@ -15,9 +15,15 @@
 #
 import pytest
 
+from merlin.core.compat import HAS_GPU, cudf
 from merlin.core.dispatch import make_df, make_series
 from merlin.core.protocols import DataFrameLike, DictLike, SeriesLike, Transformable
 
+if HAS_GPU and cudf:
+    _DEVICES = ["cpu", None]
+else:
+    _DEVICES = ["cpu"]
+
 
 @pytest.mark.parametrize("protocol", [DictLike])
 def test_dictionary_is_dictlike(protocol):
@@ -26,15 +32,15 @@ def test_dictionary_is_dictlike(protocol):
     assert isinstance(obj, protocol)
 
 
-@pytest.mark.parametrize("device", [None, "cpu"])
+@pytest.mark.parametrize("device", _DEVICES)
 @pytest.mark.parametrize("protocol", [DictLike, DataFrameLike, Transformable])
 def test_dataframes_match_protocols(protocol, device):
     obj = make_df({}, device=device)
 
     assert isinstance(obj, protocol)
 
 
-@pytest.mark.parametrize("device", [None, "cpu"])
+@pytest.mark.parametrize("device", _DEVICES)
 def test_series_are_serieslike(device):
     obj = make_series([], device=device)
 
diff --git a/tests/unit/io/test_io.py b/tests/unit/io/test_io.py
@@ -30,6 +30,7 @@
 import merlin.dtypes as md
 import merlin.io
 from merlin.core import dispatch
+from merlin.core.compat import HAS_GPU
 from merlin.io.parquet import GPUParquetWriter
 from merlin.schema.io.tensorflow_metadata import TensorflowMetadata
 from merlin.schema.tags import Tags, TagSet
@@ -38,6 +39,9 @@
 cudf = pytest.importorskip("cudf")
 dask_cudf = pytest.importorskip("dask_cudf")
 
+if not HAS_GPU:
+    pytestmark = pytest.mark.skip(reason="at least one visible CUDA GPU required.")
+
 
 def _check_partition_lens(ds):
     # Simple utility to check that the Parquet metadata
diff --git a/tests/unit/table/test_convert_column.py b/tests/unit/table/test_convert_column.py
@@ -17,6 +17,7 @@
 
 import pytest
 
+from merlin.core.compat import HAS_GPU
 from merlin.core.compat import cupy as cp
 from merlin.core.compat import numpy as np
 from merlin.core.compat import tensorflow as tf
@@ -27,7 +28,7 @@
 source_cols: List[TensorColumn] = []
 output_col_types: List[Type] = []
 
-if cp:
+if cp and HAS_GPU:
     cp_array = cp.asarray([1, 2, 3, 4])
 
     source_cols.append(CupyColumn(values=cp_array, offsets=cp_array))
@@ -39,7 +40,7 @@
     source_cols.append(NumpyColumn(values=np_array, offsets=np_array))
     output_col_types.append(NumpyColumn)
 
-if tf:
+if tf and HAS_GPU:
     with tf.device("/CPU"):
         tf_tensor = tf.convert_to_tensor(np.array([1, 2, 3, 4]))
         offsets_tensor = tf.convert_to_tensor(np.array([0, 1, 2, 3, 4]))
@@ -52,7 +53,7 @@
     source_cols.extend([cpu_tf_column, gpu_tf_column])
     output_col_types.append(TensorflowColumn)
 
-if th:
+if th and HAS_GPU:
     th_tensor = th.tensor([1, 2, 3, 4])
     cpu_th_column = TorchColumn(values=th_tensor, offsets=th_tensor)
 
diff --git a/tests/unit/table/test_tensor_column.py b/tests/unit/table/test_tensor_column.py
@@ -18,6 +18,7 @@
 import pytest
 
 import merlin.dtypes as md
+from merlin.core.compat import HAS_GPU
 from merlin.core.compat import cupy as cp
 from merlin.core.compat import numpy as np
 from merlin.core.compat import tensorflow as tf
@@ -31,7 +32,7 @@
 if np:
     col_types.append(NumpyColumn)
 
-if cp:
+if cp and HAS_GPU:
     col_types.append(CupyColumn)
 
 if tf:
@@ -91,7 +92,7 @@ def test_equality():
     assert np_col != np_col_3
 
 
-@pytest.mark.skipif(cp is None, reason="requires GPU")
+@pytest.mark.skipif(not (cp and HAS_GPU), reason="requires CuPy and GPU")
 def test_cupy_cpu_transfer():
     values = cp.array([1, 2, 3])
     offsets = cp.array([0, 1, 3])
@@ -108,7 +109,7 @@ def test_cupy_cpu_transfer():
     assert isinstance(cpu_col_again, NumpyColumn)
 
 
-@pytest.mark.skipif(cp is None, reason="requires GPU")
+@pytest.mark.skipif(not (cp and HAS_GPU), reason="requires CuPy and GPU")
 def test_numpy_gpu_transfer():
     values = np.array([1, 2, 3])
     offsets = np.array([0, 1, 3])
@@ -125,7 +126,7 @@ def test_numpy_gpu_transfer():
     assert isinstance(gpu_col_again, CupyColumn)
 
 
-@pytest.mark.skipif(th is None, reason="requires Torch")
+@pytest.mark.skipif(not (HAS_GPU and th), reason="requires Torch and GPU")
 def test_torch_data_transfer():
     values = th.tensor([1, 2, 3])
     offsets = th.tensor([0, 1, 3])
@@ -138,7 +139,7 @@ def test_torch_data_transfer():
     assert cpu_col_again.device == Device.CPU
 
 
-@pytest.mark.skipif(tf is None, reason="requires Tensorflow")
+@pytest.mark.skipif(not (tf and HAS_GPU), reason="requires TensorFlow and GPU")
 def test_tf_data_transfer():
     values = tf.constant([1, 2, 3])
     offsets = tf.constant([0, 1, 3])
diff --git a/tests/unit/table/test_tensor_table.py b/tests/unit/table/test_tensor_table.py
@@ -17,11 +17,12 @@
 
 import pytest
 
+from merlin.core.compat import HAS_GPU
 from merlin.core.compat import cupy as cp
 from merlin.core.compat import numpy as np
 from merlin.core.compat import tensorflow as tf
 from merlin.core.compat import torch as th
-from merlin.core.dispatch import HAS_GPU, df_from_dict, dict_from_df, make_df
+from merlin.core.dispatch import df_from_dict, dict_from_df, make_df
 from merlin.core.protocols import DictLike, Transformable
 from merlin.dag import BaseOperator, ColumnSelector
 from merlin.table import CupyColumn, Device, NumpyColumn, TensorflowColumn, TensorTable, TorchColumn
@@ -43,7 +44,7 @@
     cpu_target_packages.append((NumpyColumn, tensor_dict))
     cpu_source_col.append((NumpyColumn, np.array, np))
 
-if cp:
+if cp and HAS_GPU:
     tensor_dict = {
         "a__values": cp.asarray([1, 2, 3]),
         "a__offsets": cp.asarray([0, 1, 3]),
@@ -52,7 +53,7 @@
     gpu_target_packages.append((CupyColumn, tensor_dict))
     gpu_source_col.append((CupyColumn, cp.asarray, cp))
 
-if tf:
+if tf and HAS_GPU:
     with tf.device("/CPU"):
         tensor_dict_cpu = {
             "a__values": tf.convert_to_tensor(np.array([1, 2, 3])),
@@ -67,7 +68,7 @@
     gpu_target_packages.append((TensorflowColumn, tensor_dict_gpu))
     col_type.append(TensorflowColumn)
 
-if th:
+if th and HAS_GPU:
     tensor_dict_cpu = {
         "a__values": th.tensor([1, 2, 3], dtype=th.int32),
         "a__offsets": th.tensor([0, 1, 3], dtype=th.int32),
@@ -132,7 +133,8 @@ def test_column_type_validation():
 
 
 @pytest.mark.skipif(
-    tf is None, reason="Tensorflow is required for cross-framework validation tests"
+    not (tf and HAS_GPU),
+    reason="both TensorFlow and CUDA GPUs are required for cross-framework validation tests",
 )
 def test_column_device_validation():
     with tf.device("/CPU"):
@@ -283,7 +285,7 @@ def test_df_to_dict(device):
     assert_eq(df, roundtrip_df)
 
 
-@pytest.mark.skipif(cp is None, reason="requires GPU")
+@pytest.mark.skipif(cp is None or not HAS_GPU, reason="requires GPU and CuPy")
 def test_cpu_transfer():
     tensor_dict = {
         "a__values": cp.array([1, 2, 3]),
@@ -297,7 +299,7 @@ def test_cpu_transfer():
     assert isinstance(list(cpu_table.values())[0], NumpyColumn)
 
 
-@pytest.mark.skipif(cp is None, reason="requires GPU")
+@pytest.mark.skipif(cp is None or not HAS_GPU, reason="requires GPU and CuPy")
 def test_gpu_transfer():
     tensor_dict = {
         "a__values": np.array([1, 2, 3]),
diff --git a/tests/unit/utils/test_utils.py b/tests/unit/utils/test_utils.py
@@ -16,16 +16,13 @@
 
 import pytest
 
+from merlin.core.compat import HAS_GPU
 from merlin.core.utils import Distributed, Serial, global_dask_client, set_dask_client
 
-try:
-    import cudf
-
+if HAS_GPU:
     _CPU = [True, False]
-except ImportError:
+else:
     _CPU = [True]
-    cudf = None
-_HAS_GPU = cudf is not None
 
 
 @pytest.mark.parametrize("cpu", _CPU)
@@ -44,8 +41,8 @@ def test_serial_context(client, cpu):
     assert global_dask_client() == client
 
 
-@pytest.mark.parametrize("cpu", [True, False])
-@pytest.mark.parametrize("nested_serial", _CPU)
+@pytest.mark.parametrize("cpu", _CPU)
+@pytest.mark.parametrize("nested_serial", [True, False])
 def test_nvt_distributed(cpu, nested_serial):
     if cpu:
         distributed = pytest.importorskip("distributed")
diff --git a/tox.ini b/tox.ini
@@ -27,8 +27,26 @@ sitepackages=true
 setenv =
     TF_GPU_ALLOCATOR=cuda_malloc_async
 deps =
-    pytest
-    pytest-cov
+    -rrequirements.txt
+    -rrequirements-dev.txt
+commands =
+    python -m pytest --cov-report term --cov merlin -rxs tests/unit
+
+
+[testenv:test-gpu-not-visible]
+sitepackages=true
+; Runs in: Internal Jenkins
+; Runs GPU-based tests.
+; The jenkins jobs run on an image based on merlin-hugectr. This will include all cudf configuration
+; and other gpu-specific libraries that we can enxpect will always exist. Thus, we don't need
+; to install requirements.txt yet. As we get better at python environment isolation, we will
+; need to add some back.
+setenv =
+    TF_GPU_ALLOCATOR=cuda_malloc_async
+    CUDA_VISIBLE_DEVICES=
+deps =
+    -rrequirements.txt
+    -rrequirements-dev.txt
 commands =
     python -m pytest --cov-report term --cov merlin -rxs tests/unit