dask · phofl · Jul 27, 2023 · Jul 26, 2023 · Jul 26, 2023 · Jul 26, 2023
diff --git a/dask_expr/_collection.py b/dask_expr/_collection.py
@@ -1051,6 +1051,7 @@ def new_collection(expr):
     """Create new collection from an expr"""
 
     meta = expr._meta
+    expr._name  # Ensure backend is imported
     if is_dataframe_like(meta):
         return DataFrame(expr)
     elif is_series_like(meta):

diff --git a/dask_expr/_expr.py b/dask_expr/_expr.py
@@ -151,6 +151,14 @@ def __getattr__(self, key):
         try:
             return object.__getattribute__(self, key)
         except AttributeError as err:
+            if key == "_meta":
+                # Avoid a recursive loop if/when `self._meta`
+                # produces an `AttributeError`
+                raise RuntimeError(
+                    f"Failed to generate metadata for {self}. "
+                    "This operation may not be supported by the current backend."
+                )
+
             # Allow operands to be accessed as attributes
             # as long as the keys are not already reserved
             # by existing methods/properties

diff --git a/dask_expr/io/tests/test_io.py b/dask_expr/io/tests/test_io.py
@@ -1,19 +1,17 @@
-import importlib
 import os
 
 import dask.dataframe as dd
 import pytest
-from dask import config
 from dask.dataframe.utils import assert_eq
 
 from dask_expr import from_dask_dataframe, from_pandas, optimize, read_csv, read_parquet
 from dask_expr._expr import Expr, Lengths, Literal, Replace
 from dask_expr._reductions import Len
 from dask_expr.io import ReadParquet
+from dask_expr.tests._util import _backend_library
 
-# Import backend DataFrame library to test
-BACKEND = config.get("dataframe.backend", "pandas")
-lib = importlib.import_module(BACKEND)
+# Set DataFrame backend for this module
+lib = _backend_library()
 
 
 def _make_file(dir, format="parquet", df=None):
@@ -215,8 +213,7 @@ def test_from_pandas_immutable():
 
 
 def test_parquet_complex_filters(tmpdir):
-    with config.set({"dataframe.backend": BACKEND}):
-        df = read_parquet(_make_file(tmpdir))
+    df = read_parquet(_make_file(tmpdir))
     pdf = df.compute()
     got = df["a"][df["b"] > df["b"].mean()]
     expect = pdf["a"][pdf["b"] > pdf["b"].mean()]

diff --git a/dask_expr/tests/_util.py b/dask_expr/tests/_util.py
@@ -0,0 +1,18 @@
+import importlib
+
+import pytest
+from dask import config
+
+
+def _backend_name() -> str:
+    return config.get("dataframe.backend", "pandas")
+
+
+def _backend_library():
+    return importlib.import_module(_backend_name())
+
+
+def xfail_gpu(reason=None):
+    condition = _backend_name() == "cudf"
+    reason = reason or "Failure expected for cudf backend."
+    return pytest.mark.xfail(condition, reason=reason)
diff --git a/dask_expr/tests/test_categorical.py b/dask_expr/tests/test_categorical.py
@@ -1,13 +1,16 @@
-import pandas as pd
 import pytest
 from dask.dataframe import assert_eq
 
 from dask_expr import from_pandas
+from dask_expr.tests._util import _backend_library
+
+# Set DataFrame backend for this module
+lib = _backend_library()
 
 
 @pytest.fixture
 def pdf():
-    pdf = pd.DataFrame({"x": [1, 2, 3, 4, 1, 2]}, dtype="category")
+    pdf = lib.DataFrame({"x": [1, 2, 3, 4, 1, 2]}, dtype="category")
     return pdf
 
 
@@ -22,7 +25,7 @@ def test_set_categories(df, pdf):
     ser = df.x.cat.as_unknown()
     assert not ser.cat.known
     ser = ser.cat.as_known()
-    assert_eq(ser.cat.categories, pd.Index([1, 2, 3, 4]))
+    assert_eq(ser.cat.categories, lib.Index([1, 2, 3, 4]))
     ser = ser.cat.set_categories([1, 2, 3, 5, 4])
-    assert_eq(ser.cat.categories, pd.Index([1, 2, 3, 5, 4]))
+    assert_eq(ser.cat.categories, lib.Index([1, 2, 3, 5, 4]))
     assert not ser.cat.ordered
diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import importlib
 import operator
 import pickle
 
@@ -15,11 +14,10 @@
 from dask_expr._expr import are_co_aligned
 from dask_expr._reductions import Len
 from dask_expr.datasets import timeseries
+from dask_expr.tests._util import _backend_library, xfail_gpu
 
-# Import backend DataFrame library to test
-BACKEND = dask.config.get("dataframe.backend", "pandas")
-CUDF_BACKEND = BACKEND == "cudf"
-lib = importlib.import_module(BACKEND)
+# Set DataFrame backend for this module
+lib = _backend_library()
 
 
 @pytest.fixture
@@ -53,15 +51,15 @@ def test_setitem(pdf, df):
     assert_eq(df, pdf)
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="https://github.com/rapidsai/cudf/issues/10271")
+@xfail_gpu("https://github.com/rapidsai/cudf/issues/10271")
 def test_explode():
     pdf = lib.DataFrame({"a": [[1, 2], [3, 4]]})
     df = from_pandas(pdf)
     assert_eq(pdf.explode(column="a"), df.explode(column="a"))
     assert_eq(pdf.a.explode(), df.a.explode())
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="https://github.com/rapidsai/cudf/issues/10271")
+@xfail_gpu("https://github.com/rapidsai/cudf/issues/10271")
 def test_explode_simplify(pdf):
     pdf["z"] = 1
     df = from_pandas(pdf)
@@ -115,17 +113,19 @@ def test_dask(pdf, df):
         M.mean,
         M.std,
         M.var,
-        M.idxmin,
-        M.idxmax,
+        pytest.param(
+            M.idxmin, marks=xfail_gpu("https://github.com/rapidsai/cudf/issues/9602")
+        ),
+        pytest.param(
+            M.idxmax, marks=xfail_gpu("https://github.com/rapidsai/cudf/issues/9602")
+        ),
         pytest.param(
             lambda df: df.size,
             marks=pytest.mark.skip(reason="scalars don't work yet"),
         ),
     ],
 )
 def test_reductions(func, pdf, df):
-    if CUDF_BACKEND and func in [M.idxmin, M.idxmax]:
-        pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/9602")
     result = func(df)
     assert result.known_divisions
     assert_eq(result, func(pdf))
@@ -138,11 +138,17 @@ def test_reductions(func, pdf, df):
 
 
 @pytest.mark.parametrize("axis", [0, 1])
-@pytest.mark.parametrize("skipna", [True, False])
+@pytest.mark.parametrize(
+    "skipna",
+    [
+        True,
+        pytest.param(
+            False, marks=xfail_gpu("cudf requires skipna=True when nulls are present.")
+        ),
+    ],
+)
 @pytest.mark.parametrize("ddof", [1, 2])
 def test_std_kwargs(axis, skipna, ddof):
-    if CUDF_BACKEND and skipna is False:
-        pytest.xfail(reason="cudf requires skipna=True when nulls are present.")
     pdf = lib.DataFrame(
         {"x": range(30), "y": [1, 2, None] * 10, "z": ["dog", "cat"] * 15}
     )
@@ -153,7 +159,7 @@ def test_std_kwargs(axis, skipna, ddof):
     )
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="nbytes not supported by cudf")
+@xfail_gpu("nbytes not supported by cudf")
 def test_nbytes(pdf, df):
     with pytest.raises(NotImplementedError, match="nbytes is not implemented"):
         df.nbytes
@@ -280,7 +286,7 @@ def test_and_or(func, pdf, df):
     assert_eq(func(pdf), func(df), check_names=False)
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="period_range not supported by cudf")
+@xfail_gpu("period_range not supported by cudf")
 @pytest.mark.parametrize("how", ["start", "end"])
 def test_to_timestamp(pdf, how):
     pdf.index = lib.period_range("2019-12-31", freq="D", periods=len(pdf))
@@ -321,7 +327,7 @@ def test_blockwise(func, pdf, df):
     assert_eq(func(pdf), func(df))
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="func not supported by cudf")
+@xfail_gpu("func not supported by cudf")
 @pytest.mark.parametrize(
     "func",
     [
@@ -353,7 +359,7 @@ def test_simplify_add_suffix_add_prefix(df, pdf):
     assert_eq(result, pdf.add_suffix("_2")["x_2"])
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="rename_axis not supported by cudf")
+@xfail_gpu("rename_axis not supported by cudf")
 def test_rename_axis(pdf):
     pdf.index.name = "a"
     pdf.columns.name = "b"
@@ -386,7 +392,7 @@ def test_repr(df):
     assert "sum(skipna=False)" in s
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="combine_first not supported by cudf")
+@xfail_gpu("combine_first not supported by cudf")
 def test_combine_first_simplify(pdf):
     df = from_pandas(pdf)
     pdf2 = pdf.rename(columns={"y": "z"})
@@ -681,7 +687,7 @@ def test_serialization(pdf, df):
     assert_eq(pickle.loads(before), pickle.loads(after))
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="Cannot apply lambda function in cudf")
+@xfail_gpu("Cannot apply lambda function in cudf")
 def test_size_optimized(df):
     expr = (df.x + 1).apply(lambda x: x).size
     out = optimize(expr)
@@ -697,10 +703,7 @@ def test_size_optimized(df):
 @pytest.mark.parametrize("fuse", [True, False])
 def test_tree_repr(fuse):
     s = from_pandas(lib.Series(range(10))).expr.tree_repr()
-    if BACKEND == "pandas":
-        assert "<pandas>" in s
-    else:
-        assert "<series>" in s
+    assert ("<pandas>" in s) or ("<series>" in s)
 
     df = timeseries()
     expr = ((df.x + 1).sum(skipna=False) + df.y.mean()).expr
@@ -963,7 +966,7 @@ def test_sample(df):
     assert_eq(result, expected)
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="align not supported by cudf")
+@xfail_gpu("align not supported by cudf")
 def test_align(df, pdf):
     result_1, result_2 = df.align(df)
     pdf_result_1, pdf_result_2 = pdf.align(pdf)
@@ -976,7 +979,7 @@ def test_align(df, pdf):
     assert_eq(result_2, pdf_result_2)
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="align not supported by cudf")
+@xfail_gpu("align not supported by cudf")
 def test_align_different_partitions():
     pdf = lib.DataFrame({"a": [11, 12, 31, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6]})
     df = from_pandas(pdf, npartitions=2)
@@ -991,7 +994,7 @@ def test_align_different_partitions():
     assert_eq(result_2, pdf_result_2)
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="align not supported by cudf")
+@xfail_gpu("align not supported by cudf")
 def test_align_unknown_partitions_same_root():
     pdf = lib.DataFrame({"a": 1}, index=[3, 2, 1])
     df = from_pandas(pdf, npartitions=2, sort=False)
@@ -1001,7 +1004,7 @@ def test_align_unknown_partitions_same_root():
     assert_eq(result_2, pdf_result_2)
 
 
-@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf")
+@xfail_gpu(reason="align not supported by cudf")
 def test_unknown_partitions_different_root():
     pdf = lib.DataFrame({"a": 1}, index=[3, 2, 1])
     df = from_pandas(pdf, npartitions=2, sort=False)
@@ -1011,7 +1014,7 @@ def test_unknown_partitions_different_root():
         df.align(df2)
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="compute_hll_array doesn't work for cudf")
+@xfail_gpu("compute_hll_array doesn't work for cudf")
 def test_nunique_approx(df):
     result = df.nunique_approx().compute()
     assert 99 < result < 101
@@ -1050,7 +1053,7 @@ def test_assign_simplify_series(pdf):
     assert result._name == expected._name
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="assign function not supported by cudf")
+@xfail_gpu("assign function not supported by cudf")
 def test_assign_non_series_inputs(df, pdf):
     assert_eq(df.assign(a=lambda x: x.x * 2), pdf.assign(a=lambda x: x.x * 2))
     assert_eq(df.assign(a=2), pdf.assign(a=2))
@@ -1081,7 +1084,7 @@ def test_are_co_aligned(pdf, df):
     assert not are_co_aligned(merged_first.expr, df.expr)
 
 
-@pytest.mark.xfail(CUDF_BACKEND, reason="TODO")
+@xfail_gpu()
 def test_astype_categories(df):
     result = df.astype("category")
     assert_eq(result.x._meta.cat.categories, lib.Index([UNKNOWN_CATEGORIES]))