From 9875b2783c36716f957bdc91e9b946b8fefa4d8a Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Wed, 5 Jul 2023 13:37:05 -0700
Subject: [PATCH 01/18] basic cudf backend support

---
 dask_expr/io/parquet.py            |  4 ++
 dask_expr/tests/test_collection.py | 68 ++++++++++++++++++++++++++++--
 2 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/dask_expr/io/parquet.py b/dask_expr/io/parquet.py
index 0205a2f12..de116d7dc 100644
--- a/dask_expr/io/parquet.py
+++ b/dask_expr/io/parquet.py
@@ -417,6 +417,10 @@ class ReadParquet(PartitionsFiltered, BlockwiseIO):
 
     @property
     def engine(self):
+        if dask.config.get("dataframe.backend", "pandas") == "cudf":
+            from dask_cudf.io.parquet import CudfEngine
+
+            return CudfEngine
         return get_engine("pyarrow")
 
     @property
diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
index 3641c4320..0158736d1 100644
--- a/dask_expr/tests/test_collection.py
+++ b/dask_expr/tests/test_collection.py
@@ -1,3 +1,6 @@
+from __future__ import annotations
+
+import functools
 import operator
 import pickle
 import re
@@ -16,11 +19,27 @@
 from dask_expr.reductions import Len
 
 
-@pytest.fixture
-def pdf():
+try:
+    import cudf
+except ImportError:
+    cudf = None
+
+
+@functools.cache
+def _pdf():
     pdf = pd.DataFrame({"x": range(100)})
     pdf["y"] = pdf.x * 10.0
-    yield pdf
+    return pdf
+
+
+@functools.cache
+def _gdf():
+    return None if cudf is None else cudf.from_pandas(_pdf())
+
+
+@pytest.fixture
+def pdf():
+    yield _pdf().copy()
 
 
 @pytest.fixture
@@ -28,6 +47,29 @@ def df(pdf):
     yield from_pandas(pdf, npartitions=10)
 
 
+def cpu_gpu(data: dict | None = None, npartitions: int = 10):
+    """DataFrame parameterization for cpu and gpu backed data"""
+    if data is None:
+        pdf, gdf = _pdf().copy(), _gdf().copy()
+    else:
+        pdf = pd.DataFrame(data)
+        gdf = None if cudf is None else cudf.from_pandas(pdf)
+
+    return pytest.mark.parametrize(
+        "pdf,df",
+        [
+            pytest.param(pdf, from_pandas(pdf, npartitions), id="pandas"),
+            pytest.param(
+                gdf,
+                from_pandas(gdf, npartitions) if cudf else None,
+                id="cudf",
+                marks=pytest.mark.skipif(cudf is None, reason="cudf not found"),
+            ),
+        ],
+    )
+
+
+@cpu_gpu()
 def test_del(pdf, df):
     pdf = pdf.copy()
 
@@ -37,6 +79,7 @@ def test_del(pdf, df):
     assert_eq(pdf, df)
 
 
+@cpu_gpu()
 def test_setitem(pdf, df):
     pdf = pdf.copy()
     pdf["z"] = pdf.x + pdf.y
@@ -87,6 +130,7 @@ def test_meta_blockwise():
     assert set(cc.columns) == {"x", "y", "z"}
 
 
+@cpu_gpu()
 def test_dask(pdf, df):
     assert (df.x + df.y).npartitions == 10
     z = (df.x + df.y).sum()
@@ -168,6 +212,7 @@ def test_memory_usage(pdf):
         df.index.memory_usage(index=True)
 
 
+@cpu_gpu()
 @pytest.mark.parametrize("func", [M.nlargest, M.nsmallest])
 def test_nlargest_nsmallest(df, pdf, func):
     assert_eq(func(df, n=5, columns="x"), func(pdf, n=5, columns="x"))
@@ -176,6 +221,7 @@ def test_nlargest_nsmallest(df, pdf, func):
         func(df.x, n=5, columns="foo")
 
 
+@cpu_gpu()
 @pytest.mark.parametrize(
     "func",
     [
@@ -232,6 +278,7 @@ def test_unary_operators(func):
     assert_eq(func(pdf), func(df))
 
 
+@cpu_gpu()
 @pytest.mark.parametrize(
     "func",
     [
@@ -300,6 +347,7 @@ def test_rename_axis(pdf):
     assert_eq(df.x.rename_axis(index="dummy"), pdf.x.rename_axis(index="dummy"))
 
 
+@cpu_gpu()
 def test_isin(df, pdf):
     values = [1, 2]
     assert_eq(pdf.isin(values), df.isin(values))
@@ -341,6 +389,7 @@ def test_rename_traverse_filter(df):
     assert str(result) == str(expected)
 
 
+@cpu_gpu()
 def test_columns_traverse_filters(pdf, df):
     result = df[df.x > 5].y.simplify()
     expected = df.y[df.x > 5]
@@ -377,6 +426,7 @@ def test_drop_duplicates_subset_simplify(pdf, subset, projection):
     assert str(result) == str(expected)
 
 
+@cpu_gpu()
 def test_broadcast(pdf, df):
     assert_eq(
         df + df.sum(),
@@ -388,6 +438,7 @@ def test_broadcast(pdf, df):
     )
 
 
+@cpu_gpu()
 def test_persist(pdf, df):
     a = df + 2
     b = a.persist()
@@ -400,11 +451,13 @@ def test_persist(pdf, df):
     assert_eq(b.y.sum(), (pdf + 2).y.sum())
 
 
+@cpu_gpu()
 def test_index(pdf, df):
     assert_eq(df.index, pdf.index)
     assert_eq(df.x.index, pdf.x.index)
 
 
+@cpu_gpu()
 @pytest.mark.parametrize("drop", [True, False])
 def test_reset_index(pdf, df, drop):
     assert_eq(df.reset_index(drop=drop), pdf.reset_index(drop=drop), check_index=False)
@@ -413,6 +466,7 @@ def test_reset_index(pdf, df, drop):
     )
 
 
+@cpu_gpu()
 def test_head(pdf, df):
     assert_eq(df.head(compute=False), pdf.head())
     assert_eq(df.head(compute=False, n=7), pdf.head(n=7))
@@ -436,6 +490,7 @@ def test_head_head(df):
     assert a.optimize()._name == b.optimize()._name
 
 
+@cpu_gpu()
 def test_tail(pdf, df):
     assert_eq(df.tail(compute=False), pdf.tail())
     assert_eq(df.tail(compute=False, n=7), pdf.tail(n=7))
@@ -531,6 +586,7 @@ def test_from_pandas(pdf):
     assert "pandas" in df._name
 
 
+@cpu_gpu()
 def test_copy(pdf, df):
     original = df.copy()
     columns = tuple(original.columns)
@@ -541,6 +597,7 @@ def test_copy(pdf, df):
     assert "z" not in original.columns
 
 
+@cpu_gpu()
 def test_partitions(pdf, df):
     assert_eq(df.partitions[0], pdf.iloc[:10])
     assert_eq(df.partitions[1], pdf.iloc[10:20])
@@ -566,6 +623,7 @@ def test_column_getattr(df):
         df.foo
 
 
+@cpu_gpu()
 def test_serialization(pdf, df):
     before = pickle.dumps(df)
 
@@ -711,6 +769,7 @@ def test_repartition_no_op(df):
     assert result._name == df._name
 
 
+@cpu_gpu()
 def test_len(df, pdf):
     df2 = df[["x"]] + 1
     assert len(df2) == len(pdf)
@@ -724,6 +783,7 @@ def test_len(df, pdf):
     assert isinstance(expr.Lengths(df2.expr).optimize(), expr.Literal)
 
 
+@cpu_gpu()
 def test_astype_simplify(df, pdf):
     q = df.astype({"x": "float64", "y": "float64"})["x"]
     result = q.simplify()
@@ -813,6 +873,7 @@ def test_dir(df):
     assert "sum" in dir(df.index)
 
 
+@cpu_gpu()
 @pytest.mark.parametrize(
     "func, args",
     [
@@ -941,6 +1002,7 @@ def test_assign_non_series_inputs(df, pdf):
     assert_eq(df.assign(a=lambda x: x.x * 2).a, pdf.assign(a=lambda x: x.x * 2).a)
 
 
+@cpu_gpu()
 def test_are_co_aligned(pdf, df):
     df2 = df.reset_index()
     assert are_co_aligned(df.expr, df2.expr)

From 3afa3b9ec408a7d257d7baa1a25cbfa9d87e1b86 Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Wed, 5 Jul 2023 13:39:57 -0700
Subject: [PATCH 02/18] formatting

---
 dask_expr/tests/test_collection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
index 0158736d1..1d514f637 100644
--- a/dask_expr/tests/test_collection.py
+++ b/dask_expr/tests/test_collection.py
@@ -18,7 +18,6 @@
 from dask_expr.expr import are_co_aligned
 from dask_expr.reductions import Len
 
-
 try:
     import cudf
 except ImportError:
@@ -49,6 +48,7 @@ def df(pdf):
 
 def cpu_gpu(data: dict | None = None, npartitions: int = 10):
     """DataFrame parameterization for cpu and gpu backed data"""
+
     if data is None:
         pdf, gdf = _pdf().copy(), _gdf().copy()
     else:

From bead5cbe97eadeca7b6bd011cfec47be651d21fe Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Thu, 6 Jul 2023 12:56:51 -0700
Subject: [PATCH 03/18] partial revision

---
 dask_expr/collection.py            |  11 +++
 dask_expr/io/parquet.py            |  11 +--
 dask_expr/io/tests/test_io.py      |  23 ++++++-
 dask_expr/tests/test_collection.py | 104 +++++++++++------------------
 4 files changed, 77 insertions(+), 72 deletions(-)

diff --git a/dask_expr/collection.py b/dask_expr/collection.py
index 33abc9362..8b9807f06 100644
--- a/dask_expr/collection.py
+++ b/dask_expr/collection.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 import pandas as pd
+from dask import config
 from dask.base import DaskMethodsMixin, is_dask_collection, named_schedulers
 from dask.dataframe.core import (
     _concat,
@@ -902,6 +903,7 @@ def read_parquet(
     aggregate_files=None,
     parquet_file_extension=(".parq", ".parquet", ".pq"),
     filesystem="fsspec",
+    engine=None,
     **kwargs,
 ):
     from dask_expr.io.parquet import ReadParquet
@@ -911,6 +913,14 @@ def read_parquet(
 
     kwargs["dtype_backend"] = dtype_backend
 
+    if engine is None:
+        if config.get("dataframe.backend", "pandas") == "cudf":
+            from dask_cudf.io.parquet import CudfEngine
+
+            engine = CudfEngine
+        else:
+            engine = "pyarrow"
+
     return new_collection(
         ReadParquet(
             path,
@@ -927,6 +937,7 @@ def read_parquet(
             aggregate_files=aggregate_files,
             parquet_file_extension=parquet_file_extension,
             filesystem=filesystem,
+            engine=engine,
             kwargs=kwargs,
         )
     )
diff --git a/dask_expr/io/parquet.py b/dask_expr/io/parquet.py
index de116d7dc..fa170dd43 100644
--- a/dask_expr/io/parquet.py
+++ b/dask_expr/io/parquet.py
@@ -391,6 +391,7 @@ class ReadParquet(PartitionsFiltered, BlockwiseIO):
         "aggregate_files",
         "parquet_file_extension",
         "filesystem",
+        "engine",
         "kwargs",
         "_partitions",
         "_series",
@@ -409,6 +410,7 @@ class ReadParquet(PartitionsFiltered, BlockwiseIO):
         "aggregate_files": None,
         "parquet_file_extension": (".parq", ".parquet", ".pq"),
         "filesystem": "fsspec",
+        "engine": "pyarrow",
         "kwargs": None,
         "_partitions": None,
         "_series": False,
@@ -417,11 +419,10 @@ class ReadParquet(PartitionsFiltered, BlockwiseIO):
 
     @property
     def engine(self):
-        if dask.config.get("dataframe.backend", "pandas") == "cudf":
-            from dask_cudf.io.parquet import CudfEngine
-
-            return CudfEngine
-        return get_engine("pyarrow")
+        _engine = self.operand("engine")
+        if isinstance(_engine, str):
+            return get_engine(_engine)
+        return _engine
 
     @property
     def columns(self):
diff --git a/dask_expr/io/tests/test_io.py b/dask_expr/io/tests/test_io.py
index 6790a5fa6..6baae2da9 100644
--- a/dask_expr/io/tests/test_io.py
+++ b/dask_expr/io/tests/test_io.py
@@ -3,6 +3,7 @@
 import dask.dataframe as dd
 import pandas as pd
 import pytest
+from dask import config
 from dask.dataframe.utils import assert_eq
 
 from dask_expr import from_dask_dataframe, from_pandas, optimize, read_csv, read_parquet
@@ -10,6 +11,23 @@
 from dask_expr.io import ReadParquet
 from dask_expr.reductions import Len
 
+try:
+    import cudf
+except ImportError:
+    cudf = None
+
+
+@pytest.fixture(
+    params=[
+        "pandas",
+        pytest.param(
+            "cudf", marks=pytest.mark.skipif(cudf is None, reason="cudf not found.")
+        ),
+    ]
+)
+def backend(request):
+    yield request.param
+
 
 def _make_file(dir, format="parquet", df=None):
     fn = os.path.join(str(dir), f"myfile.{format}")
@@ -206,8 +224,9 @@ def test_from_pandas_immutable():
     assert_eq(df, expected)
 
 
-def test_parquet_complex_filters(tmpdir):
-    df = read_parquet(_make_file(tmpdir))
+def test_parquet_complex_filters(tmpdir, backend):
+    with config.set({"dataframe.backend": backend}):
+        df = read_parquet(_make_file(tmpdir))
     pdf = df.compute()
     got = df["a"][df["b"] > df["b"].mean()]
     expect = pdf["a"][pdf["b"] > pdf["b"].mean()]
diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
index 1d514f637..5012b6a01 100644
--- a/dask_expr/tests/test_collection.py
+++ b/dask_expr/tests/test_collection.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import functools
 import operator
 import pickle
 import re
@@ -24,21 +23,11 @@
     cudf = None
 
 
-@functools.cache
-def _pdf():
-    pdf = pd.DataFrame({"x": range(100)})
-    pdf["y"] = pdf.x * 10.0
-    return pdf
-
-
-@functools.cache
-def _gdf():
-    return None if cudf is None else cudf.from_pandas(_pdf())
-
-
 @pytest.fixture
 def pdf():
-    yield _pdf().copy()
+    pdf = pd.DataFrame({"x": range(100)})
+    pdf["y"] = pdf.x * 10.0
+    yield pdf
 
 
 @pytest.fixture
@@ -46,51 +35,55 @@ def df(pdf):
     yield from_pandas(pdf, npartitions=10)
 
 
-def cpu_gpu(data: dict | None = None, npartitions: int = 10):
-    """DataFrame parameterization for cpu and gpu backed data"""
+@pytest.fixture(
+    params=[
+        "pandas",
+        pytest.param(
+            "cudf", marks=pytest.mark.skipif(cudf is None, reason="cudf not found.")
+        ),
+    ]
+)
+def backend(request):
+    yield request.param
+
 
-    if data is None:
-        pdf, gdf = _pdf().copy(), _gdf().copy()
+@pytest.fixture
+def bdf(backend, pdf):
+    # Multi-backend DataFrame fixture
+    if backend == "cudf":
+        yield cudf.from_pandas(pdf)
     else:
-        pdf = pd.DataFrame(data)
-        gdf = None if cudf is None else cudf.from_pandas(pdf)
-
-    return pytest.mark.parametrize(
-        "pdf,df",
-        [
-            pytest.param(pdf, from_pandas(pdf, npartitions), id="pandas"),
-            pytest.param(
-                gdf,
-                from_pandas(gdf, npartitions) if cudf else None,
-                id="cudf",
-                marks=pytest.mark.skipif(cudf is None, reason="cudf not found"),
-            ),
-        ],
-    )
+        yield pdf
 
 
-@cpu_gpu()
-def test_del(pdf, df):
+@pytest.fixture
+def xdf(bdf):
+    # Multi-backend Dask-Expression DataFrame fixture
+    yield from_pandas(bdf, npartitions=10)
+
+
+def test_del(pdf, xdf):
     pdf = pdf.copy()
 
     # Check __delitem__
     del pdf["x"]
-    del df["x"]
-    assert_eq(pdf, df)
+    del xdf["x"]
+    assert_eq(pdf, xdf)
 
 
-@cpu_gpu()
-def test_setitem(pdf, df):
+def test_setitem(pdf, xdf):
     pdf = pdf.copy()
     pdf["z"] = pdf.x + pdf.y
 
-    df["z"] = df.x + df.y
+    xdf["z"] = xdf.x + xdf.y
 
-    assert "z" in df.columns
-    assert_eq(df, pdf)
+    assert "z" in xdf.columns
+    assert_eq(xdf, pdf)
 
 
 def test_explode():
+    # CuDF backend does not support explode
+    # (See: https://github.com/rapidsai/cudf/issues/10271)
     pdf = pd.DataFrame({"a": [[1, 2], [3, 4]]})
     df = from_pandas(pdf)
     assert_eq(pdf.explode(column="a"), df.explode(column="a"))
@@ -130,12 +123,11 @@ def test_meta_blockwise():
     assert set(cc.columns) == {"x", "y", "z"}
 
 
-@cpu_gpu()
-def test_dask(pdf, df):
-    assert (df.x + df.y).npartitions == 10
-    z = (df.x + df.y).sum()
+def test_dask(bdf, xdf):
+    assert (xdf.x + xdf.y).npartitions == 10
+    z = (xdf.x + xdf.y).sum()
 
-    assert assert_eq(z, (pdf.x + pdf.y).sum())
+    assert assert_eq(z, (bdf.x + bdf.y).sum())
 
 
 @pytest.mark.parametrize(
@@ -212,7 +204,6 @@ def test_memory_usage(pdf):
         df.index.memory_usage(index=True)
 
 
-@cpu_gpu()
 @pytest.mark.parametrize("func", [M.nlargest, M.nsmallest])
 def test_nlargest_nsmallest(df, pdf, func):
     assert_eq(func(df, n=5, columns="x"), func(pdf, n=5, columns="x"))
@@ -221,7 +212,6 @@ def test_nlargest_nsmallest(df, pdf, func):
         func(df.x, n=5, columns="foo")
 
 
-@cpu_gpu()
 @pytest.mark.parametrize(
     "func",
     [
@@ -278,7 +268,6 @@ def test_unary_operators(func):
     assert_eq(func(pdf), func(df))
 
 
-@cpu_gpu()
 @pytest.mark.parametrize(
     "func",
     [
@@ -347,7 +336,6 @@ def test_rename_axis(pdf):
     assert_eq(df.x.rename_axis(index="dummy"), pdf.x.rename_axis(index="dummy"))
 
 
-@cpu_gpu()
 def test_isin(df, pdf):
     values = [1, 2]
     assert_eq(pdf.isin(values), df.isin(values))
@@ -389,7 +377,6 @@ def test_rename_traverse_filter(df):
     assert str(result) == str(expected)
 
 
-@cpu_gpu()
 def test_columns_traverse_filters(pdf, df):
     result = df[df.x > 5].y.simplify()
     expected = df.y[df.x > 5]
@@ -426,7 +413,6 @@ def test_drop_duplicates_subset_simplify(pdf, subset, projection):
     assert str(result) == str(expected)
 
 
-@cpu_gpu()
 def test_broadcast(pdf, df):
     assert_eq(
         df + df.sum(),
@@ -438,7 +424,6 @@ def test_broadcast(pdf, df):
     )
 
 
-@cpu_gpu()
 def test_persist(pdf, df):
     a = df + 2
     b = a.persist()
@@ -451,13 +436,11 @@ def test_persist(pdf, df):
     assert_eq(b.y.sum(), (pdf + 2).y.sum())
 
 
-@cpu_gpu()
 def test_index(pdf, df):
     assert_eq(df.index, pdf.index)
     assert_eq(df.x.index, pdf.x.index)
 
 
-@cpu_gpu()
 @pytest.mark.parametrize("drop", [True, False])
 def test_reset_index(pdf, df, drop):
     assert_eq(df.reset_index(drop=drop), pdf.reset_index(drop=drop), check_index=False)
@@ -466,7 +449,6 @@ def test_reset_index(pdf, df, drop):
     )
 
 
-@cpu_gpu()
 def test_head(pdf, df):
     assert_eq(df.head(compute=False), pdf.head())
     assert_eq(df.head(compute=False, n=7), pdf.head(n=7))
@@ -490,7 +472,6 @@ def test_head_head(df):
     assert a.optimize()._name == b.optimize()._name
 
 
-@cpu_gpu()
 def test_tail(pdf, df):
     assert_eq(df.tail(compute=False), pdf.tail())
     assert_eq(df.tail(compute=False, n=7), pdf.tail(n=7))
@@ -586,7 +567,6 @@ def test_from_pandas(pdf):
     assert "pandas" in df._name
 
 
-@cpu_gpu()
 def test_copy(pdf, df):
     original = df.copy()
     columns = tuple(original.columns)
@@ -597,7 +577,6 @@ def test_copy(pdf, df):
     assert "z" not in original.columns
 
 
-@cpu_gpu()
 def test_partitions(pdf, df):
     assert_eq(df.partitions[0], pdf.iloc[:10])
     assert_eq(df.partitions[1], pdf.iloc[10:20])
@@ -623,7 +602,6 @@ def test_column_getattr(df):
         df.foo
 
 
-@cpu_gpu()
 def test_serialization(pdf, df):
     before = pickle.dumps(df)
 
@@ -769,7 +747,6 @@ def test_repartition_no_op(df):
     assert result._name == df._name
 
 
-@cpu_gpu()
 def test_len(df, pdf):
     df2 = df[["x"]] + 1
     assert len(df2) == len(pdf)
@@ -783,7 +760,6 @@ def test_len(df, pdf):
     assert isinstance(expr.Lengths(df2.expr).optimize(), expr.Literal)
 
 
-@cpu_gpu()
 def test_astype_simplify(df, pdf):
     q = df.astype({"x": "float64", "y": "float64"})["x"]
     result = q.simplify()
@@ -873,7 +849,6 @@ def test_dir(df):
     assert "sum" in dir(df.index)
 
 
-@cpu_gpu()
 @pytest.mark.parametrize(
     "func, args",
     [
@@ -1002,7 +977,6 @@ def test_assign_non_series_inputs(df, pdf):
     assert_eq(df.assign(a=lambda x: x.x * 2).a, pdf.assign(a=lambda x: x.x * 2).a)
 
 
-@cpu_gpu()
 def test_are_co_aligned(pdf, df):
     df2 = df.reset_index()
     assert are_co_aligned(df.expr, df2.expr)

From 4381738ba79ec8ea036d7d1fa33701c263a28a8b Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Thu, 6 Jul 2023 14:41:57 -0700
Subject: [PATCH 04/18] configure with backend fixture

---
 dask_expr/_util.py                 |   7 +
 dask_expr/collection.py            |   3 +-
 dask_expr/tests/test_collection.py | 621 +++++++++++++++--------------
 3 files changed, 340 insertions(+), 291 deletions(-)

diff --git a/dask_expr/_util.py b/dask_expr/_util.py
index a206fa9ad..671f09701 100644
--- a/dask_expr/_util.py
+++ b/dask_expr/_util.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+from dask import config
+
 
 def _convert_to_list(column) -> list | None:
     if column is None or isinstance(column, list):
@@ -11,3 +13,8 @@ def _convert_to_list(column) -> list | None:
     else:
         column = [column]
     return column
+
+
+def _maybe_import_backend():
+    if config.get("dataframe.backend", "pandas") == "cudf":
+        import dask_cudf  # noqa F401
diff --git a/dask_expr/collection.py b/dask_expr/collection.py
index 8b9807f06..87a328664 100644
--- a/dask_expr/collection.py
+++ b/dask_expr/collection.py
@@ -24,7 +24,7 @@
 from tlz import first
 
 from dask_expr import expr
-from dask_expr._util import _convert_to_list
+from dask_expr._util import _convert_to_list, _maybe_import_backend
 from dask_expr.concat import Concat
 from dask_expr.expr import Eval, no_default
 from dask_expr.merge import JoinRecursive, Merge
@@ -858,6 +858,7 @@ def optimize(collection, fuse=True):
 def from_pandas(data, *args, **kwargs):
     from dask_expr.io.io import FromPandas
 
+    _maybe_import_backend()
     return new_collection(FromPandas(data.copy(), *args, **kwargs))
 
 
diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
index 5012b6a01..cba74e939 100644
--- a/dask_expr/tests/test_collection.py
+++ b/dask_expr/tests/test_collection.py
@@ -23,18 +23,6 @@
     cudf = None
 
 
-@pytest.fixture
-def pdf():
-    pdf = pd.DataFrame({"x": range(100)})
-    pdf["y"] = pdf.x * 10.0
-    yield pdf
-
-
-@pytest.fixture
-def df(pdf):
-    yield from_pandas(pdf, npartitions=10)
-
-
 @pytest.fixture(
     params=[
         "pandas",
@@ -48,12 +36,20 @@ def backend(request):
 
 
 @pytest.fixture
-def bdf(backend, pdf):
+def lib(backend):
     # Multi-backend DataFrame fixture
     if backend == "cudf":
-        yield cudf.from_pandas(pdf)
+        yield cudf
     else:
-        yield pdf
+        yield pd
+
+
+@pytest.fixture
+def bdf(lib):
+    # Backend DataFrame fixture
+    df = lib.DataFrame({"x": range(100)})
+    df["y"] = df.x * 10.0
+    yield df
 
 
 @pytest.fixture
@@ -62,8 +58,8 @@ def xdf(bdf):
     yield from_pandas(bdf, npartitions=10)
 
 
-def test_del(pdf, xdf):
-    pdf = pdf.copy()
+def test_del(bdf, xdf):
+    pdf = bdf.copy()
 
     # Check __delitem__
     del pdf["x"]
@@ -71,8 +67,8 @@ def test_del(pdf, xdf):
     assert_eq(pdf, xdf)
 
 
-def test_setitem(pdf, xdf):
-    pdf = pdf.copy()
+def test_setitem(bdf, xdf):
+    pdf = bdf.copy()
     pdf["z"] = pdf.x + pdf.y
 
     xdf["z"] = xdf.x + xdf.y
@@ -90,7 +86,10 @@ def test_explode():
     assert_eq(pdf.a.explode(), df.a.explode())
 
 
-def test_explode_simplify(pdf):
+def test_explode_simplify(bdf, backend):
+    if backend == "cudf":
+        pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/10271")
+    pdf = bdf.copy()
     pdf["z"] = 1
     df = from_pandas(pdf)
     q = df.explode(column="x")["y"]
@@ -99,8 +98,8 @@ def test_explode_simplify(pdf):
     assert result._name == expected._name
 
 
-def test_meta_divisions_name():
-    a = pd.DataFrame({"x": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]})
+def test_meta_divisions_name(lib):
+    a = lib.DataFrame({"x": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]})
     df = 2 * from_pandas(a, npartitions=2)
     assert list(df.columns) == list(a.columns)
     assert df.npartitions == 2
@@ -112,9 +111,9 @@ def test_meta_divisions_name():
     assert "sum" in df.sum()._name
 
 
-def test_meta_blockwise():
-    a = pd.DataFrame({"x": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]})
-    b = pd.DataFrame({"z": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]})
+def test_meta_blockwise(lib):
+    a = lib.DataFrame({"x": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]})
+    b = lib.DataFrame({"z": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]})
 
     aa = from_pandas(a, npartitions=2)
     bb = from_pandas(b, npartitions=2)
@@ -149,40 +148,45 @@ def test_dask(bdf, xdf):
         ),
     ],
 )
-def test_reductions(func, pdf, df):
-    result = func(df)
+def test_reductions(func, bdf, xdf, backend):
+    if backend == "cudf" and func in [M.idxmin, M.idxmax]:
+        pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/9602")
+    result = func(xdf)
     assert result.known_divisions
-    assert_eq(result, func(pdf))
-    result = func(df.x)
+    assert_eq(result, func(bdf))
+    result = func(xdf.x)
     assert not result.known_divisions
-    assert_eq(result, func(pdf.x))
+    assert_eq(result, func(bdf.x))
     # check_dtype False because sub-selection of columns that is pushed through
     # is not reflected in the meta calculation
-    assert_eq(func(df)["x"], func(pdf)["x"], check_dtype=False)
+    assert_eq(func(xdf)["x"], func(bdf)["x"], check_dtype=False)
 
 
-def test_nbytes(pdf, df):
+def test_nbytes(bdf, xdf, backend):
+    if backend == "cudf":
+        pytest.xfail(reason="nbytes not supported by cudf")
     with pytest.raises(NotImplementedError, match="nbytes is not implemented"):
-        df.nbytes
-    assert_eq(df.x.nbytes, pdf.x.nbytes)
+        xdf.nbytes
+    assert_eq(xdf.x.nbytes, bdf.x.nbytes)
 
 
-def test_mode():
-    pdf = pd.DataFrame({"x": [1, 2, 3, 1, 2]})
+def test_mode(lib):
+    pdf = lib.DataFrame({"x": [1, 2, 3, 1, 2]})
     df = from_pandas(pdf, npartitions=3)
 
     assert_eq(df.x.mode(), pdf.x.mode(), check_names=False)
 
 
-def test_value_counts(df, pdf):
+def test_value_counts(xdf, bdf):
     with pytest.raises(
         AttributeError, match="'DataFrame' object has no attribute 'value_counts'"
     ):
-        df.value_counts()
-    assert_eq(df.x.value_counts(), pdf.x.value_counts())
+        xdf.value_counts()
+    assert_eq(xdf.x.value_counts(), bdf.x.value_counts().astype("int64"))
 
 
-def test_dropna(pdf):
+def test_dropna(bdf):
+    pdf = bdf.copy()
     pdf.loc[0, "y"] = np.nan
     df = from_pandas(pdf)
     assert_eq(df.dropna(), pdf.dropna())
@@ -190,9 +194,10 @@ def test_dropna(pdf):
     assert_eq(df.y.dropna(), pdf.y.dropna())
 
 
-def test_memory_usage(pdf):
+def test_memory_usage(bdf):
     # Results are not equal with RangeIndex because pandas has one RangeIndex while
     # we have one RangeIndex per partition
+    pdf = bdf.copy()
     pdf.index = np.arange(len(pdf))
     df = from_pandas(pdf)
     assert_eq(df.memory_usage(), pdf.memory_usage())
@@ -205,11 +210,11 @@ def test_memory_usage(pdf):
 
 
 @pytest.mark.parametrize("func", [M.nlargest, M.nsmallest])
-def test_nlargest_nsmallest(df, pdf, func):
-    assert_eq(func(df, n=5, columns="x"), func(pdf, n=5, columns="x"))
-    assert_eq(func(df.x, n=5), func(pdf.x, n=5))
+def test_nlargest_nsmallest(xdf, bdf, func):
+    assert_eq(func(xdf, n=5, columns="x"), func(bdf, n=5, columns="x"))
+    assert_eq(func(xdf.x, n=5), func(bdf.x, n=5))
     with pytest.raises(TypeError, match="got an unexpected keyword argument"):
-        func(df.x, n=5, columns="foo")
+        func(xdf.x, n=5, columns="foo")
 
 
 @pytest.mark.parametrize(
@@ -226,8 +231,8 @@ def test_nlargest_nsmallest(df, pdf, func):
         lambda df: df.x != df.y,
     ],
 )
-def test_conditionals(func, pdf, df):
-    assert_eq(func(pdf), func(df), check_names=False)
+def test_conditionals(func, bdf, xdf):
+    assert_eq(func(bdf), func(xdf), check_names=False)
 
 
 @pytest.mark.parametrize(
@@ -241,8 +246,8 @@ def test_conditionals(func, pdf, df):
         lambda df: df.x.__rxor__(df.y),
     ],
 )
-def test_boolean_operators(func):
-    pdf = pd.DataFrame(
+def test_boolean_operators(func, lib):
+    pdf = lib.DataFrame(
         {"x": [True, False, True, False], "y": [True, False, False, False]}
     )
     df = from_pandas(pdf)
@@ -260,8 +265,8 @@ def test_boolean_operators(func):
         lambda df: +df,
     ],
 )
-def test_unary_operators(func):
-    pdf = pd.DataFrame(
+def test_unary_operators(func, lib):
+    pdf = lib.DataFrame(
         {"x": [True, False, True, False], "y": [True, False, False, False], "z": 1}
     )
     df = from_pandas(pdf)
@@ -275,23 +280,25 @@ def test_unary_operators(func):
         lambda df: df[(df.x > 7) & (df.x < 10)],
     ],
 )
-def test_and_or(func, pdf, df):
-    assert_eq(func(pdf), func(df), check_names=False)
+def test_and_or(func, bdf, xdf):
+    assert_eq(func(bdf), func(xdf), check_names=False)
 
 
 @pytest.mark.parametrize("how", ["start", "end"])
-def test_to_timestamp(pdf, how):
-    pdf.index = pd.period_range("2019-12-31", freq="D", periods=len(pdf))
-    df = from_pandas(pdf)
-    assert_eq(df.to_timestamp(how=how), pdf.to_timestamp(how=how))
-    assert_eq(df.x.to_timestamp(how=how), pdf.x.to_timestamp(how=how))
+def test_to_timestamp(bdf, how, backend):
+    if backend == "cudf":
+        pytest.xfail(reason="period_range not supported by cudf")
+    bdf.index = pd.period_range("2019-12-31", freq="D", periods=len(bdf))
+    df = from_pandas(bdf)
+    assert_eq(df.to_timestamp(how=how), bdf.to_timestamp(how=how))
+    assert_eq(df.x.to_timestamp(how=how), bdf.x.to_timestamp(how=how))
 
 
 @pytest.mark.parametrize(
     "func",
     [
         lambda df: df.astype(int),
-        lambda df: df.apply(lambda row, x, y=10: row * x + y, x=2),
+        # lambda df: df.apply(lambda row, x, y=10: row * x + y, x=2),
         pytest.param(
             lambda df: df.map(lambda x: x + 1),
             marks=pytest.mark.skipif(
@@ -302,7 +309,7 @@ def test_to_timestamp(pdf, how):
         lambda df: df.x.clip(lower=10, upper=50),
         lambda df: df.x.between(left=10, right=50),
         lambda df: df.x.map(lambda x: x + 1),
-        lambda df: df.index.map(lambda x: x + 1),
+        # lambda df: df.index.map(lambda x: x + 1),
         lambda df: df[df.x > 5],
         lambda df: df.assign(a=df.x + df.y, b=df.x - df.y),
         lambda df: df.replace(to_replace=1, value=1000),
@@ -314,8 +321,8 @@ def test_to_timestamp(pdf, how):
         lambda df: df.rename(columns={"x": "xx"}),
         lambda df: df.rename(columns={"x": "xx"}).xx,
         lambda df: df.rename(columns={"x": "xx"})[["xx"]],
-        lambda df: df.combine_first(df),
-        lambda df: df.x.combine_first(df.y),
+        # lambda df: df.combine_first(df),
+        # lambda df: df.x.combine_first(df.y),
         lambda df: df.x.to_frame(),
         lambda df: df.drop(columns="x"),
         lambda df: df.x.index.to_frame(),
@@ -323,11 +330,29 @@ def test_to_timestamp(pdf, how):
         lambda df: df.select_dtypes(include="integer"),
     ],
 )
-def test_blockwise(func, pdf, df):
-    assert_eq(func(pdf), func(df))
+def test_blockwise(func, bdf, xdf):
+    assert_eq(func(bdf), func(xdf))
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        lambda df: df.apply(lambda row, x, y=10: row * x + y, x=2),
+        lambda df: df.index.map(lambda x: x + 1),
+        lambda df: df.combine_first(df),
+        lambda df: df.x.combine_first(df.y),
+    ],
+)
+def test_blockwise_cudf_fails(func, bdf, xdf, backend):
+    if backend == "cudf":
+        pytest.xfail(reason="func not supported by cudf")
+    assert_eq(func(bdf), func(xdf))
 
 
-def test_rename_axis(pdf):
+def test_rename_axis(bdf, backend):
+    if backend == "cudf":
+        pytest.xfail(reason="rename_axis not supported by cudf")
+    pdf = bdf.copy()
     pdf.index.name = "a"
     pdf.columns.name = "b"
     df = from_pandas(pdf, npartitions=10)
@@ -336,30 +361,34 @@ def test_rename_axis(pdf):
     assert_eq(df.x.rename_axis(index="dummy"), pdf.x.rename_axis(index="dummy"))
 
 
-def test_isin(df, pdf):
+def test_isin(xdf, bdf):
     values = [1, 2]
-    assert_eq(pdf.isin(values), df.isin(values))
-    assert_eq(pdf.x.isin(values), df.x.isin(values))
+    assert_eq(bdf.isin(values), xdf.isin(values))
+    assert_eq(bdf.x.isin(values), xdf.x.isin(values))
 
 
-def test_round(pdf):
+def test_round(bdf):
+    pdf = bdf.copy()
     pdf += 0.5555
     df = from_pandas(pdf)
     assert_eq(df.round(decimals=1), pdf.round(decimals=1))
     assert_eq(df.x.round(decimals=1), pdf.x.round(decimals=1))
 
 
-def test_repr(df):
-    assert "+ 1" in str(df + 1)
-    assert "+ 1" in repr(df + 1)
+def test_repr(xdf):
+    assert "+ 1" in str(xdf + 1)
+    assert "+ 1" in repr(xdf + 1)
 
-    s = (df["x"] + 1).sum(skipna=False).expr
+    s = (xdf["x"] + 1).sum(skipna=False).expr
     assert '["x"]' in s or "['x']" in s
     assert "+ 1" in s
     assert "sum(skipna=False)" in s
 
 
-def test_combine_first_simplify(pdf):
+def test_combine_first_simplify(bdf, backend):
+    if backend == "cudf":
+        pytest.xfail(reason="combine_first not supported by cudf")
+    pdf = bdf.copy()
     df = from_pandas(pdf)
     pdf2 = pdf.rename(columns={"y": "z"})
     df2 = from_pandas(pdf2)
@@ -371,40 +400,41 @@ def test_combine_first_simplify(pdf):
     assert_eq(result, pdf.combine_first(pdf2)[["z", "y"]])
 
 
-def test_rename_traverse_filter(df):
-    result = df.rename(columns={"x": "xx"})[["xx"]].simplify()
-    expected = df[["x"]].rename(columns={"x": "xx"})
+def test_rename_traverse_filter(xdf):
+    result = xdf.rename(columns={"x": "xx"})[["xx"]].simplify()
+    expected = xdf[["x"]].rename(columns={"x": "xx"})
     assert str(result) == str(expected)
 
 
-def test_columns_traverse_filters(pdf, df):
-    result = df[df.x > 5].y.simplify()
-    expected = df.y[df.x > 5]
+def test_columns_traverse_filters(xdf):
+    result = xdf[xdf.x > 5].y.simplify()
+    expected = xdf.y[xdf.x > 5]
 
     assert str(result) == str(expected)
 
 
-def test_clip_traverse_filters(df):
-    result = df.clip(lower=10).y.simplify()
-    expected = df.y.clip(lower=10)
+def test_clip_traverse_filters(xdf):
+    result = xdf.clip(lower=10).y.simplify()
+    expected = xdf.y.clip(lower=10)
 
     assert result._name == expected._name
 
-    result = df.clip(lower=10)[["x", "y"]].simplify()
-    expected = df.clip(lower=10)
+    result = xdf.clip(lower=10)[["x", "y"]].simplify()
+    expected = xdf.clip(lower=10)
 
     assert result._name == expected._name
 
-    arg = df.clip(lower=10)[["x"]]
+    arg = xdf.clip(lower=10)[["x"]]
     result = arg.simplify()
-    expected = df[["x"]].clip(lower=10)
+    expected = xdf[["x"]].clip(lower=10)
 
     assert result._name == expected._name
 
 
 @pytest.mark.parametrize("projection", ["zz", ["zz"], ["zz", "x"], "zz"])
 @pytest.mark.parametrize("subset", ["x", ["x"]])
-def test_drop_duplicates_subset_simplify(pdf, subset, projection):
+def test_drop_duplicates_subset_simplify(bdf, subset, projection):
+    pdf = bdf.copy()
     pdf["zz"] = 1
     df = from_pandas(pdf)
     result = df.drop_duplicates(subset=subset)[projection].simplify()
@@ -413,19 +443,19 @@ def test_drop_duplicates_subset_simplify(pdf, subset, projection):
     assert str(result) == str(expected)
 
 
-def test_broadcast(pdf, df):
+def test_broadcast(bdf, xdf):
     assert_eq(
-        df + df.sum(),
-        pdf + pdf.sum(),
+        xdf + xdf.sum(),
+        bdf + bdf.sum(),
     )
     assert_eq(
-        df.x + df.x.sum(),
-        pdf.x + pdf.x.sum(),
+        xdf.x + xdf.x.sum(),
+        bdf.x + bdf.x.sum(),
     )
 
 
-def test_persist(pdf, df):
-    a = df + 2
+def test_persist(bdf, xdf):
+    a = xdf + 2
     b = a.persist()
 
     assert_eq(a, b)
@@ -433,31 +463,31 @@ def test_persist(pdf, df):
 
     assert len(b.__dask_graph__()) == b.npartitions
 
-    assert_eq(b.y.sum(), (pdf + 2).y.sum())
+    assert_eq(b.y.sum(), (bdf + 2).y.sum())
 
 
-def test_index(pdf, df):
-    assert_eq(df.index, pdf.index)
-    assert_eq(df.x.index, pdf.x.index)
+def test_index(bdf, xdf):
+    assert_eq(xdf.index, bdf.index)
+    assert_eq(xdf.x.index, bdf.x.index)
 
 
 @pytest.mark.parametrize("drop", [True, False])
-def test_reset_index(pdf, df, drop):
-    assert_eq(df.reset_index(drop=drop), pdf.reset_index(drop=drop), check_index=False)
+def test_reset_index(bdf, xdf, drop):
+    assert_eq(xdf.reset_index(drop=drop), bdf.reset_index(drop=drop), check_index=False)
     assert_eq(
-        df.x.reset_index(drop=drop), pdf.x.reset_index(drop=drop), check_index=False
+        xdf.x.reset_index(drop=drop), bdf.x.reset_index(drop=drop), check_index=False
     )
 
 
-def test_head(pdf, df):
-    assert_eq(df.head(compute=False), pdf.head())
-    assert_eq(df.head(compute=False, n=7), pdf.head(n=7))
+def test_head(bdf, xdf):
+    assert_eq(xdf.head(compute=False), bdf.head())
+    assert_eq(xdf.head(compute=False, n=7), bdf.head(n=7))
 
-    assert df.head(compute=False).npartitions == 1
+    assert xdf.head(compute=False).npartitions == 1
 
 
-def test_head_down(df):
-    result = (df.x + df.y + 1).head(compute=False)
+def test_head_down(xdf):
+    result = (xdf.x + xdf.y + 1).head(compute=False)
     optimized = result.simplify()
 
     assert_eq(result, optimized)
@@ -465,22 +495,22 @@ def test_head_down(df):
     assert not isinstance(optimized.expr, expr.Head)
 
 
-def test_head_head(df):
-    a = df.head(compute=False).head(compute=False)
-    b = df.head(compute=False)
+def test_head_head(xdf):
+    a = xdf.head(compute=False).head(compute=False)
+    b = xdf.head(compute=False)
 
     assert a.optimize()._name == b.optimize()._name
 
 
-def test_tail(pdf, df):
-    assert_eq(df.tail(compute=False), pdf.tail())
-    assert_eq(df.tail(compute=False, n=7), pdf.tail(n=7))
+def test_tail(bdf, xdf):
+    assert_eq(xdf.tail(compute=False), bdf.tail())
+    assert_eq(xdf.tail(compute=False, n=7), bdf.tail(n=7))
 
-    assert df.tail(compute=False).npartitions == 1
+    assert xdf.tail(compute=False).npartitions == 1
 
 
-def test_tail_down(df):
-    result = (df.x + df.y + 1).tail(compute=False)
+def test_tail_down(xdf):
+    result = (xdf.x + xdf.y + 1).tail(compute=False)
     optimized = optimize(result)
 
     assert_eq(result, optimized)
@@ -488,49 +518,49 @@ def test_tail_down(df):
     assert not isinstance(optimized.expr, expr.Tail)
 
 
-def test_tail_tail(df):
-    a = df.tail(compute=False).tail(compute=False)
-    b = df.tail(compute=False)
+def test_tail_tail(xdf):
+    a = xdf.tail(compute=False).tail(compute=False)
+    b = xdf.tail(compute=False)
 
     assert a.optimize()._name == b.optimize()._name
 
 
-def test_tail_repartition(df):
-    a = df.repartition(npartitions=10).tail()
-    b = df.tail()
+def test_tail_repartition(xdf):
+    a = xdf.repartition(npartitions=10).tail()
+    b = xdf.tail()
     assert_eq(a, b)
 
 
-def test_projection_stacking(df):
-    result = df[["x", "y"]]["x"]
+def test_projection_stacking(xdf):
+    result = xdf[["x", "y"]]["x"]
     optimized = result.simplify()
-    expected = df["x"]
+    expected = xdf["x"]
 
     assert optimized._name == expected._name
 
 
-def test_projection_stacking_coercion(pdf):
-    df = from_pandas(pdf)
-    assert_eq(df.x[0], pdf.x[0], check_divisions=False)
-    assert_eq(df.x[[0]], pdf.x[[0]], check_divisions=False)
+def test_projection_stacking_coercion(bdf):
+    df = from_pandas(bdf)
+    assert_eq(df.x[0], bdf.x[0], check_divisions=False)
+    assert_eq(df.x[[0]], bdf.x[[0]], check_divisions=False)
 
 
-def test_remove_unnecessary_projections(df):
-    result = (df + 1)[df.columns]
+def test_remove_unnecessary_projections(xdf):
+    result = (xdf + 1)[xdf.columns]
     optimized = result.simplify()
-    expected = df + 1
+    expected = xdf + 1
 
     assert optimized._name == expected._name
 
-    result = (df[["x"]] + 1)[["x"]]
+    result = (xdf[["x"]] + 1)[["x"]]
     optimized = result.simplify()
-    expected = df[["x"]] + 1
+    expected = xdf[["x"]] + 1
 
     assert optimized._name == expected._name
 
 
-def test_substitute(df):
-    pdf = pd.DataFrame(
+def test_substitute(lib):
+    pdf = lib.DataFrame(
         {
             "a": range(100),
             "b": range(100),
@@ -561,59 +591,59 @@ def test_substitute(df):
     assert result._name == expected._name
 
 
-def test_from_pandas(pdf):
-    df = from_pandas(pdf, npartitions=3)
+def test_from_pandas(bdf):
+    df = from_pandas(bdf, npartitions=3)
     assert df.npartitions == 3
     assert "pandas" in df._name
 
 
-def test_copy(pdf, df):
-    original = df.copy()
+def test_copy(xdf):
+    original = xdf.copy()
     columns = tuple(original.columns)
 
-    df["z"] = df.x + df.y
+    xdf["z"] = xdf.x + xdf.y
 
     assert tuple(original.columns) == columns
     assert "z" not in original.columns
 
 
-def test_partitions(pdf, df):
-    assert_eq(df.partitions[0], pdf.iloc[:10])
-    assert_eq(df.partitions[1], pdf.iloc[10:20])
-    assert_eq(df.partitions[1:3], pdf.iloc[10:30])
-    assert_eq(df.partitions[[3, 4]], pdf.iloc[30:50])
-    assert_eq(df.partitions[-1], pdf.iloc[90:])
+def test_partitions(bdf, xdf):
+    assert_eq(xdf.partitions[0], bdf.iloc[:10])
+    assert_eq(xdf.partitions[1], bdf.iloc[10:20])
+    assert_eq(xdf.partitions[1:3], bdf.iloc[10:30])
+    assert_eq(xdf.partitions[[3, 4]], bdf.iloc[30:50])
+    assert_eq(xdf.partitions[-1], bdf.iloc[90:])
 
-    out = (df + 1).partitions[0].simplify()
+    out = (xdf + 1).partitions[0].simplify()
     assert isinstance(out.expr, expr.Add)
     assert out.expr.left._partitions == [0]
 
     # Check culling
-    out = optimize(df.partitions[1])
+    out = optimize(xdf.partitions[1])
     assert len(out.dask) == 1
-    assert_eq(out, pdf.iloc[10:20])
+    assert_eq(out, bdf.iloc[10:20])
 
 
-def test_column_getattr(df):
-    df = df.expr
-    assert df.x._name == df["x"]._name
+def test_column_getattr(xdf):
+    xdf = xdf.expr
+    assert xdf.x._name == xdf["x"]._name
 
     with pytest.raises(AttributeError):
-        df.foo
+        xdf.foo
 
 
-def test_serialization(pdf, df):
-    before = pickle.dumps(df)
+def test_serialization(bdf, xdf):
+    before = pickle.dumps(xdf)
 
-    assert len(before) < 200 + len(pickle.dumps(pdf))
+    assert len(before) < 200 + len(pickle.dumps(bdf))
 
-    part = df.partitions[0].compute()
+    part = xdf.partitions[0].compute()
     assert (
-        len(pickle.dumps(df.__dask_graph__()))
-        < 1000 + len(pickle.dumps(part)) * df.npartitions
+        len(pickle.dumps(xdf.__dask_graph__()))
+        < 1000 + len(pickle.dumps(part)) * xdf.npartitions
     )
 
-    after = pickle.dumps(df)
+    after = pickle.dumps(xdf)
 
     assert before == after  # caching doesn't affect serialization
 
@@ -621,21 +651,23 @@ def test_serialization(pdf, df):
     assert_eq(pickle.loads(before), pickle.loads(after))
 
 
-def test_size_optimized(df):
-    expr = (df.x + 1).apply(lambda x: x).size
+def test_size_optimized(xdf, backend):
+    if backend == "cudf":
+        pytest.xfail(reason="Cannot apply lambda function in cudf")
+    expr = (xdf.x + 1).apply(lambda x: x).size
     out = optimize(expr)
-    expected = optimize(df.x.size)
+    expected = optimize(xdf.x.size)
     assert out._name == expected._name
 
-    expr = (df + 1).apply(lambda x: x).size
+    expr = (xdf + 1).apply(lambda x: x).size
     out = optimize(expr)
-    expected = optimize(df.size)
+    expected = optimize(xdf.size)
     assert out._name == expected._name
 
 
 @pytest.mark.parametrize("fuse", [True, False])
-def test_tree_repr(df, fuse):
-    s = df.expr.tree_repr()
+def test_tree_repr(fuse):
+    s = from_pandas(pd.Series(range(10))).expr.tree_repr()
     assert "<pandas>" in s
 
     df = timeseries()
@@ -655,38 +687,38 @@ def test_tree_repr(df, fuse):
         assert s.count("|") == 9
 
 
-def test_simple_graphs(df):
-    expr = (df + 1).expr
+def test_simple_graphs(xdf):
+    expr = (xdf + 1).expr
     graph = expr.__dask_graph__()
 
-    assert graph[(expr._name, 0)] == (operator.add, (df.expr._name, 0), 1)
+    assert graph[(expr._name, 0)] == (operator.add, (xdf.expr._name, 0), 1)
 
 
-def test_map_partitions(df):
+def test_map_partitions(xdf):
     def combine_x_y(x, y, foo=None):
         assert foo == "bar"
         return x + y
 
-    df2 = df.map_partitions(combine_x_y, df + 1, foo="bar")
-    assert_eq(df2, df + (df + 1))
+    df2 = xdf.map_partitions(combine_x_y, xdf + 1, foo="bar")
+    assert_eq(df2, xdf + (xdf + 1))
 
 
-def test_map_partitions_broadcast(df):
+def test_map_partitions_broadcast(xdf):
     def combine_x_y(x, y, val, foo=None):
         assert foo == "bar"
         return x + y + val
 
-    df2 = df.map_partitions(combine_x_y, df["x"].sum(), 123, foo="bar")
-    assert_eq(df2, df + df["x"].sum() + 123)
-    assert_eq(df2.optimize(), df + df["x"].sum() + 123)
+    df2 = xdf.map_partitions(combine_x_y, xdf["x"].sum(), 123, foo="bar")
+    assert_eq(df2, xdf + xdf["x"].sum() + 123)
+    assert_eq(df2.optimize(), xdf + xdf["x"].sum() + 123)
 
 
 @pytest.mark.parametrize("opt", [True, False])
-def test_map_partitions_merge(opt):
+def test_map_partitions_merge(opt, lib):
     # Make simple left & right dfs
-    pdf1 = pd.DataFrame({"x": range(20), "y": range(20)})
+    pdf1 = lib.DataFrame({"x": range(20), "y": range(20)})
     df1 = from_pandas(pdf1, 2)
-    pdf2 = pd.DataFrame({"x": range(0, 20, 2), "z": range(10)})
+    pdf2 = lib.DataFrame({"x": range(0, 20, 2), "z": range(10)})
     df2 = from_pandas(pdf2, 1)
 
     # Partition-wise merge with map_partitions
@@ -703,37 +735,37 @@ def test_map_partitions_merge(opt):
     assert_eq(df3, expect, check_index=False)
 
 
-def test_depth(df):
-    assert df._depth() == 1
-    assert (df + 1)._depth() == 2
-    assert ((df.x + 1) + df.y)._depth() == 4
+def test_depth(xdf):
+    assert xdf._depth() == 1
+    assert (xdf + 1)._depth() == 2
+    assert ((xdf.x + 1) + xdf.y)._depth() == 4
 
 
-def test_partitions_nested(df):
-    a = expr.Partitions(expr.Partitions(df.expr, [2, 4, 6]), [0, 2])
-    b = expr.Partitions(df.expr, [2, 6])
+def test_partitions_nested(xdf):
+    a = expr.Partitions(expr.Partitions(xdf.expr, [2, 4, 6]), [0, 2])
+    b = expr.Partitions(xdf.expr, [2, 6])
 
     assert a.optimize()._name == b.optimize()._name
 
 
 @pytest.mark.parametrize("sort", [True, False])
 @pytest.mark.parametrize("npartitions", [7, 12])
-def test_repartition_npartitions(pdf, npartitions, sort):
-    df = from_pandas(pdf, sort=sort) + 1
+def test_repartition_npartitions(bdf, npartitions, sort):
+    df = from_pandas(bdf, sort=sort) + 1
     df2 = df.repartition(npartitions=npartitions)
     assert df2.npartitions == npartitions
     assert_eq(df, df2)
 
 
 @pytest.mark.parametrize("opt", [True, False])
-def test_repartition_divisions(df, opt):
-    end = df.divisions[-1] + 100
-    stride = end // (df.npartitions + 2)
+def test_repartition_divisions(xdf, opt):
+    end = xdf.divisions[-1] + 100
+    stride = end // (xdf.npartitions + 2)
     divisions = tuple(range(0, end, stride))
-    df2 = (df + 1).repartition(divisions=divisions, force=True)["x"]
+    df2 = (xdf + 1).repartition(divisions=divisions, force=True)["x"]
     df2 = optimize(df2) if opt else df2
     assert df2.divisions == divisions
-    assert_eq((df + 1)["x"], df2)
+    assert_eq((xdf + 1)["x"], df2)
 
     # Check partitions
     for p, part in enumerate(dask.compute(list(df2.index.partitions))[0]):
@@ -742,16 +774,16 @@ def test_repartition_divisions(df, opt):
             assert part.max() < df2.divisions[p + 1]
 
 
-def test_repartition_no_op(df):
-    result = df.repartition(divisions=df.divisions).optimize()
-    assert result._name == df._name
+def test_repartition_no_op(xdf):
+    result = xdf.repartition(divisions=xdf.divisions).optimize()
+    assert result._name == xdf._name
 
 
-def test_len(df, pdf):
-    df2 = df[["x"]] + 1
-    assert len(df2) == len(pdf)
+def test_len(xdf, bdf):
+    df2 = xdf[["x"]] + 1
+    assert len(df2) == len(bdf)
 
-    assert len(df[df.x > 5]) == len(pdf[pdf.x > 5])
+    assert len(xdf[xdf.x > 5]) == len(bdf[bdf.x > 5])
 
     first = df2.partitions[0].compute()
     assert len(df2.partitions[0]) == len(first)
@@ -760,62 +792,63 @@ def test_len(df, pdf):
     assert isinstance(expr.Lengths(df2.expr).optimize(), expr.Literal)
 
 
-def test_astype_simplify(df, pdf):
-    q = df.astype({"x": "float64", "y": "float64"})["x"]
+def test_astype_simplify(xdf, bdf):
+    q = xdf.astype({"x": "float64", "y": "float64"})["x"]
     result = q.simplify()
-    expected = df["x"].astype({"x": "float64"})
+    expected = xdf["x"].astype({"x": "float64"})
     assert result._name == expected._name
-    assert_eq(q, pdf.astype({"x": "float64", "y": "float64"})["x"])
+    assert_eq(q, bdf.astype({"x": "float64", "y": "float64"})["x"])
 
-    q = df.astype({"y": "float64"})["x"]
+    q = xdf.astype({"y": "float64"})["x"]
     result = q.simplify()
-    expected = df["x"]
+    expected = xdf["x"]
     assert result._name == expected._name
 
-    q = df.astype("float64")["x"]
+    q = xdf.astype("float64")["x"]
     result = q.simplify()
-    expected = df["x"].astype("float64")
+    expected = xdf["x"].astype("float64")
     assert result._name == expected._name
 
 
-def test_drop_duplicates(df, pdf):
-    assert_eq(df.drop_duplicates(), pdf.drop_duplicates())
+def test_drop_duplicates(xdf, bdf, backend):
+    assert_eq(xdf.drop_duplicates(), bdf.drop_duplicates())
     assert_eq(
-        df.drop_duplicates(ignore_index=True), pdf.drop_duplicates(ignore_index=True)
+        xdf.drop_duplicates(ignore_index=True), bdf.drop_duplicates(ignore_index=True)
     )
-    assert_eq(df.drop_duplicates(subset=["x"]), pdf.drop_duplicates(subset=["x"]))
-    assert_eq(df.x.drop_duplicates(), pdf.x.drop_duplicates())
+    assert_eq(xdf.drop_duplicates(subset=["x"]), bdf.drop_duplicates(subset=["x"]))
+    assert_eq(xdf.x.drop_duplicates(), bdf.x.drop_duplicates())
 
-    with pytest.raises(KeyError, match=re.escape("Index(['a'], dtype='object')")):
-        df.drop_duplicates(subset=["a"])
+    if backend == "pandas":
+        with pytest.raises(KeyError, match=re.escape("Index(['a'], dtype='object')")):
+            xdf.drop_duplicates(subset=["a"])
 
     with pytest.raises(TypeError, match="got an unexpected keyword argument"):
-        df.x.drop_duplicates(subset=["a"])
+        xdf.x.drop_duplicates(subset=["a"])
 
 
-def test_unique(df, pdf):
+def test_unique(xdf, bdf, lib):
     with pytest.raises(
         AttributeError, match="'DataFrame' object has no attribute 'unique'"
     ):
-        df.unique()
+        xdf.unique()
 
     # pandas returns a numpy array while we return a Series/Index
-    assert_eq(df.x.unique(), pd.Series(pdf.x.unique(), name="x"))
-    assert_eq(df.index.unique(), pd.Index(pdf.index.unique()))
+    assert_eq(xdf.x.unique(), lib.Series(bdf.x.unique(), name="x"))
+    assert_eq(xdf.index.unique(), lib.Index(bdf.index.unique()))
 
 
-def test_walk(df):
-    df2 = df[df["x"] > 1][["y"]] + 1
+def test_walk(xdf):
+    df2 = xdf[xdf["x"] > 1][["y"]] + 1
     assert all(isinstance(ex, expr.Expr) for ex in df2.walk())
     exprs = set(df2.walk())
-    assert df.expr in exprs
-    assert df["x"].expr in exprs
-    assert (df["x"] > 1).expr in exprs
+    assert xdf.expr in exprs
+    assert xdf["x"].expr in exprs
+    assert (xdf["x"] > 1).expr in exprs
     assert 1 not in exprs
 
 
-def test_find_operations(df):
-    df2 = df[df["x"] > 1][["y"]] + 1
+def test_find_operations(xdf):
+    df2 = xdf[xdf["x"] > 1][["y"]] + 1
 
     filters = list(df2.find_operations(expr.Filter))
     assert len(filters) == 1
@@ -832,21 +865,21 @@ def test_find_operations(df):
 
 
 @pytest.mark.parametrize("subset", ["x", ["x"]])
-def test_dropna_simplify(pdf, subset):
-    pdf["z"] = 1
-    df = from_pandas(pdf)
+def test_dropna_simplify(bdf, subset):
+    bdf["z"] = 1
+    df = from_pandas(bdf)
     q = df.dropna(subset=subset)["y"]
     result = q.simplify()
     expected = df[["x", "y"]].dropna(subset=subset)["y"]
     assert result._name == expected._name
-    assert_eq(q, pdf.dropna(subset=subset)["y"])
+    assert_eq(q, bdf.dropna(subset=subset)["y"])
 
 
-def test_dir(df):
-    assert all(c in dir(df) for c in df.columns)
-    assert "sum" in dir(df)
-    assert "sum" in dir(df.x)
-    assert "sum" in dir(df.index)
+def test_dir(xdf):
+    assert all(c in dir(xdf) for c in xdf.columns)
+    assert "sum" in dir(xdf)
+    assert "sum" in dir(xdf.x)
+    assert "sum" in dir(xdf.index)
 
 
 @pytest.mark.parametrize(
@@ -862,38 +895,40 @@ def test_dir(df):
     ],
 )
 @pytest.mark.parametrize("indexer", ["x", ["x"]])
-def test_simplify_up_blockwise(df, pdf, func, args, indexer):
-    q = getattr(df, func)(*args)[indexer]
+def test_simplify_up_blockwise(xdf, bdf, func, args, indexer):
+    q = getattr(xdf, func)(*args)[indexer]
     result = q.simplify()
-    expected = getattr(df[indexer], func)(*args)
+    expected = getattr(xdf[indexer], func)(*args)
     assert result._name == expected._name
 
-    assert_eq(q, getattr(pdf, func)(*args)[indexer])
+    assert_eq(q, getattr(bdf, func)(*args)[indexer])
 
-    q = getattr(df, func)(*args)[["x", "y"]]
+    q = getattr(xdf, func)(*args)[["x", "y"]]
     result = q.simplify()
-    expected = getattr(df, func)(*args)
+    expected = getattr(xdf, func)(*args)
     assert result._name == expected._name
 
 
-def test_sample(df):
-    result = df.sample(frac=0.5)
+def test_sample(xdf):
+    result = xdf.sample(frac=0.5)
 
     assert_eq(result, result)
 
-    result = df.sample(frac=0.5, random_state=1234)
-    expected = df.sample(frac=0.5, random_state=1234)
+    result = xdf.sample(frac=0.5, random_state=1234)
+    expected = xdf.sample(frac=0.5, random_state=1234)
     assert_eq(result, expected)
 
 
-def test_align(df, pdf):
-    result_1, result_2 = df.align(df)
-    pdf_result_1, pdf_result_2 = pdf.align(pdf)
+def test_align(xdf, bdf, backend):
+    if backend == "cudf":
+        pytest.skip(reason="align not supported by cudf")
+    result_1, result_2 = xdf.align(xdf)
+    pdf_result_1, pdf_result_2 = bdf.align(bdf)
     assert_eq(result_1, pdf_result_1)
     assert_eq(result_2, pdf_result_2)
 
-    result_1, result_2 = df.x.align(df.x)
-    pdf_result_1, pdf_result_2 = pdf.x.align(pdf.x)
+    result_1, result_2 = xdf.x.align(xdf.x)
+    pdf_result_1, pdf_result_2 = bdf.x.align(bdf.x)
     assert_eq(result_1, pdf_result_1)
     assert_eq(result_2, pdf_result_2)
 
@@ -930,75 +965,81 @@ def test_unknown_partitions_different_root():
         df.align(df2)
 
 
-def test_nunique_approx(df):
-    result = df.nunique_approx().compute()
+def test_nunique_approx(xdf, backend):
+    if backend == "cudf":
+        pytest.xfail(reason="compute_hll_array doesn't work for cudf")
+    result = xdf.nunique_approx().compute()
     assert 99 < result < 101
 
 
-def test_assign_simplify(pdf):
-    df = from_pandas(pdf)
-    df2 = from_pandas(pdf)
+def test_assign_simplify(bdf):
+    df = from_pandas(bdf)
+    df2 = from_pandas(bdf)
     df["new"] = df.x > 1
     result = df[["x", "new"]].simplify()
     expected = df2[["x"]].assign(new=df2.x > 1).simplify()
     assert result._name == expected._name
 
-    pdf["new"] = pdf.x > 1
-    assert_eq(pdf[["x", "new"]], result)
+    bdf["new"] = bdf.x > 1
+    assert_eq(bdf[["x", "new"]], result)
 
 
-def test_assign_simplify_new_column_not_needed(pdf):
-    df = from_pandas(pdf)
-    df2 = from_pandas(pdf)
+def test_assign_simplify_new_column_not_needed(bdf):
+    df = from_pandas(bdf)
+    df2 = from_pandas(bdf)
     df["new"] = df.x > 1
     result = df[["x"]].simplify()
     expected = df2[["x"]].simplify()
     assert result._name == expected._name
 
-    pdf["new"] = pdf.x > 1
-    assert_eq(result, pdf[["x"]])
+    bdf["new"] = bdf.x > 1
+    assert_eq(result, bdf[["x"]])
 
 
-def test_assign_simplify_series(pdf):
-    df = from_pandas(pdf)
-    df2 = from_pandas(pdf)
+def test_assign_simplify_series(bdf):
+    df = from_pandas(bdf)
+    df2 = from_pandas(bdf)
     df["new"] = df.x > 1
     result = df.new.simplify()
     expected = df2[[]].assign(new=df2.x > 1).new.simplify()
     assert result._name == expected._name
 
 
-def test_assign_non_series_inputs(df, pdf):
-    assert_eq(df.assign(a=lambda x: x.x * 2), pdf.assign(a=lambda x: x.x * 2))
-    assert_eq(df.assign(a=2), pdf.assign(a=2))
-    assert_eq(df.assign(a=df.x.sum()), pdf.assign(a=pdf.x.sum()))
+def test_assign_non_series_inputs(xdf, bdf, backend):
+    if backend == "cudf":
+        pytest.xfail(reason="assign function not supported by cudf")
+    assert_eq(xdf.assign(a=lambda x: x.x * 2), bdf.assign(a=lambda x: x.x * 2))
+    assert_eq(xdf.assign(a=2), bdf.assign(a=2))
+    assert_eq(xdf.assign(a=xdf.x.sum()), bdf.assign(a=bdf.x.sum()))
 
-    assert_eq(df.assign(a=lambda x: x.x * 2).y, pdf.assign(a=lambda x: x.x * 2).y)
-    assert_eq(df.assign(a=lambda x: x.x * 2).a, pdf.assign(a=lambda x: x.x * 2).a)
+    assert_eq(xdf.assign(a=lambda x: x.x * 2).y, bdf.assign(a=lambda x: x.x * 2).y)
+    assert_eq(xdf.assign(a=lambda x: x.x * 2).a, bdf.assign(a=lambda x: x.x * 2).a)
 
 
-def test_are_co_aligned(pdf, df):
-    df2 = df.reset_index()
-    assert are_co_aligned(df.expr, df2.expr)
-    assert are_co_aligned(df.expr, df2.sum().expr)
-    assert not are_co_aligned(df.expr, df2.repartition(npartitions=2).expr)
+def test_are_co_aligned(bdf, xdf):
+    df2 = xdf.reset_index()
+    assert are_co_aligned(xdf.expr, df2.expr)
+    assert are_co_aligned(xdf.expr, df2.sum().expr)
+    assert not are_co_aligned(xdf.expr, df2.repartition(npartitions=2).expr)
 
-    assert are_co_aligned(df.expr, df.sum().expr)
-    assert are_co_aligned((df + df.sum()).expr, df.sum().expr)
+    assert are_co_aligned(xdf.expr, xdf.sum().expr)
+    assert are_co_aligned((xdf + xdf.sum()).expr, xdf.sum().expr)
 
-    pdf = pdf.assign(z=1)
-    df3 = from_pandas(pdf, npartitions=10)
-    assert not are_co_aligned(df.expr, df3.expr)
-    assert are_co_aligned(df.expr, df3.sum().expr)
+    bdf = bdf.assign(z=1)
+    df3 = from_pandas(bdf, npartitions=10)
+    assert not are_co_aligned(xdf.expr, df3.expr)
+    assert are_co_aligned(xdf.expr, df3.sum().expr)
 
-    merged = df.merge(df2)
+    merged = xdf.merge(df2)
     merged_first = merged.reset_index()
     merged_second = merged.rename(columns={"x": "a"})
     assert are_co_aligned(merged_first.expr, merged_second.expr)
-    assert not are_co_aligned(merged_first.expr, df.expr)
+    assert not are_co_aligned(merged_first.expr, xdf.expr)
 
 
-def test_astype_categories(df):
-    result = df.astype("category")
+def test_astype_categories(xdf, backend):
+    if backend == "cudf":
+        pytest.xfail(reason="TODO")
+    result = xdf.astype("category")
     assert_eq(result.x._meta.cat.categories, pd.Index([UNKNOWN_CATEGORIES]))
     assert_eq(result.y._meta.cat.categories, pd.Index([UNKNOWN_CATEGORIES]))

From 3d229a4e7add3f38573878014899777524657c0c Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Thu, 6 Jul 2023 14:44:59 -0700
Subject: [PATCH 05/18] revert xdf name back to df

---
 dask_expr/tests/test_collection.py | 427 ++++++++++++++---------------
 1 file changed, 212 insertions(+), 215 deletions(-)

diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
index cba74e939..7c66f7623 100644
--- a/dask_expr/tests/test_collection.py
+++ b/dask_expr/tests/test_collection.py
@@ -32,12 +32,13 @@
     ]
 )
 def backend(request):
+    # Return backend-library label
     yield request.param
 
 
 @pytest.fixture
 def lib(backend):
-    # Multi-backend DataFrame fixture
+    # Return library associated with `backend` label
     if backend == "cudf":
         yield cudf
     else:
@@ -53,28 +54,28 @@ def bdf(lib):
 
 
 @pytest.fixture
-def xdf(bdf):
+def df(bdf):
     # Multi-backend Dask-Expression DataFrame fixture
     yield from_pandas(bdf, npartitions=10)
 
 
-def test_del(bdf, xdf):
+def test_del(bdf, df):
     pdf = bdf.copy()
 
     # Check __delitem__
     del pdf["x"]
-    del xdf["x"]
-    assert_eq(pdf, xdf)
+    del df["x"]
+    assert_eq(pdf, df)
 
 
-def test_setitem(bdf, xdf):
+def test_setitem(bdf, df):
     pdf = bdf.copy()
     pdf["z"] = pdf.x + pdf.y
 
-    xdf["z"] = xdf.x + xdf.y
+    df["z"] = df.x + df.y
 
-    assert "z" in xdf.columns
-    assert_eq(xdf, pdf)
+    assert "z" in df.columns
+    assert_eq(df, pdf)
 
 
 def test_explode():
@@ -122,9 +123,9 @@ def test_meta_blockwise(lib):
     assert set(cc.columns) == {"x", "y", "z"}
 
 
-def test_dask(bdf, xdf):
-    assert (xdf.x + xdf.y).npartitions == 10
-    z = (xdf.x + xdf.y).sum()
+def test_dask(bdf, df):
+    assert (df.x + df.y).npartitions == 10
+    z = (df.x + df.y).sum()
 
     assert assert_eq(z, (bdf.x + bdf.y).sum())
 
@@ -148,26 +149,26 @@ def test_dask(bdf, xdf):
         ),
     ],
 )
-def test_reductions(func, bdf, xdf, backend):
+def test_reductions(func, bdf, df, backend):
     if backend == "cudf" and func in [M.idxmin, M.idxmax]:
         pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/9602")
-    result = func(xdf)
+    result = func(df)
     assert result.known_divisions
     assert_eq(result, func(bdf))
-    result = func(xdf.x)
+    result = func(df.x)
     assert not result.known_divisions
     assert_eq(result, func(bdf.x))
     # check_dtype False because sub-selection of columns that is pushed through
     # is not reflected in the meta calculation
-    assert_eq(func(xdf)["x"], func(bdf)["x"], check_dtype=False)
+    assert_eq(func(df)["x"], func(bdf)["x"], check_dtype=False)
 
 
-def test_nbytes(bdf, xdf, backend):
+def test_nbytes(bdf, df, backend):
     if backend == "cudf":
         pytest.xfail(reason="nbytes not supported by cudf")
     with pytest.raises(NotImplementedError, match="nbytes is not implemented"):
-        xdf.nbytes
-    assert_eq(xdf.x.nbytes, bdf.x.nbytes)
+        df.nbytes
+    assert_eq(df.x.nbytes, bdf.x.nbytes)
 
 
 def test_mode(lib):
@@ -177,12 +178,12 @@ def test_mode(lib):
     assert_eq(df.x.mode(), pdf.x.mode(), check_names=False)
 
 
-def test_value_counts(xdf, bdf):
+def test_value_counts(df, bdf):
     with pytest.raises(
         AttributeError, match="'DataFrame' object has no attribute 'value_counts'"
     ):
-        xdf.value_counts()
-    assert_eq(xdf.x.value_counts(), bdf.x.value_counts().astype("int64"))
+        df.value_counts()
+    assert_eq(df.x.value_counts(), bdf.x.value_counts().astype("int64"))
 
 
 def test_dropna(bdf):
@@ -210,11 +211,11 @@ def test_memory_usage(bdf):
 
 
 @pytest.mark.parametrize("func", [M.nlargest, M.nsmallest])
-def test_nlargest_nsmallest(xdf, bdf, func):
-    assert_eq(func(xdf, n=5, columns="x"), func(bdf, n=5, columns="x"))
-    assert_eq(func(xdf.x, n=5), func(bdf.x, n=5))
+def test_nlargest_nsmallest(df, bdf, func):
+    assert_eq(func(df, n=5, columns="x"), func(bdf, n=5, columns="x"))
+    assert_eq(func(df.x, n=5), func(bdf.x, n=5))
     with pytest.raises(TypeError, match="got an unexpected keyword argument"):
-        func(xdf.x, n=5, columns="foo")
+        func(df.x, n=5, columns="foo")
 
 
 @pytest.mark.parametrize(
@@ -231,8 +232,8 @@ def test_nlargest_nsmallest(xdf, bdf, func):
         lambda df: df.x != df.y,
     ],
 )
-def test_conditionals(func, bdf, xdf):
-    assert_eq(func(bdf), func(xdf), check_names=False)
+def test_conditionals(func, bdf, df):
+    assert_eq(func(bdf), func(df), check_names=False)
 
 
 @pytest.mark.parametrize(
@@ -280,8 +281,8 @@ def test_unary_operators(func, lib):
         lambda df: df[(df.x > 7) & (df.x < 10)],
     ],
 )
-def test_and_or(func, bdf, xdf):
-    assert_eq(func(bdf), func(xdf), check_names=False)
+def test_and_or(func, bdf, df):
+    assert_eq(func(bdf), func(df), check_names=False)
 
 
 @pytest.mark.parametrize("how", ["start", "end"])
@@ -298,7 +299,6 @@ def test_to_timestamp(bdf, how, backend):
     "func",
     [
         lambda df: df.astype(int),
-        # lambda df: df.apply(lambda row, x, y=10: row * x + y, x=2),
         pytest.param(
             lambda df: df.map(lambda x: x + 1),
             marks=pytest.mark.skipif(
@@ -309,7 +309,6 @@ def test_to_timestamp(bdf, how, backend):
         lambda df: df.x.clip(lower=10, upper=50),
         lambda df: df.x.between(left=10, right=50),
         lambda df: df.x.map(lambda x: x + 1),
-        # lambda df: df.index.map(lambda x: x + 1),
         lambda df: df[df.x > 5],
         lambda df: df.assign(a=df.x + df.y, b=df.x - df.y),
         lambda df: df.replace(to_replace=1, value=1000),
@@ -321,8 +320,6 @@ def test_to_timestamp(bdf, how, backend):
         lambda df: df.rename(columns={"x": "xx"}),
         lambda df: df.rename(columns={"x": "xx"}).xx,
         lambda df: df.rename(columns={"x": "xx"})[["xx"]],
-        # lambda df: df.combine_first(df),
-        # lambda df: df.x.combine_first(df.y),
         lambda df: df.x.to_frame(),
         lambda df: df.drop(columns="x"),
         lambda df: df.x.index.to_frame(),
@@ -330,8 +327,8 @@ def test_to_timestamp(bdf, how, backend):
         lambda df: df.select_dtypes(include="integer"),
     ],
 )
-def test_blockwise(func, bdf, xdf):
-    assert_eq(func(bdf), func(xdf))
+def test_blockwise(func, bdf, df):
+    assert_eq(func(bdf), func(df))
 
 
 @pytest.mark.parametrize(
@@ -343,10 +340,10 @@ def test_blockwise(func, bdf, xdf):
         lambda df: df.x.combine_first(df.y),
     ],
 )
-def test_blockwise_cudf_fails(func, bdf, xdf, backend):
+def test_blockwise_cudf_fails(func, bdf, df, backend):
     if backend == "cudf":
         pytest.xfail(reason="func not supported by cudf")
-    assert_eq(func(bdf), func(xdf))
+    assert_eq(func(bdf), func(df))
 
 
 def test_rename_axis(bdf, backend):
@@ -361,10 +358,10 @@ def test_rename_axis(bdf, backend):
     assert_eq(df.x.rename_axis(index="dummy"), pdf.x.rename_axis(index="dummy"))
 
 
-def test_isin(xdf, bdf):
+def test_isin(df, bdf):
     values = [1, 2]
-    assert_eq(bdf.isin(values), xdf.isin(values))
-    assert_eq(bdf.x.isin(values), xdf.x.isin(values))
+    assert_eq(bdf.isin(values), df.isin(values))
+    assert_eq(bdf.x.isin(values), df.x.isin(values))
 
 
 def test_round(bdf):
@@ -375,11 +372,11 @@ def test_round(bdf):
     assert_eq(df.x.round(decimals=1), pdf.x.round(decimals=1))
 
 
-def test_repr(xdf):
-    assert "+ 1" in str(xdf + 1)
-    assert "+ 1" in repr(xdf + 1)
+def test_repr(df):
+    assert "+ 1" in str(df + 1)
+    assert "+ 1" in repr(df + 1)
 
-    s = (xdf["x"] + 1).sum(skipna=False).expr
+    s = (df["x"] + 1).sum(skipna=False).expr
     assert '["x"]' in s or "['x']" in s
     assert "+ 1" in s
     assert "sum(skipna=False)" in s
@@ -400,33 +397,33 @@ def test_combine_first_simplify(bdf, backend):
     assert_eq(result, pdf.combine_first(pdf2)[["z", "y"]])
 
 
-def test_rename_traverse_filter(xdf):
-    result = xdf.rename(columns={"x": "xx"})[["xx"]].simplify()
-    expected = xdf[["x"]].rename(columns={"x": "xx"})
+def test_rename_traverse_filter(df):
+    result = df.rename(columns={"x": "xx"})[["xx"]].simplify()
+    expected = df[["x"]].rename(columns={"x": "xx"})
     assert str(result) == str(expected)
 
 
-def test_columns_traverse_filters(xdf):
-    result = xdf[xdf.x > 5].y.simplify()
-    expected = xdf.y[xdf.x > 5]
+def test_columns_traverse_filters(df):
+    result = df[df.x > 5].y.simplify()
+    expected = df.y[df.x > 5]
 
     assert str(result) == str(expected)
 
 
-def test_clip_traverse_filters(xdf):
-    result = xdf.clip(lower=10).y.simplify()
-    expected = xdf.y.clip(lower=10)
+def test_clip_traverse_filters(df):
+    result = df.clip(lower=10).y.simplify()
+    expected = df.y.clip(lower=10)
 
     assert result._name == expected._name
 
-    result = xdf.clip(lower=10)[["x", "y"]].simplify()
-    expected = xdf.clip(lower=10)
+    result = df.clip(lower=10)[["x", "y"]].simplify()
+    expected = df.clip(lower=10)
 
     assert result._name == expected._name
 
-    arg = xdf.clip(lower=10)[["x"]]
+    arg = df.clip(lower=10)[["x"]]
     result = arg.simplify()
-    expected = xdf[["x"]].clip(lower=10)
+    expected = df[["x"]].clip(lower=10)
 
     assert result._name == expected._name
 
@@ -443,19 +440,19 @@ def test_drop_duplicates_subset_simplify(bdf, subset, projection):
     assert str(result) == str(expected)
 
 
-def test_broadcast(bdf, xdf):
+def test_broadcast(bdf, df):
     assert_eq(
-        xdf + xdf.sum(),
+        df + df.sum(),
         bdf + bdf.sum(),
     )
     assert_eq(
-        xdf.x + xdf.x.sum(),
+        df.x + df.x.sum(),
         bdf.x + bdf.x.sum(),
     )
 
 
-def test_persist(bdf, xdf):
-    a = xdf + 2
+def test_persist(bdf, df):
+    a = df + 2
     b = a.persist()
 
     assert_eq(a, b)
@@ -466,28 +463,28 @@ def test_persist(bdf, xdf):
     assert_eq(b.y.sum(), (bdf + 2).y.sum())
 
 
-def test_index(bdf, xdf):
-    assert_eq(xdf.index, bdf.index)
-    assert_eq(xdf.x.index, bdf.x.index)
+def test_index(bdf, df):
+    assert_eq(df.index, bdf.index)
+    assert_eq(df.x.index, bdf.x.index)
 
 
 @pytest.mark.parametrize("drop", [True, False])
-def test_reset_index(bdf, xdf, drop):
-    assert_eq(xdf.reset_index(drop=drop), bdf.reset_index(drop=drop), check_index=False)
+def test_reset_index(bdf, df, drop):
+    assert_eq(df.reset_index(drop=drop), bdf.reset_index(drop=drop), check_index=False)
     assert_eq(
-        xdf.x.reset_index(drop=drop), bdf.x.reset_index(drop=drop), check_index=False
+        df.x.reset_index(drop=drop), bdf.x.reset_index(drop=drop), check_index=False
     )
 
 
-def test_head(bdf, xdf):
-    assert_eq(xdf.head(compute=False), bdf.head())
-    assert_eq(xdf.head(compute=False, n=7), bdf.head(n=7))
+def test_head(bdf, df):
+    assert_eq(df.head(compute=False), bdf.head())
+    assert_eq(df.head(compute=False, n=7), bdf.head(n=7))
 
-    assert xdf.head(compute=False).npartitions == 1
+    assert df.head(compute=False).npartitions == 1
 
 
-def test_head_down(xdf):
-    result = (xdf.x + xdf.y + 1).head(compute=False)
+def test_head_down(df):
+    result = (df.x + df.y + 1).head(compute=False)
     optimized = result.simplify()
 
     assert_eq(result, optimized)
@@ -495,22 +492,22 @@ def test_head_down(xdf):
     assert not isinstance(optimized.expr, expr.Head)
 
 
-def test_head_head(xdf):
-    a = xdf.head(compute=False).head(compute=False)
-    b = xdf.head(compute=False)
+def test_head_head(df):
+    a = df.head(compute=False).head(compute=False)
+    b = df.head(compute=False)
 
     assert a.optimize()._name == b.optimize()._name
 
 
-def test_tail(bdf, xdf):
-    assert_eq(xdf.tail(compute=False), bdf.tail())
-    assert_eq(xdf.tail(compute=False, n=7), bdf.tail(n=7))
+def test_tail(bdf, df):
+    assert_eq(df.tail(compute=False), bdf.tail())
+    assert_eq(df.tail(compute=False, n=7), bdf.tail(n=7))
 
-    assert xdf.tail(compute=False).npartitions == 1
+    assert df.tail(compute=False).npartitions == 1
 
 
-def test_tail_down(xdf):
-    result = (xdf.x + xdf.y + 1).tail(compute=False)
+def test_tail_down(df):
+    result = (df.x + df.y + 1).tail(compute=False)
     optimized = optimize(result)
 
     assert_eq(result, optimized)
@@ -518,23 +515,23 @@ def test_tail_down(xdf):
     assert not isinstance(optimized.expr, expr.Tail)
 
 
-def test_tail_tail(xdf):
-    a = xdf.tail(compute=False).tail(compute=False)
-    b = xdf.tail(compute=False)
+def test_tail_tail(df):
+    a = df.tail(compute=False).tail(compute=False)
+    b = df.tail(compute=False)
 
     assert a.optimize()._name == b.optimize()._name
 
 
-def test_tail_repartition(xdf):
-    a = xdf.repartition(npartitions=10).tail()
-    b = xdf.tail()
+def test_tail_repartition(df):
+    a = df.repartition(npartitions=10).tail()
+    b = df.tail()
     assert_eq(a, b)
 
 
-def test_projection_stacking(xdf):
-    result = xdf[["x", "y"]]["x"]
+def test_projection_stacking(df):
+    result = df[["x", "y"]]["x"]
     optimized = result.simplify()
-    expected = xdf["x"]
+    expected = df["x"]
 
     assert optimized._name == expected._name
 
@@ -545,16 +542,16 @@ def test_projection_stacking_coercion(bdf):
     assert_eq(df.x[[0]], bdf.x[[0]], check_divisions=False)
 
 
-def test_remove_unnecessary_projections(xdf):
-    result = (xdf + 1)[xdf.columns]
+def test_remove_unnecessary_projections(df):
+    result = (df + 1)[df.columns]
     optimized = result.simplify()
-    expected = xdf + 1
+    expected = df + 1
 
     assert optimized._name == expected._name
 
-    result = (xdf[["x"]] + 1)[["x"]]
+    result = (df[["x"]] + 1)[["x"]]
     optimized = result.simplify()
-    expected = xdf[["x"]] + 1
+    expected = df[["x"]] + 1
 
     assert optimized._name == expected._name
 
@@ -597,53 +594,53 @@ def test_from_pandas(bdf):
     assert "pandas" in df._name
 
 
-def test_copy(xdf):
-    original = xdf.copy()
+def test_copy(df):
+    original = df.copy()
     columns = tuple(original.columns)
 
-    xdf["z"] = xdf.x + xdf.y
+    df["z"] = df.x + df.y
 
     assert tuple(original.columns) == columns
     assert "z" not in original.columns
 
 
-def test_partitions(bdf, xdf):
-    assert_eq(xdf.partitions[0], bdf.iloc[:10])
-    assert_eq(xdf.partitions[1], bdf.iloc[10:20])
-    assert_eq(xdf.partitions[1:3], bdf.iloc[10:30])
-    assert_eq(xdf.partitions[[3, 4]], bdf.iloc[30:50])
-    assert_eq(xdf.partitions[-1], bdf.iloc[90:])
+def test_partitions(bdf, df):
+    assert_eq(df.partitions[0], bdf.iloc[:10])
+    assert_eq(df.partitions[1], bdf.iloc[10:20])
+    assert_eq(df.partitions[1:3], bdf.iloc[10:30])
+    assert_eq(df.partitions[[3, 4]], bdf.iloc[30:50])
+    assert_eq(df.partitions[-1], bdf.iloc[90:])
 
-    out = (xdf + 1).partitions[0].simplify()
+    out = (df + 1).partitions[0].simplify()
     assert isinstance(out.expr, expr.Add)
     assert out.expr.left._partitions == [0]
 
     # Check culling
-    out = optimize(xdf.partitions[1])
+    out = optimize(df.partitions[1])
     assert len(out.dask) == 1
     assert_eq(out, bdf.iloc[10:20])
 
 
-def test_column_getattr(xdf):
-    xdf = xdf.expr
-    assert xdf.x._name == xdf["x"]._name
+def test_column_getattr(df):
+    df = df.expr
+    assert df.x._name == df["x"]._name
 
     with pytest.raises(AttributeError):
-        xdf.foo
+        df.foo
 
 
-def test_serialization(bdf, xdf):
-    before = pickle.dumps(xdf)
+def test_serialization(bdf, df):
+    before = pickle.dumps(df)
 
     assert len(before) < 200 + len(pickle.dumps(bdf))
 
-    part = xdf.partitions[0].compute()
+    part = df.partitions[0].compute()
     assert (
-        len(pickle.dumps(xdf.__dask_graph__()))
-        < 1000 + len(pickle.dumps(part)) * xdf.npartitions
+        len(pickle.dumps(df.__dask_graph__()))
+        < 1000 + len(pickle.dumps(part)) * df.npartitions
     )
 
-    after = pickle.dumps(xdf)
+    after = pickle.dumps(df)
 
     assert before == after  # caching doesn't affect serialization
 
@@ -651,17 +648,17 @@ def test_serialization(bdf, xdf):
     assert_eq(pickle.loads(before), pickle.loads(after))
 
 
-def test_size_optimized(xdf, backend):
+def test_size_optimized(df, backend):
     if backend == "cudf":
         pytest.xfail(reason="Cannot apply lambda function in cudf")
-    expr = (xdf.x + 1).apply(lambda x: x).size
+    expr = (df.x + 1).apply(lambda x: x).size
     out = optimize(expr)
-    expected = optimize(xdf.x.size)
+    expected = optimize(df.x.size)
     assert out._name == expected._name
 
-    expr = (xdf + 1).apply(lambda x: x).size
+    expr = (df + 1).apply(lambda x: x).size
     out = optimize(expr)
-    expected = optimize(xdf.size)
+    expected = optimize(df.size)
     assert out._name == expected._name
 
 
@@ -687,30 +684,30 @@ def test_tree_repr(fuse):
         assert s.count("|") == 9
 
 
-def test_simple_graphs(xdf):
-    expr = (xdf + 1).expr
+def test_simple_graphs(df):
+    expr = (df + 1).expr
     graph = expr.__dask_graph__()
 
-    assert graph[(expr._name, 0)] == (operator.add, (xdf.expr._name, 0), 1)
+    assert graph[(expr._name, 0)] == (operator.add, (df.expr._name, 0), 1)
 
 
-def test_map_partitions(xdf):
+def test_map_partitions(df):
     def combine_x_y(x, y, foo=None):
         assert foo == "bar"
         return x + y
 
-    df2 = xdf.map_partitions(combine_x_y, xdf + 1, foo="bar")
-    assert_eq(df2, xdf + (xdf + 1))
+    df2 = df.map_partitions(combine_x_y, df + 1, foo="bar")
+    assert_eq(df2, df + (df + 1))
 
 
-def test_map_partitions_broadcast(xdf):
+def test_map_partitions_broadcast(df):
     def combine_x_y(x, y, val, foo=None):
         assert foo == "bar"
         return x + y + val
 
-    df2 = xdf.map_partitions(combine_x_y, xdf["x"].sum(), 123, foo="bar")
-    assert_eq(df2, xdf + xdf["x"].sum() + 123)
-    assert_eq(df2.optimize(), xdf + xdf["x"].sum() + 123)
+    df2 = df.map_partitions(combine_x_y, df["x"].sum(), 123, foo="bar")
+    assert_eq(df2, df + df["x"].sum() + 123)
+    assert_eq(df2.optimize(), df + df["x"].sum() + 123)
 
 
 @pytest.mark.parametrize("opt", [True, False])
@@ -735,15 +732,15 @@ def test_map_partitions_merge(opt, lib):
     assert_eq(df3, expect, check_index=False)
 
 
-def test_depth(xdf):
-    assert xdf._depth() == 1
-    assert (xdf + 1)._depth() == 2
-    assert ((xdf.x + 1) + xdf.y)._depth() == 4
+def test_depth(df):
+    assert df._depth() == 1
+    assert (df + 1)._depth() == 2
+    assert ((df.x + 1) + df.y)._depth() == 4
 
 
-def test_partitions_nested(xdf):
-    a = expr.Partitions(expr.Partitions(xdf.expr, [2, 4, 6]), [0, 2])
-    b = expr.Partitions(xdf.expr, [2, 6])
+def test_partitions_nested(df):
+    a = expr.Partitions(expr.Partitions(df.expr, [2, 4, 6]), [0, 2])
+    b = expr.Partitions(df.expr, [2, 6])
 
     assert a.optimize()._name == b.optimize()._name
 
@@ -758,14 +755,14 @@ def test_repartition_npartitions(bdf, npartitions, sort):
 
 
 @pytest.mark.parametrize("opt", [True, False])
-def test_repartition_divisions(xdf, opt):
-    end = xdf.divisions[-1] + 100
-    stride = end // (xdf.npartitions + 2)
+def test_repartition_divisions(df, opt):
+    end = df.divisions[-1] + 100
+    stride = end // (df.npartitions + 2)
     divisions = tuple(range(0, end, stride))
-    df2 = (xdf + 1).repartition(divisions=divisions, force=True)["x"]
+    df2 = (df + 1).repartition(divisions=divisions, force=True)["x"]
     df2 = optimize(df2) if opt else df2
     assert df2.divisions == divisions
-    assert_eq((xdf + 1)["x"], df2)
+    assert_eq((df + 1)["x"], df2)
 
     # Check partitions
     for p, part in enumerate(dask.compute(list(df2.index.partitions))[0]):
@@ -774,16 +771,16 @@ def test_repartition_divisions(xdf, opt):
             assert part.max() < df2.divisions[p + 1]
 
 
-def test_repartition_no_op(xdf):
-    result = xdf.repartition(divisions=xdf.divisions).optimize()
-    assert result._name == xdf._name
+def test_repartition_no_op(df):
+    result = df.repartition(divisions=df.divisions).optimize()
+    assert result._name == df._name
 
 
-def test_len(xdf, bdf):
-    df2 = xdf[["x"]] + 1
+def test_len(df, bdf):
+    df2 = df[["x"]] + 1
     assert len(df2) == len(bdf)
 
-    assert len(xdf[xdf.x > 5]) == len(bdf[bdf.x > 5])
+    assert len(df[df.x > 5]) == len(bdf[bdf.x > 5])
 
     first = df2.partitions[0].compute()
     assert len(df2.partitions[0]) == len(first)
@@ -792,63 +789,63 @@ def test_len(xdf, bdf):
     assert isinstance(expr.Lengths(df2.expr).optimize(), expr.Literal)
 
 
-def test_astype_simplify(xdf, bdf):
-    q = xdf.astype({"x": "float64", "y": "float64"})["x"]
+def test_astype_simplify(df, bdf):
+    q = df.astype({"x": "float64", "y": "float64"})["x"]
     result = q.simplify()
-    expected = xdf["x"].astype({"x": "float64"})
+    expected = df["x"].astype({"x": "float64"})
     assert result._name == expected._name
     assert_eq(q, bdf.astype({"x": "float64", "y": "float64"})["x"])
 
-    q = xdf.astype({"y": "float64"})["x"]
+    q = df.astype({"y": "float64"})["x"]
     result = q.simplify()
-    expected = xdf["x"]
+    expected = df["x"]
     assert result._name == expected._name
 
-    q = xdf.astype("float64")["x"]
+    q = df.astype("float64")["x"]
     result = q.simplify()
-    expected = xdf["x"].astype("float64")
+    expected = df["x"].astype("float64")
     assert result._name == expected._name
 
 
-def test_drop_duplicates(xdf, bdf, backend):
-    assert_eq(xdf.drop_duplicates(), bdf.drop_duplicates())
+def test_drop_duplicates(df, bdf, backend):
+    assert_eq(df.drop_duplicates(), bdf.drop_duplicates())
     assert_eq(
-        xdf.drop_duplicates(ignore_index=True), bdf.drop_duplicates(ignore_index=True)
+        df.drop_duplicates(ignore_index=True), bdf.drop_duplicates(ignore_index=True)
     )
-    assert_eq(xdf.drop_duplicates(subset=["x"]), bdf.drop_duplicates(subset=["x"]))
-    assert_eq(xdf.x.drop_duplicates(), bdf.x.drop_duplicates())
+    assert_eq(df.drop_duplicates(subset=["x"]), bdf.drop_duplicates(subset=["x"]))
+    assert_eq(df.x.drop_duplicates(), bdf.x.drop_duplicates())
 
     if backend == "pandas":
         with pytest.raises(KeyError, match=re.escape("Index(['a'], dtype='object')")):
-            xdf.drop_duplicates(subset=["a"])
+            df.drop_duplicates(subset=["a"])
 
     with pytest.raises(TypeError, match="got an unexpected keyword argument"):
-        xdf.x.drop_duplicates(subset=["a"])
+        df.x.drop_duplicates(subset=["a"])
 
 
-def test_unique(xdf, bdf, lib):
+def test_unique(df, bdf, lib):
     with pytest.raises(
         AttributeError, match="'DataFrame' object has no attribute 'unique'"
     ):
-        xdf.unique()
+        df.unique()
 
     # pandas returns a numpy array while we return a Series/Index
-    assert_eq(xdf.x.unique(), lib.Series(bdf.x.unique(), name="x"))
-    assert_eq(xdf.index.unique(), lib.Index(bdf.index.unique()))
+    assert_eq(df.x.unique(), lib.Series(bdf.x.unique(), name="x"))
+    assert_eq(df.index.unique(), lib.Index(bdf.index.unique()))
 
 
-def test_walk(xdf):
-    df2 = xdf[xdf["x"] > 1][["y"]] + 1
+def test_walk(df):
+    df2 = df[df["x"] > 1][["y"]] + 1
     assert all(isinstance(ex, expr.Expr) for ex in df2.walk())
     exprs = set(df2.walk())
-    assert xdf.expr in exprs
-    assert xdf["x"].expr in exprs
-    assert (xdf["x"] > 1).expr in exprs
+    assert df.expr in exprs
+    assert df["x"].expr in exprs
+    assert (df["x"] > 1).expr in exprs
     assert 1 not in exprs
 
 
-def test_find_operations(xdf):
-    df2 = xdf[xdf["x"] > 1][["y"]] + 1
+def test_find_operations(df):
+    df2 = df[df["x"] > 1][["y"]] + 1
 
     filters = list(df2.find_operations(expr.Filter))
     assert len(filters) == 1
@@ -875,11 +872,11 @@ def test_dropna_simplify(bdf, subset):
     assert_eq(q, bdf.dropna(subset=subset)["y"])
 
 
-def test_dir(xdf):
-    assert all(c in dir(xdf) for c in xdf.columns)
-    assert "sum" in dir(xdf)
-    assert "sum" in dir(xdf.x)
-    assert "sum" in dir(xdf.index)
+def test_dir(df):
+    assert all(c in dir(df) for c in df.columns)
+    assert "sum" in dir(df)
+    assert "sum" in dir(df.x)
+    assert "sum" in dir(df.index)
 
 
 @pytest.mark.parametrize(
@@ -895,39 +892,39 @@ def test_dir(xdf):
     ],
 )
 @pytest.mark.parametrize("indexer", ["x", ["x"]])
-def test_simplify_up_blockwise(xdf, bdf, func, args, indexer):
-    q = getattr(xdf, func)(*args)[indexer]
+def test_simplify_up_blockwise(df, bdf, func, args, indexer):
+    q = getattr(df, func)(*args)[indexer]
     result = q.simplify()
-    expected = getattr(xdf[indexer], func)(*args)
+    expected = getattr(df[indexer], func)(*args)
     assert result._name == expected._name
 
     assert_eq(q, getattr(bdf, func)(*args)[indexer])
 
-    q = getattr(xdf, func)(*args)[["x", "y"]]
+    q = getattr(df, func)(*args)[["x", "y"]]
     result = q.simplify()
-    expected = getattr(xdf, func)(*args)
+    expected = getattr(df, func)(*args)
     assert result._name == expected._name
 
 
-def test_sample(xdf):
-    result = xdf.sample(frac=0.5)
+def test_sample(df):
+    result = df.sample(frac=0.5)
 
     assert_eq(result, result)
 
-    result = xdf.sample(frac=0.5, random_state=1234)
-    expected = xdf.sample(frac=0.5, random_state=1234)
+    result = df.sample(frac=0.5, random_state=1234)
+    expected = df.sample(frac=0.5, random_state=1234)
     assert_eq(result, expected)
 
 
-def test_align(xdf, bdf, backend):
+def test_align(df, bdf, backend):
     if backend == "cudf":
         pytest.skip(reason="align not supported by cudf")
-    result_1, result_2 = xdf.align(xdf)
+    result_1, result_2 = df.align(df)
     pdf_result_1, pdf_result_2 = bdf.align(bdf)
     assert_eq(result_1, pdf_result_1)
     assert_eq(result_2, pdf_result_2)
 
-    result_1, result_2 = xdf.x.align(xdf.x)
+    result_1, result_2 = df.x.align(df.x)
     pdf_result_1, pdf_result_2 = bdf.x.align(bdf.x)
     assert_eq(result_1, pdf_result_1)
     assert_eq(result_2, pdf_result_2)
@@ -965,10 +962,10 @@ def test_unknown_partitions_different_root():
         df.align(df2)
 
 
-def test_nunique_approx(xdf, backend):
+def test_nunique_approx(df, backend):
     if backend == "cudf":
         pytest.xfail(reason="compute_hll_array doesn't work for cudf")
-    result = xdf.nunique_approx().compute()
+    result = df.nunique_approx().compute()
     assert 99 < result < 101
 
 
@@ -1005,41 +1002,41 @@ def test_assign_simplify_series(bdf):
     assert result._name == expected._name
 
 
-def test_assign_non_series_inputs(xdf, bdf, backend):
+def test_assign_non_series_inputs(df, bdf, backend):
     if backend == "cudf":
         pytest.xfail(reason="assign function not supported by cudf")
-    assert_eq(xdf.assign(a=lambda x: x.x * 2), bdf.assign(a=lambda x: x.x * 2))
-    assert_eq(xdf.assign(a=2), bdf.assign(a=2))
-    assert_eq(xdf.assign(a=xdf.x.sum()), bdf.assign(a=bdf.x.sum()))
+    assert_eq(df.assign(a=lambda x: x.x * 2), bdf.assign(a=lambda x: x.x * 2))
+    assert_eq(df.assign(a=2), bdf.assign(a=2))
+    assert_eq(df.assign(a=df.x.sum()), bdf.assign(a=bdf.x.sum()))
 
-    assert_eq(xdf.assign(a=lambda x: x.x * 2).y, bdf.assign(a=lambda x: x.x * 2).y)
-    assert_eq(xdf.assign(a=lambda x: x.x * 2).a, bdf.assign(a=lambda x: x.x * 2).a)
+    assert_eq(df.assign(a=lambda x: x.x * 2).y, bdf.assign(a=lambda x: x.x * 2).y)
+    assert_eq(df.assign(a=lambda x: x.x * 2).a, bdf.assign(a=lambda x: x.x * 2).a)
 
 
-def test_are_co_aligned(bdf, xdf):
-    df2 = xdf.reset_index()
-    assert are_co_aligned(xdf.expr, df2.expr)
-    assert are_co_aligned(xdf.expr, df2.sum().expr)
-    assert not are_co_aligned(xdf.expr, df2.repartition(npartitions=2).expr)
+def test_are_co_aligned(bdf, df):
+    df2 = df.reset_index()
+    assert are_co_aligned(df.expr, df2.expr)
+    assert are_co_aligned(df.expr, df2.sum().expr)
+    assert not are_co_aligned(df.expr, df2.repartition(npartitions=2).expr)
 
-    assert are_co_aligned(xdf.expr, xdf.sum().expr)
-    assert are_co_aligned((xdf + xdf.sum()).expr, xdf.sum().expr)
+    assert are_co_aligned(df.expr, df.sum().expr)
+    assert are_co_aligned((df + df.sum()).expr, df.sum().expr)
 
     bdf = bdf.assign(z=1)
     df3 = from_pandas(bdf, npartitions=10)
-    assert not are_co_aligned(xdf.expr, df3.expr)
-    assert are_co_aligned(xdf.expr, df3.sum().expr)
+    assert not are_co_aligned(df.expr, df3.expr)
+    assert are_co_aligned(df.expr, df3.sum().expr)
 
-    merged = xdf.merge(df2)
+    merged = df.merge(df2)
     merged_first = merged.reset_index()
     merged_second = merged.rename(columns={"x": "a"})
     assert are_co_aligned(merged_first.expr, merged_second.expr)
-    assert not are_co_aligned(merged_first.expr, xdf.expr)
+    assert not are_co_aligned(merged_first.expr, df.expr)
 
 
-def test_astype_categories(xdf, backend):
+def test_astype_categories(df, backend):
     if backend == "cudf":
         pytest.xfail(reason="TODO")
-    result = xdf.astype("category")
+    result = df.astype("category")
     assert_eq(result.x._meta.cat.categories, pd.Index([UNKNOWN_CATEGORIES]))
     assert_eq(result.y._meta.cat.categories, pd.Index([UNKNOWN_CATEGORIES]))

From 06f3bf818b185a0b21ae40938c69acc061922f0b Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Thu, 13 Jul 2023 09:55:57 -0700
Subject: [PATCH 06/18] fix pdf in test

---
 dask_expr/tests/test_collection.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
index ced04d57e..b9463c972 100644
--- a/dask_expr/tests/test_collection.py
+++ b/dask_expr/tests/test_collection.py
@@ -1068,7 +1068,8 @@ def test_op_align():
     assert_eq(df - df2, pdf - pdf2)
 
 
-def test_can_co_align(df, pdf):
+def test_can_co_align(df, bdf):
+    pdf = bdf.copy()
     q = (df.x + df.y).optimize(fuse=False)
     expected = df.x + df.y
     assert q._name == expected._name

From 7d7a400ea88bec48f804c58b169962b771094dec Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Fri, 14 Jul 2023 11:36:22 -0700
Subject: [PATCH 07/18] fix predicate-pushdown test

---
 dask_expr/_collection.py           |   3 +-
 dask_expr/_util.py                 |   5 -
 dask_expr/io/parquet.py            |   7 +-
 dask_expr/io/tests/test_io.py      |  48 ++++-----
 dask_expr/tests/test_collection.py | 155 +++++++++++++----------------
 5 files changed, 96 insertions(+), 122 deletions(-)

diff --git a/dask_expr/_collection.py b/dask_expr/_collection.py
index 80412d8a1..d0584a2ea 100644
--- a/dask_expr/_collection.py
+++ b/dask_expr/_collection.py
@@ -41,7 +41,7 @@
 )
 from dask_expr._repartition import Repartition
 from dask_expr._shuffle import SetIndex, SetIndexBlockwise
-from dask_expr._util import _convert_to_list, _maybe_import_backend
+from dask_expr._util import _convert_to_list
 
 #
 # Utilities to wrap Expr API
@@ -949,7 +949,6 @@ def optimize(collection, fuse=True):
 def from_pandas(data, npartitions=1, sort=True):
     from dask_expr.io.io import FromPandas
 
-    _maybe_import_backend()
     return new_collection(FromPandas(data.copy(), npartitions=npartitions, sort=sort))
 
 
diff --git a/dask_expr/_util.py b/dask_expr/_util.py
index bccd61669..6275e3769 100644
--- a/dask_expr/_util.py
+++ b/dask_expr/_util.py
@@ -18,11 +18,6 @@ def _convert_to_list(column) -> list | None:
     return column
 
 
-def _maybe_import_backend():
-    if config.get("dataframe.backend", "pandas") == "cudf":
-        import dask_cudf  # noqa F401
-
-
 @normalize_token.register(LambdaType)
 def _normalize_lambda(func):
     return str(func)
diff --git a/dask_expr/io/parquet.py b/dask_expr/io/parquet.py
index 55c7d4986..c3ea94bae 100644
--- a/dask_expr/io/parquet.py
+++ b/dask_expr/io/parquet.py
@@ -24,7 +24,7 @@
 from dask.dataframe.io.parquet.utils import _split_user_options
 from dask.dataframe.io.utils import _is_local_fs
 from dask.delayed import delayed
-from dask.utils import apply, natural_sort_key
+from dask.utils import apply, natural_sort_key, typename
 from fsspec.utils import stringify_path
 
 from dask_expr._expr import (
@@ -177,6 +177,11 @@ def to_parquet(
     from dask_expr._collection import new_collection
     from dask_expr.io.parquet import NONE_LABEL, ToParquet
 
+    if typename(df._meta).split(".")[0] == "cudf":
+        from dask_cudf.io.parquet import CudfEngine
+
+        engine = CudfEngine
+
     compute_kwargs = compute_kwargs or {}
 
     partition_on = partition_on or []
diff --git a/dask_expr/io/tests/test_io.py b/dask_expr/io/tests/test_io.py
index 2eae4aa07..8c97bbb92 100644
--- a/dask_expr/io/tests/test_io.py
+++ b/dask_expr/io/tests/test_io.py
@@ -1,7 +1,7 @@
+import importlib
 import os
 
 import dask.dataframe as dd
-import pandas as pd
 import pytest
 from dask import config
 from dask.dataframe.utils import assert_eq
@@ -11,28 +11,15 @@
 from dask_expr._reductions import Len
 from dask_expr.io import ReadParquet
 
-try:
-    import cudf
-except ImportError:
-    cudf = None
-
-
-@pytest.fixture(
-    params=[
-        "pandas",
-        pytest.param(
-            "cudf", marks=pytest.mark.skipif(cudf is None, reason="cudf not found.")
-        ),
-    ]
-)
-def backend(request):
-    yield request.param
+# Import backend DataFrame library to test
+BACKEND = os.environ.get("TEST_DASK_EXPR_BACKEND", "pandas")
+lib = importlib.import_module(BACKEND)
 
 
 def _make_file(dir, format="parquet", df=None):
     fn = os.path.join(str(dir), f"myfile.{format}")
     if df is None:
-        df = pd.DataFrame({c: range(10) for c in "abcde"})
+        df = lib.DataFrame({c: range(10) for c in "abcde"})
     if format == "csv":
         df.to_csv(fn)
     elif format == "parquet":
@@ -101,7 +88,7 @@ def test_io_fusion(tmpdir, fmt):
 
 
 def test_predicate_pushdown(tmpdir):
-    original = pd.DataFrame(
+    original = lib.DataFrame(
         {
             "a": [1, 2, 3, 4, 5] * 10,
             "b": [0, 1, 2, 3, 4] * 10,
@@ -128,7 +115,7 @@ def test_predicate_pushdown(tmpdir):
 
 
 def test_predicate_pushdown_compound(tmpdir):
-    pdf = pd.DataFrame(
+    pdf = lib.DataFrame(
         {
             "a": [1, 2, 3, 4, 5] * 10,
             "b": [0, 1, 2, 3, 4] * 10,
@@ -152,15 +139,18 @@ def test_predicate_pushdown_compound(tmpdir):
     )
 
     # Test OR
-    x = df[(df.a == 5) | (df.c > 20)][df.b != 0]["b"]
+    x = df[(df.a == 5) | (df.c > 20)]
+    x = x[x.b != 0]["b"]
     y = optimize(x, fuse=False)
     assert isinstance(y.expr, ReadParquet)
     filters = [set(y.filters[0]), set(y.filters[1])]
     assert {("c", ">", 20), ("b", "!=", 0)} in filters
     assert {("a", "==", 5), ("b", "!=", 0)} in filters
+    expect = pdf[(pdf.a == 5) | (pdf.c > 20)]
+    expect = expect[expect.b != 0]["b"]
     assert_eq(
         y,
-        pdf[(pdf.a == 5) | (pdf.c > 20)][pdf.b != 0]["b"],
+        expect,
         check_index=False,
     )
 
@@ -176,7 +166,7 @@ def test_predicate_pushdown_compound(tmpdir):
 
 @pytest.mark.parametrize("fmt", ["parquet", "csv", "pandas"])
 def test_io_culling(tmpdir, fmt):
-    pdf = pd.DataFrame({c: range(10) for c in "abcde"})
+    pdf = lib.DataFrame({c: range(10) for c in "abcde"})
     if fmt == "parquet":
         dd.from_pandas(pdf, 2).to_parquet(tmpdir)
         df = read_parquet(tmpdir)
@@ -209,7 +199,7 @@ def _check_culling(expr, partitions):
 
 @pytest.mark.parametrize("sort", [True, False])
 def test_from_pandas(sort):
-    pdf = pd.DataFrame({"x": [1, 4, 3, 2, 0, 5]})
+    pdf = lib.DataFrame({"x": [1, 4, 3, 2, 0, 5]})
     df = from_pandas(pdf, npartitions=2, sort=sort)
 
     assert df.divisions == (0, 3, 5) if sort else (None,) * 3
@@ -217,15 +207,15 @@ def test_from_pandas(sort):
 
 
 def test_from_pandas_immutable():
-    pdf = pd.DataFrame({"x": [1, 2, 3, 4]})
+    pdf = lib.DataFrame({"x": [1, 2, 3, 4]})
     expected = pdf.copy()
     df = from_pandas(pdf)
     pdf["z"] = 100
     assert_eq(df, expected)
 
 
-def test_parquet_complex_filters(tmpdir, backend):
-    with config.set({"dataframe.backend": backend}):
+def test_parquet_complex_filters(tmpdir):
+    with config.set({"dataframe.backend": BACKEND}):
         df = read_parquet(_make_file(tmpdir))
     pdf = df.compute()
     got = df["a"][df["b"] > df["b"].mean()]
@@ -266,7 +256,7 @@ def test_from_dask_dataframe(optimize):
 
 @pytest.mark.parametrize("optimize", [True, False])
 def test_to_dask_dataframe(optimize):
-    pdf = pd.DataFrame({"x": [1, 4, 3, 2, 0, 5]})
+    pdf = lib.DataFrame({"x": [1, 4, 3, 2, 0, 5]})
     df = from_pandas(pdf, npartitions=2)
     ddf = df.to_dask_dataframe(optimize=optimize)
     assert isinstance(ddf, dd.DataFrame)
@@ -275,7 +265,7 @@ def test_to_dask_dataframe(optimize):
 
 @pytest.mark.parametrize("write_metadata_file", [True, False])
 def test_to_parquet(tmpdir, write_metadata_file):
-    pdf = pd.DataFrame({"x": [1, 4, 3, 2, 0, 5]})
+    pdf = lib.DataFrame({"x": [1, 4, 3, 2, 0, 5]})
     df = from_pandas(pdf, npartitions=2)
 
     # Check basic parquet round trip
diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
index b9463c972..8f0c62a7e 100644
--- a/dask_expr/tests/test_collection.py
+++ b/dask_expr/tests/test_collection.py
@@ -1,12 +1,13 @@
 from __future__ import annotations
 
+import importlib
 import operator
+import os
 import pickle
 import re
 
 import dask
 import numpy as np
-import pandas as pd
 import pytest
 from dask.dataframe._compat import PANDAS_GT_210
 from dask.dataframe.utils import UNKNOWN_CATEGORIES, assert_eq
@@ -17,45 +18,20 @@
 from dask_expr._reductions import Len
 from dask_expr.datasets import timeseries
 
-try:
-    import cudf
-except ImportError:
-    cudf = None
-
-
-@pytest.fixture(
-    params=[
-        "pandas",
-        pytest.param(
-            "cudf", marks=pytest.mark.skipif(cudf is None, reason="cudf not found.")
-        ),
-    ]
-)
-def backend(request):
-    # Return backend-library label
-    yield request.param
-
-
-@pytest.fixture
-def lib(backend):
-    # Return library associated with `backend` label
-    if backend == "cudf":
-        yield cudf
-    else:
-        yield pd
+# Import backend DataFrame library to test
+BACKEND = os.environ.get("TEST_DASK_EXPR_BACKEND", "pandas")
+lib = importlib.import_module(BACKEND)
 
 
 @pytest.fixture
-def bdf(lib):
-    # Backend DataFrame fixture
-    df = lib.DataFrame({"x": range(100)})
-    df["y"] = df.x * 10.0
-    yield df
+def bdf():
+    bdf = lib.DataFrame({"x": range(100)})
+    bdf["y"] = bdf.x * 10.0
+    yield bdf
 
 
 @pytest.fixture
 def df(bdf):
-    # Multi-backend Dask-Expression DataFrame fixture
     yield from_pandas(bdf, npartitions=10)
 
 
@@ -79,16 +55,16 @@ def test_setitem(bdf, df):
 
 
 def test_explode():
-    # CuDF backend does not support explode
-    # (See: https://github.com/rapidsai/cudf/issues/10271)
-    pdf = pd.DataFrame({"a": [[1, 2], [3, 4]]})
+    if BACKEND == "cudf":
+        pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/10271")
+    pdf = lib.DataFrame({"a": [[1, 2], [3, 4]]})
     df = from_pandas(pdf)
     assert_eq(pdf.explode(column="a"), df.explode(column="a"))
     assert_eq(pdf.a.explode(), df.a.explode())
 
 
-def test_explode_simplify(bdf, backend):
-    if backend == "cudf":
+def test_explode_simplify(bdf):
+    if BACKEND == "cudf":
         pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/10271")
     pdf = bdf.copy()
     pdf["z"] = 1
@@ -99,7 +75,7 @@ def test_explode_simplify(bdf, backend):
     assert result._name == expected._name
 
 
-def test_meta_divisions_name(lib):
+def test_meta_divisions_name():
     a = lib.DataFrame({"x": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]})
     df = 2 * from_pandas(a, npartitions=2)
     assert list(df.columns) == list(a.columns)
@@ -112,7 +88,7 @@ def test_meta_divisions_name(lib):
     assert "sum" in df.sum()._name
 
 
-def test_meta_blockwise(lib):
+def test_meta_blockwise():
     a = lib.DataFrame({"x": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]})
     b = lib.DataFrame({"z": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]})
 
@@ -149,8 +125,8 @@ def test_dask(bdf, df):
         ),
     ],
 )
-def test_reductions(func, bdf, df, backend):
-    if backend == "cudf" and func in [M.idxmin, M.idxmax]:
+def test_reductions(func, bdf, df):
+    if BACKEND == "cudf" and func in [M.idxmin, M.idxmax]:
         pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/9602")
     result = func(df)
     assert result.known_divisions
@@ -163,15 +139,15 @@ def test_reductions(func, bdf, df, backend):
     assert_eq(func(df)["x"], func(bdf)["x"], check_dtype=False)
 
 
-def test_nbytes(bdf, df, backend):
-    if backend == "cudf":
+def test_nbytes(bdf, df):
+    if BACKEND == "cudf":
         pytest.xfail(reason="nbytes not supported by cudf")
     with pytest.raises(NotImplementedError, match="nbytes is not implemented"):
         df.nbytes
     assert_eq(df.x.nbytes, bdf.x.nbytes)
 
 
-def test_mode(lib):
+def test_mode():
     pdf = lib.DataFrame({"x": [1, 2, 3, 1, 2]})
     df = from_pandas(pdf, npartitions=3)
 
@@ -195,7 +171,7 @@ def test_dropna(bdf):
     assert_eq(df.y.dropna(), pdf.y.dropna())
 
 
-def test_fillna(lib):
+def test_fillna():
     pdf = lib.DataFrame({"x": [1, 2, None, None, 5, 6]})
     df = from_pandas(pdf, npartitions=2)
     actual = df.fillna(value=100)
@@ -255,7 +231,7 @@ def test_conditionals(func, bdf, df):
         lambda df: df.x.__rxor__(df.y),
     ],
 )
-def test_boolean_operators(func, lib):
+def test_boolean_operators(func):
     pdf = lib.DataFrame(
         {"x": [True, False, True, False], "y": [True, False, False, False]}
     )
@@ -274,7 +250,7 @@ def test_boolean_operators(func, lib):
         lambda df: +df,
     ],
 )
-def test_unary_operators(func, lib):
+def test_unary_operators(func):
     pdf = lib.DataFrame(
         {"x": [True, False, True, False], "y": [True, False, False, False], "z": 1}
     )
@@ -294,10 +270,10 @@ def test_and_or(func, bdf, df):
 
 
 @pytest.mark.parametrize("how", ["start", "end"])
-def test_to_timestamp(bdf, how, backend):
-    if backend == "cudf":
+def test_to_timestamp(bdf, how):
+    if BACKEND == "cudf":
         pytest.xfail(reason="period_range not supported by cudf")
-    bdf.index = pd.period_range("2019-12-31", freq="D", periods=len(bdf))
+    bdf.index = lib.period_range("2019-12-31", freq="D", periods=len(bdf))
     df = from_pandas(bdf)
     assert_eq(df.to_timestamp(how=how), bdf.to_timestamp(how=how))
     assert_eq(df.x.to_timestamp(how=how), bdf.x.to_timestamp(how=how))
@@ -348,14 +324,14 @@ def test_blockwise(func, bdf, df):
         lambda df: df.x.combine_first(df.y),
     ],
 )
-def test_blockwise_cudf_fails(func, bdf, df, backend):
-    if backend == "cudf":
+def test_blockwise_cudf_fails(func, bdf, df):
+    if BACKEND == "cudf":
         pytest.xfail(reason="func not supported by cudf")
     assert_eq(func(bdf), func(df))
 
 
-def test_rename_axis(bdf, backend):
-    if backend == "cudf":
+def test_rename_axis(bdf):
+    if BACKEND == "cudf":
         pytest.xfail(reason="rename_axis not supported by cudf")
     pdf = bdf.copy()
     pdf.index.name = "a"
@@ -390,8 +366,8 @@ def test_repr(df):
     assert "sum(skipna=False)" in s
 
 
-def test_combine_first_simplify(bdf, backend):
-    if backend == "cudf":
+def test_combine_first_simplify(bdf):
+    if BACKEND == "cudf":
         pytest.xfail(reason="combine_first not supported by cudf")
     pdf = bdf.copy()
     df = from_pandas(pdf)
@@ -564,7 +540,7 @@ def test_remove_unnecessary_projections(df):
     assert optimized._name == expected._name
 
 
-def test_substitute(lib):
+def test_substitute():
     pdf = lib.DataFrame(
         {
             "a": range(100),
@@ -656,8 +632,8 @@ def test_serialization(bdf, df):
     assert_eq(pickle.loads(before), pickle.loads(after))
 
 
-def test_size_optimized(df, backend):
-    if backend == "cudf":
+def test_size_optimized(df):
+    if BACKEND == "cudf":
         pytest.xfail(reason="Cannot apply lambda function in cudf")
     expr = (df.x + 1).apply(lambda x: x).size
     out = optimize(expr)
@@ -672,8 +648,11 @@ def test_size_optimized(df, backend):
 
 @pytest.mark.parametrize("fuse", [True, False])
 def test_tree_repr(fuse):
-    s = from_pandas(pd.Series(range(10))).expr.tree_repr()
-    assert "<pandas>" in s
+    s = from_pandas(lib.Series(range(10))).expr.tree_repr()
+    if BACKEND == "pandas":
+        assert "<pandas>" in s
+    else:
+        assert "<series>" in s
 
     df = timeseries()
     expr = ((df.x + 1).sum(skipna=False) + df.y.mean()).expr
@@ -719,7 +698,7 @@ def combine_x_y(x, y, val, foo=None):
 
 
 @pytest.mark.parametrize("opt", [True, False])
-def test_map_partitions_merge(opt, lib):
+def test_map_partitions_merge(opt):
     # Make simple left & right dfs
     pdf1 = lib.DataFrame({"x": range(20), "y": range(20)})
     df1 = from_pandas(pdf1, 2)
@@ -815,7 +794,7 @@ def test_astype_simplify(df, bdf):
     assert result._name == expected._name
 
 
-def test_drop_duplicates(df, bdf, backend):
+def test_drop_duplicates(df, bdf):
     assert_eq(df.drop_duplicates(), bdf.drop_duplicates())
     assert_eq(
         df.drop_duplicates(ignore_index=True), bdf.drop_duplicates(ignore_index=True)
@@ -823,7 +802,7 @@ def test_drop_duplicates(df, bdf, backend):
     assert_eq(df.drop_duplicates(subset=["x"]), bdf.drop_duplicates(subset=["x"]))
     assert_eq(df.x.drop_duplicates(), bdf.x.drop_duplicates())
 
-    if backend == "pandas":
+    if BACKEND == "pandas":
         with pytest.raises(KeyError, match=re.escape("Index(['a'], dtype='object')")):
             df.drop_duplicates(subset=["a"])
 
@@ -831,7 +810,7 @@ def test_drop_duplicates(df, bdf, backend):
         df.x.drop_duplicates(subset=["a"])
 
 
-def test_unique(df, bdf, lib):
+def test_unique(df, bdf):
     with pytest.raises(
         AttributeError, match="'DataFrame' object has no attribute 'unique'"
     ):
@@ -925,8 +904,8 @@ def test_sample(df):
     assert_eq(result, expected)
 
 
-def test_align(df, bdf, backend):
-    if backend == "cudf":
+def test_align(df, bdf):
+    if BACKEND == "cudf":
         pytest.skip(reason="align not supported by cudf")
     result_1, result_2 = df.align(df)
     pdf_result_1, pdf_result_2 = bdf.align(bdf)
@@ -940,9 +919,11 @@ def test_align(df, bdf, backend):
 
 
 def test_align_different_partitions():
-    pdf = pd.DataFrame({"a": [11, 12, 31, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6]})
+    if BACKEND == "cudf":
+        pytest.skip(reason="align not supported by cudf")
+    pdf = lib.DataFrame({"a": [11, 12, 31, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6]})
     df = from_pandas(pdf, npartitions=2)
-    pdf2 = pd.DataFrame(
+    pdf2 = lib.DataFrame(
         {"a": [11, 12, 31, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6]},
         index=[-2, -1, 0, 1, 2, 3],
     )
@@ -954,7 +935,9 @@ def test_align_different_partitions():
 
 
 def test_align_unknown_partitions_same_root():
-    pdf = pd.DataFrame({"a": 1}, index=[3, 2, 1])
+    if BACKEND == "cudf":
+        pytest.skip(reason="align not supported by cudf")
+    pdf = lib.DataFrame({"a": 1}, index=[3, 2, 1])
     df = from_pandas(pdf, npartitions=2, sort=False)
     result_1, result_2 = df.align(df)
     pdf_result_1, pdf_result_2 = pdf.align(pdf)
@@ -963,16 +946,18 @@ def test_align_unknown_partitions_same_root():
 
 
 def test_unknown_partitions_different_root():
-    pdf = pd.DataFrame({"a": 1}, index=[3, 2, 1])
+    if BACKEND == "cudf":
+        pytest.skip(reason="align not supported by cudf")
+    pdf = lib.DataFrame({"a": 1}, index=[3, 2, 1])
     df = from_pandas(pdf, npartitions=2, sort=False)
-    pdf2 = pd.DataFrame({"a": 1}, index=[4, 3, 2, 1])
+    pdf2 = lib.DataFrame({"a": 1}, index=[4, 3, 2, 1])
     df2 = from_pandas(pdf2, npartitions=2, sort=False)
     with pytest.raises(ValueError, match="Not all divisions"):
         df.align(df2)
 
 
-def test_nunique_approx(df, backend):
-    if backend == "cudf":
+def test_nunique_approx(df):
+    if BACKEND == "cudf":
         pytest.xfail(reason="compute_hll_array doesn't work for cudf")
     result = df.nunique_approx().compute()
     assert 99 < result < 101
@@ -1011,8 +996,8 @@ def test_assign_simplify_series(bdf):
     assert result._name == expected._name
 
 
-def test_assign_non_series_inputs(df, bdf, backend):
-    if backend == "cudf":
+def test_assign_non_series_inputs(df, bdf):
+    if BACKEND == "cudf":
         pytest.xfail(reason="assign function not supported by cudf")
     assert_eq(df.assign(a=lambda x: x.x * 2), bdf.assign(a=lambda x: x.x * 2))
     assert_eq(df.assign(a=2), bdf.assign(a=2))
@@ -1043,12 +1028,12 @@ def test_are_co_aligned(bdf, df):
     assert not are_co_aligned(merged_first.expr, df.expr)
 
 
-def test_astype_categories(df, backend):
-    if backend == "cudf":
+def test_astype_categories(df):
+    if BACKEND == "cudf":
         pytest.xfail(reason="TODO")
     result = df.astype("category")
-    assert_eq(result.x._meta.cat.categories, pd.Index([UNKNOWN_CATEGORIES]))
-    assert_eq(result.y._meta.cat.categories, pd.Index([UNKNOWN_CATEGORIES]))
+    assert_eq(result.x._meta.cat.categories, lib.Index([UNKNOWN_CATEGORIES]))
+    assert_eq(result.y._meta.cat.categories, lib.Index([UNKNOWN_CATEGORIES]))
 
 
 def test_drop_simplify(df):
@@ -1059,10 +1044,10 @@ def test_drop_simplify(df):
 
 
 def test_op_align():
-    pdf = pd.DataFrame({"x": [1, 2, 3], "y": 1})
+    pdf = lib.DataFrame({"x": [1, 2, 3], "y": 1})
     df = from_pandas(pdf, npartitions=2)
 
-    pdf2 = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": 1})
+    pdf2 = lib.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": 1})
     df2 = from_pandas(pdf2, npartitions=2)
 
     assert_eq(df - df2, pdf - pdf2)
@@ -1083,10 +1068,10 @@ def test_can_co_align(df, bdf):
 def test_avoid_alignment():
     from dask_expr._align import AlignPartitions
 
-    a = pd.DataFrame({"x": range(100)})
+    a = lib.DataFrame({"x": range(100)})
     da = from_pandas(a, npartitions=4)
 
-    b = pd.DataFrame({"y": range(100)})
+    b = lib.DataFrame({"y": range(100)})
     b["z"] = b.y * 2
     db = from_pandas(b, npartitions=3)
 

From 934bd0324796395eb77d1efde5a7699fdab4bb52 Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Fri, 14 Jul 2023 13:09:47 -0700
Subject: [PATCH 08/18] rely on DASK_DATAFRAME__BACKEND=cudf for now

---
 dask_expr/io/tests/test_io.py      | 2 +-
 dask_expr/tests/test_collection.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/dask_expr/io/tests/test_io.py b/dask_expr/io/tests/test_io.py
index 8c97bbb92..b2a8cd788 100644
--- a/dask_expr/io/tests/test_io.py
+++ b/dask_expr/io/tests/test_io.py
@@ -12,7 +12,7 @@
 from dask_expr.io import ReadParquet
 
 # Import backend DataFrame library to test
-BACKEND = os.environ.get("TEST_DASK_EXPR_BACKEND", "pandas")
+BACKEND = config.get("dataframe.backend", "pandas")
 lib = importlib.import_module(BACKEND)
 
 
diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
index 8f0c62a7e..c0189e66e 100644
--- a/dask_expr/tests/test_collection.py
+++ b/dask_expr/tests/test_collection.py
@@ -2,7 +2,6 @@
 
 import importlib
 import operator
-import os
 import pickle
 import re
 
@@ -19,7 +18,7 @@
 from dask_expr.datasets import timeseries
 
 # Import backend DataFrame library to test
-BACKEND = os.environ.get("TEST_DASK_EXPR_BACKEND", "pandas")
+BACKEND = dask.config.get("dataframe.backend", "pandas")
 lib = importlib.import_module(BACKEND)
 
 

From 65252c482880af6cde124544d2e4cb91fad1844e Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Mon, 17 Jul 2023 13:21:35 -0700
Subject: [PATCH 09/18] add _set_engine utility for parquet

---
 dask_expr/_collection.py      | 13 ++-----------
 dask_expr/io/parquet.py       | 22 ++++++++++++++++------
 dask_expr/io/tests/test_io.py |  2 +-
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/dask_expr/_collection.py b/dask_expr/_collection.py
index d0584a2ea..3cf7a62ca 100644
--- a/dask_expr/_collection.py
+++ b/dask_expr/_collection.py
@@ -7,7 +7,6 @@
 
 import numpy as np
 import pandas as pd
-from dask import config
 from dask.base import DaskMethodsMixin, is_dask_collection, named_schedulers
 from dask.dataframe.core import (
     _concat,
@@ -999,21 +998,13 @@ def read_parquet(
     engine=None,
     **kwargs,
 ):
-    from dask_expr.io.parquet import ReadParquet
+    from dask_expr.io.parquet import ReadParquet, _set_engine
 
     if not isinstance(path, str):
         path = stringify_path(path)
 
     kwargs["dtype_backend"] = dtype_backend
 
-    if engine is None:
-        if config.get("dataframe.backend", "pandas") == "cudf":
-            from dask_cudf.io.parquet import CudfEngine
-
-            engine = CudfEngine
-        else:
-            engine = "pyarrow"
-
     return new_collection(
         ReadParquet(
             path,
@@ -1030,7 +1021,7 @@ def read_parquet(
             aggregate_files=aggregate_files,
             parquet_file_extension=parquet_file_extension,
             filesystem=filesystem,
-            engine=engine,
+            engine=_set_engine(engine),
             kwargs=kwargs,
         )
     )
diff --git a/dask_expr/io/parquet.py b/dask_expr/io/parquet.py
index c3ea94bae..17f59e2f3 100644
--- a/dask_expr/io/parquet.py
+++ b/dask_expr/io/parquet.py
@@ -157,7 +157,7 @@ def _layer(self):
 def to_parquet(
     df,
     path,
-    engine="pyarrow",
+    engine=None,
     compression="snappy",
     write_index=True,
     append=False,
@@ -177,11 +177,7 @@ def to_parquet(
     from dask_expr._collection import new_collection
     from dask_expr.io.parquet import NONE_LABEL, ToParquet
 
-    if typename(df._meta).split(".")[0] == "cudf":
-        from dask_cudf.io.parquet import CudfEngine
-
-        engine = CudfEngine
-
+    engine = _set_engine(meta=df._meta)
     compute_kwargs = compute_kwargs or {}
 
     partition_on = partition_on or []
@@ -657,6 +653,20 @@ def _update_length_statistics(self):
 #
 
 
+def _set_engine(engine=None, meta=None):
+    # Use `engine` or `meta` input to set the parquet engine
+    if engine is None:
+        if (
+            meta is not None and typename(meta).split(".")[0] == "cudf"
+        ) or dask.config.get("dataframe.backend", "pandas") == "cudf":
+            from dask_cudf.io.parquet import CudfEngine
+
+            engine = CudfEngine
+        else:
+            engine = "pyarrow"
+    return engine
+
+
 def _align_statistics(parts, statistics):
     # Make sure parts and statistics are aligned
     # (if statistics is not empty)
diff --git a/dask_expr/io/tests/test_io.py b/dask_expr/io/tests/test_io.py
index b2a8cd788..bb3c97998 100644
--- a/dask_expr/io/tests/test_io.py
+++ b/dask_expr/io/tests/test_io.py
@@ -111,7 +111,7 @@ def test_predicate_pushdown(tmpdir):
     y_result = y.compute()
     assert y_result.name == "b"
     assert len(y_result) == 6
-    assert all(y_result == 4)
+    assert (y_result == 4).all()
 
 
 def test_predicate_pushdown_compound(tmpdir):

From b2ebdf96e15ae356e172f6ab7e3624e31dfc4c9f Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Mon, 17 Jul 2023 13:28:17 -0700
Subject: [PATCH 10/18] remove unnecesary engine arg

---
 dask_expr/io/parquet.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dask_expr/io/parquet.py b/dask_expr/io/parquet.py
index 17f59e2f3..4d4ca5fde 100644
--- a/dask_expr/io/parquet.py
+++ b/dask_expr/io/parquet.py
@@ -157,7 +157,6 @@ def _layer(self):
 def to_parquet(
     df,
     path,
-    engine=None,
     compression="snappy",
     write_index=True,
     append=False,

From 8573040dfe2278b673300ebd9027c8b8aeb14a77 Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Mon, 17 Jul 2023 13:31:00 -0700
Subject: [PATCH 11/18] fix test

---
 dask_expr/io/tests/test_io.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dask_expr/io/tests/test_io.py b/dask_expr/io/tests/test_io.py
index e622c26d4..deeeac6c9 100644
--- a/dask_expr/io/tests/test_io.py
+++ b/dask_expr/io/tests/test_io.py
@@ -286,7 +286,7 @@ def test_to_parquet(tmpdir, write_metadata_file):
 
 
 def test_combine_similar(tmpdir):
-    pdf = pd.DataFrame(
+    pdf = lib.DataFrame(
         {"x": [0, 1, 2, 3] * 4, "y": range(16), "z": [None, 1, 2, 3] * 4}
     )
     fn = _make_file(tmpdir, format="parquet", df=pdf)

From 5c4c019f9b356d5ed8be01601c3bb8ce0ea62b04 Mon Sep 17 00:00:00 2001
From: Rick Zamora <rzamora217@gmail.com>
Date: Tue, 18 Jul 2023 09:26:30 -0500
Subject: [PATCH 12/18] revert pdf renaming

---
 dask_expr/tests/test_collection.py | 258 ++++++++++++++---------------
 1 file changed, 125 insertions(+), 133 deletions(-)

diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
index c0189e66e..42cdd78ce 100644
--- a/dask_expr/tests/test_collection.py
+++ b/dask_expr/tests/test_collection.py
@@ -23,19 +23,19 @@
 
 
 @pytest.fixture
-def bdf():
-    bdf = lib.DataFrame({"x": range(100)})
-    bdf["y"] = bdf.x * 10.0
-    yield bdf
+def pdf():
+    pdf = lib.DataFrame({"x": range(100)})
+    pdf["y"] = pdf.x * 10.0
+    yield pdf
 
 
 @pytest.fixture
-def df(bdf):
-    yield from_pandas(bdf, npartitions=10)
+def df(pdf):
+    yield from_pandas(pdf, npartitions=10)
 
 
-def test_del(bdf, df):
-    pdf = bdf.copy()
+def test_del(pdf, df):
+    pdf = pdf.copy()
 
     # Check __delitem__
     del pdf["x"]
@@ -43,8 +43,8 @@ def test_del(bdf, df):
     assert_eq(pdf, df)
 
 
-def test_setitem(bdf, df):
-    pdf = bdf.copy()
+def test_setitem(pdf, df):
+    pdf = pdf.copy()
     pdf["z"] = pdf.x + pdf.y
 
     df["z"] = df.x + df.y
@@ -62,10 +62,9 @@ def test_explode():
     assert_eq(pdf.a.explode(), df.a.explode())
 
 
-def test_explode_simplify(bdf):
+def test_explode_simplify(pdf):
     if BACKEND == "cudf":
         pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/10271")
-    pdf = bdf.copy()
     pdf["z"] = 1
     df = from_pandas(pdf)
     q = df.explode(column="x")["y"]
@@ -98,11 +97,11 @@ def test_meta_blockwise():
     assert set(cc.columns) == {"x", "y", "z"}
 
 
-def test_dask(bdf, df):
+def test_dask(pdf, df):
     assert (df.x + df.y).npartitions == 10
     z = (df.x + df.y).sum()
 
-    assert assert_eq(z, (bdf.x + bdf.y).sum())
+    assert assert_eq(z, (pdf.x + pdf.y).sum())
 
 
 @pytest.mark.parametrize(
@@ -124,26 +123,26 @@ def test_dask(bdf, df):
         ),
     ],
 )
-def test_reductions(func, bdf, df):
+def test_reductions(func, pdf, df):
     if BACKEND == "cudf" and func in [M.idxmin, M.idxmax]:
         pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/9602")
     result = func(df)
     assert result.known_divisions
-    assert_eq(result, func(bdf))
+    assert_eq(result, func(pdf))
     result = func(df.x)
     assert not result.known_divisions
-    assert_eq(result, func(bdf.x))
+    assert_eq(result, func(pdf.x))
     # check_dtype False because sub-selection of columns that is pushed through
     # is not reflected in the meta calculation
-    assert_eq(func(df)["x"], func(bdf)["x"], check_dtype=False)
+    assert_eq(func(df)["x"], func(pdf)["x"], check_dtype=False)
 
 
-def test_nbytes(bdf, df):
+def test_nbytes(pdf, df):
     if BACKEND == "cudf":
         pytest.xfail(reason="nbytes not supported by cudf")
     with pytest.raises(NotImplementedError, match="nbytes is not implemented"):
         df.nbytes
-    assert_eq(df.x.nbytes, bdf.x.nbytes)
+    assert_eq(df.x.nbytes, pdf.x.nbytes)
 
 
 def test_mode():
@@ -153,16 +152,15 @@ def test_mode():
     assert_eq(df.x.mode(), pdf.x.mode(), check_names=False)
 
 
-def test_value_counts(df, bdf):
+def test_value_counts(df, pdf):
     with pytest.raises(
         AttributeError, match="'DataFrame' object has no attribute 'value_counts'"
     ):
         df.value_counts()
-    assert_eq(df.x.value_counts(), bdf.x.value_counts().astype("int64"))
+    assert_eq(df.x.value_counts(), pdf.x.value_counts().astype("int64"))
 
 
-def test_dropna(bdf):
-    pdf = bdf.copy()
+def test_dropna(pdf):
     pdf.loc[0, "y"] = np.nan
     df = from_pandas(pdf)
     assert_eq(df.dropna(), pdf.dropna())
@@ -178,10 +176,9 @@ def test_fillna():
     assert_eq(actual, expected)
 
 
-def test_memory_usage(bdf):
+def test_memory_usage(pdf):
     # Results are not equal with RangeIndex because pandas has one RangeIndex while
     # we have one RangeIndex per partition
-    pdf = bdf.copy()
     pdf.index = np.arange(len(pdf))
     df = from_pandas(pdf)
     assert_eq(df.memory_usage(), pdf.memory_usage())
@@ -194,9 +191,9 @@ def test_memory_usage(bdf):
 
 
 @pytest.mark.parametrize("func", [M.nlargest, M.nsmallest])
-def test_nlargest_nsmallest(df, bdf, func):
-    assert_eq(func(df, n=5, columns="x"), func(bdf, n=5, columns="x"))
-    assert_eq(func(df.x, n=5), func(bdf.x, n=5))
+def test_nlargest_nsmallest(df, pdf, func):
+    assert_eq(func(df, n=5, columns="x"), func(pdf, n=5, columns="x"))
+    assert_eq(func(df.x, n=5), func(pdf.x, n=5))
     with pytest.raises(TypeError, match="got an unexpected keyword argument"):
         func(df.x, n=5, columns="foo")
 
@@ -215,8 +212,8 @@ def test_nlargest_nsmallest(df, bdf, func):
         lambda df: df.x != df.y,
     ],
 )
-def test_conditionals(func, bdf, df):
-    assert_eq(func(bdf), func(df), check_names=False)
+def test_conditionals(func, pdf, df):
+    assert_eq(func(pdf), func(df), check_names=False)
 
 
 @pytest.mark.parametrize(
@@ -264,18 +261,18 @@ def test_unary_operators(func):
         lambda df: df[(df.x > 7) & (df.x < 10)],
     ],
 )
-def test_and_or(func, bdf, df):
-    assert_eq(func(bdf), func(df), check_names=False)
+def test_and_or(func, pdf, df):
+    assert_eq(func(pdf), func(df), check_names=False)
 
 
 @pytest.mark.parametrize("how", ["start", "end"])
-def test_to_timestamp(bdf, how):
+def test_to_timestamp(pdf, how):
     if BACKEND == "cudf":
         pytest.xfail(reason="period_range not supported by cudf")
-    bdf.index = lib.period_range("2019-12-31", freq="D", periods=len(bdf))
-    df = from_pandas(bdf)
-    assert_eq(df.to_timestamp(how=how), bdf.to_timestamp(how=how))
-    assert_eq(df.x.to_timestamp(how=how), bdf.x.to_timestamp(how=how))
+    pdf.index = lib.period_range("2019-12-31", freq="D", periods=len(pdf))
+    df = from_pandas(pdf)
+    assert_eq(df.to_timestamp(how=how), pdf.to_timestamp(how=how))
+    assert_eq(df.x.to_timestamp(how=how), pdf.x.to_timestamp(how=how))
 
 
 @pytest.mark.parametrize(
@@ -310,8 +307,8 @@ def test_to_timestamp(bdf, how):
         lambda df: df.select_dtypes(include="integer"),
     ],
 )
-def test_blockwise(func, bdf, df):
-    assert_eq(func(bdf), func(df))
+def test_blockwise(func, pdf, df):
+    assert_eq(func(pdf), func(df))
 
 
 @pytest.mark.parametrize(
@@ -323,16 +320,15 @@ def test_blockwise(func, bdf, df):
         lambda df: df.x.combine_first(df.y),
     ],
 )
-def test_blockwise_cudf_fails(func, bdf, df):
+def test_blockwise_cudf_fails(func, pdf, df):
     if BACKEND == "cudf":
         pytest.xfail(reason="func not supported by cudf")
-    assert_eq(func(bdf), func(df))
+    assert_eq(func(pdf), func(df))
 
 
-def test_rename_axis(bdf):
+def test_rename_axis(pdf):
     if BACKEND == "cudf":
         pytest.xfail(reason="rename_axis not supported by cudf")
-    pdf = bdf.copy()
     pdf.index.name = "a"
     pdf.columns.name = "b"
     df = from_pandas(pdf, npartitions=10)
@@ -341,14 +337,13 @@ def test_rename_axis(bdf):
     assert_eq(df.x.rename_axis(index="dummy"), pdf.x.rename_axis(index="dummy"))
 
 
-def test_isin(df, bdf):
+def test_isin(df, pdf):
     values = [1, 2]
-    assert_eq(bdf.isin(values), df.isin(values))
-    assert_eq(bdf.x.isin(values), df.x.isin(values))
+    assert_eq(pdf.isin(values), df.isin(values))
+    assert_eq(pdf.x.isin(values), df.x.isin(values))
 
 
-def test_round(bdf):
-    pdf = bdf.copy()
+def test_round(pdf):
     pdf += 0.5555
     df = from_pandas(pdf)
     assert_eq(df.round(decimals=1), pdf.round(decimals=1))
@@ -365,10 +360,9 @@ def test_repr(df):
     assert "sum(skipna=False)" in s
 
 
-def test_combine_first_simplify(bdf):
+def test_combine_first_simplify(pdf):
     if BACKEND == "cudf":
         pytest.xfail(reason="combine_first not supported by cudf")
-    pdf = bdf.copy()
     df = from_pandas(pdf)
     pdf2 = pdf.rename(columns={"y": "z"})
     df2 = from_pandas(pdf2)
@@ -413,8 +407,7 @@ def test_clip_traverse_filters(df):
 
 @pytest.mark.parametrize("projection", ["zz", ["zz"], ["zz", "x"], "zz"])
 @pytest.mark.parametrize("subset", ["x", ["x"]])
-def test_drop_duplicates_subset_simplify(bdf, subset, projection):
-    pdf = bdf.copy()
+def test_drop_duplicates_subset_simplify(pdf, subset, projection):
     pdf["zz"] = 1
     df = from_pandas(pdf)
     result = df.drop_duplicates(subset=subset)[projection].simplify()
@@ -423,18 +416,18 @@ def test_drop_duplicates_subset_simplify(bdf, subset, projection):
     assert str(result) == str(expected)
 
 
-def test_broadcast(bdf, df):
+def test_broadcast(pdf, df):
     assert_eq(
         df + df.sum(),
-        bdf + bdf.sum(),
+        pdf + pdf.sum(),
     )
     assert_eq(
         df.x + df.x.sum(),
-        bdf.x + bdf.x.sum(),
+        pdf.x + pdf.x.sum(),
     )
 
 
-def test_persist(bdf, df):
+def test_persist(pdf, df):
     a = df + 2
     b = a.persist()
 
@@ -443,25 +436,25 @@ def test_persist(bdf, df):
 
     assert len(b.__dask_graph__()) == b.npartitions
 
-    assert_eq(b.y.sum(), (bdf + 2).y.sum())
+    assert_eq(b.y.sum(), (pdf + 2).y.sum())
 
 
-def test_index(bdf, df):
-    assert_eq(df.index, bdf.index)
-    assert_eq(df.x.index, bdf.x.index)
+def test_index(pdf, df):
+    assert_eq(df.index, pdf.index)
+    assert_eq(df.x.index, pdf.x.index)
 
 
 @pytest.mark.parametrize("drop", [True, False])
-def test_reset_index(bdf, df, drop):
-    assert_eq(df.reset_index(drop=drop), bdf.reset_index(drop=drop), check_index=False)
+def test_reset_index(pdf, df, drop):
+    assert_eq(df.reset_index(drop=drop), pdf.reset_index(drop=drop), check_index=False)
     assert_eq(
-        df.x.reset_index(drop=drop), bdf.x.reset_index(drop=drop), check_index=False
+        df.x.reset_index(drop=drop), pdf.x.reset_index(drop=drop), check_index=False
     )
 
 
-def test_head(bdf, df):
-    assert_eq(df.head(compute=False), bdf.head())
-    assert_eq(df.head(compute=False, n=7), bdf.head(n=7))
+def test_head(pdf, df):
+    assert_eq(df.head(compute=False), pdf.head())
+    assert_eq(df.head(compute=False, n=7), pdf.head(n=7))
 
     assert df.head(compute=False).npartitions == 1
 
@@ -482,9 +475,9 @@ def test_head_head(df):
     assert a.optimize()._name == b.optimize()._name
 
 
-def test_tail(bdf, df):
-    assert_eq(df.tail(compute=False), bdf.tail())
-    assert_eq(df.tail(compute=False, n=7), bdf.tail(n=7))
+def test_tail(pdf, df):
+    assert_eq(df.tail(compute=False), pdf.tail())
+    assert_eq(df.tail(compute=False, n=7), pdf.tail(n=7))
 
     assert df.tail(compute=False).npartitions == 1
 
@@ -519,10 +512,10 @@ def test_projection_stacking(df):
     assert optimized._name == expected._name
 
 
-def test_projection_stacking_coercion(bdf):
-    df = from_pandas(bdf)
-    assert_eq(df.x[0], bdf.x[0], check_divisions=False)
-    assert_eq(df.x[[0]], bdf.x[[0]], check_divisions=False)
+def test_projection_stacking_coercion(pdf):
+    df = from_pandas(pdf)
+    assert_eq(df.x[0], pdf.x[0], check_divisions=False)
+    assert_eq(df.x[[0]], pdf.x[[0]], check_divisions=False)
 
 
 def test_remove_unnecessary_projections(df):
@@ -571,8 +564,8 @@ def test_substitute():
     assert result._name == expected._name
 
 
-def test_from_pandas(bdf):
-    df = from_pandas(bdf, npartitions=3)
+def test_from_pandas(pdf):
+    df = from_pandas(pdf, npartitions=3)
     assert df.npartitions == 3
     assert "pandas" in df._name
 
@@ -587,12 +580,12 @@ def test_copy(df):
     assert "z" not in original.columns
 
 
-def test_partitions(bdf, df):
-    assert_eq(df.partitions[0], bdf.iloc[:10])
-    assert_eq(df.partitions[1], bdf.iloc[10:20])
-    assert_eq(df.partitions[1:3], bdf.iloc[10:30])
-    assert_eq(df.partitions[[3, 4]], bdf.iloc[30:50])
-    assert_eq(df.partitions[-1], bdf.iloc[90:])
+def test_partitions(pdf, df):
+    assert_eq(df.partitions[0], pdf.iloc[:10])
+    assert_eq(df.partitions[1], pdf.iloc[10:20])
+    assert_eq(df.partitions[1:3], pdf.iloc[10:30])
+    assert_eq(df.partitions[[3, 4]], pdf.iloc[30:50])
+    assert_eq(df.partitions[-1], pdf.iloc[90:])
 
     out = (df + 1).partitions[0].simplify()
     assert isinstance(out.expr, expr.Add)
@@ -601,7 +594,7 @@ def test_partitions(bdf, df):
     # Check culling
     out = optimize(df.partitions[1])
     assert len(out.dask) == 1
-    assert_eq(out, bdf.iloc[10:20])
+    assert_eq(out, pdf.iloc[10:20])
 
 
 def test_column_getattr(df):
@@ -612,10 +605,10 @@ def test_column_getattr(df):
         df.foo
 
 
-def test_serialization(bdf, df):
+def test_serialization(pdf, df):
     before = pickle.dumps(df)
 
-    assert len(before) < 200 + len(pickle.dumps(bdf))
+    assert len(before) < 200 + len(pickle.dumps(pdf))
 
     part = df.partitions[0].compute()
     assert (
@@ -733,8 +726,8 @@ def test_partitions_nested(df):
 
 @pytest.mark.parametrize("sort", [True, False])
 @pytest.mark.parametrize("npartitions", [7, 12])
-def test_repartition_npartitions(bdf, npartitions, sort):
-    df = from_pandas(bdf, sort=sort) + 1
+def test_repartition_npartitions(pdf, npartitions, sort):
+    df = from_pandas(pdf, sort=sort) + 1
     df2 = df.repartition(npartitions=npartitions)
     assert df2.npartitions == npartitions
     assert_eq(df, df2)
@@ -762,11 +755,11 @@ def test_repartition_no_op(df):
     assert result._name == df._name
 
 
-def test_len(df, bdf):
+def test_len(df, pdf):
     df2 = df[["x"]] + 1
-    assert len(df2) == len(bdf)
+    assert len(df2) == len(pdf)
 
-    assert len(df[df.x > 5]) == len(bdf[bdf.x > 5])
+    assert len(df[df.x > 5]) == len(pdf[pdf.x > 5])
 
     first = df2.partitions[0].compute()
     assert len(df2.partitions[0]) == len(first)
@@ -775,12 +768,12 @@ def test_len(df, bdf):
     assert isinstance(expr.Lengths(df2.expr).optimize(), expr.Literal)
 
 
-def test_astype_simplify(df, bdf):
+def test_astype_simplify(df, pdf):
     q = df.astype({"x": "float64", "y": "float64"})["x"]
     result = q.simplify()
     expected = df["x"].astype({"x": "float64"})
     assert result._name == expected._name
-    assert_eq(q, bdf.astype({"x": "float64", "y": "float64"})["x"])
+    assert_eq(q, pdf.astype({"x": "float64", "y": "float64"})["x"])
 
     q = df.astype({"y": "float64"})["x"]
     result = q.simplify()
@@ -793,13 +786,13 @@ def test_astype_simplify(df, bdf):
     assert result._name == expected._name
 
 
-def test_drop_duplicates(df, bdf):
-    assert_eq(df.drop_duplicates(), bdf.drop_duplicates())
+def test_drop_duplicates(df, pdf):
+    assert_eq(df.drop_duplicates(), pdf.drop_duplicates())
     assert_eq(
-        df.drop_duplicates(ignore_index=True), bdf.drop_duplicates(ignore_index=True)
+        df.drop_duplicates(ignore_index=True), pdf.drop_duplicates(ignore_index=True)
     )
-    assert_eq(df.drop_duplicates(subset=["x"]), bdf.drop_duplicates(subset=["x"]))
-    assert_eq(df.x.drop_duplicates(), bdf.x.drop_duplicates())
+    assert_eq(df.drop_duplicates(subset=["x"]), pdf.drop_duplicates(subset=["x"]))
+    assert_eq(df.x.drop_duplicates(), pdf.x.drop_duplicates())
 
     if BACKEND == "pandas":
         with pytest.raises(KeyError, match=re.escape("Index(['a'], dtype='object')")):
@@ -809,15 +802,15 @@ def test_drop_duplicates(df, bdf):
         df.x.drop_duplicates(subset=["a"])
 
 
-def test_unique(df, bdf):
+def test_unique(df, pdf):
     with pytest.raises(
         AttributeError, match="'DataFrame' object has no attribute 'unique'"
     ):
         df.unique()
 
     # pandas returns a numpy array while we return a Series/Index
-    assert_eq(df.x.unique(), lib.Series(bdf.x.unique(), name="x"))
-    assert_eq(df.index.unique(), lib.Index(bdf.index.unique()))
+    assert_eq(df.x.unique(), lib.Series(pdf.x.unique(), name="x"))
+    assert_eq(df.index.unique(), lib.Index(pdf.index.unique()))
 
 
 def test_walk(df):
@@ -848,14 +841,14 @@ def test_find_operations(df):
 
 
 @pytest.mark.parametrize("subset", ["x", ["x"]])
-def test_dropna_simplify(bdf, subset):
-    bdf["z"] = 1
-    df = from_pandas(bdf)
+def test_dropna_simplify(pdf, subset):
+    pdf["z"] = 1
+    df = from_pandas(pdf)
     q = df.dropna(subset=subset)["y"]
     result = q.simplify()
     expected = df[["x", "y"]].dropna(subset=subset)["y"]
     assert result._name == expected._name
-    assert_eq(q, bdf.dropna(subset=subset)["y"])
+    assert_eq(q, pdf.dropna(subset=subset)["y"])
 
 
 def test_dir(df):
@@ -879,13 +872,13 @@ def test_dir(df):
     ],
 )
 @pytest.mark.parametrize("indexer", ["x", ["x"]])
-def test_simplify_up_blockwise(df, bdf, func, args, indexer):
+def test_simplify_up_blockwise(df, pdf, func, args, indexer):
     q = getattr(df, func)(*args)[indexer]
     result = q.simplify()
     expected = getattr(df[indexer], func)(*args)
     assert result._name == expected._name
 
-    assert_eq(q, getattr(bdf, func)(*args)[indexer])
+    assert_eq(q, getattr(pdf, func)(*args)[indexer])
 
     q = getattr(df, func)(*args)[["x", "y"]]
     result = q.simplify()
@@ -903,16 +896,16 @@ def test_sample(df):
     assert_eq(result, expected)
 
 
-def test_align(df, bdf):
+def test_align(df, pdf):
     if BACKEND == "cudf":
         pytest.skip(reason="align not supported by cudf")
     result_1, result_2 = df.align(df)
-    pdf_result_1, pdf_result_2 = bdf.align(bdf)
+    pdf_result_1, pdf_result_2 = pdf.align(pdf)
     assert_eq(result_1, pdf_result_1)
     assert_eq(result_2, pdf_result_2)
 
     result_1, result_2 = df.x.align(df.x)
-    pdf_result_1, pdf_result_2 = bdf.x.align(bdf.x)
+    pdf_result_1, pdf_result_2 = pdf.x.align(pdf.x)
     assert_eq(result_1, pdf_result_1)
     assert_eq(result_2, pdf_result_2)
 
@@ -962,51 +955,51 @@ def test_nunique_approx(df):
     assert 99 < result < 101
 
 
-def test_assign_simplify(bdf):
-    df = from_pandas(bdf)
-    df2 = from_pandas(bdf)
+def test_assign_simplify(pdf):
+    df = from_pandas(pdf)
+    df2 = from_pandas(pdf)
     df["new"] = df.x > 1
     result = df[["x", "new"]].simplify()
     expected = df2[["x"]].assign(new=df2.x > 1).simplify()
     assert result._name == expected._name
 
-    bdf["new"] = bdf.x > 1
-    assert_eq(bdf[["x", "new"]], result)
+    pdf["new"] = pdf.x > 1
+    assert_eq(pdf[["x", "new"]], result)
 
 
-def test_assign_simplify_new_column_not_needed(bdf):
-    df = from_pandas(bdf)
-    df2 = from_pandas(bdf)
+def test_assign_simplify_new_column_not_needed(pdf):
+    df = from_pandas(pdf)
+    df2 = from_pandas(pdf)
     df["new"] = df.x > 1
     result = df[["x"]].simplify()
     expected = df2[["x"]].simplify()
     assert result._name == expected._name
 
-    bdf["new"] = bdf.x > 1
-    assert_eq(result, bdf[["x"]])
+    pdf["new"] = pdf.x > 1
+    assert_eq(result, pdf[["x"]])
 
 
-def test_assign_simplify_series(bdf):
-    df = from_pandas(bdf)
-    df2 = from_pandas(bdf)
+def test_assign_simplify_series(pdf):
+    df = from_pandas(pdf)
+    df2 = from_pandas(pdf)
     df["new"] = df.x > 1
     result = df.new.simplify()
     expected = df2[[]].assign(new=df2.x > 1).new.simplify()
     assert result._name == expected._name
 
 
-def test_assign_non_series_inputs(df, bdf):
+def test_assign_non_series_inputs(df, pdf):
     if BACKEND == "cudf":
         pytest.xfail(reason="assign function not supported by cudf")
-    assert_eq(df.assign(a=lambda x: x.x * 2), bdf.assign(a=lambda x: x.x * 2))
-    assert_eq(df.assign(a=2), bdf.assign(a=2))
-    assert_eq(df.assign(a=df.x.sum()), bdf.assign(a=bdf.x.sum()))
+    assert_eq(df.assign(a=lambda x: x.x * 2), pdf.assign(a=lambda x: x.x * 2))
+    assert_eq(df.assign(a=2), pdf.assign(a=2))
+    assert_eq(df.assign(a=df.x.sum()), pdf.assign(a=pdf.x.sum()))
 
-    assert_eq(df.assign(a=lambda x: x.x * 2).y, bdf.assign(a=lambda x: x.x * 2).y)
-    assert_eq(df.assign(a=lambda x: x.x * 2).a, bdf.assign(a=lambda x: x.x * 2).a)
+    assert_eq(df.assign(a=lambda x: x.x * 2).y, pdf.assign(a=lambda x: x.x * 2).y)
+    assert_eq(df.assign(a=lambda x: x.x * 2).a, pdf.assign(a=lambda x: x.x * 2).a)
 
 
-def test_are_co_aligned(bdf, df):
+def test_are_co_aligned(pdf, df):
     df2 = df.reset_index()
     assert are_co_aligned(df.expr, df2.expr)
     assert are_co_aligned(df.expr, df2.sum().expr)
@@ -1015,8 +1008,8 @@ def test_are_co_aligned(bdf, df):
     assert are_co_aligned(df.expr, df.sum().expr)
     assert are_co_aligned((df + df.sum()).expr, df.sum().expr)
 
-    bdf = bdf.assign(z=1)
-    df3 = from_pandas(bdf, npartitions=10)
+    pdf = pdf.assign(z=1)
+    df3 = from_pandas(pdf, npartitions=10)
     assert not are_co_aligned(df.expr, df3.expr)
     assert are_co_aligned(df.expr, df3.sum().expr)
 
@@ -1052,8 +1045,7 @@ def test_op_align():
     assert_eq(df - df2, pdf - pdf2)
 
 
-def test_can_co_align(df, bdf):
-    pdf = bdf.copy()
+def test_can_co_align(df, pdf):
     q = (df.x + df.y).optimize(fuse=False)
     expected = df.x + df.y
     assert q._name == expected._name

From 6d308d7b6f076a81496129f27cf774d33f30af86 Mon Sep 17 00:00:00 2001
From: Rick Zamora <rzamora217@gmail.com>
Date: Fri, 21 Jul 2023 14:51:01 -0500
Subject: [PATCH 13/18] update test

---
 dask_expr/tests/test_collection.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
index 725eacfd9..fb49d1173 100644
--- a/dask_expr/tests/test_collection.py
+++ b/dask_expr/tests/test_collection.py
@@ -143,7 +143,7 @@ def test_reductions(func, pdf, df):
 @pytest.mark.parametrize("skipna", [True, False])
 @pytest.mark.parametrize("ddof", [1, 2])
 def test_std_kwargs(axis, skipna, ddof):
-    pdf = pd.DataFrame(
+    pdf = lib.DataFrame(
         {"x": range(30), "y": [1, 2, None] * 10, "z": ["dog", "cat"] * 15}
     )
     df = from_pandas(pdf, npartitions=3)
@@ -581,7 +581,7 @@ def test_substitute():
 
 
 def test_substitute_parameters(df):
-    pdf = pd.DataFrame(
+    pdf = lib.DataFrame(
         {
             "a": range(100),
             "b": range(100),

From b9624da452dc7c1b9faa28621ddc6ee79cd1a250 Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Fri, 21 Jul 2023 15:38:45 -0700
Subject: [PATCH 14/18] address problems with cudf var/std behavior

---
 dask_expr/_reductions.py           | 7 ++++++-
 dask_expr/tests/test_collection.py | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/dask_expr/_reductions.py b/dask_expr/_reductions.py
index bf56e8810..dcadba327 100644
--- a/dask_expr/_reductions.py
+++ b/dask_expr/_reductions.py
@@ -417,7 +417,7 @@ def aggregate_kwargs(self):
     @classmethod
     def reduction_chunk(cls, x, skipna=True, numeric_only=False):
         kwargs = {"numeric_only": numeric_only} if is_dataframe_like(x) else {}
-        if skipna:
+        if skipna or numeric_only:
             n = x.count(**kwargs)
             kwargs["skipna"] = skipna
             avg = x.mean(**kwargs)
@@ -427,6 +427,11 @@ def reduction_chunk(cls, x, skipna=True, numeric_only=False):
             n = len(x)
             kwargs["skipna"] = skipna
             avg = x.sum(**kwargs) / n
+        if numeric_only:
+            # Workaround for cudf bug
+            # (see: https://github.com/rapidsai/cudf/issues/13731)
+            x = x.select_dtypes("number")
+            n = n.loc[x.columns]
         m2 = ((x - avg) ** 2).sum(**kwargs)
         return n, avg, m2
 
diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
index fb49d1173..5586cafd7 100644
--- a/dask_expr/tests/test_collection.py
+++ b/dask_expr/tests/test_collection.py
@@ -143,6 +143,8 @@ def test_reductions(func, pdf, df):
 @pytest.mark.parametrize("skipna", [True, False])
 @pytest.mark.parametrize("ddof", [1, 2])
 def test_std_kwargs(axis, skipna, ddof):
+    if BACKEND == "cudf" and skipna is False:
+        pytest.xfail(reason="cudf requires skipna=True when nulls are present.")
     pdf = lib.DataFrame(
         {"x": range(30), "y": [1, 2, None] * 10, "z": ["dog", "cat"] * 15}
     )

From f502717a4d4a29f0bb2da6668f9ad3af2e72c539 Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Mon, 24 Jul 2023 08:22:31 -0700
Subject: [PATCH 15/18] us decorators

---
 dask_expr/tests/test_collection.py | 56 +++++++++++-------------------
 1 file changed, 20 insertions(+), 36 deletions(-)

diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
index 5586cafd7..9293df46c 100644
--- a/dask_expr/tests/test_collection.py
+++ b/dask_expr/tests/test_collection.py
@@ -3,7 +3,6 @@
 import importlib
 import operator
 import pickle
-import re
 
 import dask
 import numpy as np
@@ -19,6 +18,7 @@
 
 # Import backend DataFrame library to test
 BACKEND = dask.config.get("dataframe.backend", "pandas")
+CUDF_BACKEND = BACKEND == "cudf"
 lib = importlib.import_module(BACKEND)
 
 
@@ -53,18 +53,16 @@ def test_setitem(pdf, df):
     assert_eq(df, pdf)
 
 
+@pytest.mark.xfail(CUDF_BACKEND, reason="https://github.com/rapidsai/cudf/issues/10271")
 def test_explode():
-    if BACKEND == "cudf":
-        pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/10271")
     pdf = lib.DataFrame({"a": [[1, 2], [3, 4]]})
     df = from_pandas(pdf)
     assert_eq(pdf.explode(column="a"), df.explode(column="a"))
     assert_eq(pdf.a.explode(), df.a.explode())
 
 
+@pytest.mark.xfail(CUDF_BACKEND, reason="https://github.com/rapidsai/cudf/issues/10271")
 def test_explode_simplify(pdf):
-    if BACKEND == "cudf":
-        pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/10271")
     pdf["z"] = 1
     df = from_pandas(pdf)
     q = df.explode(column="x")["y"]
@@ -126,7 +124,7 @@ def test_dask(pdf, df):
     ],
 )
 def test_reductions(func, pdf, df):
-    if BACKEND == "cudf" and func in [M.idxmin, M.idxmax]:
+    if CUDF_BACKEND and func in [M.idxmin, M.idxmax]:
         pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/9602")
     result = func(df)
     assert result.known_divisions
@@ -143,7 +141,7 @@ def test_reductions(func, pdf, df):
 @pytest.mark.parametrize("skipna", [True, False])
 @pytest.mark.parametrize("ddof", [1, 2])
 def test_std_kwargs(axis, skipna, ddof):
-    if BACKEND == "cudf" and skipna is False:
+    if CUDF_BACKEND and skipna is False:
         pytest.xfail(reason="cudf requires skipna=True when nulls are present.")
     pdf = lib.DataFrame(
         {"x": range(30), "y": [1, 2, None] * 10, "z": ["dog", "cat"] * 15}
@@ -155,9 +153,8 @@ def test_std_kwargs(axis, skipna, ddof):
     )
 
 
+@pytest.mark.xfail(CUDF_BACKEND, reason="nbytes not supported by cudf")
 def test_nbytes(pdf, df):
-    if BACKEND == "cudf":
-        pytest.xfail(reason="nbytes not supported by cudf")
     with pytest.raises(NotImplementedError, match="nbytes is not implemented"):
         df.nbytes
     assert_eq(df.x.nbytes, pdf.x.nbytes)
@@ -283,10 +280,9 @@ def test_and_or(func, pdf, df):
     assert_eq(func(pdf), func(df), check_names=False)
 
 
+@pytest.mark.xfail(CUDF_BACKEND, reason="period_range not supported by cudf")
 @pytest.mark.parametrize("how", ["start", "end"])
 def test_to_timestamp(pdf, how):
-    if BACKEND == "cudf":
-        pytest.xfail(reason="period_range not supported by cudf")
     pdf.index = lib.period_range("2019-12-31", freq="D", periods=len(pdf))
     df = from_pandas(pdf)
     assert_eq(df.to_timestamp(how=how), pdf.to_timestamp(how=how))
@@ -329,6 +325,7 @@ def test_blockwise(func, pdf, df):
     assert_eq(func(pdf), func(df))
 
 
+@pytest.mark.xfail(CUDF_BACKEND, reason="func not supported by cudf")
 @pytest.mark.parametrize(
     "func",
     [
@@ -339,14 +336,11 @@ def test_blockwise(func, pdf, df):
     ],
 )
 def test_blockwise_cudf_fails(func, pdf, df):
-    if BACKEND == "cudf":
-        pytest.xfail(reason="func not supported by cudf")
     assert_eq(func(pdf), func(df))
 
 
+@pytest.mark.xfail(CUDF_BACKEND, reason="rename_axis not supported by cudf")
 def test_rename_axis(pdf):
-    if BACKEND == "cudf":
-        pytest.xfail(reason="rename_axis not supported by cudf")
     pdf.index.name = "a"
     pdf.columns.name = "b"
     df = from_pandas(pdf, npartitions=10)
@@ -378,9 +372,8 @@ def test_repr(df):
     assert "sum(skipna=False)" in s
 
 
+@pytest.mark.xfail(CUDF_BACKEND, reason="combine_first not supported by cudf")
 def test_combine_first_simplify(pdf):
-    if BACKEND == "cudf":
-        pytest.xfail(reason="combine_first not supported by cudf")
     df = from_pandas(pdf)
     pdf2 = pdf.rename(columns={"y": "z"})
     df2 = from_pandas(pdf2)
@@ -667,9 +660,8 @@ def test_serialization(pdf, df):
     assert_eq(pickle.loads(before), pickle.loads(after))
 
 
+@pytest.mark.xfail(CUDF_BACKEND, reason="Cannot apply lambda function in cudf")
 def test_size_optimized(df):
-    if BACKEND == "cudf":
-        pytest.xfail(reason="Cannot apply lambda function in cudf")
     expr = (df.x + 1).apply(lambda x: x).size
     out = optimize(expr)
     expected = optimize(df.x.size)
@@ -837,9 +829,8 @@ def test_drop_duplicates(df, pdf):
     assert_eq(df.drop_duplicates(subset=["x"]), pdf.drop_duplicates(subset=["x"]))
     assert_eq(df.x.drop_duplicates(), pdf.x.drop_duplicates())
 
-    if BACKEND == "pandas":
-        with pytest.raises(KeyError, match=re.escape("Index(['a'], dtype='object')")):
-            df.drop_duplicates(subset=["a"])
+    with pytest.raises(KeyError, match="'a'"):
+        df.drop_duplicates(subset=["a"])
 
     with pytest.raises(TypeError, match="got an unexpected keyword argument"):
         df.x.drop_duplicates(subset=["a"])
@@ -939,9 +930,8 @@ def test_sample(df):
     assert_eq(result, expected)
 
 
+@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf")
 def test_align(df, pdf):
-    if BACKEND == "cudf":
-        pytest.skip(reason="align not supported by cudf")
     result_1, result_2 = df.align(df)
     pdf_result_1, pdf_result_2 = pdf.align(pdf)
     assert_eq(result_1, pdf_result_1)
@@ -953,9 +943,8 @@ def test_align(df, pdf):
     assert_eq(result_2, pdf_result_2)
 
 
+@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf")
 def test_align_different_partitions():
-    if BACKEND == "cudf":
-        pytest.skip(reason="align not supported by cudf")
     pdf = lib.DataFrame({"a": [11, 12, 31, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6]})
     df = from_pandas(pdf, npartitions=2)
     pdf2 = lib.DataFrame(
@@ -969,9 +958,8 @@ def test_align_different_partitions():
     assert_eq(result_2, pdf_result_2)
 
 
+@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf")
 def test_align_unknown_partitions_same_root():
-    if BACKEND == "cudf":
-        pytest.skip(reason="align not supported by cudf")
     pdf = lib.DataFrame({"a": 1}, index=[3, 2, 1])
     df = from_pandas(pdf, npartitions=2, sort=False)
     result_1, result_2 = df.align(df)
@@ -980,9 +968,8 @@ def test_align_unknown_partitions_same_root():
     assert_eq(result_2, pdf_result_2)
 
 
+@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf")
 def test_unknown_partitions_different_root():
-    if BACKEND == "cudf":
-        pytest.skip(reason="align not supported by cudf")
     pdf = lib.DataFrame({"a": 1}, index=[3, 2, 1])
     df = from_pandas(pdf, npartitions=2, sort=False)
     pdf2 = lib.DataFrame({"a": 1}, index=[4, 3, 2, 1])
@@ -991,9 +978,8 @@ def test_unknown_partitions_different_root():
         df.align(df2)
 
 
+@pytest.mark.xfail(CUDF_BACKEND, reason="compute_hll_array doesn't work for cudf")
 def test_nunique_approx(df):
-    if BACKEND == "cudf":
-        pytest.xfail(reason="compute_hll_array doesn't work for cudf")
     result = df.nunique_approx().compute()
     assert 99 < result < 101
 
@@ -1031,9 +1017,8 @@ def test_assign_simplify_series(pdf):
     assert result._name == expected._name
 
 
+@pytest.mark.xfail(CUDF_BACKEND, reason="assign function not supported by cudf")
 def test_assign_non_series_inputs(df, pdf):
-    if BACKEND == "cudf":
-        pytest.xfail(reason="assign function not supported by cudf")
     assert_eq(df.assign(a=lambda x: x.x * 2), pdf.assign(a=lambda x: x.x * 2))
     assert_eq(df.assign(a=2), pdf.assign(a=2))
     assert_eq(df.assign(a=df.x.sum()), pdf.assign(a=pdf.x.sum()))
@@ -1063,9 +1048,8 @@ def test_are_co_aligned(pdf, df):
     assert not are_co_aligned(merged_first.expr, df.expr)
 
 
+@pytest.mark.xfail(CUDF_BACKEND, reason="TODO")
 def test_astype_categories(df):
-    if BACKEND == "cudf":
-        pytest.xfail(reason="TODO")
     result = df.astype("category")
     assert_eq(result.x._meta.cat.categories, lib.Index([UNKNOWN_CATEGORIES]))
     assert_eq(result.y._meta.cat.categories, lib.Index([UNKNOWN_CATEGORIES]))

From fc884022e89ec1f8aa65a4b15490037f36c2bcdb Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Mon, 24 Jul 2023 08:26:39 -0700
Subject: [PATCH 16/18] rename _set_engine to _set_parquet_engine

---
 dask_expr/_collection.py | 4 ++--
 dask_expr/io/parquet.py  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/dask_expr/_collection.py b/dask_expr/_collection.py
index 4b23dd1f3..1de19f30b 100644
--- a/dask_expr/_collection.py
+++ b/dask_expr/_collection.py
@@ -1081,7 +1081,7 @@ def read_parquet(
     engine=None,
     **kwargs,
 ):
-    from dask_expr.io.parquet import ReadParquet, _set_engine
+    from dask_expr.io.parquet import ReadParquet, _set_parquet_engine
 
     if not isinstance(path, str):
         path = stringify_path(path)
@@ -1104,7 +1104,7 @@ def read_parquet(
             aggregate_files=aggregate_files,
             parquet_file_extension=parquet_file_extension,
             filesystem=filesystem,
-            engine=_set_engine(engine),
+            engine=_set_parquet_engine(engine),
             kwargs=kwargs,
         )
     )
diff --git a/dask_expr/io/parquet.py b/dask_expr/io/parquet.py
index 4dec13345..8f9927f07 100644
--- a/dask_expr/io/parquet.py
+++ b/dask_expr/io/parquet.py
@@ -176,7 +176,7 @@ def to_parquet(
     from dask_expr._collection import new_collection
     from dask_expr.io.parquet import NONE_LABEL, ToParquet
 
-    engine = _set_engine(meta=df._meta)
+    engine = _set_parquet_engine(meta=df._meta)
     compute_kwargs = compute_kwargs or {}
 
     partition_on = partition_on or []
@@ -686,7 +686,7 @@ def _update_length_statistics(self):
 #
 
 
-def _set_engine(engine=None, meta=None):
+def _set_parquet_engine(engine=None, meta=None):
     # Use `engine` or `meta` input to set the parquet engine
     if engine is None:
         if (

From 5d927c19eb654d349e3103e4a3a70d1953d4e76e Mon Sep 17 00:00:00 2001
From: rjzamora <rzamora217@gmail.com>
Date: Mon, 24 Jul 2023 10:11:04 -0700
Subject: [PATCH 17/18] introduce _required_attribute

---
 dask_expr/_expr.py                 | 26 +++++++++++++++++++++++++-
 dask_expr/_reductions.py           | 10 ++++++++++
 dask_expr/tests/test_collection.py | 22 +++++++++++-----------
 3 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/dask_expr/_expr.py b/dask_expr/_expr.py
index 2e08b4e18..6d25db7f4 100644
--- a/dask_expr/_expr.py
+++ b/dask_expr/_expr.py
@@ -54,6 +54,18 @@ def __init__(self, *args, **kwargs):
                 operands.append(type(self)._defaults[parameter])
         assert not kwargs
         self.operands = operands
+        if self._required_attribute:
+            dep = next(iter(self.dependencies()))._meta
+            if not hasattr(dep, self._required_attribute):
+                # Raise a ValueError instead of AttributeError to
+                # avoid infinite recursion
+                raise ValueError(f"{dep} has no attribute {self._required_attribute}")
+
+    @property
+    def _required_attribute(self) -> str:
+        # Specify if the first `dependency` must support
+        # a specific attribute for valid behavior.
+        return None
 
     @functools.cached_property
     def ndim(self):
@@ -941,6 +953,12 @@ class Blockwise(Expr):
     _keyword_only = []
     _projection_passthrough = False
 
+    @property
+    def _required_attribute(self):
+        if isinstance(self.operation, type(M.method_caller)):
+            return self.operation.method
+        return None
+
     @functools.cached_property
     def _meta(self):
         args = [op._meta if isinstance(op, Expr) else op for op in self._args]
@@ -1027,7 +1045,13 @@ def _combine_similar(self, root: Expr):
         # Push projections back up through `_projection_passthrough`
         # operations if it reduces the number of unique expression nodes.
         if self._projection_passthrough and isinstance(self.frame, Projection):
-            common = type(self)(self.frame.frame, *self.operands[1:])
+            try:
+                common = type(self)(self.frame.frame, *self.operands[1:])
+            except ValueError:
+                # May have encountered a problem with `_required_attribute`.
+                # (There is no guarentee that the same method will exist for
+                # both a Series and DataFrame)
+                return None
             projection = self.frame.operand("columns")
             push_up_projection = False
             for op in self._find_similar_operations(root, ignore=self._parameters):
diff --git a/dask_expr/_reductions.py b/dask_expr/_reductions.py
index dcadba327..592ffea9b 100644
--- a/dask_expr/_reductions.py
+++ b/dask_expr/_reductions.py
@@ -42,6 +42,15 @@ class ApplyConcatApply(Expr):
     combine_kwargs = {}
     aggregate_kwargs = {}
 
+    # def __init__(self, *args, **kwargs):
+    #     super().__init__(*args, **kwargs)
+    #     if self._required_attribute:
+    #         dep = next(iter(self.dependencies()))._meta
+    #         if not hasattr(dep, self._required_attribute):
+    #             # Raise a ValueError instead of AttributeError to
+    #             # avoid infinite recursion
+    #             raise ValueError(f"{dep} has no attribute {self._required_attribute}")
+
     def __dask_postcompute__(self):
         return toolz.first, ()
 
@@ -386,6 +395,7 @@ class NBytes(Reduction):
     # Only supported for Series objects
     reduction_chunk = lambda ser: ser.nbytes
     reduction_aggregate = sum
+    _required_attribute = "nbytes"
 
 
 class Var(Reduction):
diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py
index 9293df46c..410e40125 100644
--- a/dask_expr/tests/test_collection.py
+++ b/dask_expr/tests/test_collection.py
@@ -7,7 +7,7 @@
 import dask
 import numpy as np
 import pytest
-from dask.dataframe._compat import PANDAS_GT_210
+from dask.dataframe._compat import PANDAS_GE_210
 from dask.dataframe.utils import UNKNOWN_CATEGORIES, assert_eq
 from dask.utils import M
 
@@ -293,12 +293,6 @@ def test_to_timestamp(pdf, how):
     "func",
     [
         lambda df: df.astype(int),
-        pytest.param(
-            lambda df: df.map(lambda x: x + 1),
-            marks=pytest.mark.skipif(
-                not PANDAS_GT_210, reason="Only available from 2.1"
-            ),
-        ),
         lambda df: df.clip(lower=10, upper=50),
         lambda df: df.x.clip(lower=10, upper=50),
         lambda df: df.x.between(left=10, right=50),
@@ -331,11 +325,17 @@ def test_blockwise(func, pdf, df):
     [
         lambda df: df.apply(lambda row, x, y=10: row * x + y, x=2),
         lambda df: df.index.map(lambda x: x + 1),
+        pytest.param(
+            lambda df: df.map(lambda x: x + 1),
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_210, reason="Only available from 2.1"
+            ),
+        ),
         lambda df: df.combine_first(df),
         lambda df: df.x.combine_first(df.y),
     ],
 )
-def test_blockwise_cudf_fails(func, pdf, df):
+def test_blockwise_pandas_only(func, pdf, df):
     assert_eq(func(pdf), func(df))
 
 
@@ -930,7 +930,7 @@ def test_sample(df):
     assert_eq(result, expected)
 
 
-@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf")
+@pytest.mark.xfail(CUDF_BACKEND, reason="align not supported by cudf")
 def test_align(df, pdf):
     result_1, result_2 = df.align(df)
     pdf_result_1, pdf_result_2 = pdf.align(pdf)
@@ -943,7 +943,7 @@ def test_align(df, pdf):
     assert_eq(result_2, pdf_result_2)
 
 
-@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf")
+@pytest.mark.xfail(CUDF_BACKEND, reason="align not supported by cudf")
 def test_align_different_partitions():
     pdf = lib.DataFrame({"a": [11, 12, 31, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6]})
     df = from_pandas(pdf, npartitions=2)
@@ -958,7 +958,7 @@ def test_align_different_partitions():
     assert_eq(result_2, pdf_result_2)
 
 
-@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf")
+@pytest.mark.xfail(CUDF_BACKEND, reason="align not supported by cudf")
 def test_align_unknown_partitions_same_root():
     pdf = lib.DataFrame({"a": 1}, index=[3, 2, 1])
     df = from_pandas(pdf, npartitions=2, sort=False)

From 7f87c8ffa021dd8ea2d4cf8d76420e4a8ee8f858 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 24 Jul 2023 12:19:37 -0500
Subject: [PATCH 18/18] Update dask_expr/_reductions.py

---
 dask_expr/_reductions.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/dask_expr/_reductions.py b/dask_expr/_reductions.py
index 592ffea9b..5515799a7 100644
--- a/dask_expr/_reductions.py
+++ b/dask_expr/_reductions.py
@@ -42,15 +42,6 @@ class ApplyConcatApply(Expr):
     combine_kwargs = {}
     aggregate_kwargs = {}
 
-    # def __init__(self, *args, **kwargs):
-    #     super().__init__(*args, **kwargs)
-    #     if self._required_attribute:
-    #         dep = next(iter(self.dependencies()))._meta
-    #         if not hasattr(dep, self._required_attribute):
-    #             # Raise a ValueError instead of AttributeError to
-    #             # avoid infinite recursion
-    #             raise ValueError(f"{dep} has no attribute {self._required_attribute}")
-
     def __dask_postcompute__(self):
         return toolz.first, ()