From 9875b2783c36716f957bdc91e9b946b8fefa4d8a Mon Sep 17 00:00:00 2001 From: rjzamora Date: Wed, 5 Jul 2023 13:37:05 -0700 Subject: [PATCH 01/18] basic cudf backend support --- dask_expr/io/parquet.py | 4 ++ dask_expr/tests/test_collection.py | 68 ++++++++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/dask_expr/io/parquet.py b/dask_expr/io/parquet.py index 0205a2f12..de116d7dc 100644 --- a/dask_expr/io/parquet.py +++ b/dask_expr/io/parquet.py @@ -417,6 +417,10 @@ class ReadParquet(PartitionsFiltered, BlockwiseIO): @property def engine(self): + if dask.config.get("dataframe.backend", "pandas") == "cudf": + from dask_cudf.io.parquet import CudfEngine + + return CudfEngine return get_engine("pyarrow") @property diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py index 3641c4320..0158736d1 100644 --- a/dask_expr/tests/test_collection.py +++ b/dask_expr/tests/test_collection.py @@ -1,3 +1,6 @@ +from __future__ import annotations + +import functools import operator import pickle import re @@ -16,11 +19,27 @@ from dask_expr.reductions import Len -@pytest.fixture -def pdf(): +try: + import cudf +except ImportError: + cudf = None + + +@functools.cache +def _pdf(): pdf = pd.DataFrame({"x": range(100)}) pdf["y"] = pdf.x * 10.0 - yield pdf + return pdf + + +@functools.cache +def _gdf(): + return None if cudf is None else cudf.from_pandas(_pdf()) + + +@pytest.fixture +def pdf(): + yield _pdf().copy() @pytest.fixture @@ -28,6 +47,29 @@ def df(pdf): yield from_pandas(pdf, npartitions=10) +def cpu_gpu(data: dict | None = None, npartitions: int = 10): + """DataFrame parameterization for cpu and gpu backed data""" + if data is None: + pdf, gdf = _pdf().copy(), _gdf().copy() + else: + pdf = pd.DataFrame(data) + gdf = None if cudf is None else cudf.from_pandas(pdf) + + return pytest.mark.parametrize( + "pdf,df", + [ + pytest.param(pdf, from_pandas(pdf, npartitions), id="pandas"), + pytest.param( + gdf, + from_pandas(gdf, npartitions) if cudf else None, + id="cudf", + marks=pytest.mark.skipif(cudf is None, reason="cudf not found"), + ), + ], + ) + + +@cpu_gpu() def test_del(pdf, df): pdf = pdf.copy() @@ -37,6 +79,7 @@ def test_del(pdf, df): assert_eq(pdf, df) +@cpu_gpu() def test_setitem(pdf, df): pdf = pdf.copy() pdf["z"] = pdf.x + pdf.y @@ -87,6 +130,7 @@ def test_meta_blockwise(): assert set(cc.columns) == {"x", "y", "z"} +@cpu_gpu() def test_dask(pdf, df): assert (df.x + df.y).npartitions == 10 z = (df.x + df.y).sum() @@ -168,6 +212,7 @@ def test_memory_usage(pdf): df.index.memory_usage(index=True) +@cpu_gpu() @pytest.mark.parametrize("func", [M.nlargest, M.nsmallest]) def test_nlargest_nsmallest(df, pdf, func): assert_eq(func(df, n=5, columns="x"), func(pdf, n=5, columns="x")) @@ -176,6 +221,7 @@ def test_nlargest_nsmallest(df, pdf, func): func(df.x, n=5, columns="foo") +@cpu_gpu() @pytest.mark.parametrize( "func", [ @@ -232,6 +278,7 @@ def test_unary_operators(func): assert_eq(func(pdf), func(df)) +@cpu_gpu() @pytest.mark.parametrize( "func", [ @@ -300,6 +347,7 @@ def test_rename_axis(pdf): assert_eq(df.x.rename_axis(index="dummy"), pdf.x.rename_axis(index="dummy")) +@cpu_gpu() def test_isin(df, pdf): values = [1, 2] assert_eq(pdf.isin(values), df.isin(values)) @@ -341,6 +389,7 @@ def test_rename_traverse_filter(df): assert str(result) == str(expected) +@cpu_gpu() def test_columns_traverse_filters(pdf, df): result = df[df.x > 5].y.simplify() expected = df.y[df.x > 5] @@ -377,6 +426,7 @@ def test_drop_duplicates_subset_simplify(pdf, subset, projection): assert str(result) == str(expected) +@cpu_gpu() def test_broadcast(pdf, df): assert_eq( df + df.sum(), @@ -388,6 +438,7 @@ def test_broadcast(pdf, df): ) +@cpu_gpu() def test_persist(pdf, df): a = df + 2 b = a.persist() @@ -400,11 +451,13 @@ def test_persist(pdf, df): assert_eq(b.y.sum(), (pdf + 2).y.sum()) +@cpu_gpu() def test_index(pdf, df): assert_eq(df.index, pdf.index) assert_eq(df.x.index, pdf.x.index) +@cpu_gpu() @pytest.mark.parametrize("drop", [True, False]) def test_reset_index(pdf, df, drop): assert_eq(df.reset_index(drop=drop), pdf.reset_index(drop=drop), check_index=False) @@ -413,6 +466,7 @@ def test_reset_index(pdf, df, drop): ) +@cpu_gpu() def test_head(pdf, df): assert_eq(df.head(compute=False), pdf.head()) assert_eq(df.head(compute=False, n=7), pdf.head(n=7)) @@ -436,6 +490,7 @@ def test_head_head(df): assert a.optimize()._name == b.optimize()._name +@cpu_gpu() def test_tail(pdf, df): assert_eq(df.tail(compute=False), pdf.tail()) assert_eq(df.tail(compute=False, n=7), pdf.tail(n=7)) @@ -531,6 +586,7 @@ def test_from_pandas(pdf): assert "pandas" in df._name +@cpu_gpu() def test_copy(pdf, df): original = df.copy() columns = tuple(original.columns) @@ -541,6 +597,7 @@ def test_copy(pdf, df): assert "z" not in original.columns +@cpu_gpu() def test_partitions(pdf, df): assert_eq(df.partitions[0], pdf.iloc[:10]) assert_eq(df.partitions[1], pdf.iloc[10:20]) @@ -566,6 +623,7 @@ def test_column_getattr(df): df.foo +@cpu_gpu() def test_serialization(pdf, df): before = pickle.dumps(df) @@ -711,6 +769,7 @@ def test_repartition_no_op(df): assert result._name == df._name +@cpu_gpu() def test_len(df, pdf): df2 = df[["x"]] + 1 assert len(df2) == len(pdf) @@ -724,6 +783,7 @@ def test_len(df, pdf): assert isinstance(expr.Lengths(df2.expr).optimize(), expr.Literal) +@cpu_gpu() def test_astype_simplify(df, pdf): q = df.astype({"x": "float64", "y": "float64"})["x"] result = q.simplify() @@ -813,6 +873,7 @@ def test_dir(df): assert "sum" in dir(df.index) +@cpu_gpu() @pytest.mark.parametrize( "func, args", [ @@ -941,6 +1002,7 @@ def test_assign_non_series_inputs(df, pdf): assert_eq(df.assign(a=lambda x: x.x * 2).a, pdf.assign(a=lambda x: x.x * 2).a) +@cpu_gpu() def test_are_co_aligned(pdf, df): df2 = df.reset_index() assert are_co_aligned(df.expr, df2.expr) From 3afa3b9ec408a7d257d7baa1a25cbfa9d87e1b86 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Wed, 5 Jul 2023 13:39:57 -0700 Subject: [PATCH 02/18] formatting --- dask_expr/tests/test_collection.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py index 0158736d1..1d514f637 100644 --- a/dask_expr/tests/test_collection.py +++ b/dask_expr/tests/test_collection.py @@ -18,7 +18,6 @@ from dask_expr.expr import are_co_aligned from dask_expr.reductions import Len - try: import cudf except ImportError: @@ -49,6 +48,7 @@ def df(pdf): def cpu_gpu(data: dict | None = None, npartitions: int = 10): """DataFrame parameterization for cpu and gpu backed data""" + if data is None: pdf, gdf = _pdf().copy(), _gdf().copy() else: From bead5cbe97eadeca7b6bd011cfec47be651d21fe Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 6 Jul 2023 12:56:51 -0700 Subject: [PATCH 03/18] partial revision --- dask_expr/collection.py | 11 +++ dask_expr/io/parquet.py | 11 +-- dask_expr/io/tests/test_io.py | 23 ++++++- dask_expr/tests/test_collection.py | 104 +++++++++++------------------ 4 files changed, 77 insertions(+), 72 deletions(-) diff --git a/dask_expr/collection.py b/dask_expr/collection.py index 33abc9362..8b9807f06 100644 --- a/dask_expr/collection.py +++ b/dask_expr/collection.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +from dask import config from dask.base import DaskMethodsMixin, is_dask_collection, named_schedulers from dask.dataframe.core import ( _concat, @@ -902,6 +903,7 @@ def read_parquet( aggregate_files=None, parquet_file_extension=(".parq", ".parquet", ".pq"), filesystem="fsspec", + engine=None, **kwargs, ): from dask_expr.io.parquet import ReadParquet @@ -911,6 +913,14 @@ def read_parquet( kwargs["dtype_backend"] = dtype_backend + if engine is None: + if config.get("dataframe.backend", "pandas") == "cudf": + from dask_cudf.io.parquet import CudfEngine + + engine = CudfEngine + else: + engine = "pyarrow" + return new_collection( ReadParquet( path, @@ -927,6 +937,7 @@ def read_parquet( aggregate_files=aggregate_files, parquet_file_extension=parquet_file_extension, filesystem=filesystem, + engine=engine, kwargs=kwargs, ) ) diff --git a/dask_expr/io/parquet.py b/dask_expr/io/parquet.py index de116d7dc..fa170dd43 100644 --- a/dask_expr/io/parquet.py +++ b/dask_expr/io/parquet.py @@ -391,6 +391,7 @@ class ReadParquet(PartitionsFiltered, BlockwiseIO): "aggregate_files", "parquet_file_extension", "filesystem", + "engine", "kwargs", "_partitions", "_series", @@ -409,6 +410,7 @@ class ReadParquet(PartitionsFiltered, BlockwiseIO): "aggregate_files": None, "parquet_file_extension": (".parq", ".parquet", ".pq"), "filesystem": "fsspec", + "engine": "pyarrow", "kwargs": None, "_partitions": None, "_series": False, @@ -417,11 +419,10 @@ class ReadParquet(PartitionsFiltered, BlockwiseIO): @property def engine(self): - if dask.config.get("dataframe.backend", "pandas") == "cudf": - from dask_cudf.io.parquet import CudfEngine - - return CudfEngine - return get_engine("pyarrow") + _engine = self.operand("engine") + if isinstance(_engine, str): + return get_engine(_engine) + return _engine @property def columns(self): diff --git a/dask_expr/io/tests/test_io.py b/dask_expr/io/tests/test_io.py index 6790a5fa6..6baae2da9 100644 --- a/dask_expr/io/tests/test_io.py +++ b/dask_expr/io/tests/test_io.py @@ -3,6 +3,7 @@ import dask.dataframe as dd import pandas as pd import pytest +from dask import config from dask.dataframe.utils import assert_eq from dask_expr import from_dask_dataframe, from_pandas, optimize, read_csv, read_parquet @@ -10,6 +11,23 @@ from dask_expr.io import ReadParquet from dask_expr.reductions import Len +try: + import cudf +except ImportError: + cudf = None + + +@pytest.fixture( + params=[ + "pandas", + pytest.param( + "cudf", marks=pytest.mark.skipif(cudf is None, reason="cudf not found.") + ), + ] +) +def backend(request): + yield request.param + def _make_file(dir, format="parquet", df=None): fn = os.path.join(str(dir), f"myfile.{format}") @@ -206,8 +224,9 @@ def test_from_pandas_immutable(): assert_eq(df, expected) -def test_parquet_complex_filters(tmpdir): - df = read_parquet(_make_file(tmpdir)) +def test_parquet_complex_filters(tmpdir, backend): + with config.set({"dataframe.backend": backend}): + df = read_parquet(_make_file(tmpdir)) pdf = df.compute() got = df["a"][df["b"] > df["b"].mean()] expect = pdf["a"][pdf["b"] > pdf["b"].mean()] diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py index 1d514f637..5012b6a01 100644 --- a/dask_expr/tests/test_collection.py +++ b/dask_expr/tests/test_collection.py @@ -1,6 +1,5 @@ from __future__ import annotations -import functools import operator import pickle import re @@ -24,21 +23,11 @@ cudf = None -@functools.cache -def _pdf(): - pdf = pd.DataFrame({"x": range(100)}) - pdf["y"] = pdf.x * 10.0 - return pdf - - -@functools.cache -def _gdf(): - return None if cudf is None else cudf.from_pandas(_pdf()) - - @pytest.fixture def pdf(): - yield _pdf().copy() + pdf = pd.DataFrame({"x": range(100)}) + pdf["y"] = pdf.x * 10.0 + yield pdf @pytest.fixture @@ -46,51 +35,55 @@ def df(pdf): yield from_pandas(pdf, npartitions=10) -def cpu_gpu(data: dict | None = None, npartitions: int = 10): - """DataFrame parameterization for cpu and gpu backed data""" +@pytest.fixture( + params=[ + "pandas", + pytest.param( + "cudf", marks=pytest.mark.skipif(cudf is None, reason="cudf not found.") + ), + ] +) +def backend(request): + yield request.param + - if data is None: - pdf, gdf = _pdf().copy(), _gdf().copy() +@pytest.fixture +def bdf(backend, pdf): + # Multi-backend DataFrame fixture + if backend == "cudf": + yield cudf.from_pandas(pdf) else: - pdf = pd.DataFrame(data) - gdf = None if cudf is None else cudf.from_pandas(pdf) - - return pytest.mark.parametrize( - "pdf,df", - [ - pytest.param(pdf, from_pandas(pdf, npartitions), id="pandas"), - pytest.param( - gdf, - from_pandas(gdf, npartitions) if cudf else None, - id="cudf", - marks=pytest.mark.skipif(cudf is None, reason="cudf not found"), - ), - ], - ) + yield pdf -@cpu_gpu() -def test_del(pdf, df): +@pytest.fixture +def xdf(bdf): + # Multi-backend Dask-Expression DataFrame fixture + yield from_pandas(bdf, npartitions=10) + + +def test_del(pdf, xdf): pdf = pdf.copy() # Check __delitem__ del pdf["x"] - del df["x"] - assert_eq(pdf, df) + del xdf["x"] + assert_eq(pdf, xdf) -@cpu_gpu() -def test_setitem(pdf, df): +def test_setitem(pdf, xdf): pdf = pdf.copy() pdf["z"] = pdf.x + pdf.y - df["z"] = df.x + df.y + xdf["z"] = xdf.x + xdf.y - assert "z" in df.columns - assert_eq(df, pdf) + assert "z" in xdf.columns + assert_eq(xdf, pdf) def test_explode(): + # CuDF backend does not support explode + # (See: https://github.com/rapidsai/cudf/issues/10271) pdf = pd.DataFrame({"a": [[1, 2], [3, 4]]}) df = from_pandas(pdf) assert_eq(pdf.explode(column="a"), df.explode(column="a")) @@ -130,12 +123,11 @@ def test_meta_blockwise(): assert set(cc.columns) == {"x", "y", "z"} -@cpu_gpu() -def test_dask(pdf, df): - assert (df.x + df.y).npartitions == 10 - z = (df.x + df.y).sum() +def test_dask(bdf, xdf): + assert (xdf.x + xdf.y).npartitions == 10 + z = (xdf.x + xdf.y).sum() - assert assert_eq(z, (pdf.x + pdf.y).sum()) + assert assert_eq(z, (bdf.x + bdf.y).sum()) @pytest.mark.parametrize( @@ -212,7 +204,6 @@ def test_memory_usage(pdf): df.index.memory_usage(index=True) -@cpu_gpu() @pytest.mark.parametrize("func", [M.nlargest, M.nsmallest]) def test_nlargest_nsmallest(df, pdf, func): assert_eq(func(df, n=5, columns="x"), func(pdf, n=5, columns="x")) @@ -221,7 +212,6 @@ def test_nlargest_nsmallest(df, pdf, func): func(df.x, n=5, columns="foo") -@cpu_gpu() @pytest.mark.parametrize( "func", [ @@ -278,7 +268,6 @@ def test_unary_operators(func): assert_eq(func(pdf), func(df)) -@cpu_gpu() @pytest.mark.parametrize( "func", [ @@ -347,7 +336,6 @@ def test_rename_axis(pdf): assert_eq(df.x.rename_axis(index="dummy"), pdf.x.rename_axis(index="dummy")) -@cpu_gpu() def test_isin(df, pdf): values = [1, 2] assert_eq(pdf.isin(values), df.isin(values)) @@ -389,7 +377,6 @@ def test_rename_traverse_filter(df): assert str(result) == str(expected) -@cpu_gpu() def test_columns_traverse_filters(pdf, df): result = df[df.x > 5].y.simplify() expected = df.y[df.x > 5] @@ -426,7 +413,6 @@ def test_drop_duplicates_subset_simplify(pdf, subset, projection): assert str(result) == str(expected) -@cpu_gpu() def test_broadcast(pdf, df): assert_eq( df + df.sum(), @@ -438,7 +424,6 @@ def test_broadcast(pdf, df): ) -@cpu_gpu() def test_persist(pdf, df): a = df + 2 b = a.persist() @@ -451,13 +436,11 @@ def test_persist(pdf, df): assert_eq(b.y.sum(), (pdf + 2).y.sum()) -@cpu_gpu() def test_index(pdf, df): assert_eq(df.index, pdf.index) assert_eq(df.x.index, pdf.x.index) -@cpu_gpu() @pytest.mark.parametrize("drop", [True, False]) def test_reset_index(pdf, df, drop): assert_eq(df.reset_index(drop=drop), pdf.reset_index(drop=drop), check_index=False) @@ -466,7 +449,6 @@ def test_reset_index(pdf, df, drop): ) -@cpu_gpu() def test_head(pdf, df): assert_eq(df.head(compute=False), pdf.head()) assert_eq(df.head(compute=False, n=7), pdf.head(n=7)) @@ -490,7 +472,6 @@ def test_head_head(df): assert a.optimize()._name == b.optimize()._name -@cpu_gpu() def test_tail(pdf, df): assert_eq(df.tail(compute=False), pdf.tail()) assert_eq(df.tail(compute=False, n=7), pdf.tail(n=7)) @@ -586,7 +567,6 @@ def test_from_pandas(pdf): assert "pandas" in df._name -@cpu_gpu() def test_copy(pdf, df): original = df.copy() columns = tuple(original.columns) @@ -597,7 +577,6 @@ def test_copy(pdf, df): assert "z" not in original.columns -@cpu_gpu() def test_partitions(pdf, df): assert_eq(df.partitions[0], pdf.iloc[:10]) assert_eq(df.partitions[1], pdf.iloc[10:20]) @@ -623,7 +602,6 @@ def test_column_getattr(df): df.foo -@cpu_gpu() def test_serialization(pdf, df): before = pickle.dumps(df) @@ -769,7 +747,6 @@ def test_repartition_no_op(df): assert result._name == df._name -@cpu_gpu() def test_len(df, pdf): df2 = df[["x"]] + 1 assert len(df2) == len(pdf) @@ -783,7 +760,6 @@ def test_len(df, pdf): assert isinstance(expr.Lengths(df2.expr).optimize(), expr.Literal) -@cpu_gpu() def test_astype_simplify(df, pdf): q = df.astype({"x": "float64", "y": "float64"})["x"] result = q.simplify() @@ -873,7 +849,6 @@ def test_dir(df): assert "sum" in dir(df.index) -@cpu_gpu() @pytest.mark.parametrize( "func, args", [ @@ -1002,7 +977,6 @@ def test_assign_non_series_inputs(df, pdf): assert_eq(df.assign(a=lambda x: x.x * 2).a, pdf.assign(a=lambda x: x.x * 2).a) -@cpu_gpu() def test_are_co_aligned(pdf, df): df2 = df.reset_index() assert are_co_aligned(df.expr, df2.expr) From 4381738ba79ec8ea036d7d1fa33701c263a28a8b Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 6 Jul 2023 14:41:57 -0700 Subject: [PATCH 04/18] configure with backend fixture --- dask_expr/_util.py | 7 + dask_expr/collection.py | 3 +- dask_expr/tests/test_collection.py | 621 +++++++++++++++-------------- 3 files changed, 340 insertions(+), 291 deletions(-) diff --git a/dask_expr/_util.py b/dask_expr/_util.py index a206fa9ad..671f09701 100644 --- a/dask_expr/_util.py +++ b/dask_expr/_util.py @@ -1,5 +1,7 @@ from __future__ import annotations +from dask import config + def _convert_to_list(column) -> list | None: if column is None or isinstance(column, list): @@ -11,3 +13,8 @@ def _convert_to_list(column) -> list | None: else: column = [column] return column + + +def _maybe_import_backend(): + if config.get("dataframe.backend", "pandas") == "cudf": + import dask_cudf # noqa F401 diff --git a/dask_expr/collection.py b/dask_expr/collection.py index 8b9807f06..87a328664 100644 --- a/dask_expr/collection.py +++ b/dask_expr/collection.py @@ -24,7 +24,7 @@ from tlz import first from dask_expr import expr -from dask_expr._util import _convert_to_list +from dask_expr._util import _convert_to_list, _maybe_import_backend from dask_expr.concat import Concat from dask_expr.expr import Eval, no_default from dask_expr.merge import JoinRecursive, Merge @@ -858,6 +858,7 @@ def optimize(collection, fuse=True): def from_pandas(data, *args, **kwargs): from dask_expr.io.io import FromPandas + _maybe_import_backend() return new_collection(FromPandas(data.copy(), *args, **kwargs)) diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py index 5012b6a01..cba74e939 100644 --- a/dask_expr/tests/test_collection.py +++ b/dask_expr/tests/test_collection.py @@ -23,18 +23,6 @@ cudf = None -@pytest.fixture -def pdf(): - pdf = pd.DataFrame({"x": range(100)}) - pdf["y"] = pdf.x * 10.0 - yield pdf - - -@pytest.fixture -def df(pdf): - yield from_pandas(pdf, npartitions=10) - - @pytest.fixture( params=[ "pandas", @@ -48,12 +36,20 @@ def backend(request): @pytest.fixture -def bdf(backend, pdf): +def lib(backend): # Multi-backend DataFrame fixture if backend == "cudf": - yield cudf.from_pandas(pdf) + yield cudf else: - yield pdf + yield pd + + +@pytest.fixture +def bdf(lib): + # Backend DataFrame fixture + df = lib.DataFrame({"x": range(100)}) + df["y"] = df.x * 10.0 + yield df @pytest.fixture @@ -62,8 +58,8 @@ def xdf(bdf): yield from_pandas(bdf, npartitions=10) -def test_del(pdf, xdf): - pdf = pdf.copy() +def test_del(bdf, xdf): + pdf = bdf.copy() # Check __delitem__ del pdf["x"] @@ -71,8 +67,8 @@ def test_del(pdf, xdf): assert_eq(pdf, xdf) -def test_setitem(pdf, xdf): - pdf = pdf.copy() +def test_setitem(bdf, xdf): + pdf = bdf.copy() pdf["z"] = pdf.x + pdf.y xdf["z"] = xdf.x + xdf.y @@ -90,7 +86,10 @@ def test_explode(): assert_eq(pdf.a.explode(), df.a.explode()) -def test_explode_simplify(pdf): +def test_explode_simplify(bdf, backend): + if backend == "cudf": + pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/10271") + pdf = bdf.copy() pdf["z"] = 1 df = from_pandas(pdf) q = df.explode(column="x")["y"] @@ -99,8 +98,8 @@ def test_explode_simplify(pdf): assert result._name == expected._name -def test_meta_divisions_name(): - a = pd.DataFrame({"x": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]}) +def test_meta_divisions_name(lib): + a = lib.DataFrame({"x": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]}) df = 2 * from_pandas(a, npartitions=2) assert list(df.columns) == list(a.columns) assert df.npartitions == 2 @@ -112,9 +111,9 @@ def test_meta_divisions_name(): assert "sum" in df.sum()._name -def test_meta_blockwise(): - a = pd.DataFrame({"x": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]}) - b = pd.DataFrame({"z": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]}) +def test_meta_blockwise(lib): + a = lib.DataFrame({"x": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]}) + b = lib.DataFrame({"z": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]}) aa = from_pandas(a, npartitions=2) bb = from_pandas(b, npartitions=2) @@ -149,40 +148,45 @@ def test_dask(bdf, xdf): ), ], ) -def test_reductions(func, pdf, df): - result = func(df) +def test_reductions(func, bdf, xdf, backend): + if backend == "cudf" and func in [M.idxmin, M.idxmax]: + pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/9602") + result = func(xdf) assert result.known_divisions - assert_eq(result, func(pdf)) - result = func(df.x) + assert_eq(result, func(bdf)) + result = func(xdf.x) assert not result.known_divisions - assert_eq(result, func(pdf.x)) + assert_eq(result, func(bdf.x)) # check_dtype False because sub-selection of columns that is pushed through # is not reflected in the meta calculation - assert_eq(func(df)["x"], func(pdf)["x"], check_dtype=False) + assert_eq(func(xdf)["x"], func(bdf)["x"], check_dtype=False) -def test_nbytes(pdf, df): +def test_nbytes(bdf, xdf, backend): + if backend == "cudf": + pytest.xfail(reason="nbytes not supported by cudf") with pytest.raises(NotImplementedError, match="nbytes is not implemented"): - df.nbytes - assert_eq(df.x.nbytes, pdf.x.nbytes) + xdf.nbytes + assert_eq(xdf.x.nbytes, bdf.x.nbytes) -def test_mode(): - pdf = pd.DataFrame({"x": [1, 2, 3, 1, 2]}) +def test_mode(lib): + pdf = lib.DataFrame({"x": [1, 2, 3, 1, 2]}) df = from_pandas(pdf, npartitions=3) assert_eq(df.x.mode(), pdf.x.mode(), check_names=False) -def test_value_counts(df, pdf): +def test_value_counts(xdf, bdf): with pytest.raises( AttributeError, match="'DataFrame' object has no attribute 'value_counts'" ): - df.value_counts() - assert_eq(df.x.value_counts(), pdf.x.value_counts()) + xdf.value_counts() + assert_eq(xdf.x.value_counts(), bdf.x.value_counts().astype("int64")) -def test_dropna(pdf): +def test_dropna(bdf): + pdf = bdf.copy() pdf.loc[0, "y"] = np.nan df = from_pandas(pdf) assert_eq(df.dropna(), pdf.dropna()) @@ -190,9 +194,10 @@ def test_dropna(pdf): assert_eq(df.y.dropna(), pdf.y.dropna()) -def test_memory_usage(pdf): +def test_memory_usage(bdf): # Results are not equal with RangeIndex because pandas has one RangeIndex while # we have one RangeIndex per partition + pdf = bdf.copy() pdf.index = np.arange(len(pdf)) df = from_pandas(pdf) assert_eq(df.memory_usage(), pdf.memory_usage()) @@ -205,11 +210,11 @@ def test_memory_usage(pdf): @pytest.mark.parametrize("func", [M.nlargest, M.nsmallest]) -def test_nlargest_nsmallest(df, pdf, func): - assert_eq(func(df, n=5, columns="x"), func(pdf, n=5, columns="x")) - assert_eq(func(df.x, n=5), func(pdf.x, n=5)) +def test_nlargest_nsmallest(xdf, bdf, func): + assert_eq(func(xdf, n=5, columns="x"), func(bdf, n=5, columns="x")) + assert_eq(func(xdf.x, n=5), func(bdf.x, n=5)) with pytest.raises(TypeError, match="got an unexpected keyword argument"): - func(df.x, n=5, columns="foo") + func(xdf.x, n=5, columns="foo") @pytest.mark.parametrize( @@ -226,8 +231,8 @@ def test_nlargest_nsmallest(df, pdf, func): lambda df: df.x != df.y, ], ) -def test_conditionals(func, pdf, df): - assert_eq(func(pdf), func(df), check_names=False) +def test_conditionals(func, bdf, xdf): + assert_eq(func(bdf), func(xdf), check_names=False) @pytest.mark.parametrize( @@ -241,8 +246,8 @@ def test_conditionals(func, pdf, df): lambda df: df.x.__rxor__(df.y), ], ) -def test_boolean_operators(func): - pdf = pd.DataFrame( +def test_boolean_operators(func, lib): + pdf = lib.DataFrame( {"x": [True, False, True, False], "y": [True, False, False, False]} ) df = from_pandas(pdf) @@ -260,8 +265,8 @@ def test_boolean_operators(func): lambda df: +df, ], ) -def test_unary_operators(func): - pdf = pd.DataFrame( +def test_unary_operators(func, lib): + pdf = lib.DataFrame( {"x": [True, False, True, False], "y": [True, False, False, False], "z": 1} ) df = from_pandas(pdf) @@ -275,23 +280,25 @@ def test_unary_operators(func): lambda df: df[(df.x > 7) & (df.x < 10)], ], ) -def test_and_or(func, pdf, df): - assert_eq(func(pdf), func(df), check_names=False) +def test_and_or(func, bdf, xdf): + assert_eq(func(bdf), func(xdf), check_names=False) @pytest.mark.parametrize("how", ["start", "end"]) -def test_to_timestamp(pdf, how): - pdf.index = pd.period_range("2019-12-31", freq="D", periods=len(pdf)) - df = from_pandas(pdf) - assert_eq(df.to_timestamp(how=how), pdf.to_timestamp(how=how)) - assert_eq(df.x.to_timestamp(how=how), pdf.x.to_timestamp(how=how)) +def test_to_timestamp(bdf, how, backend): + if backend == "cudf": + pytest.xfail(reason="period_range not supported by cudf") + bdf.index = pd.period_range("2019-12-31", freq="D", periods=len(bdf)) + df = from_pandas(bdf) + assert_eq(df.to_timestamp(how=how), bdf.to_timestamp(how=how)) + assert_eq(df.x.to_timestamp(how=how), bdf.x.to_timestamp(how=how)) @pytest.mark.parametrize( "func", [ lambda df: df.astype(int), - lambda df: df.apply(lambda row, x, y=10: row * x + y, x=2), + # lambda df: df.apply(lambda row, x, y=10: row * x + y, x=2), pytest.param( lambda df: df.map(lambda x: x + 1), marks=pytest.mark.skipif( @@ -302,7 +309,7 @@ def test_to_timestamp(pdf, how): lambda df: df.x.clip(lower=10, upper=50), lambda df: df.x.between(left=10, right=50), lambda df: df.x.map(lambda x: x + 1), - lambda df: df.index.map(lambda x: x + 1), + # lambda df: df.index.map(lambda x: x + 1), lambda df: df[df.x > 5], lambda df: df.assign(a=df.x + df.y, b=df.x - df.y), lambda df: df.replace(to_replace=1, value=1000), @@ -314,8 +321,8 @@ def test_to_timestamp(pdf, how): lambda df: df.rename(columns={"x": "xx"}), lambda df: df.rename(columns={"x": "xx"}).xx, lambda df: df.rename(columns={"x": "xx"})[["xx"]], - lambda df: df.combine_first(df), - lambda df: df.x.combine_first(df.y), + # lambda df: df.combine_first(df), + # lambda df: df.x.combine_first(df.y), lambda df: df.x.to_frame(), lambda df: df.drop(columns="x"), lambda df: df.x.index.to_frame(), @@ -323,11 +330,29 @@ def test_to_timestamp(pdf, how): lambda df: df.select_dtypes(include="integer"), ], ) -def test_blockwise(func, pdf, df): - assert_eq(func(pdf), func(df)) +def test_blockwise(func, bdf, xdf): + assert_eq(func(bdf), func(xdf)) + + +@pytest.mark.parametrize( + "func", + [ + lambda df: df.apply(lambda row, x, y=10: row * x + y, x=2), + lambda df: df.index.map(lambda x: x + 1), + lambda df: df.combine_first(df), + lambda df: df.x.combine_first(df.y), + ], +) +def test_blockwise_cudf_fails(func, bdf, xdf, backend): + if backend == "cudf": + pytest.xfail(reason="func not supported by cudf") + assert_eq(func(bdf), func(xdf)) -def test_rename_axis(pdf): +def test_rename_axis(bdf, backend): + if backend == "cudf": + pytest.xfail(reason="rename_axis not supported by cudf") + pdf = bdf.copy() pdf.index.name = "a" pdf.columns.name = "b" df = from_pandas(pdf, npartitions=10) @@ -336,30 +361,34 @@ def test_rename_axis(pdf): assert_eq(df.x.rename_axis(index="dummy"), pdf.x.rename_axis(index="dummy")) -def test_isin(df, pdf): +def test_isin(xdf, bdf): values = [1, 2] - assert_eq(pdf.isin(values), df.isin(values)) - assert_eq(pdf.x.isin(values), df.x.isin(values)) + assert_eq(bdf.isin(values), xdf.isin(values)) + assert_eq(bdf.x.isin(values), xdf.x.isin(values)) -def test_round(pdf): +def test_round(bdf): + pdf = bdf.copy() pdf += 0.5555 df = from_pandas(pdf) assert_eq(df.round(decimals=1), pdf.round(decimals=1)) assert_eq(df.x.round(decimals=1), pdf.x.round(decimals=1)) -def test_repr(df): - assert "+ 1" in str(df + 1) - assert "+ 1" in repr(df + 1) +def test_repr(xdf): + assert "+ 1" in str(xdf + 1) + assert "+ 1" in repr(xdf + 1) - s = (df["x"] + 1).sum(skipna=False).expr + s = (xdf["x"] + 1).sum(skipna=False).expr assert '["x"]' in s or "['x']" in s assert "+ 1" in s assert "sum(skipna=False)" in s -def test_combine_first_simplify(pdf): +def test_combine_first_simplify(bdf, backend): + if backend == "cudf": + pytest.xfail(reason="combine_first not supported by cudf") + pdf = bdf.copy() df = from_pandas(pdf) pdf2 = pdf.rename(columns={"y": "z"}) df2 = from_pandas(pdf2) @@ -371,40 +400,41 @@ def test_combine_first_simplify(pdf): assert_eq(result, pdf.combine_first(pdf2)[["z", "y"]]) -def test_rename_traverse_filter(df): - result = df.rename(columns={"x": "xx"})[["xx"]].simplify() - expected = df[["x"]].rename(columns={"x": "xx"}) +def test_rename_traverse_filter(xdf): + result = xdf.rename(columns={"x": "xx"})[["xx"]].simplify() + expected = xdf[["x"]].rename(columns={"x": "xx"}) assert str(result) == str(expected) -def test_columns_traverse_filters(pdf, df): - result = df[df.x > 5].y.simplify() - expected = df.y[df.x > 5] +def test_columns_traverse_filters(xdf): + result = xdf[xdf.x > 5].y.simplify() + expected = xdf.y[xdf.x > 5] assert str(result) == str(expected) -def test_clip_traverse_filters(df): - result = df.clip(lower=10).y.simplify() - expected = df.y.clip(lower=10) +def test_clip_traverse_filters(xdf): + result = xdf.clip(lower=10).y.simplify() + expected = xdf.y.clip(lower=10) assert result._name == expected._name - result = df.clip(lower=10)[["x", "y"]].simplify() - expected = df.clip(lower=10) + result = xdf.clip(lower=10)[["x", "y"]].simplify() + expected = xdf.clip(lower=10) assert result._name == expected._name - arg = df.clip(lower=10)[["x"]] + arg = xdf.clip(lower=10)[["x"]] result = arg.simplify() - expected = df[["x"]].clip(lower=10) + expected = xdf[["x"]].clip(lower=10) assert result._name == expected._name @pytest.mark.parametrize("projection", ["zz", ["zz"], ["zz", "x"], "zz"]) @pytest.mark.parametrize("subset", ["x", ["x"]]) -def test_drop_duplicates_subset_simplify(pdf, subset, projection): +def test_drop_duplicates_subset_simplify(bdf, subset, projection): + pdf = bdf.copy() pdf["zz"] = 1 df = from_pandas(pdf) result = df.drop_duplicates(subset=subset)[projection].simplify() @@ -413,19 +443,19 @@ def test_drop_duplicates_subset_simplify(pdf, subset, projection): assert str(result) == str(expected) -def test_broadcast(pdf, df): +def test_broadcast(bdf, xdf): assert_eq( - df + df.sum(), - pdf + pdf.sum(), + xdf + xdf.sum(), + bdf + bdf.sum(), ) assert_eq( - df.x + df.x.sum(), - pdf.x + pdf.x.sum(), + xdf.x + xdf.x.sum(), + bdf.x + bdf.x.sum(), ) -def test_persist(pdf, df): - a = df + 2 +def test_persist(bdf, xdf): + a = xdf + 2 b = a.persist() assert_eq(a, b) @@ -433,31 +463,31 @@ def test_persist(pdf, df): assert len(b.__dask_graph__()) == b.npartitions - assert_eq(b.y.sum(), (pdf + 2).y.sum()) + assert_eq(b.y.sum(), (bdf + 2).y.sum()) -def test_index(pdf, df): - assert_eq(df.index, pdf.index) - assert_eq(df.x.index, pdf.x.index) +def test_index(bdf, xdf): + assert_eq(xdf.index, bdf.index) + assert_eq(xdf.x.index, bdf.x.index) @pytest.mark.parametrize("drop", [True, False]) -def test_reset_index(pdf, df, drop): - assert_eq(df.reset_index(drop=drop), pdf.reset_index(drop=drop), check_index=False) +def test_reset_index(bdf, xdf, drop): + assert_eq(xdf.reset_index(drop=drop), bdf.reset_index(drop=drop), check_index=False) assert_eq( - df.x.reset_index(drop=drop), pdf.x.reset_index(drop=drop), check_index=False + xdf.x.reset_index(drop=drop), bdf.x.reset_index(drop=drop), check_index=False ) -def test_head(pdf, df): - assert_eq(df.head(compute=False), pdf.head()) - assert_eq(df.head(compute=False, n=7), pdf.head(n=7)) +def test_head(bdf, xdf): + assert_eq(xdf.head(compute=False), bdf.head()) + assert_eq(xdf.head(compute=False, n=7), bdf.head(n=7)) - assert df.head(compute=False).npartitions == 1 + assert xdf.head(compute=False).npartitions == 1 -def test_head_down(df): - result = (df.x + df.y + 1).head(compute=False) +def test_head_down(xdf): + result = (xdf.x + xdf.y + 1).head(compute=False) optimized = result.simplify() assert_eq(result, optimized) @@ -465,22 +495,22 @@ def test_head_down(df): assert not isinstance(optimized.expr, expr.Head) -def test_head_head(df): - a = df.head(compute=False).head(compute=False) - b = df.head(compute=False) +def test_head_head(xdf): + a = xdf.head(compute=False).head(compute=False) + b = xdf.head(compute=False) assert a.optimize()._name == b.optimize()._name -def test_tail(pdf, df): - assert_eq(df.tail(compute=False), pdf.tail()) - assert_eq(df.tail(compute=False, n=7), pdf.tail(n=7)) +def test_tail(bdf, xdf): + assert_eq(xdf.tail(compute=False), bdf.tail()) + assert_eq(xdf.tail(compute=False, n=7), bdf.tail(n=7)) - assert df.tail(compute=False).npartitions == 1 + assert xdf.tail(compute=False).npartitions == 1 -def test_tail_down(df): - result = (df.x + df.y + 1).tail(compute=False) +def test_tail_down(xdf): + result = (xdf.x + xdf.y + 1).tail(compute=False) optimized = optimize(result) assert_eq(result, optimized) @@ -488,49 +518,49 @@ def test_tail_down(df): assert not isinstance(optimized.expr, expr.Tail) -def test_tail_tail(df): - a = df.tail(compute=False).tail(compute=False) - b = df.tail(compute=False) +def test_tail_tail(xdf): + a = xdf.tail(compute=False).tail(compute=False) + b = xdf.tail(compute=False) assert a.optimize()._name == b.optimize()._name -def test_tail_repartition(df): - a = df.repartition(npartitions=10).tail() - b = df.tail() +def test_tail_repartition(xdf): + a = xdf.repartition(npartitions=10).tail() + b = xdf.tail() assert_eq(a, b) -def test_projection_stacking(df): - result = df[["x", "y"]]["x"] +def test_projection_stacking(xdf): + result = xdf[["x", "y"]]["x"] optimized = result.simplify() - expected = df["x"] + expected = xdf["x"] assert optimized._name == expected._name -def test_projection_stacking_coercion(pdf): - df = from_pandas(pdf) - assert_eq(df.x[0], pdf.x[0], check_divisions=False) - assert_eq(df.x[[0]], pdf.x[[0]], check_divisions=False) +def test_projection_stacking_coercion(bdf): + df = from_pandas(bdf) + assert_eq(df.x[0], bdf.x[0], check_divisions=False) + assert_eq(df.x[[0]], bdf.x[[0]], check_divisions=False) -def test_remove_unnecessary_projections(df): - result = (df + 1)[df.columns] +def test_remove_unnecessary_projections(xdf): + result = (xdf + 1)[xdf.columns] optimized = result.simplify() - expected = df + 1 + expected = xdf + 1 assert optimized._name == expected._name - result = (df[["x"]] + 1)[["x"]] + result = (xdf[["x"]] + 1)[["x"]] optimized = result.simplify() - expected = df[["x"]] + 1 + expected = xdf[["x"]] + 1 assert optimized._name == expected._name -def test_substitute(df): - pdf = pd.DataFrame( +def test_substitute(lib): + pdf = lib.DataFrame( { "a": range(100), "b": range(100), @@ -561,59 +591,59 @@ def test_substitute(df): assert result._name == expected._name -def test_from_pandas(pdf): - df = from_pandas(pdf, npartitions=3) +def test_from_pandas(bdf): + df = from_pandas(bdf, npartitions=3) assert df.npartitions == 3 assert "pandas" in df._name -def test_copy(pdf, df): - original = df.copy() +def test_copy(xdf): + original = xdf.copy() columns = tuple(original.columns) - df["z"] = df.x + df.y + xdf["z"] = xdf.x + xdf.y assert tuple(original.columns) == columns assert "z" not in original.columns -def test_partitions(pdf, df): - assert_eq(df.partitions[0], pdf.iloc[:10]) - assert_eq(df.partitions[1], pdf.iloc[10:20]) - assert_eq(df.partitions[1:3], pdf.iloc[10:30]) - assert_eq(df.partitions[[3, 4]], pdf.iloc[30:50]) - assert_eq(df.partitions[-1], pdf.iloc[90:]) +def test_partitions(bdf, xdf): + assert_eq(xdf.partitions[0], bdf.iloc[:10]) + assert_eq(xdf.partitions[1], bdf.iloc[10:20]) + assert_eq(xdf.partitions[1:3], bdf.iloc[10:30]) + assert_eq(xdf.partitions[[3, 4]], bdf.iloc[30:50]) + assert_eq(xdf.partitions[-1], bdf.iloc[90:]) - out = (df + 1).partitions[0].simplify() + out = (xdf + 1).partitions[0].simplify() assert isinstance(out.expr, expr.Add) assert out.expr.left._partitions == [0] # Check culling - out = optimize(df.partitions[1]) + out = optimize(xdf.partitions[1]) assert len(out.dask) == 1 - assert_eq(out, pdf.iloc[10:20]) + assert_eq(out, bdf.iloc[10:20]) -def test_column_getattr(df): - df = df.expr - assert df.x._name == df["x"]._name +def test_column_getattr(xdf): + xdf = xdf.expr + assert xdf.x._name == xdf["x"]._name with pytest.raises(AttributeError): - df.foo + xdf.foo -def test_serialization(pdf, df): - before = pickle.dumps(df) +def test_serialization(bdf, xdf): + before = pickle.dumps(xdf) - assert len(before) < 200 + len(pickle.dumps(pdf)) + assert len(before) < 200 + len(pickle.dumps(bdf)) - part = df.partitions[0].compute() + part = xdf.partitions[0].compute() assert ( - len(pickle.dumps(df.__dask_graph__())) - < 1000 + len(pickle.dumps(part)) * df.npartitions + len(pickle.dumps(xdf.__dask_graph__())) + < 1000 + len(pickle.dumps(part)) * xdf.npartitions ) - after = pickle.dumps(df) + after = pickle.dumps(xdf) assert before == after # caching doesn't affect serialization @@ -621,21 +651,23 @@ def test_serialization(pdf, df): assert_eq(pickle.loads(before), pickle.loads(after)) -def test_size_optimized(df): - expr = (df.x + 1).apply(lambda x: x).size +def test_size_optimized(xdf, backend): + if backend == "cudf": + pytest.xfail(reason="Cannot apply lambda function in cudf") + expr = (xdf.x + 1).apply(lambda x: x).size out = optimize(expr) - expected = optimize(df.x.size) + expected = optimize(xdf.x.size) assert out._name == expected._name - expr = (df + 1).apply(lambda x: x).size + expr = (xdf + 1).apply(lambda x: x).size out = optimize(expr) - expected = optimize(df.size) + expected = optimize(xdf.size) assert out._name == expected._name @pytest.mark.parametrize("fuse", [True, False]) -def test_tree_repr(df, fuse): - s = df.expr.tree_repr() +def test_tree_repr(fuse): + s = from_pandas(pd.Series(range(10))).expr.tree_repr() assert "" in s df = timeseries() @@ -655,38 +687,38 @@ def test_tree_repr(df, fuse): assert s.count("|") == 9 -def test_simple_graphs(df): - expr = (df + 1).expr +def test_simple_graphs(xdf): + expr = (xdf + 1).expr graph = expr.__dask_graph__() - assert graph[(expr._name, 0)] == (operator.add, (df.expr._name, 0), 1) + assert graph[(expr._name, 0)] == (operator.add, (xdf.expr._name, 0), 1) -def test_map_partitions(df): +def test_map_partitions(xdf): def combine_x_y(x, y, foo=None): assert foo == "bar" return x + y - df2 = df.map_partitions(combine_x_y, df + 1, foo="bar") - assert_eq(df2, df + (df + 1)) + df2 = xdf.map_partitions(combine_x_y, xdf + 1, foo="bar") + assert_eq(df2, xdf + (xdf + 1)) -def test_map_partitions_broadcast(df): +def test_map_partitions_broadcast(xdf): def combine_x_y(x, y, val, foo=None): assert foo == "bar" return x + y + val - df2 = df.map_partitions(combine_x_y, df["x"].sum(), 123, foo="bar") - assert_eq(df2, df + df["x"].sum() + 123) - assert_eq(df2.optimize(), df + df["x"].sum() + 123) + df2 = xdf.map_partitions(combine_x_y, xdf["x"].sum(), 123, foo="bar") + assert_eq(df2, xdf + xdf["x"].sum() + 123) + assert_eq(df2.optimize(), xdf + xdf["x"].sum() + 123) @pytest.mark.parametrize("opt", [True, False]) -def test_map_partitions_merge(opt): +def test_map_partitions_merge(opt, lib): # Make simple left & right dfs - pdf1 = pd.DataFrame({"x": range(20), "y": range(20)}) + pdf1 = lib.DataFrame({"x": range(20), "y": range(20)}) df1 = from_pandas(pdf1, 2) - pdf2 = pd.DataFrame({"x": range(0, 20, 2), "z": range(10)}) + pdf2 = lib.DataFrame({"x": range(0, 20, 2), "z": range(10)}) df2 = from_pandas(pdf2, 1) # Partition-wise merge with map_partitions @@ -703,37 +735,37 @@ def test_map_partitions_merge(opt): assert_eq(df3, expect, check_index=False) -def test_depth(df): - assert df._depth() == 1 - assert (df + 1)._depth() == 2 - assert ((df.x + 1) + df.y)._depth() == 4 +def test_depth(xdf): + assert xdf._depth() == 1 + assert (xdf + 1)._depth() == 2 + assert ((xdf.x + 1) + xdf.y)._depth() == 4 -def test_partitions_nested(df): - a = expr.Partitions(expr.Partitions(df.expr, [2, 4, 6]), [0, 2]) - b = expr.Partitions(df.expr, [2, 6]) +def test_partitions_nested(xdf): + a = expr.Partitions(expr.Partitions(xdf.expr, [2, 4, 6]), [0, 2]) + b = expr.Partitions(xdf.expr, [2, 6]) assert a.optimize()._name == b.optimize()._name @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("npartitions", [7, 12]) -def test_repartition_npartitions(pdf, npartitions, sort): - df = from_pandas(pdf, sort=sort) + 1 +def test_repartition_npartitions(bdf, npartitions, sort): + df = from_pandas(bdf, sort=sort) + 1 df2 = df.repartition(npartitions=npartitions) assert df2.npartitions == npartitions assert_eq(df, df2) @pytest.mark.parametrize("opt", [True, False]) -def test_repartition_divisions(df, opt): - end = df.divisions[-1] + 100 - stride = end // (df.npartitions + 2) +def test_repartition_divisions(xdf, opt): + end = xdf.divisions[-1] + 100 + stride = end // (xdf.npartitions + 2) divisions = tuple(range(0, end, stride)) - df2 = (df + 1).repartition(divisions=divisions, force=True)["x"] + df2 = (xdf + 1).repartition(divisions=divisions, force=True)["x"] df2 = optimize(df2) if opt else df2 assert df2.divisions == divisions - assert_eq((df + 1)["x"], df2) + assert_eq((xdf + 1)["x"], df2) # Check partitions for p, part in enumerate(dask.compute(list(df2.index.partitions))[0]): @@ -742,16 +774,16 @@ def test_repartition_divisions(df, opt): assert part.max() < df2.divisions[p + 1] -def test_repartition_no_op(df): - result = df.repartition(divisions=df.divisions).optimize() - assert result._name == df._name +def test_repartition_no_op(xdf): + result = xdf.repartition(divisions=xdf.divisions).optimize() + assert result._name == xdf._name -def test_len(df, pdf): - df2 = df[["x"]] + 1 - assert len(df2) == len(pdf) +def test_len(xdf, bdf): + df2 = xdf[["x"]] + 1 + assert len(df2) == len(bdf) - assert len(df[df.x > 5]) == len(pdf[pdf.x > 5]) + assert len(xdf[xdf.x > 5]) == len(bdf[bdf.x > 5]) first = df2.partitions[0].compute() assert len(df2.partitions[0]) == len(first) @@ -760,62 +792,63 @@ def test_len(df, pdf): assert isinstance(expr.Lengths(df2.expr).optimize(), expr.Literal) -def test_astype_simplify(df, pdf): - q = df.astype({"x": "float64", "y": "float64"})["x"] +def test_astype_simplify(xdf, bdf): + q = xdf.astype({"x": "float64", "y": "float64"})["x"] result = q.simplify() - expected = df["x"].astype({"x": "float64"}) + expected = xdf["x"].astype({"x": "float64"}) assert result._name == expected._name - assert_eq(q, pdf.astype({"x": "float64", "y": "float64"})["x"]) + assert_eq(q, bdf.astype({"x": "float64", "y": "float64"})["x"]) - q = df.astype({"y": "float64"})["x"] + q = xdf.astype({"y": "float64"})["x"] result = q.simplify() - expected = df["x"] + expected = xdf["x"] assert result._name == expected._name - q = df.astype("float64")["x"] + q = xdf.astype("float64")["x"] result = q.simplify() - expected = df["x"].astype("float64") + expected = xdf["x"].astype("float64") assert result._name == expected._name -def test_drop_duplicates(df, pdf): - assert_eq(df.drop_duplicates(), pdf.drop_duplicates()) +def test_drop_duplicates(xdf, bdf, backend): + assert_eq(xdf.drop_duplicates(), bdf.drop_duplicates()) assert_eq( - df.drop_duplicates(ignore_index=True), pdf.drop_duplicates(ignore_index=True) + xdf.drop_duplicates(ignore_index=True), bdf.drop_duplicates(ignore_index=True) ) - assert_eq(df.drop_duplicates(subset=["x"]), pdf.drop_duplicates(subset=["x"])) - assert_eq(df.x.drop_duplicates(), pdf.x.drop_duplicates()) + assert_eq(xdf.drop_duplicates(subset=["x"]), bdf.drop_duplicates(subset=["x"])) + assert_eq(xdf.x.drop_duplicates(), bdf.x.drop_duplicates()) - with pytest.raises(KeyError, match=re.escape("Index(['a'], dtype='object')")): - df.drop_duplicates(subset=["a"]) + if backend == "pandas": + with pytest.raises(KeyError, match=re.escape("Index(['a'], dtype='object')")): + xdf.drop_duplicates(subset=["a"]) with pytest.raises(TypeError, match="got an unexpected keyword argument"): - df.x.drop_duplicates(subset=["a"]) + xdf.x.drop_duplicates(subset=["a"]) -def test_unique(df, pdf): +def test_unique(xdf, bdf, lib): with pytest.raises( AttributeError, match="'DataFrame' object has no attribute 'unique'" ): - df.unique() + xdf.unique() # pandas returns a numpy array while we return a Series/Index - assert_eq(df.x.unique(), pd.Series(pdf.x.unique(), name="x")) - assert_eq(df.index.unique(), pd.Index(pdf.index.unique())) + assert_eq(xdf.x.unique(), lib.Series(bdf.x.unique(), name="x")) + assert_eq(xdf.index.unique(), lib.Index(bdf.index.unique())) -def test_walk(df): - df2 = df[df["x"] > 1][["y"]] + 1 +def test_walk(xdf): + df2 = xdf[xdf["x"] > 1][["y"]] + 1 assert all(isinstance(ex, expr.Expr) for ex in df2.walk()) exprs = set(df2.walk()) - assert df.expr in exprs - assert df["x"].expr in exprs - assert (df["x"] > 1).expr in exprs + assert xdf.expr in exprs + assert xdf["x"].expr in exprs + assert (xdf["x"] > 1).expr in exprs assert 1 not in exprs -def test_find_operations(df): - df2 = df[df["x"] > 1][["y"]] + 1 +def test_find_operations(xdf): + df2 = xdf[xdf["x"] > 1][["y"]] + 1 filters = list(df2.find_operations(expr.Filter)) assert len(filters) == 1 @@ -832,21 +865,21 @@ def test_find_operations(df): @pytest.mark.parametrize("subset", ["x", ["x"]]) -def test_dropna_simplify(pdf, subset): - pdf["z"] = 1 - df = from_pandas(pdf) +def test_dropna_simplify(bdf, subset): + bdf["z"] = 1 + df = from_pandas(bdf) q = df.dropna(subset=subset)["y"] result = q.simplify() expected = df[["x", "y"]].dropna(subset=subset)["y"] assert result._name == expected._name - assert_eq(q, pdf.dropna(subset=subset)["y"]) + assert_eq(q, bdf.dropna(subset=subset)["y"]) -def test_dir(df): - assert all(c in dir(df) for c in df.columns) - assert "sum" in dir(df) - assert "sum" in dir(df.x) - assert "sum" in dir(df.index) +def test_dir(xdf): + assert all(c in dir(xdf) for c in xdf.columns) + assert "sum" in dir(xdf) + assert "sum" in dir(xdf.x) + assert "sum" in dir(xdf.index) @pytest.mark.parametrize( @@ -862,38 +895,40 @@ def test_dir(df): ], ) @pytest.mark.parametrize("indexer", ["x", ["x"]]) -def test_simplify_up_blockwise(df, pdf, func, args, indexer): - q = getattr(df, func)(*args)[indexer] +def test_simplify_up_blockwise(xdf, bdf, func, args, indexer): + q = getattr(xdf, func)(*args)[indexer] result = q.simplify() - expected = getattr(df[indexer], func)(*args) + expected = getattr(xdf[indexer], func)(*args) assert result._name == expected._name - assert_eq(q, getattr(pdf, func)(*args)[indexer]) + assert_eq(q, getattr(bdf, func)(*args)[indexer]) - q = getattr(df, func)(*args)[["x", "y"]] + q = getattr(xdf, func)(*args)[["x", "y"]] result = q.simplify() - expected = getattr(df, func)(*args) + expected = getattr(xdf, func)(*args) assert result._name == expected._name -def test_sample(df): - result = df.sample(frac=0.5) +def test_sample(xdf): + result = xdf.sample(frac=0.5) assert_eq(result, result) - result = df.sample(frac=0.5, random_state=1234) - expected = df.sample(frac=0.5, random_state=1234) + result = xdf.sample(frac=0.5, random_state=1234) + expected = xdf.sample(frac=0.5, random_state=1234) assert_eq(result, expected) -def test_align(df, pdf): - result_1, result_2 = df.align(df) - pdf_result_1, pdf_result_2 = pdf.align(pdf) +def test_align(xdf, bdf, backend): + if backend == "cudf": + pytest.skip(reason="align not supported by cudf") + result_1, result_2 = xdf.align(xdf) + pdf_result_1, pdf_result_2 = bdf.align(bdf) assert_eq(result_1, pdf_result_1) assert_eq(result_2, pdf_result_2) - result_1, result_2 = df.x.align(df.x) - pdf_result_1, pdf_result_2 = pdf.x.align(pdf.x) + result_1, result_2 = xdf.x.align(xdf.x) + pdf_result_1, pdf_result_2 = bdf.x.align(bdf.x) assert_eq(result_1, pdf_result_1) assert_eq(result_2, pdf_result_2) @@ -930,75 +965,81 @@ def test_unknown_partitions_different_root(): df.align(df2) -def test_nunique_approx(df): - result = df.nunique_approx().compute() +def test_nunique_approx(xdf, backend): + if backend == "cudf": + pytest.xfail(reason="compute_hll_array doesn't work for cudf") + result = xdf.nunique_approx().compute() assert 99 < result < 101 -def test_assign_simplify(pdf): - df = from_pandas(pdf) - df2 = from_pandas(pdf) +def test_assign_simplify(bdf): + df = from_pandas(bdf) + df2 = from_pandas(bdf) df["new"] = df.x > 1 result = df[["x", "new"]].simplify() expected = df2[["x"]].assign(new=df2.x > 1).simplify() assert result._name == expected._name - pdf["new"] = pdf.x > 1 - assert_eq(pdf[["x", "new"]], result) + bdf["new"] = bdf.x > 1 + assert_eq(bdf[["x", "new"]], result) -def test_assign_simplify_new_column_not_needed(pdf): - df = from_pandas(pdf) - df2 = from_pandas(pdf) +def test_assign_simplify_new_column_not_needed(bdf): + df = from_pandas(bdf) + df2 = from_pandas(bdf) df["new"] = df.x > 1 result = df[["x"]].simplify() expected = df2[["x"]].simplify() assert result._name == expected._name - pdf["new"] = pdf.x > 1 - assert_eq(result, pdf[["x"]]) + bdf["new"] = bdf.x > 1 + assert_eq(result, bdf[["x"]]) -def test_assign_simplify_series(pdf): - df = from_pandas(pdf) - df2 = from_pandas(pdf) +def test_assign_simplify_series(bdf): + df = from_pandas(bdf) + df2 = from_pandas(bdf) df["new"] = df.x > 1 result = df.new.simplify() expected = df2[[]].assign(new=df2.x > 1).new.simplify() assert result._name == expected._name -def test_assign_non_series_inputs(df, pdf): - assert_eq(df.assign(a=lambda x: x.x * 2), pdf.assign(a=lambda x: x.x * 2)) - assert_eq(df.assign(a=2), pdf.assign(a=2)) - assert_eq(df.assign(a=df.x.sum()), pdf.assign(a=pdf.x.sum())) +def test_assign_non_series_inputs(xdf, bdf, backend): + if backend == "cudf": + pytest.xfail(reason="assign function not supported by cudf") + assert_eq(xdf.assign(a=lambda x: x.x * 2), bdf.assign(a=lambda x: x.x * 2)) + assert_eq(xdf.assign(a=2), bdf.assign(a=2)) + assert_eq(xdf.assign(a=xdf.x.sum()), bdf.assign(a=bdf.x.sum())) - assert_eq(df.assign(a=lambda x: x.x * 2).y, pdf.assign(a=lambda x: x.x * 2).y) - assert_eq(df.assign(a=lambda x: x.x * 2).a, pdf.assign(a=lambda x: x.x * 2).a) + assert_eq(xdf.assign(a=lambda x: x.x * 2).y, bdf.assign(a=lambda x: x.x * 2).y) + assert_eq(xdf.assign(a=lambda x: x.x * 2).a, bdf.assign(a=lambda x: x.x * 2).a) -def test_are_co_aligned(pdf, df): - df2 = df.reset_index() - assert are_co_aligned(df.expr, df2.expr) - assert are_co_aligned(df.expr, df2.sum().expr) - assert not are_co_aligned(df.expr, df2.repartition(npartitions=2).expr) +def test_are_co_aligned(bdf, xdf): + df2 = xdf.reset_index() + assert are_co_aligned(xdf.expr, df2.expr) + assert are_co_aligned(xdf.expr, df2.sum().expr) + assert not are_co_aligned(xdf.expr, df2.repartition(npartitions=2).expr) - assert are_co_aligned(df.expr, df.sum().expr) - assert are_co_aligned((df + df.sum()).expr, df.sum().expr) + assert are_co_aligned(xdf.expr, xdf.sum().expr) + assert are_co_aligned((xdf + xdf.sum()).expr, xdf.sum().expr) - pdf = pdf.assign(z=1) - df3 = from_pandas(pdf, npartitions=10) - assert not are_co_aligned(df.expr, df3.expr) - assert are_co_aligned(df.expr, df3.sum().expr) + bdf = bdf.assign(z=1) + df3 = from_pandas(bdf, npartitions=10) + assert not are_co_aligned(xdf.expr, df3.expr) + assert are_co_aligned(xdf.expr, df3.sum().expr) - merged = df.merge(df2) + merged = xdf.merge(df2) merged_first = merged.reset_index() merged_second = merged.rename(columns={"x": "a"}) assert are_co_aligned(merged_first.expr, merged_second.expr) - assert not are_co_aligned(merged_first.expr, df.expr) + assert not are_co_aligned(merged_first.expr, xdf.expr) -def test_astype_categories(df): - result = df.astype("category") +def test_astype_categories(xdf, backend): + if backend == "cudf": + pytest.xfail(reason="TODO") + result = xdf.astype("category") assert_eq(result.x._meta.cat.categories, pd.Index([UNKNOWN_CATEGORIES])) assert_eq(result.y._meta.cat.categories, pd.Index([UNKNOWN_CATEGORIES])) From 3d229a4e7add3f38573878014899777524657c0c Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 6 Jul 2023 14:44:59 -0700 Subject: [PATCH 05/18] revert xdf name back to df --- dask_expr/tests/test_collection.py | 427 ++++++++++++++--------------- 1 file changed, 212 insertions(+), 215 deletions(-) diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py index cba74e939..7c66f7623 100644 --- a/dask_expr/tests/test_collection.py +++ b/dask_expr/tests/test_collection.py @@ -32,12 +32,13 @@ ] ) def backend(request): + # Return backend-library label yield request.param @pytest.fixture def lib(backend): - # Multi-backend DataFrame fixture + # Return library associated with `backend` label if backend == "cudf": yield cudf else: @@ -53,28 +54,28 @@ def bdf(lib): @pytest.fixture -def xdf(bdf): +def df(bdf): # Multi-backend Dask-Expression DataFrame fixture yield from_pandas(bdf, npartitions=10) -def test_del(bdf, xdf): +def test_del(bdf, df): pdf = bdf.copy() # Check __delitem__ del pdf["x"] - del xdf["x"] - assert_eq(pdf, xdf) + del df["x"] + assert_eq(pdf, df) -def test_setitem(bdf, xdf): +def test_setitem(bdf, df): pdf = bdf.copy() pdf["z"] = pdf.x + pdf.y - xdf["z"] = xdf.x + xdf.y + df["z"] = df.x + df.y - assert "z" in xdf.columns - assert_eq(xdf, pdf) + assert "z" in df.columns + assert_eq(df, pdf) def test_explode(): @@ -122,9 +123,9 @@ def test_meta_blockwise(lib): assert set(cc.columns) == {"x", "y", "z"} -def test_dask(bdf, xdf): - assert (xdf.x + xdf.y).npartitions == 10 - z = (xdf.x + xdf.y).sum() +def test_dask(bdf, df): + assert (df.x + df.y).npartitions == 10 + z = (df.x + df.y).sum() assert assert_eq(z, (bdf.x + bdf.y).sum()) @@ -148,26 +149,26 @@ def test_dask(bdf, xdf): ), ], ) -def test_reductions(func, bdf, xdf, backend): +def test_reductions(func, bdf, df, backend): if backend == "cudf" and func in [M.idxmin, M.idxmax]: pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/9602") - result = func(xdf) + result = func(df) assert result.known_divisions assert_eq(result, func(bdf)) - result = func(xdf.x) + result = func(df.x) assert not result.known_divisions assert_eq(result, func(bdf.x)) # check_dtype False because sub-selection of columns that is pushed through # is not reflected in the meta calculation - assert_eq(func(xdf)["x"], func(bdf)["x"], check_dtype=False) + assert_eq(func(df)["x"], func(bdf)["x"], check_dtype=False) -def test_nbytes(bdf, xdf, backend): +def test_nbytes(bdf, df, backend): if backend == "cudf": pytest.xfail(reason="nbytes not supported by cudf") with pytest.raises(NotImplementedError, match="nbytes is not implemented"): - xdf.nbytes - assert_eq(xdf.x.nbytes, bdf.x.nbytes) + df.nbytes + assert_eq(df.x.nbytes, bdf.x.nbytes) def test_mode(lib): @@ -177,12 +178,12 @@ def test_mode(lib): assert_eq(df.x.mode(), pdf.x.mode(), check_names=False) -def test_value_counts(xdf, bdf): +def test_value_counts(df, bdf): with pytest.raises( AttributeError, match="'DataFrame' object has no attribute 'value_counts'" ): - xdf.value_counts() - assert_eq(xdf.x.value_counts(), bdf.x.value_counts().astype("int64")) + df.value_counts() + assert_eq(df.x.value_counts(), bdf.x.value_counts().astype("int64")) def test_dropna(bdf): @@ -210,11 +211,11 @@ def test_memory_usage(bdf): @pytest.mark.parametrize("func", [M.nlargest, M.nsmallest]) -def test_nlargest_nsmallest(xdf, bdf, func): - assert_eq(func(xdf, n=5, columns="x"), func(bdf, n=5, columns="x")) - assert_eq(func(xdf.x, n=5), func(bdf.x, n=5)) +def test_nlargest_nsmallest(df, bdf, func): + assert_eq(func(df, n=5, columns="x"), func(bdf, n=5, columns="x")) + assert_eq(func(df.x, n=5), func(bdf.x, n=5)) with pytest.raises(TypeError, match="got an unexpected keyword argument"): - func(xdf.x, n=5, columns="foo") + func(df.x, n=5, columns="foo") @pytest.mark.parametrize( @@ -231,8 +232,8 @@ def test_nlargest_nsmallest(xdf, bdf, func): lambda df: df.x != df.y, ], ) -def test_conditionals(func, bdf, xdf): - assert_eq(func(bdf), func(xdf), check_names=False) +def test_conditionals(func, bdf, df): + assert_eq(func(bdf), func(df), check_names=False) @pytest.mark.parametrize( @@ -280,8 +281,8 @@ def test_unary_operators(func, lib): lambda df: df[(df.x > 7) & (df.x < 10)], ], ) -def test_and_or(func, bdf, xdf): - assert_eq(func(bdf), func(xdf), check_names=False) +def test_and_or(func, bdf, df): + assert_eq(func(bdf), func(df), check_names=False) @pytest.mark.parametrize("how", ["start", "end"]) @@ -298,7 +299,6 @@ def test_to_timestamp(bdf, how, backend): "func", [ lambda df: df.astype(int), - # lambda df: df.apply(lambda row, x, y=10: row * x + y, x=2), pytest.param( lambda df: df.map(lambda x: x + 1), marks=pytest.mark.skipif( @@ -309,7 +309,6 @@ def test_to_timestamp(bdf, how, backend): lambda df: df.x.clip(lower=10, upper=50), lambda df: df.x.between(left=10, right=50), lambda df: df.x.map(lambda x: x + 1), - # lambda df: df.index.map(lambda x: x + 1), lambda df: df[df.x > 5], lambda df: df.assign(a=df.x + df.y, b=df.x - df.y), lambda df: df.replace(to_replace=1, value=1000), @@ -321,8 +320,6 @@ def test_to_timestamp(bdf, how, backend): lambda df: df.rename(columns={"x": "xx"}), lambda df: df.rename(columns={"x": "xx"}).xx, lambda df: df.rename(columns={"x": "xx"})[["xx"]], - # lambda df: df.combine_first(df), - # lambda df: df.x.combine_first(df.y), lambda df: df.x.to_frame(), lambda df: df.drop(columns="x"), lambda df: df.x.index.to_frame(), @@ -330,8 +327,8 @@ def test_to_timestamp(bdf, how, backend): lambda df: df.select_dtypes(include="integer"), ], ) -def test_blockwise(func, bdf, xdf): - assert_eq(func(bdf), func(xdf)) +def test_blockwise(func, bdf, df): + assert_eq(func(bdf), func(df)) @pytest.mark.parametrize( @@ -343,10 +340,10 @@ def test_blockwise(func, bdf, xdf): lambda df: df.x.combine_first(df.y), ], ) -def test_blockwise_cudf_fails(func, bdf, xdf, backend): +def test_blockwise_cudf_fails(func, bdf, df, backend): if backend == "cudf": pytest.xfail(reason="func not supported by cudf") - assert_eq(func(bdf), func(xdf)) + assert_eq(func(bdf), func(df)) def test_rename_axis(bdf, backend): @@ -361,10 +358,10 @@ def test_rename_axis(bdf, backend): assert_eq(df.x.rename_axis(index="dummy"), pdf.x.rename_axis(index="dummy")) -def test_isin(xdf, bdf): +def test_isin(df, bdf): values = [1, 2] - assert_eq(bdf.isin(values), xdf.isin(values)) - assert_eq(bdf.x.isin(values), xdf.x.isin(values)) + assert_eq(bdf.isin(values), df.isin(values)) + assert_eq(bdf.x.isin(values), df.x.isin(values)) def test_round(bdf): @@ -375,11 +372,11 @@ def test_round(bdf): assert_eq(df.x.round(decimals=1), pdf.x.round(decimals=1)) -def test_repr(xdf): - assert "+ 1" in str(xdf + 1) - assert "+ 1" in repr(xdf + 1) +def test_repr(df): + assert "+ 1" in str(df + 1) + assert "+ 1" in repr(df + 1) - s = (xdf["x"] + 1).sum(skipna=False).expr + s = (df["x"] + 1).sum(skipna=False).expr assert '["x"]' in s or "['x']" in s assert "+ 1" in s assert "sum(skipna=False)" in s @@ -400,33 +397,33 @@ def test_combine_first_simplify(bdf, backend): assert_eq(result, pdf.combine_first(pdf2)[["z", "y"]]) -def test_rename_traverse_filter(xdf): - result = xdf.rename(columns={"x": "xx"})[["xx"]].simplify() - expected = xdf[["x"]].rename(columns={"x": "xx"}) +def test_rename_traverse_filter(df): + result = df.rename(columns={"x": "xx"})[["xx"]].simplify() + expected = df[["x"]].rename(columns={"x": "xx"}) assert str(result) == str(expected) -def test_columns_traverse_filters(xdf): - result = xdf[xdf.x > 5].y.simplify() - expected = xdf.y[xdf.x > 5] +def test_columns_traverse_filters(df): + result = df[df.x > 5].y.simplify() + expected = df.y[df.x > 5] assert str(result) == str(expected) -def test_clip_traverse_filters(xdf): - result = xdf.clip(lower=10).y.simplify() - expected = xdf.y.clip(lower=10) +def test_clip_traverse_filters(df): + result = df.clip(lower=10).y.simplify() + expected = df.y.clip(lower=10) assert result._name == expected._name - result = xdf.clip(lower=10)[["x", "y"]].simplify() - expected = xdf.clip(lower=10) + result = df.clip(lower=10)[["x", "y"]].simplify() + expected = df.clip(lower=10) assert result._name == expected._name - arg = xdf.clip(lower=10)[["x"]] + arg = df.clip(lower=10)[["x"]] result = arg.simplify() - expected = xdf[["x"]].clip(lower=10) + expected = df[["x"]].clip(lower=10) assert result._name == expected._name @@ -443,19 +440,19 @@ def test_drop_duplicates_subset_simplify(bdf, subset, projection): assert str(result) == str(expected) -def test_broadcast(bdf, xdf): +def test_broadcast(bdf, df): assert_eq( - xdf + xdf.sum(), + df + df.sum(), bdf + bdf.sum(), ) assert_eq( - xdf.x + xdf.x.sum(), + df.x + df.x.sum(), bdf.x + bdf.x.sum(), ) -def test_persist(bdf, xdf): - a = xdf + 2 +def test_persist(bdf, df): + a = df + 2 b = a.persist() assert_eq(a, b) @@ -466,28 +463,28 @@ def test_persist(bdf, xdf): assert_eq(b.y.sum(), (bdf + 2).y.sum()) -def test_index(bdf, xdf): - assert_eq(xdf.index, bdf.index) - assert_eq(xdf.x.index, bdf.x.index) +def test_index(bdf, df): + assert_eq(df.index, bdf.index) + assert_eq(df.x.index, bdf.x.index) @pytest.mark.parametrize("drop", [True, False]) -def test_reset_index(bdf, xdf, drop): - assert_eq(xdf.reset_index(drop=drop), bdf.reset_index(drop=drop), check_index=False) +def test_reset_index(bdf, df, drop): + assert_eq(df.reset_index(drop=drop), bdf.reset_index(drop=drop), check_index=False) assert_eq( - xdf.x.reset_index(drop=drop), bdf.x.reset_index(drop=drop), check_index=False + df.x.reset_index(drop=drop), bdf.x.reset_index(drop=drop), check_index=False ) -def test_head(bdf, xdf): - assert_eq(xdf.head(compute=False), bdf.head()) - assert_eq(xdf.head(compute=False, n=7), bdf.head(n=7)) +def test_head(bdf, df): + assert_eq(df.head(compute=False), bdf.head()) + assert_eq(df.head(compute=False, n=7), bdf.head(n=7)) - assert xdf.head(compute=False).npartitions == 1 + assert df.head(compute=False).npartitions == 1 -def test_head_down(xdf): - result = (xdf.x + xdf.y + 1).head(compute=False) +def test_head_down(df): + result = (df.x + df.y + 1).head(compute=False) optimized = result.simplify() assert_eq(result, optimized) @@ -495,22 +492,22 @@ def test_head_down(xdf): assert not isinstance(optimized.expr, expr.Head) -def test_head_head(xdf): - a = xdf.head(compute=False).head(compute=False) - b = xdf.head(compute=False) +def test_head_head(df): + a = df.head(compute=False).head(compute=False) + b = df.head(compute=False) assert a.optimize()._name == b.optimize()._name -def test_tail(bdf, xdf): - assert_eq(xdf.tail(compute=False), bdf.tail()) - assert_eq(xdf.tail(compute=False, n=7), bdf.tail(n=7)) +def test_tail(bdf, df): + assert_eq(df.tail(compute=False), bdf.tail()) + assert_eq(df.tail(compute=False, n=7), bdf.tail(n=7)) - assert xdf.tail(compute=False).npartitions == 1 + assert df.tail(compute=False).npartitions == 1 -def test_tail_down(xdf): - result = (xdf.x + xdf.y + 1).tail(compute=False) +def test_tail_down(df): + result = (df.x + df.y + 1).tail(compute=False) optimized = optimize(result) assert_eq(result, optimized) @@ -518,23 +515,23 @@ def test_tail_down(xdf): assert not isinstance(optimized.expr, expr.Tail) -def test_tail_tail(xdf): - a = xdf.tail(compute=False).tail(compute=False) - b = xdf.tail(compute=False) +def test_tail_tail(df): + a = df.tail(compute=False).tail(compute=False) + b = df.tail(compute=False) assert a.optimize()._name == b.optimize()._name -def test_tail_repartition(xdf): - a = xdf.repartition(npartitions=10).tail() - b = xdf.tail() +def test_tail_repartition(df): + a = df.repartition(npartitions=10).tail() + b = df.tail() assert_eq(a, b) -def test_projection_stacking(xdf): - result = xdf[["x", "y"]]["x"] +def test_projection_stacking(df): + result = df[["x", "y"]]["x"] optimized = result.simplify() - expected = xdf["x"] + expected = df["x"] assert optimized._name == expected._name @@ -545,16 +542,16 @@ def test_projection_stacking_coercion(bdf): assert_eq(df.x[[0]], bdf.x[[0]], check_divisions=False) -def test_remove_unnecessary_projections(xdf): - result = (xdf + 1)[xdf.columns] +def test_remove_unnecessary_projections(df): + result = (df + 1)[df.columns] optimized = result.simplify() - expected = xdf + 1 + expected = df + 1 assert optimized._name == expected._name - result = (xdf[["x"]] + 1)[["x"]] + result = (df[["x"]] + 1)[["x"]] optimized = result.simplify() - expected = xdf[["x"]] + 1 + expected = df[["x"]] + 1 assert optimized._name == expected._name @@ -597,53 +594,53 @@ def test_from_pandas(bdf): assert "pandas" in df._name -def test_copy(xdf): - original = xdf.copy() +def test_copy(df): + original = df.copy() columns = tuple(original.columns) - xdf["z"] = xdf.x + xdf.y + df["z"] = df.x + df.y assert tuple(original.columns) == columns assert "z" not in original.columns -def test_partitions(bdf, xdf): - assert_eq(xdf.partitions[0], bdf.iloc[:10]) - assert_eq(xdf.partitions[1], bdf.iloc[10:20]) - assert_eq(xdf.partitions[1:3], bdf.iloc[10:30]) - assert_eq(xdf.partitions[[3, 4]], bdf.iloc[30:50]) - assert_eq(xdf.partitions[-1], bdf.iloc[90:]) +def test_partitions(bdf, df): + assert_eq(df.partitions[0], bdf.iloc[:10]) + assert_eq(df.partitions[1], bdf.iloc[10:20]) + assert_eq(df.partitions[1:3], bdf.iloc[10:30]) + assert_eq(df.partitions[[3, 4]], bdf.iloc[30:50]) + assert_eq(df.partitions[-1], bdf.iloc[90:]) - out = (xdf + 1).partitions[0].simplify() + out = (df + 1).partitions[0].simplify() assert isinstance(out.expr, expr.Add) assert out.expr.left._partitions == [0] # Check culling - out = optimize(xdf.partitions[1]) + out = optimize(df.partitions[1]) assert len(out.dask) == 1 assert_eq(out, bdf.iloc[10:20]) -def test_column_getattr(xdf): - xdf = xdf.expr - assert xdf.x._name == xdf["x"]._name +def test_column_getattr(df): + df = df.expr + assert df.x._name == df["x"]._name with pytest.raises(AttributeError): - xdf.foo + df.foo -def test_serialization(bdf, xdf): - before = pickle.dumps(xdf) +def test_serialization(bdf, df): + before = pickle.dumps(df) assert len(before) < 200 + len(pickle.dumps(bdf)) - part = xdf.partitions[0].compute() + part = df.partitions[0].compute() assert ( - len(pickle.dumps(xdf.__dask_graph__())) - < 1000 + len(pickle.dumps(part)) * xdf.npartitions + len(pickle.dumps(df.__dask_graph__())) + < 1000 + len(pickle.dumps(part)) * df.npartitions ) - after = pickle.dumps(xdf) + after = pickle.dumps(df) assert before == after # caching doesn't affect serialization @@ -651,17 +648,17 @@ def test_serialization(bdf, xdf): assert_eq(pickle.loads(before), pickle.loads(after)) -def test_size_optimized(xdf, backend): +def test_size_optimized(df, backend): if backend == "cudf": pytest.xfail(reason="Cannot apply lambda function in cudf") - expr = (xdf.x + 1).apply(lambda x: x).size + expr = (df.x + 1).apply(lambda x: x).size out = optimize(expr) - expected = optimize(xdf.x.size) + expected = optimize(df.x.size) assert out._name == expected._name - expr = (xdf + 1).apply(lambda x: x).size + expr = (df + 1).apply(lambda x: x).size out = optimize(expr) - expected = optimize(xdf.size) + expected = optimize(df.size) assert out._name == expected._name @@ -687,30 +684,30 @@ def test_tree_repr(fuse): assert s.count("|") == 9 -def test_simple_graphs(xdf): - expr = (xdf + 1).expr +def test_simple_graphs(df): + expr = (df + 1).expr graph = expr.__dask_graph__() - assert graph[(expr._name, 0)] == (operator.add, (xdf.expr._name, 0), 1) + assert graph[(expr._name, 0)] == (operator.add, (df.expr._name, 0), 1) -def test_map_partitions(xdf): +def test_map_partitions(df): def combine_x_y(x, y, foo=None): assert foo == "bar" return x + y - df2 = xdf.map_partitions(combine_x_y, xdf + 1, foo="bar") - assert_eq(df2, xdf + (xdf + 1)) + df2 = df.map_partitions(combine_x_y, df + 1, foo="bar") + assert_eq(df2, df + (df + 1)) -def test_map_partitions_broadcast(xdf): +def test_map_partitions_broadcast(df): def combine_x_y(x, y, val, foo=None): assert foo == "bar" return x + y + val - df2 = xdf.map_partitions(combine_x_y, xdf["x"].sum(), 123, foo="bar") - assert_eq(df2, xdf + xdf["x"].sum() + 123) - assert_eq(df2.optimize(), xdf + xdf["x"].sum() + 123) + df2 = df.map_partitions(combine_x_y, df["x"].sum(), 123, foo="bar") + assert_eq(df2, df + df["x"].sum() + 123) + assert_eq(df2.optimize(), df + df["x"].sum() + 123) @pytest.mark.parametrize("opt", [True, False]) @@ -735,15 +732,15 @@ def test_map_partitions_merge(opt, lib): assert_eq(df3, expect, check_index=False) -def test_depth(xdf): - assert xdf._depth() == 1 - assert (xdf + 1)._depth() == 2 - assert ((xdf.x + 1) + xdf.y)._depth() == 4 +def test_depth(df): + assert df._depth() == 1 + assert (df + 1)._depth() == 2 + assert ((df.x + 1) + df.y)._depth() == 4 -def test_partitions_nested(xdf): - a = expr.Partitions(expr.Partitions(xdf.expr, [2, 4, 6]), [0, 2]) - b = expr.Partitions(xdf.expr, [2, 6]) +def test_partitions_nested(df): + a = expr.Partitions(expr.Partitions(df.expr, [2, 4, 6]), [0, 2]) + b = expr.Partitions(df.expr, [2, 6]) assert a.optimize()._name == b.optimize()._name @@ -758,14 +755,14 @@ def test_repartition_npartitions(bdf, npartitions, sort): @pytest.mark.parametrize("opt", [True, False]) -def test_repartition_divisions(xdf, opt): - end = xdf.divisions[-1] + 100 - stride = end // (xdf.npartitions + 2) +def test_repartition_divisions(df, opt): + end = df.divisions[-1] + 100 + stride = end // (df.npartitions + 2) divisions = tuple(range(0, end, stride)) - df2 = (xdf + 1).repartition(divisions=divisions, force=True)["x"] + df2 = (df + 1).repartition(divisions=divisions, force=True)["x"] df2 = optimize(df2) if opt else df2 assert df2.divisions == divisions - assert_eq((xdf + 1)["x"], df2) + assert_eq((df + 1)["x"], df2) # Check partitions for p, part in enumerate(dask.compute(list(df2.index.partitions))[0]): @@ -774,16 +771,16 @@ def test_repartition_divisions(xdf, opt): assert part.max() < df2.divisions[p + 1] -def test_repartition_no_op(xdf): - result = xdf.repartition(divisions=xdf.divisions).optimize() - assert result._name == xdf._name +def test_repartition_no_op(df): + result = df.repartition(divisions=df.divisions).optimize() + assert result._name == df._name -def test_len(xdf, bdf): - df2 = xdf[["x"]] + 1 +def test_len(df, bdf): + df2 = df[["x"]] + 1 assert len(df2) == len(bdf) - assert len(xdf[xdf.x > 5]) == len(bdf[bdf.x > 5]) + assert len(df[df.x > 5]) == len(bdf[bdf.x > 5]) first = df2.partitions[0].compute() assert len(df2.partitions[0]) == len(first) @@ -792,63 +789,63 @@ def test_len(xdf, bdf): assert isinstance(expr.Lengths(df2.expr).optimize(), expr.Literal) -def test_astype_simplify(xdf, bdf): - q = xdf.astype({"x": "float64", "y": "float64"})["x"] +def test_astype_simplify(df, bdf): + q = df.astype({"x": "float64", "y": "float64"})["x"] result = q.simplify() - expected = xdf["x"].astype({"x": "float64"}) + expected = df["x"].astype({"x": "float64"}) assert result._name == expected._name assert_eq(q, bdf.astype({"x": "float64", "y": "float64"})["x"]) - q = xdf.astype({"y": "float64"})["x"] + q = df.astype({"y": "float64"})["x"] result = q.simplify() - expected = xdf["x"] + expected = df["x"] assert result._name == expected._name - q = xdf.astype("float64")["x"] + q = df.astype("float64")["x"] result = q.simplify() - expected = xdf["x"].astype("float64") + expected = df["x"].astype("float64") assert result._name == expected._name -def test_drop_duplicates(xdf, bdf, backend): - assert_eq(xdf.drop_duplicates(), bdf.drop_duplicates()) +def test_drop_duplicates(df, bdf, backend): + assert_eq(df.drop_duplicates(), bdf.drop_duplicates()) assert_eq( - xdf.drop_duplicates(ignore_index=True), bdf.drop_duplicates(ignore_index=True) + df.drop_duplicates(ignore_index=True), bdf.drop_duplicates(ignore_index=True) ) - assert_eq(xdf.drop_duplicates(subset=["x"]), bdf.drop_duplicates(subset=["x"])) - assert_eq(xdf.x.drop_duplicates(), bdf.x.drop_duplicates()) + assert_eq(df.drop_duplicates(subset=["x"]), bdf.drop_duplicates(subset=["x"])) + assert_eq(df.x.drop_duplicates(), bdf.x.drop_duplicates()) if backend == "pandas": with pytest.raises(KeyError, match=re.escape("Index(['a'], dtype='object')")): - xdf.drop_duplicates(subset=["a"]) + df.drop_duplicates(subset=["a"]) with pytest.raises(TypeError, match="got an unexpected keyword argument"): - xdf.x.drop_duplicates(subset=["a"]) + df.x.drop_duplicates(subset=["a"]) -def test_unique(xdf, bdf, lib): +def test_unique(df, bdf, lib): with pytest.raises( AttributeError, match="'DataFrame' object has no attribute 'unique'" ): - xdf.unique() + df.unique() # pandas returns a numpy array while we return a Series/Index - assert_eq(xdf.x.unique(), lib.Series(bdf.x.unique(), name="x")) - assert_eq(xdf.index.unique(), lib.Index(bdf.index.unique())) + assert_eq(df.x.unique(), lib.Series(bdf.x.unique(), name="x")) + assert_eq(df.index.unique(), lib.Index(bdf.index.unique())) -def test_walk(xdf): - df2 = xdf[xdf["x"] > 1][["y"]] + 1 +def test_walk(df): + df2 = df[df["x"] > 1][["y"]] + 1 assert all(isinstance(ex, expr.Expr) for ex in df2.walk()) exprs = set(df2.walk()) - assert xdf.expr in exprs - assert xdf["x"].expr in exprs - assert (xdf["x"] > 1).expr in exprs + assert df.expr in exprs + assert df["x"].expr in exprs + assert (df["x"] > 1).expr in exprs assert 1 not in exprs -def test_find_operations(xdf): - df2 = xdf[xdf["x"] > 1][["y"]] + 1 +def test_find_operations(df): + df2 = df[df["x"] > 1][["y"]] + 1 filters = list(df2.find_operations(expr.Filter)) assert len(filters) == 1 @@ -875,11 +872,11 @@ def test_dropna_simplify(bdf, subset): assert_eq(q, bdf.dropna(subset=subset)["y"]) -def test_dir(xdf): - assert all(c in dir(xdf) for c in xdf.columns) - assert "sum" in dir(xdf) - assert "sum" in dir(xdf.x) - assert "sum" in dir(xdf.index) +def test_dir(df): + assert all(c in dir(df) for c in df.columns) + assert "sum" in dir(df) + assert "sum" in dir(df.x) + assert "sum" in dir(df.index) @pytest.mark.parametrize( @@ -895,39 +892,39 @@ def test_dir(xdf): ], ) @pytest.mark.parametrize("indexer", ["x", ["x"]]) -def test_simplify_up_blockwise(xdf, bdf, func, args, indexer): - q = getattr(xdf, func)(*args)[indexer] +def test_simplify_up_blockwise(df, bdf, func, args, indexer): + q = getattr(df, func)(*args)[indexer] result = q.simplify() - expected = getattr(xdf[indexer], func)(*args) + expected = getattr(df[indexer], func)(*args) assert result._name == expected._name assert_eq(q, getattr(bdf, func)(*args)[indexer]) - q = getattr(xdf, func)(*args)[["x", "y"]] + q = getattr(df, func)(*args)[["x", "y"]] result = q.simplify() - expected = getattr(xdf, func)(*args) + expected = getattr(df, func)(*args) assert result._name == expected._name -def test_sample(xdf): - result = xdf.sample(frac=0.5) +def test_sample(df): + result = df.sample(frac=0.5) assert_eq(result, result) - result = xdf.sample(frac=0.5, random_state=1234) - expected = xdf.sample(frac=0.5, random_state=1234) + result = df.sample(frac=0.5, random_state=1234) + expected = df.sample(frac=0.5, random_state=1234) assert_eq(result, expected) -def test_align(xdf, bdf, backend): +def test_align(df, bdf, backend): if backend == "cudf": pytest.skip(reason="align not supported by cudf") - result_1, result_2 = xdf.align(xdf) + result_1, result_2 = df.align(df) pdf_result_1, pdf_result_2 = bdf.align(bdf) assert_eq(result_1, pdf_result_1) assert_eq(result_2, pdf_result_2) - result_1, result_2 = xdf.x.align(xdf.x) + result_1, result_2 = df.x.align(df.x) pdf_result_1, pdf_result_2 = bdf.x.align(bdf.x) assert_eq(result_1, pdf_result_1) assert_eq(result_2, pdf_result_2) @@ -965,10 +962,10 @@ def test_unknown_partitions_different_root(): df.align(df2) -def test_nunique_approx(xdf, backend): +def test_nunique_approx(df, backend): if backend == "cudf": pytest.xfail(reason="compute_hll_array doesn't work for cudf") - result = xdf.nunique_approx().compute() + result = df.nunique_approx().compute() assert 99 < result < 101 @@ -1005,41 +1002,41 @@ def test_assign_simplify_series(bdf): assert result._name == expected._name -def test_assign_non_series_inputs(xdf, bdf, backend): +def test_assign_non_series_inputs(df, bdf, backend): if backend == "cudf": pytest.xfail(reason="assign function not supported by cudf") - assert_eq(xdf.assign(a=lambda x: x.x * 2), bdf.assign(a=lambda x: x.x * 2)) - assert_eq(xdf.assign(a=2), bdf.assign(a=2)) - assert_eq(xdf.assign(a=xdf.x.sum()), bdf.assign(a=bdf.x.sum())) + assert_eq(df.assign(a=lambda x: x.x * 2), bdf.assign(a=lambda x: x.x * 2)) + assert_eq(df.assign(a=2), bdf.assign(a=2)) + assert_eq(df.assign(a=df.x.sum()), bdf.assign(a=bdf.x.sum())) - assert_eq(xdf.assign(a=lambda x: x.x * 2).y, bdf.assign(a=lambda x: x.x * 2).y) - assert_eq(xdf.assign(a=lambda x: x.x * 2).a, bdf.assign(a=lambda x: x.x * 2).a) + assert_eq(df.assign(a=lambda x: x.x * 2).y, bdf.assign(a=lambda x: x.x * 2).y) + assert_eq(df.assign(a=lambda x: x.x * 2).a, bdf.assign(a=lambda x: x.x * 2).a) -def test_are_co_aligned(bdf, xdf): - df2 = xdf.reset_index() - assert are_co_aligned(xdf.expr, df2.expr) - assert are_co_aligned(xdf.expr, df2.sum().expr) - assert not are_co_aligned(xdf.expr, df2.repartition(npartitions=2).expr) +def test_are_co_aligned(bdf, df): + df2 = df.reset_index() + assert are_co_aligned(df.expr, df2.expr) + assert are_co_aligned(df.expr, df2.sum().expr) + assert not are_co_aligned(df.expr, df2.repartition(npartitions=2).expr) - assert are_co_aligned(xdf.expr, xdf.sum().expr) - assert are_co_aligned((xdf + xdf.sum()).expr, xdf.sum().expr) + assert are_co_aligned(df.expr, df.sum().expr) + assert are_co_aligned((df + df.sum()).expr, df.sum().expr) bdf = bdf.assign(z=1) df3 = from_pandas(bdf, npartitions=10) - assert not are_co_aligned(xdf.expr, df3.expr) - assert are_co_aligned(xdf.expr, df3.sum().expr) + assert not are_co_aligned(df.expr, df3.expr) + assert are_co_aligned(df.expr, df3.sum().expr) - merged = xdf.merge(df2) + merged = df.merge(df2) merged_first = merged.reset_index() merged_second = merged.rename(columns={"x": "a"}) assert are_co_aligned(merged_first.expr, merged_second.expr) - assert not are_co_aligned(merged_first.expr, xdf.expr) + assert not are_co_aligned(merged_first.expr, df.expr) -def test_astype_categories(xdf, backend): +def test_astype_categories(df, backend): if backend == "cudf": pytest.xfail(reason="TODO") - result = xdf.astype("category") + result = df.astype("category") assert_eq(result.x._meta.cat.categories, pd.Index([UNKNOWN_CATEGORIES])) assert_eq(result.y._meta.cat.categories, pd.Index([UNKNOWN_CATEGORIES])) From 06f3bf818b185a0b21ae40938c69acc061922f0b Mon Sep 17 00:00:00 2001 From: rjzamora Date: Thu, 13 Jul 2023 09:55:57 -0700 Subject: [PATCH 06/18] fix pdf in test --- dask_expr/tests/test_collection.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py index ced04d57e..b9463c972 100644 --- a/dask_expr/tests/test_collection.py +++ b/dask_expr/tests/test_collection.py @@ -1068,7 +1068,8 @@ def test_op_align(): assert_eq(df - df2, pdf - pdf2) -def test_can_co_align(df, pdf): +def test_can_co_align(df, bdf): + pdf = bdf.copy() q = (df.x + df.y).optimize(fuse=False) expected = df.x + df.y assert q._name == expected._name From 7d7a400ea88bec48f804c58b169962b771094dec Mon Sep 17 00:00:00 2001 From: rjzamora Date: Fri, 14 Jul 2023 11:36:22 -0700 Subject: [PATCH 07/18] fix predicate-pushdown test --- dask_expr/_collection.py | 3 +- dask_expr/_util.py | 5 - dask_expr/io/parquet.py | 7 +- dask_expr/io/tests/test_io.py | 48 ++++----- dask_expr/tests/test_collection.py | 155 +++++++++++++---------------- 5 files changed, 96 insertions(+), 122 deletions(-) diff --git a/dask_expr/_collection.py b/dask_expr/_collection.py index 80412d8a1..d0584a2ea 100644 --- a/dask_expr/_collection.py +++ b/dask_expr/_collection.py @@ -41,7 +41,7 @@ ) from dask_expr._repartition import Repartition from dask_expr._shuffle import SetIndex, SetIndexBlockwise -from dask_expr._util import _convert_to_list, _maybe_import_backend +from dask_expr._util import _convert_to_list # # Utilities to wrap Expr API @@ -949,7 +949,6 @@ def optimize(collection, fuse=True): def from_pandas(data, npartitions=1, sort=True): from dask_expr.io.io import FromPandas - _maybe_import_backend() return new_collection(FromPandas(data.copy(), npartitions=npartitions, sort=sort)) diff --git a/dask_expr/_util.py b/dask_expr/_util.py index bccd61669..6275e3769 100644 --- a/dask_expr/_util.py +++ b/dask_expr/_util.py @@ -18,11 +18,6 @@ def _convert_to_list(column) -> list | None: return column -def _maybe_import_backend(): - if config.get("dataframe.backend", "pandas") == "cudf": - import dask_cudf # noqa F401 - - @normalize_token.register(LambdaType) def _normalize_lambda(func): return str(func) diff --git a/dask_expr/io/parquet.py b/dask_expr/io/parquet.py index 55c7d4986..c3ea94bae 100644 --- a/dask_expr/io/parquet.py +++ b/dask_expr/io/parquet.py @@ -24,7 +24,7 @@ from dask.dataframe.io.parquet.utils import _split_user_options from dask.dataframe.io.utils import _is_local_fs from dask.delayed import delayed -from dask.utils import apply, natural_sort_key +from dask.utils import apply, natural_sort_key, typename from fsspec.utils import stringify_path from dask_expr._expr import ( @@ -177,6 +177,11 @@ def to_parquet( from dask_expr._collection import new_collection from dask_expr.io.parquet import NONE_LABEL, ToParquet + if typename(df._meta).split(".")[0] == "cudf": + from dask_cudf.io.parquet import CudfEngine + + engine = CudfEngine + compute_kwargs = compute_kwargs or {} partition_on = partition_on or [] diff --git a/dask_expr/io/tests/test_io.py b/dask_expr/io/tests/test_io.py index 2eae4aa07..8c97bbb92 100644 --- a/dask_expr/io/tests/test_io.py +++ b/dask_expr/io/tests/test_io.py @@ -1,7 +1,7 @@ +import importlib import os import dask.dataframe as dd -import pandas as pd import pytest from dask import config from dask.dataframe.utils import assert_eq @@ -11,28 +11,15 @@ from dask_expr._reductions import Len from dask_expr.io import ReadParquet -try: - import cudf -except ImportError: - cudf = None - - -@pytest.fixture( - params=[ - "pandas", - pytest.param( - "cudf", marks=pytest.mark.skipif(cudf is None, reason="cudf not found.") - ), - ] -) -def backend(request): - yield request.param +# Import backend DataFrame library to test +BACKEND = os.environ.get("TEST_DASK_EXPR_BACKEND", "pandas") +lib = importlib.import_module(BACKEND) def _make_file(dir, format="parquet", df=None): fn = os.path.join(str(dir), f"myfile.{format}") if df is None: - df = pd.DataFrame({c: range(10) for c in "abcde"}) + df = lib.DataFrame({c: range(10) for c in "abcde"}) if format == "csv": df.to_csv(fn) elif format == "parquet": @@ -101,7 +88,7 @@ def test_io_fusion(tmpdir, fmt): def test_predicate_pushdown(tmpdir): - original = pd.DataFrame( + original = lib.DataFrame( { "a": [1, 2, 3, 4, 5] * 10, "b": [0, 1, 2, 3, 4] * 10, @@ -128,7 +115,7 @@ def test_predicate_pushdown(tmpdir): def test_predicate_pushdown_compound(tmpdir): - pdf = pd.DataFrame( + pdf = lib.DataFrame( { "a": [1, 2, 3, 4, 5] * 10, "b": [0, 1, 2, 3, 4] * 10, @@ -152,15 +139,18 @@ def test_predicate_pushdown_compound(tmpdir): ) # Test OR - x = df[(df.a == 5) | (df.c > 20)][df.b != 0]["b"] + x = df[(df.a == 5) | (df.c > 20)] + x = x[x.b != 0]["b"] y = optimize(x, fuse=False) assert isinstance(y.expr, ReadParquet) filters = [set(y.filters[0]), set(y.filters[1])] assert {("c", ">", 20), ("b", "!=", 0)} in filters assert {("a", "==", 5), ("b", "!=", 0)} in filters + expect = pdf[(pdf.a == 5) | (pdf.c > 20)] + expect = expect[expect.b != 0]["b"] assert_eq( y, - pdf[(pdf.a == 5) | (pdf.c > 20)][pdf.b != 0]["b"], + expect, check_index=False, ) @@ -176,7 +166,7 @@ def test_predicate_pushdown_compound(tmpdir): @pytest.mark.parametrize("fmt", ["parquet", "csv", "pandas"]) def test_io_culling(tmpdir, fmt): - pdf = pd.DataFrame({c: range(10) for c in "abcde"}) + pdf = lib.DataFrame({c: range(10) for c in "abcde"}) if fmt == "parquet": dd.from_pandas(pdf, 2).to_parquet(tmpdir) df = read_parquet(tmpdir) @@ -209,7 +199,7 @@ def _check_culling(expr, partitions): @pytest.mark.parametrize("sort", [True, False]) def test_from_pandas(sort): - pdf = pd.DataFrame({"x": [1, 4, 3, 2, 0, 5]}) + pdf = lib.DataFrame({"x": [1, 4, 3, 2, 0, 5]}) df = from_pandas(pdf, npartitions=2, sort=sort) assert df.divisions == (0, 3, 5) if sort else (None,) * 3 @@ -217,15 +207,15 @@ def test_from_pandas(sort): def test_from_pandas_immutable(): - pdf = pd.DataFrame({"x": [1, 2, 3, 4]}) + pdf = lib.DataFrame({"x": [1, 2, 3, 4]}) expected = pdf.copy() df = from_pandas(pdf) pdf["z"] = 100 assert_eq(df, expected) -def test_parquet_complex_filters(tmpdir, backend): - with config.set({"dataframe.backend": backend}): +def test_parquet_complex_filters(tmpdir): + with config.set({"dataframe.backend": BACKEND}): df = read_parquet(_make_file(tmpdir)) pdf = df.compute() got = df["a"][df["b"] > df["b"].mean()] @@ -266,7 +256,7 @@ def test_from_dask_dataframe(optimize): @pytest.mark.parametrize("optimize", [True, False]) def test_to_dask_dataframe(optimize): - pdf = pd.DataFrame({"x": [1, 4, 3, 2, 0, 5]}) + pdf = lib.DataFrame({"x": [1, 4, 3, 2, 0, 5]}) df = from_pandas(pdf, npartitions=2) ddf = df.to_dask_dataframe(optimize=optimize) assert isinstance(ddf, dd.DataFrame) @@ -275,7 +265,7 @@ def test_to_dask_dataframe(optimize): @pytest.mark.parametrize("write_metadata_file", [True, False]) def test_to_parquet(tmpdir, write_metadata_file): - pdf = pd.DataFrame({"x": [1, 4, 3, 2, 0, 5]}) + pdf = lib.DataFrame({"x": [1, 4, 3, 2, 0, 5]}) df = from_pandas(pdf, npartitions=2) # Check basic parquet round trip diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py index b9463c972..8f0c62a7e 100644 --- a/dask_expr/tests/test_collection.py +++ b/dask_expr/tests/test_collection.py @@ -1,12 +1,13 @@ from __future__ import annotations +import importlib import operator +import os import pickle import re import dask import numpy as np -import pandas as pd import pytest from dask.dataframe._compat import PANDAS_GT_210 from dask.dataframe.utils import UNKNOWN_CATEGORIES, assert_eq @@ -17,45 +18,20 @@ from dask_expr._reductions import Len from dask_expr.datasets import timeseries -try: - import cudf -except ImportError: - cudf = None - - -@pytest.fixture( - params=[ - "pandas", - pytest.param( - "cudf", marks=pytest.mark.skipif(cudf is None, reason="cudf not found.") - ), - ] -) -def backend(request): - # Return backend-library label - yield request.param - - -@pytest.fixture -def lib(backend): - # Return library associated with `backend` label - if backend == "cudf": - yield cudf - else: - yield pd +# Import backend DataFrame library to test +BACKEND = os.environ.get("TEST_DASK_EXPR_BACKEND", "pandas") +lib = importlib.import_module(BACKEND) @pytest.fixture -def bdf(lib): - # Backend DataFrame fixture - df = lib.DataFrame({"x": range(100)}) - df["y"] = df.x * 10.0 - yield df +def bdf(): + bdf = lib.DataFrame({"x": range(100)}) + bdf["y"] = bdf.x * 10.0 + yield bdf @pytest.fixture def df(bdf): - # Multi-backend Dask-Expression DataFrame fixture yield from_pandas(bdf, npartitions=10) @@ -79,16 +55,16 @@ def test_setitem(bdf, df): def test_explode(): - # CuDF backend does not support explode - # (See: https://github.com/rapidsai/cudf/issues/10271) - pdf = pd.DataFrame({"a": [[1, 2], [3, 4]]}) + if BACKEND == "cudf": + pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/10271") + pdf = lib.DataFrame({"a": [[1, 2], [3, 4]]}) df = from_pandas(pdf) assert_eq(pdf.explode(column="a"), df.explode(column="a")) assert_eq(pdf.a.explode(), df.a.explode()) -def test_explode_simplify(bdf, backend): - if backend == "cudf": +def test_explode_simplify(bdf): + if BACKEND == "cudf": pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/10271") pdf = bdf.copy() pdf["z"] = 1 @@ -99,7 +75,7 @@ def test_explode_simplify(bdf, backend): assert result._name == expected._name -def test_meta_divisions_name(lib): +def test_meta_divisions_name(): a = lib.DataFrame({"x": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]}) df = 2 * from_pandas(a, npartitions=2) assert list(df.columns) == list(a.columns) @@ -112,7 +88,7 @@ def test_meta_divisions_name(lib): assert "sum" in df.sum()._name -def test_meta_blockwise(lib): +def test_meta_blockwise(): a = lib.DataFrame({"x": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]}) b = lib.DataFrame({"z": [1, 2, 3, 4], "y": [1.0, 2.0, 3.0, 4.0]}) @@ -149,8 +125,8 @@ def test_dask(bdf, df): ), ], ) -def test_reductions(func, bdf, df, backend): - if backend == "cudf" and func in [M.idxmin, M.idxmax]: +def test_reductions(func, bdf, df): + if BACKEND == "cudf" and func in [M.idxmin, M.idxmax]: pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/9602") result = func(df) assert result.known_divisions @@ -163,15 +139,15 @@ def test_reductions(func, bdf, df, backend): assert_eq(func(df)["x"], func(bdf)["x"], check_dtype=False) -def test_nbytes(bdf, df, backend): - if backend == "cudf": +def test_nbytes(bdf, df): + if BACKEND == "cudf": pytest.xfail(reason="nbytes not supported by cudf") with pytest.raises(NotImplementedError, match="nbytes is not implemented"): df.nbytes assert_eq(df.x.nbytes, bdf.x.nbytes) -def test_mode(lib): +def test_mode(): pdf = lib.DataFrame({"x": [1, 2, 3, 1, 2]}) df = from_pandas(pdf, npartitions=3) @@ -195,7 +171,7 @@ def test_dropna(bdf): assert_eq(df.y.dropna(), pdf.y.dropna()) -def test_fillna(lib): +def test_fillna(): pdf = lib.DataFrame({"x": [1, 2, None, None, 5, 6]}) df = from_pandas(pdf, npartitions=2) actual = df.fillna(value=100) @@ -255,7 +231,7 @@ def test_conditionals(func, bdf, df): lambda df: df.x.__rxor__(df.y), ], ) -def test_boolean_operators(func, lib): +def test_boolean_operators(func): pdf = lib.DataFrame( {"x": [True, False, True, False], "y": [True, False, False, False]} ) @@ -274,7 +250,7 @@ def test_boolean_operators(func, lib): lambda df: +df, ], ) -def test_unary_operators(func, lib): +def test_unary_operators(func): pdf = lib.DataFrame( {"x": [True, False, True, False], "y": [True, False, False, False], "z": 1} ) @@ -294,10 +270,10 @@ def test_and_or(func, bdf, df): @pytest.mark.parametrize("how", ["start", "end"]) -def test_to_timestamp(bdf, how, backend): - if backend == "cudf": +def test_to_timestamp(bdf, how): + if BACKEND == "cudf": pytest.xfail(reason="period_range not supported by cudf") - bdf.index = pd.period_range("2019-12-31", freq="D", periods=len(bdf)) + bdf.index = lib.period_range("2019-12-31", freq="D", periods=len(bdf)) df = from_pandas(bdf) assert_eq(df.to_timestamp(how=how), bdf.to_timestamp(how=how)) assert_eq(df.x.to_timestamp(how=how), bdf.x.to_timestamp(how=how)) @@ -348,14 +324,14 @@ def test_blockwise(func, bdf, df): lambda df: df.x.combine_first(df.y), ], ) -def test_blockwise_cudf_fails(func, bdf, df, backend): - if backend == "cudf": +def test_blockwise_cudf_fails(func, bdf, df): + if BACKEND == "cudf": pytest.xfail(reason="func not supported by cudf") assert_eq(func(bdf), func(df)) -def test_rename_axis(bdf, backend): - if backend == "cudf": +def test_rename_axis(bdf): + if BACKEND == "cudf": pytest.xfail(reason="rename_axis not supported by cudf") pdf = bdf.copy() pdf.index.name = "a" @@ -390,8 +366,8 @@ def test_repr(df): assert "sum(skipna=False)" in s -def test_combine_first_simplify(bdf, backend): - if backend == "cudf": +def test_combine_first_simplify(bdf): + if BACKEND == "cudf": pytest.xfail(reason="combine_first not supported by cudf") pdf = bdf.copy() df = from_pandas(pdf) @@ -564,7 +540,7 @@ def test_remove_unnecessary_projections(df): assert optimized._name == expected._name -def test_substitute(lib): +def test_substitute(): pdf = lib.DataFrame( { "a": range(100), @@ -656,8 +632,8 @@ def test_serialization(bdf, df): assert_eq(pickle.loads(before), pickle.loads(after)) -def test_size_optimized(df, backend): - if backend == "cudf": +def test_size_optimized(df): + if BACKEND == "cudf": pytest.xfail(reason="Cannot apply lambda function in cudf") expr = (df.x + 1).apply(lambda x: x).size out = optimize(expr) @@ -672,8 +648,11 @@ def test_size_optimized(df, backend): @pytest.mark.parametrize("fuse", [True, False]) def test_tree_repr(fuse): - s = from_pandas(pd.Series(range(10))).expr.tree_repr() - assert "" in s + s = from_pandas(lib.Series(range(10))).expr.tree_repr() + if BACKEND == "pandas": + assert "" in s + else: + assert "" in s df = timeseries() expr = ((df.x + 1).sum(skipna=False) + df.y.mean()).expr @@ -719,7 +698,7 @@ def combine_x_y(x, y, val, foo=None): @pytest.mark.parametrize("opt", [True, False]) -def test_map_partitions_merge(opt, lib): +def test_map_partitions_merge(opt): # Make simple left & right dfs pdf1 = lib.DataFrame({"x": range(20), "y": range(20)}) df1 = from_pandas(pdf1, 2) @@ -815,7 +794,7 @@ def test_astype_simplify(df, bdf): assert result._name == expected._name -def test_drop_duplicates(df, bdf, backend): +def test_drop_duplicates(df, bdf): assert_eq(df.drop_duplicates(), bdf.drop_duplicates()) assert_eq( df.drop_duplicates(ignore_index=True), bdf.drop_duplicates(ignore_index=True) @@ -823,7 +802,7 @@ def test_drop_duplicates(df, bdf, backend): assert_eq(df.drop_duplicates(subset=["x"]), bdf.drop_duplicates(subset=["x"])) assert_eq(df.x.drop_duplicates(), bdf.x.drop_duplicates()) - if backend == "pandas": + if BACKEND == "pandas": with pytest.raises(KeyError, match=re.escape("Index(['a'], dtype='object')")): df.drop_duplicates(subset=["a"]) @@ -831,7 +810,7 @@ def test_drop_duplicates(df, bdf, backend): df.x.drop_duplicates(subset=["a"]) -def test_unique(df, bdf, lib): +def test_unique(df, bdf): with pytest.raises( AttributeError, match="'DataFrame' object has no attribute 'unique'" ): @@ -925,8 +904,8 @@ def test_sample(df): assert_eq(result, expected) -def test_align(df, bdf, backend): - if backend == "cudf": +def test_align(df, bdf): + if BACKEND == "cudf": pytest.skip(reason="align not supported by cudf") result_1, result_2 = df.align(df) pdf_result_1, pdf_result_2 = bdf.align(bdf) @@ -940,9 +919,11 @@ def test_align(df, bdf, backend): def test_align_different_partitions(): - pdf = pd.DataFrame({"a": [11, 12, 31, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6]}) + if BACKEND == "cudf": + pytest.skip(reason="align not supported by cudf") + pdf = lib.DataFrame({"a": [11, 12, 31, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6]}) df = from_pandas(pdf, npartitions=2) - pdf2 = pd.DataFrame( + pdf2 = lib.DataFrame( {"a": [11, 12, 31, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6]}, index=[-2, -1, 0, 1, 2, 3], ) @@ -954,7 +935,9 @@ def test_align_different_partitions(): def test_align_unknown_partitions_same_root(): - pdf = pd.DataFrame({"a": 1}, index=[3, 2, 1]) + if BACKEND == "cudf": + pytest.skip(reason="align not supported by cudf") + pdf = lib.DataFrame({"a": 1}, index=[3, 2, 1]) df = from_pandas(pdf, npartitions=2, sort=False) result_1, result_2 = df.align(df) pdf_result_1, pdf_result_2 = pdf.align(pdf) @@ -963,16 +946,18 @@ def test_align_unknown_partitions_same_root(): def test_unknown_partitions_different_root(): - pdf = pd.DataFrame({"a": 1}, index=[3, 2, 1]) + if BACKEND == "cudf": + pytest.skip(reason="align not supported by cudf") + pdf = lib.DataFrame({"a": 1}, index=[3, 2, 1]) df = from_pandas(pdf, npartitions=2, sort=False) - pdf2 = pd.DataFrame({"a": 1}, index=[4, 3, 2, 1]) + pdf2 = lib.DataFrame({"a": 1}, index=[4, 3, 2, 1]) df2 = from_pandas(pdf2, npartitions=2, sort=False) with pytest.raises(ValueError, match="Not all divisions"): df.align(df2) -def test_nunique_approx(df, backend): - if backend == "cudf": +def test_nunique_approx(df): + if BACKEND == "cudf": pytest.xfail(reason="compute_hll_array doesn't work for cudf") result = df.nunique_approx().compute() assert 99 < result < 101 @@ -1011,8 +996,8 @@ def test_assign_simplify_series(bdf): assert result._name == expected._name -def test_assign_non_series_inputs(df, bdf, backend): - if backend == "cudf": +def test_assign_non_series_inputs(df, bdf): + if BACKEND == "cudf": pytest.xfail(reason="assign function not supported by cudf") assert_eq(df.assign(a=lambda x: x.x * 2), bdf.assign(a=lambda x: x.x * 2)) assert_eq(df.assign(a=2), bdf.assign(a=2)) @@ -1043,12 +1028,12 @@ def test_are_co_aligned(bdf, df): assert not are_co_aligned(merged_first.expr, df.expr) -def test_astype_categories(df, backend): - if backend == "cudf": +def test_astype_categories(df): + if BACKEND == "cudf": pytest.xfail(reason="TODO") result = df.astype("category") - assert_eq(result.x._meta.cat.categories, pd.Index([UNKNOWN_CATEGORIES])) - assert_eq(result.y._meta.cat.categories, pd.Index([UNKNOWN_CATEGORIES])) + assert_eq(result.x._meta.cat.categories, lib.Index([UNKNOWN_CATEGORIES])) + assert_eq(result.y._meta.cat.categories, lib.Index([UNKNOWN_CATEGORIES])) def test_drop_simplify(df): @@ -1059,10 +1044,10 @@ def test_drop_simplify(df): def test_op_align(): - pdf = pd.DataFrame({"x": [1, 2, 3], "y": 1}) + pdf = lib.DataFrame({"x": [1, 2, 3], "y": 1}) df = from_pandas(pdf, npartitions=2) - pdf2 = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": 1}) + pdf2 = lib.DataFrame({"x": [1, 2, 3, 4, 5, 6], "y": 1}) df2 = from_pandas(pdf2, npartitions=2) assert_eq(df - df2, pdf - pdf2) @@ -1083,10 +1068,10 @@ def test_can_co_align(df, bdf): def test_avoid_alignment(): from dask_expr._align import AlignPartitions - a = pd.DataFrame({"x": range(100)}) + a = lib.DataFrame({"x": range(100)}) da = from_pandas(a, npartitions=4) - b = pd.DataFrame({"y": range(100)}) + b = lib.DataFrame({"y": range(100)}) b["z"] = b.y * 2 db = from_pandas(b, npartitions=3) From 934bd0324796395eb77d1efde5a7699fdab4bb52 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Fri, 14 Jul 2023 13:09:47 -0700 Subject: [PATCH 08/18] rely on DASK_DATAFRAME__BACKEND=cudf for now --- dask_expr/io/tests/test_io.py | 2 +- dask_expr/tests/test_collection.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/dask_expr/io/tests/test_io.py b/dask_expr/io/tests/test_io.py index 8c97bbb92..b2a8cd788 100644 --- a/dask_expr/io/tests/test_io.py +++ b/dask_expr/io/tests/test_io.py @@ -12,7 +12,7 @@ from dask_expr.io import ReadParquet # Import backend DataFrame library to test -BACKEND = os.environ.get("TEST_DASK_EXPR_BACKEND", "pandas") +BACKEND = config.get("dataframe.backend", "pandas") lib = importlib.import_module(BACKEND) diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py index 8f0c62a7e..c0189e66e 100644 --- a/dask_expr/tests/test_collection.py +++ b/dask_expr/tests/test_collection.py @@ -2,7 +2,6 @@ import importlib import operator -import os import pickle import re @@ -19,7 +18,7 @@ from dask_expr.datasets import timeseries # Import backend DataFrame library to test -BACKEND = os.environ.get("TEST_DASK_EXPR_BACKEND", "pandas") +BACKEND = dask.config.get("dataframe.backend", "pandas") lib = importlib.import_module(BACKEND) From 65252c482880af6cde124544d2e4cb91fad1844e Mon Sep 17 00:00:00 2001 From: rjzamora Date: Mon, 17 Jul 2023 13:21:35 -0700 Subject: [PATCH 09/18] add _set_engine utility for parquet --- dask_expr/_collection.py | 13 ++----------- dask_expr/io/parquet.py | 22 ++++++++++++++++------ dask_expr/io/tests/test_io.py | 2 +- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/dask_expr/_collection.py b/dask_expr/_collection.py index d0584a2ea..3cf7a62ca 100644 --- a/dask_expr/_collection.py +++ b/dask_expr/_collection.py @@ -7,7 +7,6 @@ import numpy as np import pandas as pd -from dask import config from dask.base import DaskMethodsMixin, is_dask_collection, named_schedulers from dask.dataframe.core import ( _concat, @@ -999,21 +998,13 @@ def read_parquet( engine=None, **kwargs, ): - from dask_expr.io.parquet import ReadParquet + from dask_expr.io.parquet import ReadParquet, _set_engine if not isinstance(path, str): path = stringify_path(path) kwargs["dtype_backend"] = dtype_backend - if engine is None: - if config.get("dataframe.backend", "pandas") == "cudf": - from dask_cudf.io.parquet import CudfEngine - - engine = CudfEngine - else: - engine = "pyarrow" - return new_collection( ReadParquet( path, @@ -1030,7 +1021,7 @@ def read_parquet( aggregate_files=aggregate_files, parquet_file_extension=parquet_file_extension, filesystem=filesystem, - engine=engine, + engine=_set_engine(engine), kwargs=kwargs, ) ) diff --git a/dask_expr/io/parquet.py b/dask_expr/io/parquet.py index c3ea94bae..17f59e2f3 100644 --- a/dask_expr/io/parquet.py +++ b/dask_expr/io/parquet.py @@ -157,7 +157,7 @@ def _layer(self): def to_parquet( df, path, - engine="pyarrow", + engine=None, compression="snappy", write_index=True, append=False, @@ -177,11 +177,7 @@ def to_parquet( from dask_expr._collection import new_collection from dask_expr.io.parquet import NONE_LABEL, ToParquet - if typename(df._meta).split(".")[0] == "cudf": - from dask_cudf.io.parquet import CudfEngine - - engine = CudfEngine - + engine = _set_engine(meta=df._meta) compute_kwargs = compute_kwargs or {} partition_on = partition_on or [] @@ -657,6 +653,20 @@ def _update_length_statistics(self): # +def _set_engine(engine=None, meta=None): + # Use `engine` or `meta` input to set the parquet engine + if engine is None: + if ( + meta is not None and typename(meta).split(".")[0] == "cudf" + ) or dask.config.get("dataframe.backend", "pandas") == "cudf": + from dask_cudf.io.parquet import CudfEngine + + engine = CudfEngine + else: + engine = "pyarrow" + return engine + + def _align_statistics(parts, statistics): # Make sure parts and statistics are aligned # (if statistics is not empty) diff --git a/dask_expr/io/tests/test_io.py b/dask_expr/io/tests/test_io.py index b2a8cd788..bb3c97998 100644 --- a/dask_expr/io/tests/test_io.py +++ b/dask_expr/io/tests/test_io.py @@ -111,7 +111,7 @@ def test_predicate_pushdown(tmpdir): y_result = y.compute() assert y_result.name == "b" assert len(y_result) == 6 - assert all(y_result == 4) + assert (y_result == 4).all() def test_predicate_pushdown_compound(tmpdir): From b2ebdf96e15ae356e172f6ab7e3624e31dfc4c9f Mon Sep 17 00:00:00 2001 From: rjzamora Date: Mon, 17 Jul 2023 13:28:17 -0700 Subject: [PATCH 10/18] remove unnecesary engine arg --- dask_expr/io/parquet.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dask_expr/io/parquet.py b/dask_expr/io/parquet.py index 17f59e2f3..4d4ca5fde 100644 --- a/dask_expr/io/parquet.py +++ b/dask_expr/io/parquet.py @@ -157,7 +157,6 @@ def _layer(self): def to_parquet( df, path, - engine=None, compression="snappy", write_index=True, append=False, From 8573040dfe2278b673300ebd9027c8b8aeb14a77 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Mon, 17 Jul 2023 13:31:00 -0700 Subject: [PATCH 11/18] fix test --- dask_expr/io/tests/test_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_expr/io/tests/test_io.py b/dask_expr/io/tests/test_io.py index e622c26d4..deeeac6c9 100644 --- a/dask_expr/io/tests/test_io.py +++ b/dask_expr/io/tests/test_io.py @@ -286,7 +286,7 @@ def test_to_parquet(tmpdir, write_metadata_file): def test_combine_similar(tmpdir): - pdf = pd.DataFrame( + pdf = lib.DataFrame( {"x": [0, 1, 2, 3] * 4, "y": range(16), "z": [None, 1, 2, 3] * 4} ) fn = _make_file(tmpdir, format="parquet", df=pdf) From 5c4c019f9b356d5ed8be01601c3bb8ce0ea62b04 Mon Sep 17 00:00:00 2001 From: Rick Zamora Date: Tue, 18 Jul 2023 09:26:30 -0500 Subject: [PATCH 12/18] revert pdf renaming --- dask_expr/tests/test_collection.py | 258 ++++++++++++++--------------- 1 file changed, 125 insertions(+), 133 deletions(-) diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py index c0189e66e..42cdd78ce 100644 --- a/dask_expr/tests/test_collection.py +++ b/dask_expr/tests/test_collection.py @@ -23,19 +23,19 @@ @pytest.fixture -def bdf(): - bdf = lib.DataFrame({"x": range(100)}) - bdf["y"] = bdf.x * 10.0 - yield bdf +def pdf(): + pdf = lib.DataFrame({"x": range(100)}) + pdf["y"] = pdf.x * 10.0 + yield pdf @pytest.fixture -def df(bdf): - yield from_pandas(bdf, npartitions=10) +def df(pdf): + yield from_pandas(pdf, npartitions=10) -def test_del(bdf, df): - pdf = bdf.copy() +def test_del(pdf, df): + pdf = pdf.copy() # Check __delitem__ del pdf["x"] @@ -43,8 +43,8 @@ def test_del(bdf, df): assert_eq(pdf, df) -def test_setitem(bdf, df): - pdf = bdf.copy() +def test_setitem(pdf, df): + pdf = pdf.copy() pdf["z"] = pdf.x + pdf.y df["z"] = df.x + df.y @@ -62,10 +62,9 @@ def test_explode(): assert_eq(pdf.a.explode(), df.a.explode()) -def test_explode_simplify(bdf): +def test_explode_simplify(pdf): if BACKEND == "cudf": pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/10271") - pdf = bdf.copy() pdf["z"] = 1 df = from_pandas(pdf) q = df.explode(column="x")["y"] @@ -98,11 +97,11 @@ def test_meta_blockwise(): assert set(cc.columns) == {"x", "y", "z"} -def test_dask(bdf, df): +def test_dask(pdf, df): assert (df.x + df.y).npartitions == 10 z = (df.x + df.y).sum() - assert assert_eq(z, (bdf.x + bdf.y).sum()) + assert assert_eq(z, (pdf.x + pdf.y).sum()) @pytest.mark.parametrize( @@ -124,26 +123,26 @@ def test_dask(bdf, df): ), ], ) -def test_reductions(func, bdf, df): +def test_reductions(func, pdf, df): if BACKEND == "cudf" and func in [M.idxmin, M.idxmax]: pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/9602") result = func(df) assert result.known_divisions - assert_eq(result, func(bdf)) + assert_eq(result, func(pdf)) result = func(df.x) assert not result.known_divisions - assert_eq(result, func(bdf.x)) + assert_eq(result, func(pdf.x)) # check_dtype False because sub-selection of columns that is pushed through # is not reflected in the meta calculation - assert_eq(func(df)["x"], func(bdf)["x"], check_dtype=False) + assert_eq(func(df)["x"], func(pdf)["x"], check_dtype=False) -def test_nbytes(bdf, df): +def test_nbytes(pdf, df): if BACKEND == "cudf": pytest.xfail(reason="nbytes not supported by cudf") with pytest.raises(NotImplementedError, match="nbytes is not implemented"): df.nbytes - assert_eq(df.x.nbytes, bdf.x.nbytes) + assert_eq(df.x.nbytes, pdf.x.nbytes) def test_mode(): @@ -153,16 +152,15 @@ def test_mode(): assert_eq(df.x.mode(), pdf.x.mode(), check_names=False) -def test_value_counts(df, bdf): +def test_value_counts(df, pdf): with pytest.raises( AttributeError, match="'DataFrame' object has no attribute 'value_counts'" ): df.value_counts() - assert_eq(df.x.value_counts(), bdf.x.value_counts().astype("int64")) + assert_eq(df.x.value_counts(), pdf.x.value_counts().astype("int64")) -def test_dropna(bdf): - pdf = bdf.copy() +def test_dropna(pdf): pdf.loc[0, "y"] = np.nan df = from_pandas(pdf) assert_eq(df.dropna(), pdf.dropna()) @@ -178,10 +176,9 @@ def test_fillna(): assert_eq(actual, expected) -def test_memory_usage(bdf): +def test_memory_usage(pdf): # Results are not equal with RangeIndex because pandas has one RangeIndex while # we have one RangeIndex per partition - pdf = bdf.copy() pdf.index = np.arange(len(pdf)) df = from_pandas(pdf) assert_eq(df.memory_usage(), pdf.memory_usage()) @@ -194,9 +191,9 @@ def test_memory_usage(bdf): @pytest.mark.parametrize("func", [M.nlargest, M.nsmallest]) -def test_nlargest_nsmallest(df, bdf, func): - assert_eq(func(df, n=5, columns="x"), func(bdf, n=5, columns="x")) - assert_eq(func(df.x, n=5), func(bdf.x, n=5)) +def test_nlargest_nsmallest(df, pdf, func): + assert_eq(func(df, n=5, columns="x"), func(pdf, n=5, columns="x")) + assert_eq(func(df.x, n=5), func(pdf.x, n=5)) with pytest.raises(TypeError, match="got an unexpected keyword argument"): func(df.x, n=5, columns="foo") @@ -215,8 +212,8 @@ def test_nlargest_nsmallest(df, bdf, func): lambda df: df.x != df.y, ], ) -def test_conditionals(func, bdf, df): - assert_eq(func(bdf), func(df), check_names=False) +def test_conditionals(func, pdf, df): + assert_eq(func(pdf), func(df), check_names=False) @pytest.mark.parametrize( @@ -264,18 +261,18 @@ def test_unary_operators(func): lambda df: df[(df.x > 7) & (df.x < 10)], ], ) -def test_and_or(func, bdf, df): - assert_eq(func(bdf), func(df), check_names=False) +def test_and_or(func, pdf, df): + assert_eq(func(pdf), func(df), check_names=False) @pytest.mark.parametrize("how", ["start", "end"]) -def test_to_timestamp(bdf, how): +def test_to_timestamp(pdf, how): if BACKEND == "cudf": pytest.xfail(reason="period_range not supported by cudf") - bdf.index = lib.period_range("2019-12-31", freq="D", periods=len(bdf)) - df = from_pandas(bdf) - assert_eq(df.to_timestamp(how=how), bdf.to_timestamp(how=how)) - assert_eq(df.x.to_timestamp(how=how), bdf.x.to_timestamp(how=how)) + pdf.index = lib.period_range("2019-12-31", freq="D", periods=len(pdf)) + df = from_pandas(pdf) + assert_eq(df.to_timestamp(how=how), pdf.to_timestamp(how=how)) + assert_eq(df.x.to_timestamp(how=how), pdf.x.to_timestamp(how=how)) @pytest.mark.parametrize( @@ -310,8 +307,8 @@ def test_to_timestamp(bdf, how): lambda df: df.select_dtypes(include="integer"), ], ) -def test_blockwise(func, bdf, df): - assert_eq(func(bdf), func(df)) +def test_blockwise(func, pdf, df): + assert_eq(func(pdf), func(df)) @pytest.mark.parametrize( @@ -323,16 +320,15 @@ def test_blockwise(func, bdf, df): lambda df: df.x.combine_first(df.y), ], ) -def test_blockwise_cudf_fails(func, bdf, df): +def test_blockwise_cudf_fails(func, pdf, df): if BACKEND == "cudf": pytest.xfail(reason="func not supported by cudf") - assert_eq(func(bdf), func(df)) + assert_eq(func(pdf), func(df)) -def test_rename_axis(bdf): +def test_rename_axis(pdf): if BACKEND == "cudf": pytest.xfail(reason="rename_axis not supported by cudf") - pdf = bdf.copy() pdf.index.name = "a" pdf.columns.name = "b" df = from_pandas(pdf, npartitions=10) @@ -341,14 +337,13 @@ def test_rename_axis(bdf): assert_eq(df.x.rename_axis(index="dummy"), pdf.x.rename_axis(index="dummy")) -def test_isin(df, bdf): +def test_isin(df, pdf): values = [1, 2] - assert_eq(bdf.isin(values), df.isin(values)) - assert_eq(bdf.x.isin(values), df.x.isin(values)) + assert_eq(pdf.isin(values), df.isin(values)) + assert_eq(pdf.x.isin(values), df.x.isin(values)) -def test_round(bdf): - pdf = bdf.copy() +def test_round(pdf): pdf += 0.5555 df = from_pandas(pdf) assert_eq(df.round(decimals=1), pdf.round(decimals=1)) @@ -365,10 +360,9 @@ def test_repr(df): assert "sum(skipna=False)" in s -def test_combine_first_simplify(bdf): +def test_combine_first_simplify(pdf): if BACKEND == "cudf": pytest.xfail(reason="combine_first not supported by cudf") - pdf = bdf.copy() df = from_pandas(pdf) pdf2 = pdf.rename(columns={"y": "z"}) df2 = from_pandas(pdf2) @@ -413,8 +407,7 @@ def test_clip_traverse_filters(df): @pytest.mark.parametrize("projection", ["zz", ["zz"], ["zz", "x"], "zz"]) @pytest.mark.parametrize("subset", ["x", ["x"]]) -def test_drop_duplicates_subset_simplify(bdf, subset, projection): - pdf = bdf.copy() +def test_drop_duplicates_subset_simplify(pdf, subset, projection): pdf["zz"] = 1 df = from_pandas(pdf) result = df.drop_duplicates(subset=subset)[projection].simplify() @@ -423,18 +416,18 @@ def test_drop_duplicates_subset_simplify(bdf, subset, projection): assert str(result) == str(expected) -def test_broadcast(bdf, df): +def test_broadcast(pdf, df): assert_eq( df + df.sum(), - bdf + bdf.sum(), + pdf + pdf.sum(), ) assert_eq( df.x + df.x.sum(), - bdf.x + bdf.x.sum(), + pdf.x + pdf.x.sum(), ) -def test_persist(bdf, df): +def test_persist(pdf, df): a = df + 2 b = a.persist() @@ -443,25 +436,25 @@ def test_persist(bdf, df): assert len(b.__dask_graph__()) == b.npartitions - assert_eq(b.y.sum(), (bdf + 2).y.sum()) + assert_eq(b.y.sum(), (pdf + 2).y.sum()) -def test_index(bdf, df): - assert_eq(df.index, bdf.index) - assert_eq(df.x.index, bdf.x.index) +def test_index(pdf, df): + assert_eq(df.index, pdf.index) + assert_eq(df.x.index, pdf.x.index) @pytest.mark.parametrize("drop", [True, False]) -def test_reset_index(bdf, df, drop): - assert_eq(df.reset_index(drop=drop), bdf.reset_index(drop=drop), check_index=False) +def test_reset_index(pdf, df, drop): + assert_eq(df.reset_index(drop=drop), pdf.reset_index(drop=drop), check_index=False) assert_eq( - df.x.reset_index(drop=drop), bdf.x.reset_index(drop=drop), check_index=False + df.x.reset_index(drop=drop), pdf.x.reset_index(drop=drop), check_index=False ) -def test_head(bdf, df): - assert_eq(df.head(compute=False), bdf.head()) - assert_eq(df.head(compute=False, n=7), bdf.head(n=7)) +def test_head(pdf, df): + assert_eq(df.head(compute=False), pdf.head()) + assert_eq(df.head(compute=False, n=7), pdf.head(n=7)) assert df.head(compute=False).npartitions == 1 @@ -482,9 +475,9 @@ def test_head_head(df): assert a.optimize()._name == b.optimize()._name -def test_tail(bdf, df): - assert_eq(df.tail(compute=False), bdf.tail()) - assert_eq(df.tail(compute=False, n=7), bdf.tail(n=7)) +def test_tail(pdf, df): + assert_eq(df.tail(compute=False), pdf.tail()) + assert_eq(df.tail(compute=False, n=7), pdf.tail(n=7)) assert df.tail(compute=False).npartitions == 1 @@ -519,10 +512,10 @@ def test_projection_stacking(df): assert optimized._name == expected._name -def test_projection_stacking_coercion(bdf): - df = from_pandas(bdf) - assert_eq(df.x[0], bdf.x[0], check_divisions=False) - assert_eq(df.x[[0]], bdf.x[[0]], check_divisions=False) +def test_projection_stacking_coercion(pdf): + df = from_pandas(pdf) + assert_eq(df.x[0], pdf.x[0], check_divisions=False) + assert_eq(df.x[[0]], pdf.x[[0]], check_divisions=False) def test_remove_unnecessary_projections(df): @@ -571,8 +564,8 @@ def test_substitute(): assert result._name == expected._name -def test_from_pandas(bdf): - df = from_pandas(bdf, npartitions=3) +def test_from_pandas(pdf): + df = from_pandas(pdf, npartitions=3) assert df.npartitions == 3 assert "pandas" in df._name @@ -587,12 +580,12 @@ def test_copy(df): assert "z" not in original.columns -def test_partitions(bdf, df): - assert_eq(df.partitions[0], bdf.iloc[:10]) - assert_eq(df.partitions[1], bdf.iloc[10:20]) - assert_eq(df.partitions[1:3], bdf.iloc[10:30]) - assert_eq(df.partitions[[3, 4]], bdf.iloc[30:50]) - assert_eq(df.partitions[-1], bdf.iloc[90:]) +def test_partitions(pdf, df): + assert_eq(df.partitions[0], pdf.iloc[:10]) + assert_eq(df.partitions[1], pdf.iloc[10:20]) + assert_eq(df.partitions[1:3], pdf.iloc[10:30]) + assert_eq(df.partitions[[3, 4]], pdf.iloc[30:50]) + assert_eq(df.partitions[-1], pdf.iloc[90:]) out = (df + 1).partitions[0].simplify() assert isinstance(out.expr, expr.Add) @@ -601,7 +594,7 @@ def test_partitions(bdf, df): # Check culling out = optimize(df.partitions[1]) assert len(out.dask) == 1 - assert_eq(out, bdf.iloc[10:20]) + assert_eq(out, pdf.iloc[10:20]) def test_column_getattr(df): @@ -612,10 +605,10 @@ def test_column_getattr(df): df.foo -def test_serialization(bdf, df): +def test_serialization(pdf, df): before = pickle.dumps(df) - assert len(before) < 200 + len(pickle.dumps(bdf)) + assert len(before) < 200 + len(pickle.dumps(pdf)) part = df.partitions[0].compute() assert ( @@ -733,8 +726,8 @@ def test_partitions_nested(df): @pytest.mark.parametrize("sort", [True, False]) @pytest.mark.parametrize("npartitions", [7, 12]) -def test_repartition_npartitions(bdf, npartitions, sort): - df = from_pandas(bdf, sort=sort) + 1 +def test_repartition_npartitions(pdf, npartitions, sort): + df = from_pandas(pdf, sort=sort) + 1 df2 = df.repartition(npartitions=npartitions) assert df2.npartitions == npartitions assert_eq(df, df2) @@ -762,11 +755,11 @@ def test_repartition_no_op(df): assert result._name == df._name -def test_len(df, bdf): +def test_len(df, pdf): df2 = df[["x"]] + 1 - assert len(df2) == len(bdf) + assert len(df2) == len(pdf) - assert len(df[df.x > 5]) == len(bdf[bdf.x > 5]) + assert len(df[df.x > 5]) == len(pdf[pdf.x > 5]) first = df2.partitions[0].compute() assert len(df2.partitions[0]) == len(first) @@ -775,12 +768,12 @@ def test_len(df, bdf): assert isinstance(expr.Lengths(df2.expr).optimize(), expr.Literal) -def test_astype_simplify(df, bdf): +def test_astype_simplify(df, pdf): q = df.astype({"x": "float64", "y": "float64"})["x"] result = q.simplify() expected = df["x"].astype({"x": "float64"}) assert result._name == expected._name - assert_eq(q, bdf.astype({"x": "float64", "y": "float64"})["x"]) + assert_eq(q, pdf.astype({"x": "float64", "y": "float64"})["x"]) q = df.astype({"y": "float64"})["x"] result = q.simplify() @@ -793,13 +786,13 @@ def test_astype_simplify(df, bdf): assert result._name == expected._name -def test_drop_duplicates(df, bdf): - assert_eq(df.drop_duplicates(), bdf.drop_duplicates()) +def test_drop_duplicates(df, pdf): + assert_eq(df.drop_duplicates(), pdf.drop_duplicates()) assert_eq( - df.drop_duplicates(ignore_index=True), bdf.drop_duplicates(ignore_index=True) + df.drop_duplicates(ignore_index=True), pdf.drop_duplicates(ignore_index=True) ) - assert_eq(df.drop_duplicates(subset=["x"]), bdf.drop_duplicates(subset=["x"])) - assert_eq(df.x.drop_duplicates(), bdf.x.drop_duplicates()) + assert_eq(df.drop_duplicates(subset=["x"]), pdf.drop_duplicates(subset=["x"])) + assert_eq(df.x.drop_duplicates(), pdf.x.drop_duplicates()) if BACKEND == "pandas": with pytest.raises(KeyError, match=re.escape("Index(['a'], dtype='object')")): @@ -809,15 +802,15 @@ def test_drop_duplicates(df, bdf): df.x.drop_duplicates(subset=["a"]) -def test_unique(df, bdf): +def test_unique(df, pdf): with pytest.raises( AttributeError, match="'DataFrame' object has no attribute 'unique'" ): df.unique() # pandas returns a numpy array while we return a Series/Index - assert_eq(df.x.unique(), lib.Series(bdf.x.unique(), name="x")) - assert_eq(df.index.unique(), lib.Index(bdf.index.unique())) + assert_eq(df.x.unique(), lib.Series(pdf.x.unique(), name="x")) + assert_eq(df.index.unique(), lib.Index(pdf.index.unique())) def test_walk(df): @@ -848,14 +841,14 @@ def test_find_operations(df): @pytest.mark.parametrize("subset", ["x", ["x"]]) -def test_dropna_simplify(bdf, subset): - bdf["z"] = 1 - df = from_pandas(bdf) +def test_dropna_simplify(pdf, subset): + pdf["z"] = 1 + df = from_pandas(pdf) q = df.dropna(subset=subset)["y"] result = q.simplify() expected = df[["x", "y"]].dropna(subset=subset)["y"] assert result._name == expected._name - assert_eq(q, bdf.dropna(subset=subset)["y"]) + assert_eq(q, pdf.dropna(subset=subset)["y"]) def test_dir(df): @@ -879,13 +872,13 @@ def test_dir(df): ], ) @pytest.mark.parametrize("indexer", ["x", ["x"]]) -def test_simplify_up_blockwise(df, bdf, func, args, indexer): +def test_simplify_up_blockwise(df, pdf, func, args, indexer): q = getattr(df, func)(*args)[indexer] result = q.simplify() expected = getattr(df[indexer], func)(*args) assert result._name == expected._name - assert_eq(q, getattr(bdf, func)(*args)[indexer]) + assert_eq(q, getattr(pdf, func)(*args)[indexer]) q = getattr(df, func)(*args)[["x", "y"]] result = q.simplify() @@ -903,16 +896,16 @@ def test_sample(df): assert_eq(result, expected) -def test_align(df, bdf): +def test_align(df, pdf): if BACKEND == "cudf": pytest.skip(reason="align not supported by cudf") result_1, result_2 = df.align(df) - pdf_result_1, pdf_result_2 = bdf.align(bdf) + pdf_result_1, pdf_result_2 = pdf.align(pdf) assert_eq(result_1, pdf_result_1) assert_eq(result_2, pdf_result_2) result_1, result_2 = df.x.align(df.x) - pdf_result_1, pdf_result_2 = bdf.x.align(bdf.x) + pdf_result_1, pdf_result_2 = pdf.x.align(pdf.x) assert_eq(result_1, pdf_result_1) assert_eq(result_2, pdf_result_2) @@ -962,51 +955,51 @@ def test_nunique_approx(df): assert 99 < result < 101 -def test_assign_simplify(bdf): - df = from_pandas(bdf) - df2 = from_pandas(bdf) +def test_assign_simplify(pdf): + df = from_pandas(pdf) + df2 = from_pandas(pdf) df["new"] = df.x > 1 result = df[["x", "new"]].simplify() expected = df2[["x"]].assign(new=df2.x > 1).simplify() assert result._name == expected._name - bdf["new"] = bdf.x > 1 - assert_eq(bdf[["x", "new"]], result) + pdf["new"] = pdf.x > 1 + assert_eq(pdf[["x", "new"]], result) -def test_assign_simplify_new_column_not_needed(bdf): - df = from_pandas(bdf) - df2 = from_pandas(bdf) +def test_assign_simplify_new_column_not_needed(pdf): + df = from_pandas(pdf) + df2 = from_pandas(pdf) df["new"] = df.x > 1 result = df[["x"]].simplify() expected = df2[["x"]].simplify() assert result._name == expected._name - bdf["new"] = bdf.x > 1 - assert_eq(result, bdf[["x"]]) + pdf["new"] = pdf.x > 1 + assert_eq(result, pdf[["x"]]) -def test_assign_simplify_series(bdf): - df = from_pandas(bdf) - df2 = from_pandas(bdf) +def test_assign_simplify_series(pdf): + df = from_pandas(pdf) + df2 = from_pandas(pdf) df["new"] = df.x > 1 result = df.new.simplify() expected = df2[[]].assign(new=df2.x > 1).new.simplify() assert result._name == expected._name -def test_assign_non_series_inputs(df, bdf): +def test_assign_non_series_inputs(df, pdf): if BACKEND == "cudf": pytest.xfail(reason="assign function not supported by cudf") - assert_eq(df.assign(a=lambda x: x.x * 2), bdf.assign(a=lambda x: x.x * 2)) - assert_eq(df.assign(a=2), bdf.assign(a=2)) - assert_eq(df.assign(a=df.x.sum()), bdf.assign(a=bdf.x.sum())) + assert_eq(df.assign(a=lambda x: x.x * 2), pdf.assign(a=lambda x: x.x * 2)) + assert_eq(df.assign(a=2), pdf.assign(a=2)) + assert_eq(df.assign(a=df.x.sum()), pdf.assign(a=pdf.x.sum())) - assert_eq(df.assign(a=lambda x: x.x * 2).y, bdf.assign(a=lambda x: x.x * 2).y) - assert_eq(df.assign(a=lambda x: x.x * 2).a, bdf.assign(a=lambda x: x.x * 2).a) + assert_eq(df.assign(a=lambda x: x.x * 2).y, pdf.assign(a=lambda x: x.x * 2).y) + assert_eq(df.assign(a=lambda x: x.x * 2).a, pdf.assign(a=lambda x: x.x * 2).a) -def test_are_co_aligned(bdf, df): +def test_are_co_aligned(pdf, df): df2 = df.reset_index() assert are_co_aligned(df.expr, df2.expr) assert are_co_aligned(df.expr, df2.sum().expr) @@ -1015,8 +1008,8 @@ def test_are_co_aligned(bdf, df): assert are_co_aligned(df.expr, df.sum().expr) assert are_co_aligned((df + df.sum()).expr, df.sum().expr) - bdf = bdf.assign(z=1) - df3 = from_pandas(bdf, npartitions=10) + pdf = pdf.assign(z=1) + df3 = from_pandas(pdf, npartitions=10) assert not are_co_aligned(df.expr, df3.expr) assert are_co_aligned(df.expr, df3.sum().expr) @@ -1052,8 +1045,7 @@ def test_op_align(): assert_eq(df - df2, pdf - pdf2) -def test_can_co_align(df, bdf): - pdf = bdf.copy() +def test_can_co_align(df, pdf): q = (df.x + df.y).optimize(fuse=False) expected = df.x + df.y assert q._name == expected._name From 6d308d7b6f076a81496129f27cf774d33f30af86 Mon Sep 17 00:00:00 2001 From: Rick Zamora Date: Fri, 21 Jul 2023 14:51:01 -0500 Subject: [PATCH 13/18] update test --- dask_expr/tests/test_collection.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py index 725eacfd9..fb49d1173 100644 --- a/dask_expr/tests/test_collection.py +++ b/dask_expr/tests/test_collection.py @@ -143,7 +143,7 @@ def test_reductions(func, pdf, df): @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("ddof", [1, 2]) def test_std_kwargs(axis, skipna, ddof): - pdf = pd.DataFrame( + pdf = lib.DataFrame( {"x": range(30), "y": [1, 2, None] * 10, "z": ["dog", "cat"] * 15} ) df = from_pandas(pdf, npartitions=3) @@ -581,7 +581,7 @@ def test_substitute(): def test_substitute_parameters(df): - pdf = pd.DataFrame( + pdf = lib.DataFrame( { "a": range(100), "b": range(100), From b9624da452dc7c1b9faa28621ddc6ee79cd1a250 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Fri, 21 Jul 2023 15:38:45 -0700 Subject: [PATCH 14/18] address problems with cudf var/std behavior --- dask_expr/_reductions.py | 7 ++++++- dask_expr/tests/test_collection.py | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/dask_expr/_reductions.py b/dask_expr/_reductions.py index bf56e8810..dcadba327 100644 --- a/dask_expr/_reductions.py +++ b/dask_expr/_reductions.py @@ -417,7 +417,7 @@ def aggregate_kwargs(self): @classmethod def reduction_chunk(cls, x, skipna=True, numeric_only=False): kwargs = {"numeric_only": numeric_only} if is_dataframe_like(x) else {} - if skipna: + if skipna or numeric_only: n = x.count(**kwargs) kwargs["skipna"] = skipna avg = x.mean(**kwargs) @@ -427,6 +427,11 @@ def reduction_chunk(cls, x, skipna=True, numeric_only=False): n = len(x) kwargs["skipna"] = skipna avg = x.sum(**kwargs) / n + if numeric_only: + # Workaround for cudf bug + # (see: https://github.com/rapidsai/cudf/issues/13731) + x = x.select_dtypes("number") + n = n.loc[x.columns] m2 = ((x - avg) ** 2).sum(**kwargs) return n, avg, m2 diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py index fb49d1173..5586cafd7 100644 --- a/dask_expr/tests/test_collection.py +++ b/dask_expr/tests/test_collection.py @@ -143,6 +143,8 @@ def test_reductions(func, pdf, df): @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("ddof", [1, 2]) def test_std_kwargs(axis, skipna, ddof): + if BACKEND == "cudf" and skipna is False: + pytest.xfail(reason="cudf requires skipna=True when nulls are present.") pdf = lib.DataFrame( {"x": range(30), "y": [1, 2, None] * 10, "z": ["dog", "cat"] * 15} ) From f502717a4d4a29f0bb2da6668f9ad3af2e72c539 Mon Sep 17 00:00:00 2001 From: rjzamora Date: Mon, 24 Jul 2023 08:22:31 -0700 Subject: [PATCH 15/18] us decorators --- dask_expr/tests/test_collection.py | 56 +++++++++++------------------- 1 file changed, 20 insertions(+), 36 deletions(-) diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py index 5586cafd7..9293df46c 100644 --- a/dask_expr/tests/test_collection.py +++ b/dask_expr/tests/test_collection.py @@ -3,7 +3,6 @@ import importlib import operator import pickle -import re import dask import numpy as np @@ -19,6 +18,7 @@ # Import backend DataFrame library to test BACKEND = dask.config.get("dataframe.backend", "pandas") +CUDF_BACKEND = BACKEND == "cudf" lib = importlib.import_module(BACKEND) @@ -53,18 +53,16 @@ def test_setitem(pdf, df): assert_eq(df, pdf) +@pytest.mark.xfail(CUDF_BACKEND, reason="https://github.com/rapidsai/cudf/issues/10271") def test_explode(): - if BACKEND == "cudf": - pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/10271") pdf = lib.DataFrame({"a": [[1, 2], [3, 4]]}) df = from_pandas(pdf) assert_eq(pdf.explode(column="a"), df.explode(column="a")) assert_eq(pdf.a.explode(), df.a.explode()) +@pytest.mark.xfail(CUDF_BACKEND, reason="https://github.com/rapidsai/cudf/issues/10271") def test_explode_simplify(pdf): - if BACKEND == "cudf": - pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/10271") pdf["z"] = 1 df = from_pandas(pdf) q = df.explode(column="x")["y"] @@ -126,7 +124,7 @@ def test_dask(pdf, df): ], ) def test_reductions(func, pdf, df): - if BACKEND == "cudf" and func in [M.idxmin, M.idxmax]: + if CUDF_BACKEND and func in [M.idxmin, M.idxmax]: pytest.xfail(reason="https://github.com/rapidsai/cudf/issues/9602") result = func(df) assert result.known_divisions @@ -143,7 +141,7 @@ def test_reductions(func, pdf, df): @pytest.mark.parametrize("skipna", [True, False]) @pytest.mark.parametrize("ddof", [1, 2]) def test_std_kwargs(axis, skipna, ddof): - if BACKEND == "cudf" and skipna is False: + if CUDF_BACKEND and skipna is False: pytest.xfail(reason="cudf requires skipna=True when nulls are present.") pdf = lib.DataFrame( {"x": range(30), "y": [1, 2, None] * 10, "z": ["dog", "cat"] * 15} @@ -155,9 +153,8 @@ def test_std_kwargs(axis, skipna, ddof): ) +@pytest.mark.xfail(CUDF_BACKEND, reason="nbytes not supported by cudf") def test_nbytes(pdf, df): - if BACKEND == "cudf": - pytest.xfail(reason="nbytes not supported by cudf") with pytest.raises(NotImplementedError, match="nbytes is not implemented"): df.nbytes assert_eq(df.x.nbytes, pdf.x.nbytes) @@ -283,10 +280,9 @@ def test_and_or(func, pdf, df): assert_eq(func(pdf), func(df), check_names=False) +@pytest.mark.xfail(CUDF_BACKEND, reason="period_range not supported by cudf") @pytest.mark.parametrize("how", ["start", "end"]) def test_to_timestamp(pdf, how): - if BACKEND == "cudf": - pytest.xfail(reason="period_range not supported by cudf") pdf.index = lib.period_range("2019-12-31", freq="D", periods=len(pdf)) df = from_pandas(pdf) assert_eq(df.to_timestamp(how=how), pdf.to_timestamp(how=how)) @@ -329,6 +325,7 @@ def test_blockwise(func, pdf, df): assert_eq(func(pdf), func(df)) +@pytest.mark.xfail(CUDF_BACKEND, reason="func not supported by cudf") @pytest.mark.parametrize( "func", [ @@ -339,14 +336,11 @@ def test_blockwise(func, pdf, df): ], ) def test_blockwise_cudf_fails(func, pdf, df): - if BACKEND == "cudf": - pytest.xfail(reason="func not supported by cudf") assert_eq(func(pdf), func(df)) +@pytest.mark.xfail(CUDF_BACKEND, reason="rename_axis not supported by cudf") def test_rename_axis(pdf): - if BACKEND == "cudf": - pytest.xfail(reason="rename_axis not supported by cudf") pdf.index.name = "a" pdf.columns.name = "b" df = from_pandas(pdf, npartitions=10) @@ -378,9 +372,8 @@ def test_repr(df): assert "sum(skipna=False)" in s +@pytest.mark.xfail(CUDF_BACKEND, reason="combine_first not supported by cudf") def test_combine_first_simplify(pdf): - if BACKEND == "cudf": - pytest.xfail(reason="combine_first not supported by cudf") df = from_pandas(pdf) pdf2 = pdf.rename(columns={"y": "z"}) df2 = from_pandas(pdf2) @@ -667,9 +660,8 @@ def test_serialization(pdf, df): assert_eq(pickle.loads(before), pickle.loads(after)) +@pytest.mark.xfail(CUDF_BACKEND, reason="Cannot apply lambda function in cudf") def test_size_optimized(df): - if BACKEND == "cudf": - pytest.xfail(reason="Cannot apply lambda function in cudf") expr = (df.x + 1).apply(lambda x: x).size out = optimize(expr) expected = optimize(df.x.size) @@ -837,9 +829,8 @@ def test_drop_duplicates(df, pdf): assert_eq(df.drop_duplicates(subset=["x"]), pdf.drop_duplicates(subset=["x"])) assert_eq(df.x.drop_duplicates(), pdf.x.drop_duplicates()) - if BACKEND == "pandas": - with pytest.raises(KeyError, match=re.escape("Index(['a'], dtype='object')")): - df.drop_duplicates(subset=["a"]) + with pytest.raises(KeyError, match="'a'"): + df.drop_duplicates(subset=["a"]) with pytest.raises(TypeError, match="got an unexpected keyword argument"): df.x.drop_duplicates(subset=["a"]) @@ -939,9 +930,8 @@ def test_sample(df): assert_eq(result, expected) +@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf") def test_align(df, pdf): - if BACKEND == "cudf": - pytest.skip(reason="align not supported by cudf") result_1, result_2 = df.align(df) pdf_result_1, pdf_result_2 = pdf.align(pdf) assert_eq(result_1, pdf_result_1) @@ -953,9 +943,8 @@ def test_align(df, pdf): assert_eq(result_2, pdf_result_2) +@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf") def test_align_different_partitions(): - if BACKEND == "cudf": - pytest.skip(reason="align not supported by cudf") pdf = lib.DataFrame({"a": [11, 12, 31, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6]}) df = from_pandas(pdf, npartitions=2) pdf2 = lib.DataFrame( @@ -969,9 +958,8 @@ def test_align_different_partitions(): assert_eq(result_2, pdf_result_2) +@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf") def test_align_unknown_partitions_same_root(): - if BACKEND == "cudf": - pytest.skip(reason="align not supported by cudf") pdf = lib.DataFrame({"a": 1}, index=[3, 2, 1]) df = from_pandas(pdf, npartitions=2, sort=False) result_1, result_2 = df.align(df) @@ -980,9 +968,8 @@ def test_align_unknown_partitions_same_root(): assert_eq(result_2, pdf_result_2) +@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf") def test_unknown_partitions_different_root(): - if BACKEND == "cudf": - pytest.skip(reason="align not supported by cudf") pdf = lib.DataFrame({"a": 1}, index=[3, 2, 1]) df = from_pandas(pdf, npartitions=2, sort=False) pdf2 = lib.DataFrame({"a": 1}, index=[4, 3, 2, 1]) @@ -991,9 +978,8 @@ def test_unknown_partitions_different_root(): df.align(df2) +@pytest.mark.xfail(CUDF_BACKEND, reason="compute_hll_array doesn't work for cudf") def test_nunique_approx(df): - if BACKEND == "cudf": - pytest.xfail(reason="compute_hll_array doesn't work for cudf") result = df.nunique_approx().compute() assert 99 < result < 101 @@ -1031,9 +1017,8 @@ def test_assign_simplify_series(pdf): assert result._name == expected._name +@pytest.mark.xfail(CUDF_BACKEND, reason="assign function not supported by cudf") def test_assign_non_series_inputs(df, pdf): - if BACKEND == "cudf": - pytest.xfail(reason="assign function not supported by cudf") assert_eq(df.assign(a=lambda x: x.x * 2), pdf.assign(a=lambda x: x.x * 2)) assert_eq(df.assign(a=2), pdf.assign(a=2)) assert_eq(df.assign(a=df.x.sum()), pdf.assign(a=pdf.x.sum())) @@ -1063,9 +1048,8 @@ def test_are_co_aligned(pdf, df): assert not are_co_aligned(merged_first.expr, df.expr) +@pytest.mark.xfail(CUDF_BACKEND, reason="TODO") def test_astype_categories(df): - if BACKEND == "cudf": - pytest.xfail(reason="TODO") result = df.astype("category") assert_eq(result.x._meta.cat.categories, lib.Index([UNKNOWN_CATEGORIES])) assert_eq(result.y._meta.cat.categories, lib.Index([UNKNOWN_CATEGORIES])) From fc884022e89ec1f8aa65a4b15490037f36c2bcdb Mon Sep 17 00:00:00 2001 From: rjzamora Date: Mon, 24 Jul 2023 08:26:39 -0700 Subject: [PATCH 16/18] rename _set_engine to _set_parquet_engine --- dask_expr/_collection.py | 4 ++-- dask_expr/io/parquet.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dask_expr/_collection.py b/dask_expr/_collection.py index 4b23dd1f3..1de19f30b 100644 --- a/dask_expr/_collection.py +++ b/dask_expr/_collection.py @@ -1081,7 +1081,7 @@ def read_parquet( engine=None, **kwargs, ): - from dask_expr.io.parquet import ReadParquet, _set_engine + from dask_expr.io.parquet import ReadParquet, _set_parquet_engine if not isinstance(path, str): path = stringify_path(path) @@ -1104,7 +1104,7 @@ def read_parquet( aggregate_files=aggregate_files, parquet_file_extension=parquet_file_extension, filesystem=filesystem, - engine=_set_engine(engine), + engine=_set_parquet_engine(engine), kwargs=kwargs, ) ) diff --git a/dask_expr/io/parquet.py b/dask_expr/io/parquet.py index 4dec13345..8f9927f07 100644 --- a/dask_expr/io/parquet.py +++ b/dask_expr/io/parquet.py @@ -176,7 +176,7 @@ def to_parquet( from dask_expr._collection import new_collection from dask_expr.io.parquet import NONE_LABEL, ToParquet - engine = _set_engine(meta=df._meta) + engine = _set_parquet_engine(meta=df._meta) compute_kwargs = compute_kwargs or {} partition_on = partition_on or [] @@ -686,7 +686,7 @@ def _update_length_statistics(self): # -def _set_engine(engine=None, meta=None): +def _set_parquet_engine(engine=None, meta=None): # Use `engine` or `meta` input to set the parquet engine if engine is None: if ( From 5d927c19eb654d349e3103e4a3a70d1953d4e76e Mon Sep 17 00:00:00 2001 From: rjzamora Date: Mon, 24 Jul 2023 10:11:04 -0700 Subject: [PATCH 17/18] introduce _required_attribute --- dask_expr/_expr.py | 26 +++++++++++++++++++++++++- dask_expr/_reductions.py | 10 ++++++++++ dask_expr/tests/test_collection.py | 22 +++++++++++----------- 3 files changed, 46 insertions(+), 12 deletions(-) diff --git a/dask_expr/_expr.py b/dask_expr/_expr.py index 2e08b4e18..6d25db7f4 100644 --- a/dask_expr/_expr.py +++ b/dask_expr/_expr.py @@ -54,6 +54,18 @@ def __init__(self, *args, **kwargs): operands.append(type(self)._defaults[parameter]) assert not kwargs self.operands = operands + if self._required_attribute: + dep = next(iter(self.dependencies()))._meta + if not hasattr(dep, self._required_attribute): + # Raise a ValueError instead of AttributeError to + # avoid infinite recursion + raise ValueError(f"{dep} has no attribute {self._required_attribute}") + + @property + def _required_attribute(self) -> str: + # Specify if the first `dependency` must support + # a specific attribute for valid behavior. + return None @functools.cached_property def ndim(self): @@ -941,6 +953,12 @@ class Blockwise(Expr): _keyword_only = [] _projection_passthrough = False + @property + def _required_attribute(self): + if isinstance(self.operation, type(M.method_caller)): + return self.operation.method + return None + @functools.cached_property def _meta(self): args = [op._meta if isinstance(op, Expr) else op for op in self._args] @@ -1027,7 +1045,13 @@ def _combine_similar(self, root: Expr): # Push projections back up through `_projection_passthrough` # operations if it reduces the number of unique expression nodes. if self._projection_passthrough and isinstance(self.frame, Projection): - common = type(self)(self.frame.frame, *self.operands[1:]) + try: + common = type(self)(self.frame.frame, *self.operands[1:]) + except ValueError: + # May have encountered a problem with `_required_attribute`. + # (There is no guarentee that the same method will exist for + # both a Series and DataFrame) + return None projection = self.frame.operand("columns") push_up_projection = False for op in self._find_similar_operations(root, ignore=self._parameters): diff --git a/dask_expr/_reductions.py b/dask_expr/_reductions.py index dcadba327..592ffea9b 100644 --- a/dask_expr/_reductions.py +++ b/dask_expr/_reductions.py @@ -42,6 +42,15 @@ class ApplyConcatApply(Expr): combine_kwargs = {} aggregate_kwargs = {} + # def __init__(self, *args, **kwargs): + # super().__init__(*args, **kwargs) + # if self._required_attribute: + # dep = next(iter(self.dependencies()))._meta + # if not hasattr(dep, self._required_attribute): + # # Raise a ValueError instead of AttributeError to + # # avoid infinite recursion + # raise ValueError(f"{dep} has no attribute {self._required_attribute}") + def __dask_postcompute__(self): return toolz.first, () @@ -386,6 +395,7 @@ class NBytes(Reduction): # Only supported for Series objects reduction_chunk = lambda ser: ser.nbytes reduction_aggregate = sum + _required_attribute = "nbytes" class Var(Reduction): diff --git a/dask_expr/tests/test_collection.py b/dask_expr/tests/test_collection.py index 9293df46c..410e40125 100644 --- a/dask_expr/tests/test_collection.py +++ b/dask_expr/tests/test_collection.py @@ -7,7 +7,7 @@ import dask import numpy as np import pytest -from dask.dataframe._compat import PANDAS_GT_210 +from dask.dataframe._compat import PANDAS_GE_210 from dask.dataframe.utils import UNKNOWN_CATEGORIES, assert_eq from dask.utils import M @@ -293,12 +293,6 @@ def test_to_timestamp(pdf, how): "func", [ lambda df: df.astype(int), - pytest.param( - lambda df: df.map(lambda x: x + 1), - marks=pytest.mark.skipif( - not PANDAS_GT_210, reason="Only available from 2.1" - ), - ), lambda df: df.clip(lower=10, upper=50), lambda df: df.x.clip(lower=10, upper=50), lambda df: df.x.between(left=10, right=50), @@ -331,11 +325,17 @@ def test_blockwise(func, pdf, df): [ lambda df: df.apply(lambda row, x, y=10: row * x + y, x=2), lambda df: df.index.map(lambda x: x + 1), + pytest.param( + lambda df: df.map(lambda x: x + 1), + marks=pytest.mark.skipif( + not PANDAS_GE_210, reason="Only available from 2.1" + ), + ), lambda df: df.combine_first(df), lambda df: df.x.combine_first(df.y), ], ) -def test_blockwise_cudf_fails(func, pdf, df): +def test_blockwise_pandas_only(func, pdf, df): assert_eq(func(pdf), func(df)) @@ -930,7 +930,7 @@ def test_sample(df): assert_eq(result, expected) -@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf") +@pytest.mark.xfail(CUDF_BACKEND, reason="align not supported by cudf") def test_align(df, pdf): result_1, result_2 = df.align(df) pdf_result_1, pdf_result_2 = pdf.align(pdf) @@ -943,7 +943,7 @@ def test_align(df, pdf): assert_eq(result_2, pdf_result_2) -@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf") +@pytest.mark.xfail(CUDF_BACKEND, reason="align not supported by cudf") def test_align_different_partitions(): pdf = lib.DataFrame({"a": [11, 12, 31, 1, 2, 3], "b": [1, 2, 3, 4, 5, 6]}) df = from_pandas(pdf, npartitions=2) @@ -958,7 +958,7 @@ def test_align_different_partitions(): assert_eq(result_2, pdf_result_2) -@pytest.mark.skipif(CUDF_BACKEND, reason="align not supported by cudf") +@pytest.mark.xfail(CUDF_BACKEND, reason="align not supported by cudf") def test_align_unknown_partitions_same_root(): pdf = lib.DataFrame({"a": 1}, index=[3, 2, 1]) df = from_pandas(pdf, npartitions=2, sort=False) From 7f87c8ffa021dd8ea2d4cf8d76420e4a8ee8f858 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Mon, 24 Jul 2023 12:19:37 -0500 Subject: [PATCH 18/18] Update dask_expr/_reductions.py --- dask_expr/_reductions.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/dask_expr/_reductions.py b/dask_expr/_reductions.py index 592ffea9b..5515799a7 100644 --- a/dask_expr/_reductions.py +++ b/dask_expr/_reductions.py @@ -42,15 +42,6 @@ class ApplyConcatApply(Expr): combine_kwargs = {} aggregate_kwargs = {} - # def __init__(self, *args, **kwargs): - # super().__init__(*args, **kwargs) - # if self._required_attribute: - # dep = next(iter(self.dependencies()))._meta - # if not hasattr(dep, self._required_attribute): - # # Raise a ValueError instead of AttributeError to - # # avoid infinite recursion - # raise ValueError(f"{dep} has no attribute {self._required_attribute}") - def __dask_postcompute__(self): return toolz.first, ()