-
-
Notifications
You must be signed in to change notification settings - Fork 762
Reduce memory footprint of P2P shuffling #8157
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
671ab10
9f08ff3
bf4c737
c80bf63
2417a5a
871c043
20634e9
f76c622
5a3b33e
0149bbd
a60cc07
d5f81f4
491b9e6
bbbe303
6c80354
bdc0d8e
f14aba6
67282f7
429a7ac
2f92de0
e2368b5
629124a
ecfe534
396c719
ecca1d8
3bc2c5a
d0286a4
f23c1aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,12 @@ | ||
| from __future__ import annotations | ||
|
|
||
| from io import BytesIO | ||
| from typing import TYPE_CHECKING | ||
| from pathlib import Path | ||
| from typing import TYPE_CHECKING, Any | ||
|
|
||
| from packaging.version import parse | ||
|
|
||
| from dask.utils import parse_bytes | ||
|
|
||
| if TYPE_CHECKING: | ||
| import pandas as pd | ||
| import pyarrow as pa | ||
|
|
@@ -29,34 +31,27 @@ def check_minimal_arrow_version() -> None: | |
| """Verify that the the correct version of pyarrow is installed to support | ||
| the P2P extension. | ||
|
|
||
| Raises a RuntimeError in case pyarrow is not installed or installed version | ||
| is not recent enough. | ||
| Raises a ModuleNotFoundError if pyarrow is not installed or an | ||
| ImportError if the installed version is not recent enough. | ||
| """ | ||
| # First version to introduce Table.sort_by | ||
| minversion = "7.0.0" | ||
| # First version that supports concatenating extension arrays (apache/arrow#14463) | ||
| minversion = "12.0.0" | ||
| try: | ||
| import pyarrow as pa | ||
| except ImportError: | ||
| raise RuntimeError(f"P2P shuffling requires pyarrow>={minversion}") | ||
|
|
||
| except ModuleNotFoundError: | ||
| raise ModuleNotFoundError(f"P2P shuffling requires pyarrow>={minversion}") | ||
| if parse(pa.__version__) < parse(minversion): | ||
| raise RuntimeError( | ||
| raise ImportError( | ||
| f"P2P shuffling requires pyarrow>={minversion} but only found {pa.__version__}" | ||
| ) | ||
|
|
||
|
|
||
| def convert_partition(data: bytes, meta: pd.DataFrame) -> pd.DataFrame: | ||
| def convert_shards(shards: list[pa.Table], meta: pd.DataFrame) -> pd.DataFrame: | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. (disclaimer: still in early review) I once tried to move tables around instead of bytes but that messed up the event loop. We should check this before merging |
||
| import pyarrow as pa | ||
|
|
||
| from dask.dataframe.dispatch import from_pyarrow_table_dispatch | ||
|
|
||
| file = BytesIO(data) | ||
| end = len(data) | ||
| shards = [] | ||
| while file.tell() < end: | ||
| sr = pa.RecordBatchStreamReader(file) | ||
| shards.append(sr.read_all()) | ||
| table = pa.concat_tables(shards, promote=True) | ||
| table = pa.concat_tables(shards) | ||
|
|
||
| df = from_pyarrow_table_dispatch(meta, table, self_destruct=True) | ||
| return df.astype(meta.dtypes, copy=False) | ||
|
|
@@ -66,9 +61,7 @@ def list_of_buffers_to_table(data: list[bytes]) -> pa.Table: | |
| """Convert a list of arrow buffers and a schema to an Arrow Table""" | ||
| import pyarrow as pa | ||
|
|
||
| return pa.concat_tables( | ||
| (deserialize_table(buffer) for buffer in data), promote=True | ||
| ) | ||
| return pa.concat_tables(deserialize_table(buffer) for buffer in data) | ||
|
|
||
|
|
||
| def serialize_table(table: pa.Table) -> bytes: | ||
|
|
@@ -85,3 +78,42 @@ def deserialize_table(buffer: bytes) -> pa.Table: | |
|
|
||
| with pa.ipc.open_stream(pa.py_buffer(buffer)) as reader: | ||
| return reader.read_all() | ||
|
|
||
|
|
||
| def read_from_disk(path: Path, meta: pd.DataFrame) -> tuple[Any, int]: | ||
| import pyarrow as pa | ||
|
|
||
| from dask.dataframe.dispatch import pyarrow_schema_dispatch | ||
|
|
||
| batch_size = parse_bytes("1 MiB") | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is fragile and I don't really like it, but for now it seems to do the job. We will have to spend more time on performance optimization and understanding memory (de)allocation here to make this more robust. |
||
| batch = [] | ||
| shards = [] | ||
| schema = pyarrow_schema_dispatch(meta, preserve_index=True) | ||
|
|
||
| with pa.OSFile(str(path), mode="rb") as f: | ||
| size = f.seek(0, whence=2) | ||
| f.seek(0) | ||
| prev = 0 | ||
| offset = f.tell() | ||
| while offset < size: | ||
| sr = pa.RecordBatchStreamReader(f) | ||
| shard = sr.read_all() | ||
| offset = f.tell() | ||
| batch.append(shard) | ||
|
|
||
| if offset - prev >= batch_size: | ||
| table = pa.concat_tables(batch) | ||
| shards.append(_copy_table(table, schema)) | ||
| batch = [] | ||
| prev = offset | ||
| if batch: | ||
| table = pa.concat_tables(batch) | ||
| shards.append(_copy_table(table, schema)) | ||
| return shards, size | ||
|
|
||
|
|
||
| def _copy_table(table: pa.Table, schema: pa.Schema) -> pa.Table: | ||
| import pyarrow as pa | ||
|
|
||
| arrs = [pa.concat_arrays(column.chunks) for column in table.columns] | ||
| return pa.table(data=arrs, schema=schema) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@fjetter: Together with dask/dask#10496,
get_default_shuffle_methodshould raise if pyarrow is outdated and choosetasksif it's not installed.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(testing it manually)