Skip to content

Support arrow Table/RecordBatch types #2103

Description

@dhirschfeld

xref: #614

I'm sure this is possible with #606 so this issue is mostly just to document my attempt to get this working. Out-of-the box (1.22.0+9.gdb758d0f), attempting to pass an arrow Table or RecordBatch results in a TypeError:

TypeError: no default __reduce__ due to non-trivial __cinit__
from distributed import Client
import pandas as pd
import pyarrow as pa

client = Client()

df = pd.DataFrame({'A': list('abc'), 'B': [1,2,3]})
tbl = pa.Table.from_pandas(df, preserve_index=False)

def echo(arg):
    return arg
>>> client.submit(echo, df).result().equals(df)
True
>>> client.submit(echo, tbl).result()
distributed.protocol.pickle - INFO - Failed to serialize (pyarrow.Table
A: string
B: int64
metadata
--------
{b'pandas': b'{"index_columns": [], "column_indexes": [], "columns": [{"name":'
            b' "A", "field_name": "A", "pandas_type": "unicode", "numpy_type":'
            b' "object", "metadata": null}, {"name": "B", "field_name": "B", "'
            b'pandas_type": "int64", "numpy_type": "int64", "metadata": null}]'
            b', "pandas_version": "0.23.1"}'},). Exception: no default __reduce__ due to non-trivial __cinit__
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
C:\Miniconda3\lib\site-packages\distributed\protocol\pickle.py in dumps(x)
     37     try:
---> 38         result = pickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
     39         if len(result) < 1000:

C:\Miniconda3\lib\site-packages\pyarrow\lib.cp36-win_amd64.pyd in pyarrow.lib.RecordBatch.__reduce_cython__()

TypeError: no default __reduce__ due to non-trivial __cinit__

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-48-4e8e2ea90e79> in <module>()
----> 1 client.submit(echo, tbl).result()

C:\Miniconda3\lib\site-packages\distributed\client.py in submit(self, func, *args, **kwargs)
   1236                                          resources={skey: resources} if resources else None,
   1237                                          retries=retries,
-> 1238                                          fifo_timeout=fifo_timeout)
   1239 
   1240         logger.debug("Submit %s(...), %s", funcname(func), key)

C:\Miniconda3\lib\site-packages\distributed\client.py in _graph_to_futures(self, dsk, keys, restrictions, loose_restrictions, priority, user_priority, resources, retries, fifo_timeout)
   2093 
   2094             self._send_to_scheduler({'op': 'update-graph',
-> 2095                                      'tasks': valmap(dumps_task, dsk3),
   2096                                      'dependencies': dependencies,
   2097                                      'keys': list(flatkeys),

C:\Miniconda3\lib\site-packages\cytoolz\dicttoolz.pyx in cytoolz.dicttoolz.valmap()

C:\Miniconda3\lib\site-packages\cytoolz\dicttoolz.pyx in cytoolz.dicttoolz.valmap()

C:\Miniconda3\lib\site-packages\distributed\worker.py in dumps_task(task)
    799         elif not any(map(_maybe_complex, task[1:])):
    800             return {'function': dumps_function(task[0]),
--> 801                     'args': warn_dumps(task[1:])}
    802     return to_serialize(task)
    803 

C:\Miniconda3\lib\site-packages\distributed\worker.py in warn_dumps(obj, dumps, limit)
    808 def warn_dumps(obj, dumps=pickle.dumps, limit=1e6):
    809     """ Dump an object to bytes, warn if those bytes are large """
--> 810     b = dumps(obj)
    811     if not _warn_dumps_warned[0] and len(b) > limit:
    812         _warn_dumps_warned[0] = True

C:\Miniconda3\lib\site-packages\distributed\protocol\pickle.py in dumps(x)
     49     except Exception:
     50         try:
---> 51             return cloudpickle.dumps(x, protocol=pickle.HIGHEST_PROTOCOL)
     52         except Exception as e:
     53             logger.info("Failed to serialize %s. Exception: %s", x, e)

C:\Miniconda3\lib\site-packages\cloudpickle\cloudpickle.py in dumps(obj, protocol)
    893     try:
    894         cp = CloudPickler(file, protocol=protocol)
--> 895         cp.dump(obj)
    896         return file.getvalue()
    897     finally:

C:\Miniconda3\lib\site-packages\cloudpickle\cloudpickle.py in dump(self, obj)
    266         self.inject_addons()
    267         try:
--> 268             return Pickler.dump(self, obj)
    269         except RuntimeError as e:
    270             if 'recursion' in e.args[0]:

C:\Miniconda3\lib\pickle.py in dump(self, obj)
    407         if self.proto >= 4:
    408             self.framer.start_framing()
--> 409         self.save(obj)
    410         self.write(STOP)
    411         self.framer.end_framing()

C:\Miniconda3\lib\pickle.py in save(self, obj, save_persistent_id)
    474         f = self.dispatch.get(t)
    475         if f is not None:
--> 476             f(self, obj) # Call unbound method with explicit self
    477             return
    478 

C:\Miniconda3\lib\pickle.py in save_tuple(self, obj)
    734         if n <= 3 and self.proto >= 2:
    735             for element in obj:
--> 736                 save(element)
    737             # Subtle.  Same as in the big comment below.
    738             if id(obj) in memo:

C:\Miniconda3\lib\pickle.py in save(self, obj, save_persistent_id)
    494             reduce = getattr(obj, "__reduce_ex__", None)
    495             if reduce is not None:
--> 496                 rv = reduce(self.proto)
    497             else:
    498                 reduce = getattr(obj, "__reduce__", None)

C:\Miniconda3\lib\site-packages\pyarrow\lib.cp36-win_amd64.pyd in pyarrow.lib.RecordBatch.__reduce_cython__()

TypeError: no default __reduce__ due to non-trivial __cinit__

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions