Martin Durant created ARROW-5144:
------------------------------------

             Summary: ParquetDataset and CloudParuqtePiece not serializable
                 Key: ARROW-5144
                 URL: https://issues.apache.org/jira/browse/ARROW-5144
             Project: Apache Arrow
          Issue Type: Bug
    Affects Versions: 0.13.0
         Environment: osx python36/conda cloudpickle 0.8.1
arrow-cpp                 0.13.0           py36ha71616b_0    conda-forge
pyarrow                   0.13.0           py36hb37e6aa_0    conda-forge

            Reporter: Martin Durant


Since 0.13.0, parquet instances are no longer serialisable, which means that 
dask.distributed cannot pass them between processes in order to load parquet in 
parallel.

Example:
```
>>> import cloudpickle
>>> import pyarrow.parquet as pq
>>> pf = pq.ParquetDataset('nation.impala.parquet')
>>> cloudpickle.dumps(pf)
~/anaconda/envs/py36/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in 
dumps(obj, protocol)
    893     try:
    894         cp = CloudPickler(file, protocol=protocol)
--> 895         cp.dump(obj)
    896         return file.getvalue()
    897     finally:

~/anaconda/envs/py36/lib/python3.6/site-packages/cloudpickle/cloudpickle.py in 
dump(self, obj)
    266         self.inject_addons()
    267         try:
--> 268             return Pickler.dump(self, obj)
    269         except RuntimeError as e:
    270             if 'recursion' in e.args[0]:

~/anaconda/envs/py36/lib/python3.6/pickle.py in dump(self, obj)
    407         if self.proto >= 4:
    408             self.framer.start_framing()
--> 409         self.save(obj)
    410         self.write(STOP)
    411         self.framer.end_framing()

~/anaconda/envs/py36/lib/python3.6/pickle.py in save(self, obj, 
save_persistent_id)
    519
    520         # Save the reduce() output and finally memoize the object
--> 521         self.save_reduce(obj=obj, *rv)
    522
    523     def persistent_id(self, obj):

~/anaconda/envs/py36/lib/python3.6/pickle.py in save_reduce(self, func, args, 
state, listitems, dictitems, obj)
    632
    633         if state is not None:
--> 634             save(state)
    635             write(BUILD)
    636

~/anaconda/envs/py36/lib/python3.6/pickle.py in save(self, obj, 
save_persistent_id)
    474         f = self.dispatch.get(t)
    475         if f is not None:
--> 476             f(self, obj) # Call unbound method with explicit self
    477             return
    478

~/anaconda/envs/py36/lib/python3.6/pickle.py in save_dict(self, obj)
    819
    820         self.memoize(obj)
--> 821         self._batch_setitems(obj.items())
    822
    823     dispatch[dict] = save_dict

~/anaconda/envs/py36/lib/python3.6/pickle.py in _batch_setitems(self, items)
    845                 for k, v in tmp:
    846                     save(k)
--> 847                     save(v)
    848                 write(SETITEMS)
    849             elif n:

~/anaconda/envs/py36/lib/python3.6/pickle.py in save(self, obj, 
save_persistent_id)
    494             reduce = getattr(obj, "__reduce_ex__", None)
    495             if reduce is not None:
--> 496                 rv = reduce(self.proto)
    497             else:
    498                 reduce = getattr(obj, "__reduce__", None)

~/anaconda/envs/py36/lib/python3.6/site-packages/pyarrow/_parquet.cpython-36m-darwin.so
 in pyarrow._parquet.ParquetSchema.__reduce_cython__()

TypeError: no default __reduce__ due to non-trivial __cinit__
```
The indicated schema instance is also referenced by the ParquetDatasetPiece s.

ref: https://github.com/dask/distributed/issues/2597



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to