Serialization problem with arrow Tables?
See original GitHub issueI think the below code should work (but it doesnât) is this a bug or am I missing something?
Setup code
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
# create parquet files
filenames = []
for n in range(10):
df = n*pd.DataFrame(np.ones((10, 5)))
tbl = pa.Table.from_pandas(df)
filename = f'part{n}.pq'
pq.write_table(tbl, filename, version='2.0')
filenames.append(filename)
Demo of problem:
def read_table(filename):
import pyarrow.parquet as pq
tbl = pq.read_table(filename)
return tbl
from distributed import LocalCluster, Client
cluster = LocalCluster(2, threads_per_worker=1)
client = Client(cluster.scheduler.address)
futs = client.map(read_table, filenames)
tbl = client.submit(pa.concat_tables, futs).result()
Exception:
---------------------------------------------------------------------------
Exception Traceback (most recent call last)
<ipython-input-67-74f644c68e47> in <module>
1 futs = client.map(read_table, filenames)
----> 2 tbl = client.submit(pa.concat_tables, futs).result()
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\client.py in result(self, timeout)
191 # shorten error traceback
192 result = self.client.sync(self._result, callback_timeout=timeout,
--> 193 raiseit=False)
194 if self.status == 'error':
195 six.reraise(*result)
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\client.py in sync(self, func, *args, **kwargs)
673 return future
674 else:
--> 675 return sync(self.loop, func, *args, **kwargs)
676
677 def __repr__(self):
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\utils.py in sync(loop, func, *args, **kwargs)
275 e.wait(10)
276 if error[0]:
--> 277 six.reraise(*error[0])
278 else:
279 return result[0]
C:\Miniconda3\envs\py-dev\lib\site-packages\six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\utils.py in f()
260 if timeout is not None:
261 future = gen.with_timeout(timedelta(seconds=timeout), future)
--> 262 result[0] = yield future
263 except Exception as exc:
264 error[0] = sys.exc_info()
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\client.py in _result(self, raiseit)
216 raise gen.Return(exception)
217 else:
--> 218 result = yield self.client._gather([self])
219 raise gen.Return(result[0])
220
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\client.py in _gather(self, futures, errors, direct, local_worker)
1520 else:
1521 self._gather_future = future
-> 1522 response = yield future
1523
1524 if response['status'] == 'error':
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\client.py in _gather_remote(self, direct, local_worker)
1569
1570 else: # ask scheduler to gather data for us
-> 1571 response = yield self.scheduler.gather(keys=keys)
1572 finally:
1573 self._gather_semaphore.release()
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run(self)
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\core.py in send_recv_from_rpc(**kwargs)
578 try:
579 comm = yield self.live_comm()
--> 580 result = yield send_recv(comm=comm, op=key, **kwargs)
581 except (RPCClosed, CommClosedError) as e:
582 raise e.__class__("%s: while trying to call remote method %r"
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run(self)
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run(self)
1145 exc_info = None
1146 else:
-> 1147 yielded = self.gen.send(value)
1148
1149 if stack_context._state.contexts is not orig_stack_contexts:
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\core.py in send_recv(comm, reply, serializers, deserializers, **kwargs)
470 if isinstance(response, dict) and response.get('status') == 'uncaught-error':
471 if comm.deserialize:
--> 472 six.reraise(*clean_exception(**response))
473 else:
474 raise Exception(response['text'])
C:\Miniconda3\envs\py-dev\lib\site-packages\six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692 raise value.with_traceback(tb)
693 raise value
694 finally:
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\core.py in handle_comm()
344 if type(result) is gen.Future:
345 self._ongoing_coroutines.add(result)
--> 346 result = yield result
347 except (CommClosedError, CancelledError) as e:
348 if self.status == 'running':
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run()
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run()
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\scheduler.py in gather()
2315
2316 data, missing_keys, missing_workers = yield gather_from_workers(
-> 2317 who_has, rpc=self.rpc, close=False, serializers=serializers)
2318 if not missing_keys:
2319 result = {'status': 'OK', 'data': data}
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run()
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run()
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\utils_comm.py in gather_from_workers()
65 for worker, c in coroutines.items():
66 try:
---> 67 r = yield c
68 except EnvironmentError:
69 missing_workers.add(worker)
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run()
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run()
1139 if exc_info is not None:
1140 try:
-> 1141 yielded = self.gen.throw(*exc_info)
1142 finally:
1143 # Break up a reference to itself
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\worker.py in get_data_from_worker()
2669 deserializers=deserializers,
2670 op='get_data', keys=keys, who=who,
-> 2671 max_connections=max_connections)
2672 try:
2673 status = response['status']
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run()
1131
1132 try:
-> 1133 value = future.result()
1134 except Exception:
1135 self.had_exception = True
C:\Miniconda3\envs\py-dev\lib\site-packages\tornado\gen.py in run()
1145 exc_info = None
1146 else:
-> 1147 yielded = self.gen.send(value)
1148
1149 if stack_context._state.contexts is not orig_stack_contexts:
C:\Miniconda3\envs\py-dev\lib\site-packages\distributed\core.py in send_recv()
472 six.reraise(*clean_exception(**response))
473 else:
--> 474 raise Exception(response['text'])
475 raise gen.Return(response)
476
Exception: Object <pyarrow.lib.Buffer object at 0x000002A6C3FF03E8> is neither a bytes object nor has an encode method
Iâll try updating and reproducing but wonât get to that for a while so any confirmation appreciatedâŚ
print(pa.__version__)
print(dask.__version__)
print(distributed.__version__)
0.12.0
1.1.1
1.25.3
Issue Analytics
- State:
- Created 4 years ago
- Comments:5 (5 by maintainers)
Top Results From Across the Web
How to serialize apache arrow c++ table, trans through socket ...
Flight uses grpc to send arrow data back and forth and it will remove some of the tedium of using sockets. Here is...
Read more >Serialization and IPC â Apache Arrow v10.0.1
The serialization functionality is deprecated in pyarrow 2.0, and will be removed in a future version. Use the standard library pickle or the...
Read more >Apache Arrow: A New Gold Standard for Dataset Transport
This talk will discuss the role that Apache Arrow and Arrow Flight ... common ones is the time that the CPU spends doing...
Read more >What is Apache Arrow, How it Works & More - Streamlit Blog
Streamlit walks through what Apache Arrow is, how it works, what it means for ... crafted module that serialized Pandas DataFrames (read "fancy...
Read more >Getting started with Apache Arrow - Notes from a data witch
That's a problem when a large data set needs to be passed around between multiple platforms. Loading the a CSV into R incurs...
Read more >Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start FreeTop Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Top GitHub Comments
You might want to try updating Dask. That version is decently old.
On Mon, Aug 19, 2019 at 9:32 PM Dave Hirschfeld notifications@github.com wrote:
Thanks @TomAugspurger! I was just going to take a look myself, only to find youâve already fixed it! đ