Intermittent KilledWorker Error
See original GitHub issueSorry for an uninformative bug report. However, this issue is really affecting me and I feel compelled to report it.
I am creating a big xarray dataset from a lot of custom code and writing to to zarr on an HPC cluster. I am using dask_jobqueue.PBSCluster
.
My command is something like this
xr_dataset.to_zarr(output_dir)
Usually everything works fine. But every so often, I get this sort of error.
---------------------------------------------------------------------------
KilledWorker Traceback (most recent call last)
<ipython-input-49-b2d316dabe2a> in <module>()
1 for vname in ['V']:
2 output_dir = base_dir + vname
----> 3 ds_subset[[vname]].reset_coords(drop=True).to_zarr(output_dir)
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/xarray/core/dataset.py in to_zarr(self, store, mode, synchronizer, group, encoding, compute)
1185 from ..backends.api import to_zarr
1186 return to_zarr(self, store=store, mode=mode, synchronizer=synchronizer,
-> 1187 group=group, encoding=encoding, compute=compute)
1188
1189 def __unicode__(self):
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/xarray/backends/api.py in to_zarr(dataset, store, mode, synchronizer, group, encoding, compute)
856 # I think zarr stores should always be sync'd immediately
857 # TODO: figure out how to properly handle unlimited_dims
--> 858 dataset.dump_to_store(store, sync=True, encoding=encoding, compute=compute)
859
860 if not compute:
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/xarray/core/dataset.py in dump_to_store(self, store, encoder, sync, encoding, unlimited_dims, compute)
1075 unlimited_dims=unlimited_dims)
1076 if sync:
-> 1077 store.sync(compute=compute)
1078
1079 def to_netcdf(self, path=None, mode='w', format=None, group=None,
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/xarray/backends/zarr.py in sync(self, compute)
344
345 def sync(self, compute=True):
--> 346 self.delayed_store = self.writer.sync(compute=compute)
347
348
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/xarray/backends/common.py in sync(self, compute)
271 delayed_store = da.store(self.sources, self.targets,
272 lock=self.lock, compute=compute,
--> 273 flush=True)
274 self.sources = []
275 self.targets = []
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/dask/array/core.py in store(sources, targets, lock, regions, compute, return_stored, **kwargs)
959
960 if compute:
--> 961 result.compute(**kwargs)
962 return None
963 else:
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
154 dask.base.compute
155 """
--> 156 (result,) = compute(self, traverse=False, **kwargs)
157 return result
158
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
400 keys = [x.__dask_keys__() for x in collections]
401 postcomputes = [x.__dask_postcompute__() for x in collections]
--> 402 results = schedule(dsk, keys, **kwargs)
403 return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
404
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/distributed/client.py in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, direct, retries, priority, fifo_timeout, **kwargs)
2157 try:
2158 results = self.gather(packed, asynchronous=asynchronous,
-> 2159 direct=direct)
2160 finally:
2161 for f in futures.values():
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/distributed/client.py in gather(self, futures, errors, maxsize, direct, asynchronous)
1560 return self.sync(self._gather, futures, errors=errors,
1561 direct=direct, local_worker=local_worker,
-> 1562 asynchronous=asynchronous)
1563
1564 @gen.coroutine
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/distributed/client.py in sync(self, func, *args, **kwargs)
650 return future
651 else:
--> 652 return sync(self.loop, func, *args, **kwargs)
653
654 def __repr__(self):
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
273 e.wait(10)
274 if error[0]:
--> 275 six.reraise(*error[0])
276 else:
277 return result[0]
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/distributed/utils.py in f()
258 yield gen.moment
259 thread_state.asynchronous = True
--> 260 result[0] = yield make_coro()
261 except Exception as exc:
262 error[0] = sys.exc_info()
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/tornado/gen.py in run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/tornado/gen.py in run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1437 six.reraise(type(exception),
1438 exception,
-> 1439 traceback)
1440 if errors == 'skip':
1441 bad_keys.add(key)
/nobackup/rpaberna/conda/envs/pangeo/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
KilledWorker: ("('from-value-load_level_from_3D_field-concatenate-4708995cd4ff01559e3e73caa5120d44', 8930, 0, 0, 0)", 'tcp://10.150.11.16:45421')
The computation fails at that point.
How can I interpret this error message in order to better debug my problem?
Issue Analytics
- State:
- Created 5 years ago
- Comments:20 (13 by maintainers)
Top Results From Across the Web
What do KilledWorker exceptions mean in Dask?
This error is generated when the Dask scheduler no longer trusts your task, because it was present too often when workers died unexpectedly....
Read more >KilledWorker Exception — Coiled documentation
When a Task dies on a worker, it might be sent to a different worker. If the same tasks kill that worker, then...
Read more >dask/dask - Gitter
I am having an intermittent failure where I get a botocore error NoCredentialsError: Unable to locate credentials. I am running bleeding edge dask, ......
Read more >Changelog — Dask.distributed 2022.12.1 documentation
Fix intermittent test_profile_plot failure (GH#6456) Matthew Rocklin. Use asyncio.run to run gen_cluster , gen_test and cluster (GH#6231) Thomas Grainger.
Read more >Caught on Camera The Wal-mart Stampede That Killed Worker in ...
... that race is not the cause of this problem since all races are susceptible to it. ... After two months of intermittent...
Read more >Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start FreeTop Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Top GitHub Comments
Yes, and from your description it does. However it no longer trusts some of the tasks that you’ve computed because they were commonly found on workers that were killed unexpectedtly. If you want to make your system robust to failing workers you might want to play with the
distributed.scheduler.allowed-failures
configuration value, but I strongly recommend instead determining why your workers are failing in the first place. I recommend looking at your memory configuration and ensuring that it matches the specs of your machine. HPC job schedulers without local storage are particularly picky about memory constraints.@jhamman
client.scheduler_info()
should give you the information you want. Look formemory_limit