Too many open files: LocalCluster not cleaning up
See original GitHub issueI have been sporadically getting issues with distributed (pip install from master) and too many open file errors and even hanging when calling LocalCluster. This seems to be a common theme in other bug reports but as I couldn’t find anything that quite fitted my case I tried to create the simplest possible test case to reproduce the bug.
It turns out it’s really easy to reproduce the bug and seem to boil down to the LocalCluster object not cleaning up after itself.
In [1]: from distributed import LocalCluster
In [2]: for i in range(100):
...: print("trial ", i)
...: cluster = LocalCluster()
...: cluster.close()
...:
trial 0
...
trial 59
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-2-f166cff3bce1> in <module>()
1 for i in range(100):
2 print("trial ", i)
----> 3 cluster = LocalCluster()
4 cluster.close()
5
~/miniconda3/lib/python3.6/site-packages/distributed/deploy/local.py in __init__(self, n_workers, threads_per_worker, processes, loop, start, ip, scheduler_port, silence_logs, diagnostics_port, services, worker_services, service_kwargs, asynchronous, **worker_kwargs)
128 self.worker_kwargs = worker_kwargs
129
--> 130 self.start(ip=ip, n_workers=n_workers)
131
132 clusters_to_close.add(self)
~/miniconda3/lib/python3.6/site-packages/distributed/deploy/local.py in start(self, **kwargs)
146 self._started = self._start(**kwargs)
147 else:
--> 148 sync(self.loop, self._start, **kwargs)
149
150 @gen.coroutine
~/miniconda3/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
251 e.wait(10)
252 if error[0]:
--> 253 six.reraise(*error[0])
254 else:
255 return result[0]
~/miniconda3/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~/miniconda3/lib/python3.6/site-packages/distributed/utils.py in f()
236 yield gen.moment
237 thread_state.asynchronous = True
--> 238 result[0] = yield make_coro()
239 except Exception as exc:
240 error[0] = sys.exc_info()
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1097
1098 try:
-> 1099 value = future.result()
1100 except Exception:
1101 self.had_exception = True
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1105 if exc_info is not None:
1106 try:
-> 1107 yielded = self.gen.throw(*exc_info)
1108 finally:
1109 # Break up a reference to itself
~/miniconda3/lib/python3.6/site-packages/distributed/deploy/local.py in _start(self, ip, n_workers)
164 self.scheduler.start(scheduler_address)
165
--> 166 yield [self._start_worker(**self.worker_kwargs) for i in range(n_workers)]
167
168 self.status = 'running'
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1097
1098 try:
-> 1099 value = future.result()
1100 except Exception:
1101 self.had_exception = True
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in callback(f)
847 for f in children:
848 try:
--> 849 result_list.append(f.result())
850 except Exception as e:
851 if future.done():
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1105 if exc_info is not None:
1106 try:
-> 1107 yielded = self.gen.throw(*exc_info)
1108 finally:
1109 # Break up a reference to itself
~/miniconda3/lib/python3.6/site-packages/distributed/deploy/local.py in _start_worker(self, death_timeout, **kwargs)
181 death_timeout=death_timeout,
182 silence_logs=self.silence_logs, **kwargs)
--> 183 yield w._start()
184
185 self.workers.append(w)
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1097
1098 try:
-> 1099 value = future.result()
1100 except Exception:
1101 self.had_exception = True
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1105 if exc_info is not None:
1106 try:
-> 1107 yielded = self.gen.throw(*exc_info)
1108 finally:
1109 # Break up a reference to itself
~/miniconda3/lib/python3.6/site-packages/distributed/nanny.py in _start(self, addr_or_port)
154
155 logger.info(' Start Nanny at: %r', self.address)
--> 156 response = yield self.instantiate()
157 if response == 'running':
158 assert self.worker_address
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1097
1098 try:
-> 1099 value = future.result()
1100 except Exception:
1101 self.had_exception = True
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1105 if exc_info is not None:
1106 try:
-> 1107 yielded = self.gen.throw(*exc_info)
1108 finally:
1109 # Break up a reference to itself
~/miniconda3/lib/python3.6/site-packages/distributed/nanny.py in instantiate(self, comm)
222 result = yield gen.with_timeout(
223 timedelta(seconds=self.death_timeout),
--> 224 self.process.start()
225 )
226 except gen.TimeoutError:
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1097
1098 try:
-> 1099 value = future.result()
1100 except Exception:
1101 self.had_exception = True
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in wrapper(*args, **kwargs)
313 try:
314 orig_stack_contexts = stack_context._state.contexts
--> 315 yielded = next(result)
316 if stack_context._state.contexts is not orig_stack_contexts:
317 yielded = _create_future()
~/miniconda3/lib/python3.6/site-packages/distributed/nanny.py in start(self)
357 init_result_q=self.init_result_q,
358 child_stop_q=self.child_stop_q,
--> 359 uid=uid),
360 )
361 self.process.daemon = True
~/miniconda3/lib/python3.6/site-packages/distributed/process.py in __init__(self, loop, target, name, args, kwargs)
71 # (for example due to SIGKILL). This variable is otherwise unused except
72 # for the assignment here.
---> 73 parent_alive_pipe, self._keep_child_alive = mp_context.Pipe(duplex=False)
74
75 self._process = mp_context.Process(target=self._run, name=name,
~/miniconda3/lib/python3.6/multiprocessing/context.py in Pipe(self, duplex)
60 '''Returns two connection object connected by a pipe'''
61 from .connection import Pipe
---> 62 return Pipe(duplex)
63
64 def Lock(self):
~/miniconda3/lib/python3.6/multiprocessing/connection.py in Pipe(duplex)
510 c2 = Connection(s2.detach())
511 else:
--> 512 fd1, fd2 = os.pipe()
513 c1 = Connection(fd1, writable=False)
514 c2 = Connection(fd2, readable=False)
OSError: [Errno 24] Too many open files
At this point I began to distrust close so I tried del instead:
In [1]: from distributed import LocalCluster
In [2]: for i in range(100):
...: print("trial ", i)
...: cluster = LocalCluster()
...: del cluster
...:
trial 0
...
trial 58
Exception in callback BaseAsyncIOLoop._handle_events(197, 1)
handle: <Handle BaseAsyncIOLoop._handle_events(197, 1)>
Traceback (most recent call last):
File "/Users/ggorman/miniconda3/lib/python3.6/asyncio/events.py", line 145, in _run
File "/Users/ggorman/miniconda3/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 117, in _handle_events
File "/Users/ggorman/miniconda3/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper
File "/Users/ggorman/miniconda3/lib/python3.6/site-packages/tornado/netutil.py", line 239, in accept_handler
File "/Users/ggorman/miniconda3/lib/python3.6/socket.py", line 205, in accept
OSError: [Errno 24] Too many open files
trial 59
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
<ipython-input-2-282e9a27564c> in <module>()
1 for i in range(100):
2 print("trial ", i)
----> 3 cluster = LocalCluster()
4 del cluster
5
~/miniconda3/lib/python3.6/site-packages/distributed/deploy/local.py in __init__(self, n_workers, threads_per_worker, processes, loop, start, ip, scheduler_port, silence_logs, diagnostics_port, services, worker_services, service_kwargs, asynchronous, **worker_kwargs)
128 self.worker_kwargs = worker_kwargs
129
--> 130 self.start(ip=ip, n_workers=n_workers)
131
132 clusters_to_close.add(self)
~/miniconda3/lib/python3.6/site-packages/distributed/deploy/local.py in start(self, **kwargs)
146 self._started = self._start(**kwargs)
147 else:
--> 148 sync(self.loop, self._start, **kwargs)
149
150 @gen.coroutine
~/miniconda3/lib/python3.6/site-packages/distributed/utils.py in sync(loop, func, *args, **kwargs)
251 e.wait(10)
252 if error[0]:
--> 253 six.reraise(*error[0])
254 else:
255 return result[0]
~/miniconda3/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~/miniconda3/lib/python3.6/site-packages/distributed/utils.py in f()
236 yield gen.moment
237 thread_state.asynchronous = True
--> 238 result[0] = yield make_coro()
239 except Exception as exc:
240 error[0] = sys.exc_info()
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1097
1098 try:
-> 1099 value = future.result()
1100 except Exception:
1101 self.had_exception = True
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1105 if exc_info is not None:
1106 try:
-> 1107 yielded = self.gen.throw(*exc_info)
1108 finally:
1109 # Break up a reference to itself
~/miniconda3/lib/python3.6/site-packages/distributed/deploy/local.py in _start(self, ip, n_workers)
164 self.scheduler.start(scheduler_address)
165
--> 166 yield [self._start_worker(**self.worker_kwargs) for i in range(n_workers)]
167
168 self.status = 'running'
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1097
1098 try:
-> 1099 value = future.result()
1100 except Exception:
1101 self.had_exception = True
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in callback(f)
847 for f in children:
848 try:
--> 849 result_list.append(f.result())
850 except Exception as e:
851 if future.done():
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1105 if exc_info is not None:
1106 try:
-> 1107 yielded = self.gen.throw(*exc_info)
1108 finally:
1109 # Break up a reference to itself
~/miniconda3/lib/python3.6/site-packages/distributed/deploy/local.py in _start_worker(self, death_timeout, **kwargs)
181 death_timeout=death_timeout,
182 silence_logs=self.silence_logs, **kwargs)
--> 183 yield w._start()
184
185 self.workers.append(w)
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1097
1098 try:
-> 1099 value = future.result()
1100 except Exception:
1101 self.had_exception = True
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1105 if exc_info is not None:
1106 try:
-> 1107 yielded = self.gen.throw(*exc_info)
1108 finally:
1109 # Break up a reference to itself
~/miniconda3/lib/python3.6/site-packages/distributed/nanny.py in _start(self, addr_or_port)
154
155 logger.info(' Start Nanny at: %r', self.address)
--> 156 response = yield self.instantiate()
157 if response == 'running':
158 assert self.worker_address
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1097
1098 try:
-> 1099 value = future.result()
1100 except Exception:
1101 self.had_exception = True
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1105 if exc_info is not None:
1106 try:
-> 1107 yielded = self.gen.throw(*exc_info)
1108 finally:
1109 # Break up a reference to itself
~/miniconda3/lib/python3.6/site-packages/distributed/nanny.py in instantiate(self, comm)
222 result = yield gen.with_timeout(
223 timedelta(seconds=self.death_timeout),
--> 224 self.process.start()
225 )
226 except gen.TimeoutError:
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in run(self)
1097
1098 try:
-> 1099 value = future.result()
1100 except Exception:
1101 self.had_exception = True
~/miniconda3/lib/python3.6/site-packages/tornado/gen.py in wrapper(*args, **kwargs)
313 try:
314 orig_stack_contexts = stack_context._state.contexts
--> 315 yielded = next(result)
316 if stack_context._state.contexts is not orig_stack_contexts:
317 yielded = _create_future()
~/miniconda3/lib/python3.6/site-packages/distributed/nanny.py in start(self)
346
347 self.init_result_q = init_q = mp_context.Queue()
--> 348 self.child_stop_q = mp_context.Queue()
349 uid = uuid.uuid4().hex
350
~/miniconda3/lib/python3.6/multiprocessing/context.py in Queue(self, maxsize)
100 '''Returns a queue object'''
101 from .queues import Queue
--> 102 return Queue(maxsize, ctx=self.get_context())
103
104 def JoinableQueue(self, maxsize=0):
~/miniconda3/lib/python3.6/multiprocessing/queues.py in __init__(self, maxsize, ctx)
45 self._wlock = None
46 else:
---> 47 self._wlock = ctx.Lock()
48 self._sem = ctx.BoundedSemaphore(maxsize)
49 # For use by concurrent.futures
~/miniconda3/lib/python3.6/multiprocessing/context.py in Lock(self)
65 '''Returns a non-recursive lock object'''
66 from .synchronize import Lock
---> 67 return Lock(ctx=self.get_context())
68
69 def RLock(self):
~/miniconda3/lib/python3.6/multiprocessing/synchronize.py in __init__(self, ctx)
161
162 def __init__(self, *, ctx):
--> 163 SemLock.__init__(self, SEMAPHORE, 1, 1, ctx=ctx)
164
165 def __repr__(self):
~/miniconda3/lib/python3.6/multiprocessing/synchronize.py in __init__(self, kind, value, maxvalue, ctx)
58 sl = self._semlock = _multiprocessing.SemLock(
59 kind, value, maxvalue, self._make_name(),
---> 60 unlink_now)
61 except FileExistsError:
62 pass
OSError: [Errno 24] Too many open files
Issue Analytics
- State:
- Created 5 years ago
- Comments:10 (3 by maintainers)
Top Results From Across the Web
too many open files - shuffle - Google Groups
I am using a 64k ulimit. Each iteration adds about 20k new open shuffle* files on each machine and keeps growing. Iterations are...
Read more >Too many open files on SLURMCluster - Dask Forum
My goal is to process 1000 files at once on a SLURM cluster. I can perform this on 8 nodes each with 32...
Read more >How to get rid of the "too many open files" error by tuning the ...
This article shows how to tune up your linux box to get rid of the "too many open files" error in mule.
Read more >Incidents/2018-07-11 kafka-eqiad - Wikitech-static
IOException : Too many open files. The former sadly does not have a heap dump that we can analyze and from the logs...
Read more >Is there a fix for too many open files error in Gridgain cluster?
There are (at least) two possibilities here. The number of file descriptors did not increase. It can be set at multiple levels (the...
Read more >Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start FreeTop Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Top GitHub Comments
Seems like the orphaned fd’s arise from tornado/util.py (using tornado 5.0.2)
I ran the code in the original post and things seemed fine. Closing