Stuck on an issue?

Lightrun Answers was designed to reduce the constant googling that comes with debugging 3rd party libraries. It collects links to all the places you might be looking at while hunting down a tough bug.

And, if you’re still stuck at the end, we’re happy to hop on a call to see how we can help out.

memory leak when using distributed.Client with delayed

See original GitHub issue

I have used dask.delayed to wire together some classes and when using dask.threaded.get everything works properly. When same code is run using distributed.Client memory used by process keeps growing.

Dummy code to reproduce issue is below.

import gc
import os

import psutil
from dask import delayed

# generate random strings: https://stackoverflow.com/a/16310739
class Data():
    def __init__(self):
        self.tbl = bytes.maketrans(bytearray(range(256)),
                              bytearray([ord(b'a') + b % 26 for b in range(256)]))

    @staticmethod
    def split_len(seq, length):
        return [seq[i:i + length] for i in range(0, len(seq), length)]

    def get_data(self):
        l = self.split_len(os.urandom(1000000).translate(self.tbl), 1000)
        return l


class Calc():
    def __init__(self, l):
        self.l = l

    def nth_nth_item(self, n):
        return self.l[n][n]


class Combiner():
    def __init__(self):
        self.delayed_data = delayed(Data())

    def get_calc(self):
        d_l = self.delayed_data.get_data(pure=True)
        return delayed(Calc, pure=True)(d_l)

    def mem_usage_mb(self):
        process = psutil.Process(os.getpid())
        return "%.2f" % (process.memory_info().rss * 1e-6)

    def results(self):
        return {
            '0': self.get_calc().nth_nth_item(0),
            '1': self.get_calc().nth_nth_item(1),
            '2': self.get_calc().nth_nth_item(2),
            'mem_usage_mb': self.mem_usage_mb()
        }

    def delayed_results(self):
        return delayed(self.results())


def main_threaded_get():
    from dask.threaded import get as threaded_get
    from dask import compute

    for i in range(300):
        delayed_obj = Combiner().delayed_results()
        res = compute(delayed_obj, key=threaded_get)[0]
        #print(res)
        print("#%d, mem: %s mb" % (i, res['mem_usage_mb']))
        gc.collect()


def main_distributed_client():
    from distributed import Client
    client = Client(processes=True, n_workers=1, threads_per_worker=1)

    for i in range(1000):
        delayed_obj = Combiner().delayed_results()
        future = client.compute(delayed_obj)
        res = future.result()
        print("#%d, mem: %s mb" % (i, res['mem_usage_mb']))

        collect_res = client.run(lambda: gc.collect()) # doesn't help
        # print(collect_res)

if __name__ == "__main__":
    main_threaded_get()
    main_distributed_client()

Results:

main_threaded_get():
100, mem: 33.64 mb
200, mem: 33.64 mb
299, mem: 33.64 mb

main_distributed_client()
100, mem: 94.02 mb
200, mem: 96.02 mb
300, mem: 97.95 mb
400, mem: 100.11 mb
500, mem: 102.29 mb
600, mem: 104.48 mb
700, mem: 106.72 mb
800, mem: 108.20 mb
900, mem: 110.02 mb
999, mem: 112.22 mb
And also "distributed.utils_perf - WARNING - full garbage collections took 60% CPU time recently (threshold: 10%)" messages starting with i=30

Python 3.6.5
>>> dask.__version__
'0.18.0'
>>> distributed.__version__
'1.22.0'

Issue Analytics

State:
Created 5 years ago
Reactions:1
Comments:26 (7 by maintainers)

Top GitHub Comments

4reactions

abastcommented, Nov 18, 2018

@Axel-CH I’ve also noticed a mismatch between the memory usage reported by dask distributed and the OS. What helped me to resolve problems of freezed and killed workers was to change the configuration described here to the following:

     # Fractions of worker memory at which we take action to avoid memory blowup
     # Set any of the lower three values to False to turn off the behavior entirely
     memory:
       target: 0.95  # target fraction to stay below
       spill: False  # fraction at which we spill to disk
       pause: False  # fraction at which we pause worker threads
       terminate: False  # fraction at which we terminate the worker

3reactions

iljaucommented, Jun 28, 2018

Code for: mleak.py

import gc
import os
import tracemalloc

import psutil
from dask import delayed
from dask.distributed import get_worker

# generate random strings: https://stackoverflow.com/a/16310739
class Data():
    def __init__(self):
        self.tbl = bytes.maketrans(bytearray(range(256)),
                              bytearray([ord(b'a') + b % 26 for b in range(256)]))

    @staticmethod
    def split_len(seq, length):
        return [seq[i:i + length] for i in range(0, len(seq), length)]

    def get_data(self):
        l = self.split_len(os.urandom(1000000).translate(self.tbl), 1000)
        return l


class Calc():
    def __init__(self, l):
        self.l = l

    def nth_nth_item(self, n):
        return self.l[n][n]


class Combiner():
    def __init__(self):
        self.delayed_data = delayed(Data())

    def get_calc(self):
        d_l = self.delayed_data.get_data(pure=True)
        return delayed(Calc, pure=True)(d_l)

    def mem_usage_mb(self):
        process = psutil.Process(os.getpid())
        return "%.2f" % (process.memory_info().rss * 1e-6)

    def results(self):
        return {
            '0': self.get_calc().nth_nth_item(0),
            '1': self.get_calc().nth_nth_item(1),
            '2': self.get_calc().nth_nth_item(2),
            'mem_usage_mb': self.mem_usage_mb()
        }

    def delayed_results(self):
        return delayed(self.results())

##
##


def snapshot():
    worker = get_worker()
    worker.snapshot1 = tracemalloc.take_snapshot()


def top_stats(do_print_trace=False):
    worker = get_worker()
    snapshot2 = tracemalloc.take_snapshot()

    stats = snapshot2.compare_to(worker.snapshot1, 'traceback')

    for stat in stats[:5]:
        print(stat)
        if do_print_trace:
            for line in stat.traceback.format():
                print(line)

##
##


def main_threaded_get():
    from dask.threaded import get as threaded_get
    from dask import compute

    for i in range(300):
        delayed_obj = Combiner().delayed_results()
        res = compute(delayed_obj, key=threaded_get)[0]
        print("#%d, mem: %s mb" % (i, res['mem_usage_mb']))
        gc.collect()


def main_distributed_client():
    from distributed import Client
    client = Client(processes=True, n_workers=1, threads_per_worker=1)

    # up to 7 stacktrace lines
    client.run(lambda: tracemalloc.start(7))

    for i in range(1000):
        client.run(lambda: snapshot())

        delayed_obj = Combiner().delayed_results()
        future = client.compute(delayed_obj)
        res = future.result()
        print("#%d, mem: %s mb" % (i, res['mem_usage_mb']))

        client.run(lambda: top_stats(do_print_trace=False))
        # client.run(lambda: top_stats(do_print_trace=True))  # print call stack as well

        client.run(lambda: gc.collect()) # doesn't help

        print("---")


if __name__ == "__main__":
    main_distributed_client()

Modified script compares memory usage using tracemalloc before computing delayed function and after.

If I’m interpreting tracemalloc results correctly, then it looks that memory usage grows when pickle.loads is called.

Run: python -X tracemalloc mleak.py

Top memory increases per invocation:

[..]/python3.6/site-packages/distributed/protocol/pickle.py:59: size=188 KiB (+5044 B), count=2019 (+50), average=95 B
[..]/python3.6/site-packages/distributed/worker.py:2130: size=11.3 KiB (+3176 B), count=16 (+5), average=721 B
[..]/python3.6/site-packages/tornado/gen.py:1046: size=3560 B (+2848 B), count=5 (+4), average=712 B
[..]/python3.6/site-packages/distributed/protocol/core.py:188: size=97.5 KiB (+2482 B), count=1042 (+25), average=96 B
[..]/python3.6/asyncio/events.py:145: size=2880 B (+2304 B), count=5 (+4), average=576 B

Call stack for distributed/protocol/pickle.py (different invocation)

[..]/python3.6/site-packages/distributed/protocol/pickle.py:59: size=41.0 KiB (+2468 B), count=360 (+21), average=117 B
  File "[..]/python3.6/site-packages/distributed/protocol/pickle.py", line 59
    return pickle.loads(x)
  File "[..]/python3.6/site-packages/distributed/worker.py", line 720
    function = pickle.loads(function)
  File "[..]/python3.6/site-packages/distributed/worker.py", line 1275
    self.tasks[key] = _deserialize(function, args, kwargs, task)
  File "[..]/python3.6/site-packages/distributed/core.py", line 375
    handler(**merge(extra, msg))
  File "[..]/python3.6/site-packages/tornado/gen.py", line 1113
    yielded = self.gen.send(value)
  File "[..]/python3.6/site-packages/tornado/gen.py", line 1199
    self.run()
  File "[..]/python3.6/site-packages/tornado/stack_context.py", line 276
    return fn(*args, **kwargs)

Top Results From Across the Web

Tackling unmanaged memory with Dask - Coiled

Unmanaged memory is RAM that the Dask scheduler is not directly aware of and which can cause workers to run out of memory...

Managing Memory — Dask.distributed 2022.12.1 documentation

The compute and persist methods handle Dask collections like arrays, bags, delayed values, and dataframes. The scatter method sends data directly from the...

Why Dask is not respecting the memory limits for LocalCluster?

worker - WARNING - Unmanaged memory use is high. This may indicate a memory leak or the memory may not be released to...

Dask Memory Leak Workaround

When using the Dask dataframe where clause I get a “distributed.worker_memory - WARNING - Unmanaged memory use is high.

Troubleshooting and Optimizing Dask Resources | Saturn Cloud

As you use a Dask cluster, some objects may not be cleared from memory when your code stops running. These memory leaks can...