gRPC borken pipe when using multiple loggers on PL with remote AIM server
See original GitHub issue🐛 Bug
My trainer
# log locally:
tb_logger = pl.loggers.TensorBoardLogger(
save_dir=str(EXP_ROOT),
name='my_model',
default_hp_metric=False,
)
# log remotely on aim server
aim_logger = AimLogger(
repo='aim://127.0.0.1:53800',
experiment='my_model',
train_metric_prefix='train_',
test_metric_prefix='test_',
val_metric_prefix='val_',
)
trainer = pl.Trainer(
logger=[tb_logger, aim_logger],
)
My training and validation step:
def training_step(self, batch, batch_idx):
t0, t1, mask_gt = batch
mask_pred = self.forward(t0, t1)
loss = self.criterion(mask_pred, mask_gt.long())
metrics = {
'train_loss': loss,
}
self.log_dict(metrics, logger=False, on_step=False, on_epoch=True, prog_bar=True, rank_zero_only=True)
self.logger[1].log_metrics({'train_loss1': loss, 'train_loss1': loss})
return loss
def validation_step(self, batch, batch_idx):
t0, t1, mask_gt = batch
mask_pred = self.forward(t0, t1)
loss = self.criterion(mask_pred, mask_gt.long())
metrics = {
'val_loss': loss,
}
self.log_dict(metrics, logger=False, on_step=False, on_epoch=True, prog_bar=True, rank_zero_only=True)
self.logger[1].log_metrics({'train_loss1': loss, 'train_loss1': loss})
return loss
crashes:
0207 14:42:36.177987824 17936 chttp2_transport.cc:1111] Received a GOAWAY with error code ENHANCE_YOUR_CALM and debug data equal to "too_many_pings" | 4/6 [00:01<00:00, 4.07it/s]
E0207 14:42:36.178162323 17936 client_channel.cc:647] chand=0x8fc9e40: Illegal keepalive throttling value 9223372036854775807
Traceback (most recent call last):
File "/.../lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 685, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/.../lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 777, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/.../lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 1199, in _run
self._dispatch()
File "/.../lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 1279, in _dispatch
self.training_type_plugin.start_training(self)
File "/.../lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 202, in start_training
self._results = trainer.run_stage()
File "/.../lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 1289, in run_stage
return self._run_train()
File "/.../lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 1319, in _run_train
self.fit_loop.run()
File "/.../lib/python3.6/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/.../lib/python3.6/site-packages/pytorch_lightning/loops/fit_loop.py", line 234, in advance
self.epoch_loop.run(data_fetcher)
File "/.../python3.6/site-packages/pytorch_lightning/loops/base.py", line 146, in run
self.on_advance_end()
File "/.../python3.6/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 242, in on_advance_end
self._run_validation()
File "/.../lib/python3.6/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 337, in _run_validation
self.val_loop.run()
File "/.../lib/python3.6/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/.../lib/python3.6/site-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py", line 110, in advance
dl_outputs = self.epoch_loop.run(dataloader, dataloader_idx, dl_max_batches, self.num_dataloaders)
File "/.../lib/python3.6/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/.../lib/python3.6/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 122, in advance
output = self._evaluation_step(batch, batch_idx, dataloader_idx)
File "/.../lib/python3.6/site-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py", line 217, in _evaluation_step
output = self.trainer.accelerator.validation_step(step_kwargs)
File "/.../lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py", line 236, in validation_step
return self.training_type_plugin.validation_step(*step_kwargs.values())
File "/.../lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/ddp.py", line 444, in validation_step
return self.model(*args, **kwargs)
File "/.../lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/.../lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 886, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/.../lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/..../lib/python3.6/site-packages/pytorch_lightning/overrides/base.py", line 92, in forward
output = self.module.validation_step(*inputs, **kwargs)
File "/.../architectures/tanet_arch/tanet.py", line 135, in validation_step
self.logger[1].log_metrics(metrics, self.current_epoch)
File "/.../lib/python3.6/site-packages/pytorch_lightning/utilities/distributed.py", line 50, in wrapped_fn
return fn(*args, **kwargs)
File "/.../lib/python3.6/site-packages/aim/sdk/adapters/pytorch_lightning.py", line 99, in log_metrics
self.experiment.track(v, name=name, context=context)
File "/.../lib/python3.6/site-packages/aim/sdk/run.py", line 412, in track
self._track_impl(value, track_time, name, step, epoch, context=context)
File "/.../lib/python3.6/site-packages/aim/sdk/run.py", line 445, in _track_impl
self.meta_run_tree['traces', ctx.idx, name, 'last'] = val
File "/.../lib/python3.6/site-packages/aim/storage/treeviewproxy.py", line 187, in __setitem__
self.tree[self.absolute_path(path)] = value
File "/.../lib/python3.6/site-packages/aim/storage/treeviewproxy.py", line 83, in __setitem__
self._rpc_client.run_instruction(self._handler, '__setitem__', (path, value))
File "/.../lib/python3.6/site-packages/aim/ext/transport/client.py", line 66, in run_instruction
status_msg = next(resp)
File "/.../lib/python3.6/site-packages/grpc/_channel.py", line 426, in __next__
return self._next()
File "/.../lib/python3.6/site-packages/grpc/_channel.py", line 826, in _next
raise self
grpc._channel._MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
status = StatusCode.UNAVAILABLE
details = "Broken pipe"
debug_error_string = "{"created":"@1644244955.352788922","description":"Error received from peer ipv4:127.0.0.1:53800","file":"src/core/lib/surface/call.cc","file_line":1063,"grpc_message":"Broken pipe","grpc_status":14}"
>
During handling of the above exception, another exception occurred:
...
Environment
- Aim Version: 3.5.1
- Python version: 3.6.9
- pip version: 21.3.0
- Linux 18.04
Additional context
Issue Analytics
- State:
- Created 2 years ago
- Comments:16 (7 by maintainers)
Top Results From Across the Web
Broken pipe exceptions are not retried (ruby) #7663 - GitHub
After some testing (pretty low load) on Heroku platform I've got several occurrences of the following error: Gcloud::InternalError: ...
Read more >HAProxy version 2.2.22 - Configuration Manual - GitHub Pages
Defines a file containing a list of key/value pairs used to adjust the case of some header names before sending them to HTTP/1...
Read more >HAProxy version 2.4.20-3 - Configuration Manual
This document covers the configuration language as implemented in the version specified above. It does not provide any hints, examples, ...
Read more >Untitled
Pipefitting apps for iphone, Flash heating up macbook pro, ... Minigore zombies 2 apk, Panaboard panasonic ub 5815, Dominic hall bbdo, Maradona best...
Read more >FRR User Manual
CP - control plane only (i.e. BGP route server / route reflector) ... gRPC provides a combined front end to all FRR daemons...
Read more >Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start FreeTop Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Top GitHub Comments
@devfox-se thank you for still investigating this issue! I’ve tried the environment variables you suggested with
fork
enabled and can report that it fixes this issue in my case.However, there seems to be a drawback. The training gets stuck when spinning up the DataLoader workers if the parallelization is set to
num_workers > 6
. Below that I did not encounter any issues. I triednum_workers = 0, 1, 2, 4, 6, 8
.Hi, I will hopefully find some time tonight to give it a try!