Bug report: Cannot resume training from a model trained with ddp_spawn strategy.
See original GitHub issue🐛 Bug
Encountered this bug, when trying to continue training from a model trained with ddp_spawn strategy.
To Reproduce
Please notice two things: 1) multi-GPU training 2) use RAdam/AdamW/etc. optimizer to replicate the bug (because SGD will not create any problem)
import os
import torch
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
class RandomDataset(Dataset):
def __init__(self, size, num_samples):
self.len = num_samples
self.data = torch.randn(num_samples, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
num_samples = 10000
class BoringModel(LightningModule):
def __init__(self):
super().__init__()
self.layer = torch.nn.Linear(32, 2)
def forward(self, x):
return self.layer(x)
def training_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("train_loss", loss)
return {"loss": loss}
def validation_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("valid_loss", loss)
def test_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("test_loss", loss)
def configure_optimizers(self):
return torch.optim.RAdam(self.layer.parameters(), lr=0.1)
def run():
train_data = DataLoader(RandomDataset(32, 64), batch_size=2)
val_data = DataLoader(RandomDataset(32, 64), batch_size=2)
checkpoint_callback = ModelCheckpoint(save_last=True, save_top_k=1, monitor="valid_loss", mode="min", filename="best")
model = BoringModel()
trainer1= Trainer(
gpus=-1,
default_root_dir=os.getcwd(),
limit_train_batches=1,
limit_val_batches=1,
limit_test_batches=1,
num_sanity_val_steps=0,
max_epochs=1,
enable_model_summary=False,
strategy="ddp_spawn",
callbacks=[checkpoint_callback]
)
trainer1.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
trainer2= Trainer(
gpus=-1,
default_root_dir=os.getcwd(),
limit_train_batches=1,
limit_val_batches=1,
limit_test_batches=1,
num_sanity_val_steps=0,
max_epochs=2,
enable_model_summary=False,
strategy="ddp_spawn",
callbacks=[checkpoint_callback]
)
trainer2.fit(
model,
train_dataloaders=train_data,
val_dataloaders=val_data,
ckpt_path=os.path.join(os.getcwd(), "lightning_logs/version_0/checkpoints/last.ckpt")
)
if __name__ == "__main__":
run()
Expected behavior
Run to the end without any error message.
Error message
-- Process 3 terminated with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
fn(i, *args)
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py", line 208, in _wrapped_function
result = function(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/plugins/training_type/ddp_spawn.py", line 236, in new_process
results = trainer.run_stage()
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py", line 1289, in run_stage
return self._run_train()
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py", line 1319, in _run_train
self.fit_loop.run()
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/fit_loop.py", line 234, in advance
self.epoch_loop.run(data_fetcher)
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 193, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 88, in advance
outputs = self.optimizer_loop.run(split_batch, optimizers, batch_idx)
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 219, in advance
self.optimizer_idx,
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 266, in _run_optimization
self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/optimization/optimizer_loop.py", line 386, in _optimizer_step
using_lbfgs=is_lbfgs,
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/core/lightning.py", line 1652, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/core/optimizer.py", line 164, in step
trainer.accelerator.optimizer_step(self._optimizer, self._optimizer_idx, closure, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/accelerators/accelerator.py", line 339, in optimizer_step
self.precision_plugin.optimizer_step(model, optimizer, opt_idx, closure, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 163, in optimizer_step
optimizer.step(closure=closure, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/torch/optim/optimizer.py", line 88, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/torch/autograd/grad_mode.py", line 28, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/torch/optim/radam.py", line 128, in step
eps=group['eps'])
File "/usr/local/lib/python3.7/dist-packages/torch/optim/_functional.py", line 436, in radam
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:3!
cc @awaelchli @ananthsub @ninginthecloud @rohitgr7 @justusschock @kaushikb11 @akihironitta
Issue Analytics
- State:
- Created 2 years ago
- Comments:6 (6 by maintainers)
Top Results From Across the Web
Bug report: Cannot resume training from a model ... - GitHub
Encountered this bug, when trying to continue training from a model trained with ddp_spawn strategy. To Reproduce. Please notice two things: 1) ...
Read more >Error resuming from checkpoint with multiple GPUs
I started training a model on two GPUs, using the following trainer: trainer = pl.Trainer( devices = [0,2], accelerator='gpu', precision=16, ...
Read more >Is there some problems use resume in Pytorch?
When i train my network, in 156th epochs the training is break. So i use --resume to continue training(load the last checkpoint).
Read more >1.3.6 PDF - PyTorch Lightning Documentation
In this guide we'll show you how to organize your PyTorch code into Lightning in 2 steps. Organizing your code with PyTorch Lightning...
Read more >Multi GPU Model Training: Monitoring and Optimizing
But if the same model is trained in parallel using 1024 NVIDIA A100 GPUs, ... accelerator="gpu",strategy="ddp") # train on 32 GPUs (4 nodes) ......
Read more >
Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start Free
Top Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found

I`ve isolated the issue and replicated it with minimal code, and updated the post according to the template.
Hi @circlecrystal, could you fill up the bug report template? https://github.com/PyTorchLightning/pytorch-lightning/blob/master/.github/ISSUE_TEMPLATE/bug_report.md