Using pytorch-lightning to train PixelCL on multi-gpu
See original GitHub issueHello everyone,
I’m trying to use pytorch-lightning to train PixelCL on 2 gpus using ddp2 accelerator.
I followed this example :
class SelfSupervisedLearner(pl.LightningModule):
def __init__(self, net, n_epochs, steps_per_epoch, **kwargs):
super().__init__()
self.learner = PixelCL(net, **kwargs)
self.n_epochs = n_epochs
self.steps_per_epoch = steps_per_epoch
def forward(self, images):
return self.learner(images)
def training_step(self, batch, _):
images,_ = batch
loss, _ = self.forward(images)
self.log('loss',loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
return {'loss': loss}
def configure_optimizers(self):
opt = LARS(self.parameters(), lr=1e-3, weight_decay = 1e-3)#torch.optim.Adam(self.parameters(), lr=1e-3)
scheduler = scheduler = OneCycleLR(opt, max_lr = get_lr(opt), epochs = self.n_epochs, steps_per_epoch = self.steps_per_epoch) return [opt], [scheduler]
def on_before_zero_grad(self, _):
self.learner.update_moving_average()`
learner = SelfSupervisedLearner(
resnet,
n_epochs = 200,
steps_per_epoch = len(train_loader),
image_size = 244,
hidden_layer = 'layer4', # leads to output of 8x8 feature map
projection_size = 256, # size of projection output, 256 was used in the paper
projection_hidden_size = 2048, # size of projection hidden dimension, paper used 2048
moving_average_decay = 0.99, # exponential moving average decay of target encoder
ppm_num_layers = 1, # number of layers for transform function in the pixel propagation module, 1 was optimal
ppm_gamma = 2, # sharpness of the similarity in the pixel propagation module, already at optimal value of 2
distance_thres = 0.7, # ideal value is 0.7, as indicated in the paper, which makes the assumption of each feature map's pixel diagonal distance to be $
similarity_temperature = 0.3, # temperature for the cosine similarity for the pixel contrastive loss
alpha = 1. # weight of the pixel propagation loss (pixpro) vs pixel CL loss
).cuda()
trainer = pl.Trainer(
gpus = 2,
max_epochs = 200,
accumulate_grad_batches = 1,
sync_batchnorm = False,
callbacks = [ckpt_callback],
accelerator = 'ddp2'
)
trainer.fit(learner, train_loader)
When I try to run this I got the following error :
0: Traceback (most recent call last):
0: File "../swav_src/light_seg_byol.py", line 166, in <module>
0: trainer.fit(learner, train_loader)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 510, in fit
0: results = self.accelerator_backend.train()
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/accelerators/ddp2_accelerator.py", line 64, in train
0: return self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/accelerators/ddp2_accelerator.py", line 202, in ddp_train
0: results = self.train_or_test()
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in train_or_test
0: results = self.trainer.train()
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in train
0: self.train_loop.run_training_epoch()
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 549, in run_training_epoch
0: batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 704, in run_training_batch
0: self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 490, in optimizer_step
0: using_lbfgs=is_lbfgs,
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1296, in optimizer_step
0: optimizer.step(closure=optimizer_closure)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 286, in step
0: self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 144, in __optimizer_step
0: optimizer.step(closure=closure, *args, **kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/optim/lr_scheduler.py", line 67, in wrapper
0: return wrapped(*args, **kwargs)
0: File "/gpfs/users/bensaad/namr/swav_src/lars.py", line 83, in step
0: loss = closure()
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 699, in train_step_and_backward_closure
0: self.trainer.hiddens
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 792, in training_step_and_backward
0: result = self.training_step(split_batch, batch_idx, opt_idx, hiddens)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 316, in training_step
0: training_step_output = self.trainer.accelerator_backend.training_step(args)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/accelerators/ddp2_accelerator.py", line 67, in training_step
0: return self._step(args)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/accelerators/ddp2_accelerator.py", line 81, in _step
0: output = self.trainer.model(*args)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
0: result = self.forward(*input, **kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/overrides/data_parallel.py", line 188, in forward
0: outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/overrides/data_parallel.py", line 161, in parallel_apply
0: return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/overrides/data_parallel.py", line 321, in parallel_apply
0: raise output
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/overrides/data_parallel.py", line 274, in _worker
0: output = module.training_step(*input, **kwargs)
0: File "../swav_src/light_seg_byol.py", line 66, in training_step
0: loss, _ = self.forward(images)
0: File "../swav_src/light_seg_byol.py", line 62, in forward
0: return self.learner(images)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
0: result = self.forward(*input, **kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pixel_level_contrastive_learning/pixel_level_contrastive_learning.py", line 294, in forward
0: proj_one = self.online_encoder(image_one_cutout)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
0: result = self.forward(*input, **kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pixel_level_contrastive_learning/pixel_level_contrastive_learning.py", line 201, in forward
0: projection = projector(representation)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
0: result = self.forward(*input, **kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pixel_level_contrastive_learning/pixel_level_contrastive_learning.py", line 107, in forward
0: return self.net(x)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
0: result = self.forward(*input, **kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/nn/modules/container.py", line 100, in forward
0: input = module(input)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
0: result = self.forward(*input, **kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 353, in forward
0: return self._conv_forward(input, self.weight)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 350, in _conv_forward
0: self.padding, self.dilation, self.groups)
0: RuntimeError: Expected tensor for argument #1 'input' to have the same device as tensor for argument #2 'weight'; but device 1 does not equal 0 (while checking arguments for cudnn_convolution)
0: Traceback (most recent call last):
0: File "../swav_src/light_seg_byol.py", line 166, in <module>
0: trainer.fit(learner, train_loader)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 510, in fit
0: results = self.accelerator_backend.train()
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/accelerators/ddp2_accelerator.py", line 64, in train
0: return self.ddp_train(process_idx=self.task_idx, mp_queue=None, model=model)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/accelerators/ddp2_accelerator.py", line 202, in ddp_train
0: results = self.train_or_test()
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/accelerators/accelerator.py", line 74, in train_or_test
0: results = self.trainer.train()
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/trainer.py", line 561, in train
0: self.train_loop.run_training_epoch()
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 549, in run_training_epoch
0: batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 704, in run_training_batch
0: self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 490, in optimizer_step
0: using_lbfgs=is_lbfgs,
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/core/lightning.py", line 1296, in optimizer_step
0: optimizer.step(closure=optimizer_closure)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 286, in step
0: self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/core/optimizer.py", line 144, in __optimizer_step
0: optimizer.step(closure=closure, *args, **kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/optim/lr_scheduler.py", line 67, in wrapper
0: return wrapped(*args, **kwargs)
0: File "/gpfs/users/bensaad/namr/swav_src/lars.py", line 83, in step
0: loss = closure()
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 699, in train_step_and_backward_closure
0: self.trainer.hiddens
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 792, in training_step_and_backward
0: result = self.training_step(split_batch, batch_idx, opt_idx, hiddens)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/trainer/training_loop.py", line 316, in training_step
0: training_step_output = self.trainer.accelerator_backend.training_step(args)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/accelerators/ddp2_accelerator.py", line 67, in training_step
0: return self._step(args)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/accelerators/ddp2_accelerator.py", line 81, in _step
0: output = self.trainer.model(*args)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
0: result = self.forward(*input, **kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/overrides/data_parallel.py", line 188, in forward
0: outputs = self.parallel_apply(self._module_copies[:len(inputs)], inputs, kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/overrides/data_parallel.py", line 161, in parallel_apply
0: return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/overrides/data_parallel.py", line 321, in parallel_apply
0: raise output
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pytorch_lightning/overrides/data_parallel.py", line 274, in _worker
0: output = module.training_step(*input, **kwargs)
0: File "../swav_src/light_seg_byol.py", line 66, in training_step
0: loss, _ = self.forward(images)
0: File "../swav_src/light_seg_byol.py", line 62, in forward
0: return self.learner(images)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
0: result = self.forward(*input, **kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pixel_level_contrastive_learning/pixel_level_contrastive_learning.py", line 294, in forward
0: proj_one = self.online_encoder(image_one_cutout)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/torch/nn/modules/module.py", line 550, in __call__
0: result = self.forward(*input, **kwargs)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pixel_level_contrastive_learning/pixel_level_contrastive_learning.py", line 199, in forward
0: representation = self.get_representation(x)
0: File "/gpfs/users/bensaad/.local/lib/python3.7/site-packages/pixel_level_contrastive_learning/pixel_level_contrastive_learning.py", line 195, in get_representation
0: assert hidden is not None, f'hidden layer {self.layer} never emitted an output'
0: AssertionError: hidden layer layer4 never emitted an output
According to this issue Something needs to be done to register the forward hook. But I cannot understand what is it. Could someone help me please?
Thanks
Issue Analytics
- State:
- Created 3 years ago
- Comments:11 (7 by maintainers)
Top Results From Across the Web
Multi-GPU Training Using PyTorch Lightning - Wandb
In this article, we take a look at how to execute multi-GPU training using PyTorch Lightning and visualize GPU performance in Weights &...
Read more >PyTorch Lightning and Optuna: Multi-GPU hyperparameter ...
How to quickly set up multi-GPU training for hyperparameter optimisation with PyTorch Lightning.
Read more >Multi-Node Multi-GPU Comprehensive Working Example for ...
This blogpost provides a comprehensive working example of training a PyTorch Lightning model on an AzureML GPU cluster consisting of ...
Read more >GPU training (Intermediate) - PyTorch Lightning - Read the Docs
Lightning supports multiple ways of doing distributed training. ... If you request multiple GPUs or nodes without setting a mode, DDP Spawn will...
Read more >[P] PyTorch Lightning Multi-GPU Training Visualization using ...
I've been working on a visualization to breakdown some of the multi-gpu training plugins in Lightning, to gain an understanding of how they ......
Read more >Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start FreeTop Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Top GitHub Comments
Hello,
I think it is working on my side. But I just need to finish some training epochs in order to be sure that it works 100%.
I will push my code example shortly after
----- Mail original ----- De: “Phil Wang” notifications@github.com À: “lucidrains/pixel-level-contrastive-learning” pixel-level-contrastive-learning@noreply.github.com Cc: “Ahmed Ben Saad” ahmed.bensaad@telecom-paristech.fr, “Mention” mention@noreply.github.com Envoyé: Vendredi 5 Février 2021 07:26:54 Objet: Re: [lucidrains/pixel-level-contrastive-learning] Using pytorch-lightning to train PixelCL on multi-gpu (#11)
did you ever get this to work?
– You are receiving this because you were mentioned. Reply to this email directly or view it on GitHub: https://github.com/lucidrains/pixel-level-contrastive-learning/issues/11#issuecomment-773822649
Of course I will. With another (very) minor change to the package.