PT 1.6.0 could not resume a training with plugins monitoring on metrics
See original GitHub issueπ Bug
As mentioned in the title
To Reproduce
The code (boring.py) is provided below:
from typing import List
import torch
from pytorch_lightning import LightningDataModule, LightningModule
from pytorch_lightning.utilities.cli import (LightningArgumentParser, LightningCLI)
from torch.utils.data import DataLoader, Dataset
from jsonargparse import lazy_instance
class Arch(torch.nn.Linear):
def __init__(self, input_size: int = 10, output_size: int = 2) -> None:
super().__init__(input_size, output_size)
class RandomDataset(Dataset):
def __init__(self, size, length):
self.len = length
self.data = torch.randn(length, size)
def __getitem__(self, index):
return self.data[index]
def __len__(self):
return self.len
class MyDataModule(LightningDataModule):
def __init__(self, input_size: int = 10, train_transforms=None, val_transforms=None, test_transforms=None, dims=None):
super().__init__(train_transforms=train_transforms, val_transforms=val_transforms, test_transforms=test_transforms, dims=dims)
self.input_size = input_size
def train_dataloader(self) -> DataLoader:
return DataLoader(RandomDataset(self.input_size, 64), batch_size=2)
def val_dataloader(self) -> DataLoader:
return DataLoader(RandomDataset(self.input_size, 64), batch_size=2)
def test_dataloader(self) -> DataLoader:
return DataLoader(RandomDataset(self.input_size, 64), batch_size=2)
class BoringModel(LightningModule):
def __init__(self, arch: Arch = lazy_instance(Arch), channels: List[int] = [0, 1]):
super().__init__()
self.arch = arch
def forward(self, x):
print(x.shape)
return self.arch(x)
def training_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("train_loss", loss)
return {"loss": loss}
def validation_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("valid_loss", loss)
def test_step(self, batch, batch_idx):
loss = self(batch).sum()
self.log("test_loss", loss)
def configure_optimizers(self):
return torch.optim.SGD(self.arch.parameters(), lr=0.1)
class MyCLI(LightningCLI):
def add_arguments_to_parser(self, parser: LightningArgumentParser) -> None:
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
parser.add_lightning_class_args(EarlyStopping, "early_stopping")
parser.set_defaults({
"early_stopping.monitor": "valid_loss",
"early_stopping.min_delta": 0.01,
"early_stopping.patience": 10,
"early_stopping.mode": "min",
})
# ModelCheckpoint
parser.add_lightning_class_args(ModelCheckpoint, "model_checkpoint")
model_checkpoint_defaults = {
"model_checkpoint.filename": "epoch{epoch}_valid_loss{valid_loss:.4f}",
"model_checkpoint.monitor": "valid_loss",
"model_checkpoint.mode": "min",
"model_checkpoint.every_n_epochs": 1,
"model_checkpoint.save_top_k": 5,
"model_checkpoint.auto_insert_metric_name": False,
"model_checkpoint.save_last": True
}
parser.set_defaults(model_checkpoint_defaults)
return super().add_arguments_to_parser(parser)
if __name__ == '__main__':
cli = MyCLI(BoringModel, MyDataModule, seed_everything_default=None, save_config_overwrite=True, parser_kwargs={"parser_mode": "omegaconf"})
- fit:
python boring.py fit
, and kill the training program when we have ckeckpoints - resume (something like the command):
python boring.py fit --config lightning_logs/version_7/config.yaml --ckpt_path=lightning_logs/version_7/checkpoints/last.ckpt
Expected behavior
resume sucessfully.
Environment
- PyTorch Lightning Version (e.g., 1.5.0): 1.6.0
- PyTorch Version (e.g., 1.10): 1.10
- Python version (e.g., 3.9): 3.9
- OS (e.g., Linux):
- CUDA/cuDNN version:
- GPU models and configuration:
- How you installed PyTorch (
conda
,pip
, source): - If compiling from source, the output of
torch.__config__.show()
: - Any other relevant information:
Additional context
The error:
python boring.py fit --config lightning_logs/version_7/config.yaml --ckpt_path=lightning_logs/version_7/checkpoints/last.ckpt
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/utilities.py:91: PossibleUserWarning: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
rank_zero_warn(
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1823: PossibleUserWarning: GPU available but not used. Set `accelerator` and `devices` using `Trainer(accelerator='gpu', devices=8)`.
rank_zero_warn(
Restoring states from the checkpoint path at lightning_logs/version_7/checkpoints/last.ckpt
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:342: UserWarning: The dirpath has changed from '/data/home/quancs/projects/NBSS_pmt/lightning_logs/version_7/checkpoints' to '/data/home/quancs/projects/NBSS_pmt/lightning_logs/version_8/checkpoints', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.
warnings.warn(
| Name | Type | Params
------------------------------
0 | arch | Arch | 22
------------------------------
22 Trainable params
0 Non-trainable params
22 Total params
0.000 Total estimated model params size (MB)
Restored all states from the checkpoint file at lightning_logs/version_7/checkpoints/last.ckpt
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:240: PossibleUserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 128 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
rank_zero_warn(
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1938: PossibleUserWarning: The number of training samples (32) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
rank_zero_warn(
/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:240: PossibleUserWarning: The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 128 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
rank_zero_warn(
Epoch 19: 50%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 32/64 [00:00<?, ?it/s]Traceback (most recent call last):
File "/data/home/x/projects/NBSS_pmt/boring.py", line 100, in <module>
cli = MyCLI(BoringModel, MyDataModule, seed_everything_default=None, save_config_overwrite=True, parser_kwargs={"parser_mode": "omegaconf"})
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/utilities/cli.py", line 564, in __init__
self._run_subcommand(self.subcommand)
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/utilities/cli.py", line 835, in _run_subcommand
fn(**fn_kwargs)
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 771, in fit
self._call_and_handle_interrupt(
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 724, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 812, in _fit_impl
results = self._run(model, ckpt_path=self.ckpt_path)
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1237, in _run
results = self._run_stage()
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1324, in _run_stage
return self._run_train()
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1354, in _run_train
self.fit_loop.run()
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 205, in run
self.on_advance_end()
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 297, in on_advance_end
self.trainer._call_callback_hooks("on_train_epoch_end")
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1637, in _call_callback_hooks
fn(self, self.lightning_module, *args, **kwargs)
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/callbacks/early_stopping.py", line 179, in on_train_epoch_end
self._run_early_stopping_check(trainer)
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/callbacks/early_stopping.py", line 190, in _run_early_stopping_check
if trainer.fast_dev_run or not self._validate_condition_metric( # disable early_stopping with fast_dev_run
File "/data/home/x/miniconda3/lib/python3.9/site-packages/pytorch_lightning/callbacks/early_stopping.py", line 145, in _validate_condition_metric
raise RuntimeError(error_msg)
RuntimeError: Early stopping conditioned on metric `valid_loss` which is not available. Pass in or modify your `EarlyStopping` callback to use any of the following: ``
Epoch 19: 50%|βββββ | 32/64 [00:00<?, ?it/s]
Issue Analytics
- State:
- Created a year ago
- Reactions:3
- Comments:10 (5 by maintainers)
Top Results From Across the Web
PT 1.6.0 could not resume a training with plugins monitoring ...
Even for 1.6.1, this bug is still unsolved. UPDATE: Tried to print out related variables, and found 'trainer.callback_metrics' in this line isΒ ...
Read more >Trainer β PyTorch Lightning 1.8.5.post0 documentation
Under the hood, the Lightning Trainer handles the training loop details for you, some examples include: Automatically enabling/disabling grads. Running theΒ ...
Read more >QAs_As_for_CELD.docx - GovTribe
Page 14, section 7.3.2 does not list an educational requirement for the Project ... Can MCC please clarify the page limit of each...
Read more >Confluent Platform Component Changelogs
Bump maven-shade-plugin to 3.3.0; Fix build break; Override transitive dependency for analytics library of kotlin-stdlib-common; Add exclusion to overrideΒ ...
Read more >WhatsNew 1.6 | Ktor Framework
JMXReporter not included in ktor-metrics:1.6.8 ... Ktor test' intention should initialize a client with the WebSockets plugin installed.
Read more >
Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start Free
Top Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
prioritizing it. Will be fixed soon.
Same here! Not only EarlyStopping doesnβt work, any callback or LRScheduler with metric monitors.