Permission Error when creating a trainer
See original GitHub issueBug description
pytorch_lightning version: 1.8.0 Platform: WSL python version: 3.8.10
I get a PermissionError: [Errno 13] Permission denied: 'command'
error when creating a Trainer object by doing
trainer = pl.Trainer()
This bug was most likely introduced by recent changes to the SLURMEnvironment
, specifically the call to subprocess here https://github.com/Lightning-AI/lightning/blob/master/src/lightning_lite/plugins/environments/slurm.py#L164 .
I checked that when I run e.g. subprocess.call(["command", "-v", "srun"])
in an ipython terminal I get the same error. However, when I pass shell=True
, i.e. subprocess.call(["command", "-v", "srun"], shell=True)
it works. Maybe the fix is as simple as just passing shell=True
here https://github.com/Lightning-AI/lightning/blob/master/src/lightning_lite/plugins/environments/slurm.py#L164 .
How to reproduce the bug
trainer = pl.Trainer()
Error messages and logs
PermissionError Traceback (most recent call last)
/tmp/ipykernel_189/60910405.py in <cell line: 5>()
3 tb_logger = TensorBoardLogger(save_dir=LOG_SAVE_DIR)
4 # trainer = pl.Trainer(accelerator="cpu", devices=1, gradient_clip_val=1e-1, plugins=None)
----> 5 trainer = pl.Trainer(gpus=None, gradient_clip_val=1e-1, logger=tb_logger)
6 net = DeepAR.from_dataset(
7 training, learning_rate=3e-2, hidden_size=N_HIDDEN, rnn_layers=2, loss=NormalDistributionLoss()
~/python_venvs/my_env/lib/python3.8/site-packages/pytorch_lightning/utilities/argparse.py in insert_env_defaults(self, *args, **kwargs)
338
339 # all args were already moved to kwargs
--> 340 return fn(self, **kwargs)
341
342 return cast(_T, insert_env_defaults)
~/python_venvs/my_env/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py in __init__(self, logger, enable_checkpointing, callbacks, default_root_dir, gradient_clip_val, gradient_clip_algorithm, num_nodes, num_processes, devices, gpus, auto_select_gpus, tpu_cores, ipus, enable_progress_bar, overfit_batches, track_grad_norm, check_val_every_n_epoch, fast_dev_run, accumulate_grad_batches, max_epochs, min_epochs, max_steps, min_steps, max_time, limit_train_batches, limit_val_batches, limit_test_batches, limit_predict_batches, val_check_interval, log_every_n_steps, accelerator, strategy, sync_batchnorm, precision, enable_model_summary, num_sanity_val_steps, resume_from_checkpoint, profiler, benchmark, deterministic, reload_dataloaders_every_n_epochs, auto_lr_find, replace_sampler_ddp, detect_anomaly, auto_scale_batch_size, plugins, amp_backend, amp_level, move_metrics_to_cpu, multiple_trainloader_mode, inference_mode)
403 self._data_connector = DataConnector(self, multiple_trainloader_mode)
404
--> 405 self._accelerator_connector = AcceleratorConnector(
406 num_processes=num_processes,
407 devices=devices,
~/python_venvs/my_env/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py in __init__(self, devices, num_nodes, accelerator, strategy, plugins, precision, amp_type, amp_level, sync_batchnorm, benchmark, replace_sampler_ddp, deterministic, auto_select_gpus, num_processes, tpu_cores, ipus, gpus)
214
215 # 3. Instantiate ClusterEnvironment
--> 216 self.cluster_environment: ClusterEnvironment = self._choose_and_init_cluster_environment()
217
218 # 4. Instantiate Strategy - Part 1
~/python_venvs/my_env/lib/python3.8/site-packages/pytorch_lightning/trainer/connectors/accelerator_connector.py in _choose_and_init_cluster_environment(self)
568 LSFEnvironment,
569 ):
--> 570 if env_type.detect():
571 return env_type()
572 return LightningEnvironment()
~/python_venvs/my_env/lib/python3.8/site-packages/lightning_lite/plugins/environments/slurm.py in detect()
97 automatically.
98 """
---> 99 SLURMEnvironment._validate_srun_used()
100 return _is_srun_used()
101
~/python_venvs/my_env/lib/python3.8/site-packages/lightning_lite/plugins/environments/slurm.py in _validate_srun_used()
162 return
163 try:
--> 164 srun_exists = subprocess.call(["command", "-v", "srun"]) == 0
165 except FileNotFoundError:
166 srun_exists = False
/usr/lib/python3.8/subprocess.py in call(timeout, *popenargs, **kwargs)
338 retcode = call(["ls", "-l"])
339 """
--> 340 with Popen(*popenargs, **kwargs) as p:
341 try:
342 return p.wait(timeout=timeout)
/usr/lib/python3.8/subprocess.py in __init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, encoding, errors, text)
856 encoding=encoding, errors=errors)
857
--> 858 self._execute_child(args, executable, preexec_fn, close_fds,
859 pass_fds, cwd, env,
860 startupinfo, creationflags, shell,
/usr/lib/python3.8/subprocess.py in _execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, start_new_session)
1702 if errno_num != 0:
1703 err_msg = os.strerror(errno_num)
-> 1704 raise child_exception_type(errno_num, err_msg, err_filename)
1705 raise child_exception_type(err_msg)
1706
PermissionError: [Errno 13] Permission denied: 'command'
Environment
#- Lightning Component (e.g. Trainer, LightningModule, LightningApp, LightningWork, LightningFlow):
#- PyTorch Lightning Version (e.g., 1.5.0):
#- Lightning App Version (e.g., 0.5.2):
#- PyTorch Version (e.g., 1.10):
#- Python version (e.g., 3.9):
#- OS (e.g., Linux):
#- CUDA/cuDNN version:
#- GPU models and configuration:
#- How you installed Lightning(`conda`, `pip`, source):
#- Running environment of LightningApp (e.g. local, cloud):
More info
No response
Issue Analytics
- State:
- Created a year ago
- Comments:9 (4 by maintainers)
Thanks for fixing this.
@awaelchli I’m not familiar with the SLURM platform, so I don’t know if this would be an appropriate answer. This is was happened after I updated pl(v1.8.0), so I looked for an issue about permission error and the first thing I did was downgrading to the existing version (v1.7.7), and the problem did not appear anymore.