Error building extension 'fused_adam' with DeepSpeed==0.3.13
See original GitHub issueHi,
I upgraded DeepSpeed
to 0.3.13
and Torch
to 1.8.0
and while using DeepSpeed with HF (HuggingFace), I’m getting below error -
RuntimeError: Error building extension ‘fused_adam’ and here is the stacktrace -
[2021-03-23 07:03:49,374] [INFO] [logging.py:60:log_dist] [Rank 0] DeepSpeed info: version=0.3.13, git-hash=unknown, git-branch=unknown
[2021-03-23 07:03:49,407] [INFO] [engine.py:77:_initialize_parameter_parallel_groups] data_parallel_size: 1, parameter_parallel_size: 1
Using /home/jovyan/.cache/torch_extensions as PyTorch extensions root...
Creating extension directory /home/jovyan/.cache/torch_extensions/fused_adam...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/jovyan/.cache/torch_extensions/fused_adam/build.ninja...
Building extension module fused_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
---------------------------------------------------------------------------
CalledProcessError Traceback (most recent call last)
~/.local/lib/python3.6/site-packages/torch/utils/cpp_extension.py in _run_ninja_build(build_directory, verbose, error_prefix)
1672 check=True,
-> 1673 env=env)
1674 except subprocess.CalledProcessError as e:
/usr/lib/python3.6/subprocess.py in run(input, timeout, check, *popenargs, **kwargs)
437 raise CalledProcessError(retcode, process.args,
--> 438 output=stdout, stderr=stderr)
439 return CompletedProcess(process.args, retcode, stdout, stderr)
CalledProcessError: Command '['ninja', '-v']' returned non-zero exit status 1.
The above exception was the direct cause of the following exception:
RuntimeError Traceback (most recent call last)
<ipython-input-24-3435b262f1ae> in <module>
----> 1 trainer.train()
~/.local/lib/python3.6/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, **kwargs)
901 delay_optimizer_creation = self.sharded_ddp is not None and self.sharded_ddp != ShardedDDPOption.SIMPLE
902 if self.args.deepspeed:
--> 903 model, optimizer, lr_scheduler = init_deepspeed(self, num_training_steps=max_steps)
904 self.model = model.module
905 self.model_wrapped = model # will get further wrapped in DDP
~/.local/lib/python3.6/site-packages/transformers/integrations.py in init_deepspeed(trainer, num_training_steps)
416 model=model,
417 model_parameters=model_parameters,
--> 418 config_params=config,
419 )
420
~/.local/lib/python3.6/site-packages/deepspeed/__init__.py in initialize(args, model, optimizer, model_parameters, training_data, lr_scheduler, mpu, dist_init_required, collate_fn, config_params)
123 dist_init_required=dist_init_required,
124 collate_fn=collate_fn,
--> 125 config_params=config_params)
126 else:
127 assert mpu is None, "mpu must be None with pipeline parallelism"
~/.local/lib/python3.6/site-packages/deepspeed/runtime/engine.py in __init__(self, args, model, optimizer, model_parameters, training_data, lr_scheduler, mpu, dist_init_required, collate_fn, config_params, dont_change_device)
181 self.lr_scheduler = None
182 if model_parameters or optimizer:
--> 183 self._configure_optimizer(optimizer, model_parameters)
184 self._configure_lr_scheduler(lr_scheduler)
185 self._report_progress(0)
~/.local/lib/python3.6/site-packages/deepspeed/runtime/engine.py in _configure_optimizer(self, client_optimizer, model_parameters)
596 logger.info('Using client Optimizer as basic optimizer')
597 else:
--> 598 basic_optimizer = self._configure_basic_optimizer(model_parameters)
599 if self.global_rank == 0:
600 logger.info(
~/.local/lib/python3.6/site-packages/deepspeed/runtime/engine.py in _configure_basic_optimizer(self, model_parameters)
670 optimizer = FusedAdam(model_parameters,
671 **optimizer_parameters,
--> 672 adam_w_mode=effective_adam_w_mode)
673
674 elif self.optimizer_name() == LAMB_OPTIMIZER:
~/.local/lib/python3.6/site-packages/deepspeed/ops/adam/fused_adam.py in __init__(self, params, lr, bias_correction, betas, eps, adam_w_mode, weight_decay, amsgrad, set_grad_none)
70 self.set_grad_none = set_grad_none
71
---> 72 fused_adam_cuda = FusedAdamBuilder().load()
73 # Skip buffer
74 self._dummy_overflow_buf = torch.cuda.IntTensor([0])
~/.local/lib/python3.6/site-packages/deepspeed/ops/op_builder/builder.py in load(self, verbose)
213 return importlib.import_module(self.absolute_name())
214 else:
--> 215 return self.jit_load(verbose)
216
217 def jit_load(self, verbose=True):
~/.local/lib/python3.6/site-packages/deepspeed/ops/op_builder/builder.py in jit_load(self, verbose)
250 extra_cuda_cflags=self.nvcc_args(),
251 extra_ldflags=self.extra_ldflags(),
--> 252 verbose=verbose)
253 build_duration = time.time() - start_build
254 if verbose:
~/.local/lib/python3.6/site-packages/torch/utils/cpp_extension.py in load(name, sources, extra_cflags, extra_cuda_cflags, extra_ldflags, extra_include_paths, build_directory, verbose, with_cuda, is_python_module, is_standalone, keep_intermediates)
1089 is_python_module,
1090 is_standalone,
-> 1091 keep_intermediates=keep_intermediates)
1092
1093
~/.local/lib/python3.6/site-packages/torch/utils/cpp_extension.py in _jit_compile(name, sources, extra_cflags, extra_cuda_cflags, extra_ldflags, extra_include_paths, build_directory, verbose, with_cuda, is_python_module, is_standalone, keep_intermediates)
1300 verbose=verbose,
1301 with_cuda=with_cuda,
-> 1302 is_standalone=is_standalone)
1303 finally:
1304 baton.release()
~/.local/lib/python3.6/site-packages/torch/utils/cpp_extension.py in _write_ninja_file_and_build_library(name, sources, extra_cflags, extra_cuda_cflags, extra_ldflags, extra_include_paths, build_directory, verbose, with_cuda, is_standalone)
1405 build_directory,
1406 verbose,
-> 1407 error_prefix=f"Error building extension '{name}'")
1408
1409
~/.local/lib/python3.6/site-packages/torch/utils/cpp_extension.py in _run_ninja_build(build_directory, verbose, error_prefix)
1681 if hasattr(error, 'output') and error.output: # type: ignore
1682 message += f": {error.output.decode()}" # type: ignore
-> 1683 raise RuntimeError(message) from e
1684
1685
RuntimeError: Error building extension 'fused_adam'
Versions that I’m using are -
Collecting environment information...
PyTorch version: 1.8.0+cu101
Is debug build: False
CUDA used to build PyTorch: 10.1
ROCM used to build PyTorch: N/A
OS: Ubuntu 18.04.5 LTS (x86_64)
GCC version: (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
Clang version: Could not collect
CMake version: Could not collect
Python version: 3.6 (64-bit runtime)
Is CUDA available: True
CUDA runtime version: 10.1.243
GPU models and configuration:
GPU 0: Tesla V100-SXM2-32GB
GPU 1: Tesla V100-SXM2-32GB
Nvidia driver version: 450.51.06
cuDNN version: /usr/lib/x86_64-linux-gnu/libcudnn.so.7.6.4
HIP runtime version: N/A
MIOpen runtime version: N/A
Versions of relevant libraries:
[pip3] kubeflow-pytorchjob==0.1.3
[pip3] numpy==1.18.5
[pip3] torch==1.8.0+cu101
[pip3] torchvision==0.8.1
[conda] Could not collect
transformers==4.4.2
DeepSpeed==0.3.13
gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243
But I was able to run DeepSpeed-0.3.10
with HuggingFace-4.3.2
and Torch-1.7.1+cu101
without any issue.
Plz suggest how to proceed further…
Issue Analytics
- State:
- Created 2 years ago
- Comments:17 (10 by maintainers)
Top Results From Across the Web
DeepSpeed Fix "Error building extension" - YouTube
Looking to use Adam with DeepSpeed, or another extension ? This video shows you how to install DeepSpeed with optional extensions.
Read more >Issues with building extensions in Deepspeed
The issue comes when the Trainer tries to use cpu_adam extension. Now there are two ways, JIT way of building extension on fly,...
Read more >AI千集-AI智能创作平台-openoker/DeepSpeed: DeepSpeed是一个 ...
If you would like to pre-install any of the DeepSpeed extensions/ops (instead of JIT ... On Windows you can build wheel with following...
Read more >apex.optimizers — Apex 0.1.0 documentation - GitHub Pages
FusedAdam may be used as a drop-in replacement for torch.optim.Adam : ... and are only retained (for the moment) for error-checking purposes.
Read more >DeepSpeed is a deep learning optimization library that makes ...
DeepSpeed includes several C++/CUDA extensions that we commonly refer ... e RuntimeError: Error building extension 'stochastic_transformer'
Read more >Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start FreeTop Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Top GitHub Comments
Thanks a lot @stas00
Finally it worked. As colab is having python-3.7, I replicated what you’ve said in my AWS EC2 instance where I had many CUDAs including 10.1 and 11.1.
Reiterating the steps I followed so that it can help someone with similar issues -
Created a conda environment with python-3.6.9 (because my target machine where I want to run DeepSpeed is having 3.6.9).
Changed
PATH
andLD_LIBRARY_PATH
to point to CUDA-10.1 ( again because of my target machine) as suggested in HF’s installation notes here. Below are the commands -verify torch versions with
python -m torch.utils.collect_env
Check whether compatible op’s were installed or not with
ds_report
dist/
and install in target machine usingpip install deepspeed-0.3.13+7fcc891-cp36-cp36m-linux_x86_64.whl
That’s the correct way:
major=7, minor=0
=>7.0
Also you can find the full list of all archs at https://developer.nvidia.com/cuda-gpus
Incidentally I have just added all this information to the docs, hopefully should be merged in the next few days: