Sharded Multi-GPU MT5 training with the Seq2SeqTrainer fails (4.21.0)
See original GitHub issueSystem Info
transformers version: 4.21.0 Platform: Linux Python version: 3.7.6 Huggingface_hub version: 0.8.1 PyTorch version (GPU?): 1.10.2 (Yes) Tensorflow version (GPU?): not installed (NA) Flax version (CPU?/GPU?/TPU?): not installed (NA) Jax version: not installed JaxLib version: not installed Using GPU in script?: Yes (2+ Tesla V100) Using distributed or parallel set-up in script?: Yes
When trying to fine-tune a MT5ForConditionalGeneration model using a Seq2SeqTrainer, while using multiple GPUs, I get a InternalAssert error. I am running the script using torchrun --nproc=$NUM_GPUS script.py
. The issue appears when $NUM_GPUS
is greater than 1. Also, it only appears when the argument sharded_ddp: ["zero_dp_3"]
is passed to the trainer.
Traceback (most recent call last):
File "script.py", line 475, in <module>
fire.Fire(main)
File "/miniconda/lib/python3.7/site-packages/fire/core.py", line 141, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/miniconda/lib/python3.7/site-packages/fire/core.py", line 471, in _Fire
target=component.__name__)
File "/miniconda/lib/python3.7/site-packages/fire/core.py", line 681, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "script.py", line 447, in main
train_model(model, tokenizer, cli_arguments)
File "script.py", line 357, in train_model
trainer.train()
File "/miniconda/lib/python3.7/site-packages/transformers/trainer.py", line 1502, in train
ignore_keys_for_eval=ignore_keys_for_eval,
File "/miniconda/lib/python3.7/site-packages/transformers/trainer.py", line 1740, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/miniconda/lib/python3.7/site-packages/transformers/trainer.py", line 2488, in training_step
loss.backward()
File "/miniconda/lib/python3.7/site-packages/torch/_tensor.py", line 307, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/miniconda/lib/python3.7/site-packages/torch/autograd/__init__.py", line 156, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: grad.numel() == bucket_view.numel()INTERNAL ASSERT FAILED at "/opt/conda/conda-bld/pytorch_1640811797118/work/torch/csrc/distributed/c10d/reducer.cpp":328, please report a bug to PyTorch.
0%| | 0/100000 [00:06<?, ?it/s]
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 660 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 662 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 663 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 661) of binary: /miniconda/bin/python
Traceback (most recent call last):
File "/miniconda/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==1.10.2', 'console_scripts', 'torchrun')())
File "/miniconda/lib/python3.7/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 345, in wrapper
return f(*args, **kwargs)
File "/miniconda/lib/python3.7/site-packages/torch/distributed/run.py", line 719, in main
run(args)
File "/miniconda/lib/python3.7/site-packages/torch/distributed/run.py", line 713, in run
)(*cmd_args)
File "/miniconda/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/miniconda/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
script.py FAILED
------------------------------------------------------------
The issue fails on transformers[deepspeed]==4.21.0
but there are no issues in transformers[deepspeed]==4.20.1
. The versions of Deepspeed and Fairscale are deepspeed==0.6.5
or deepspeed==0.6.7
and fairscale=0.4.6
and this code was run in a Linux machine.
Who can help?
Information
- The official example scripts
- My own modified scripts
Tasks
- An officially supported task in the
examples
folder (such as GLUE/SQuAD, …) - My own task or dataset (give details below)
Reproduction
# The simplified contents of script.py
# Running torchrun --nproc_per_node=1 script.py should work
# Running torchrun --nproc_per_node=4 script.py should fail with a RuntimeError: grad.numel() == bucket_view.numel()INTERNAL ASSERT FAILED error.
from __future__ import annotations
import functools
import typing as tp
import datasets
import transformers
from transformers import (
DataCollatorForSeq2Seq,
PreTrainedTokenizer,
Seq2SeqTrainingArguments,
Seq2SeqTrainer,
)
increment_en = [
{"input": "One", "target": "Two"},
{"input": "Three", "target": "Four"},
{"input": "Five", "target": "Six"},
{"input": "Seven", "target": "Eight"},
{"input": "Nine", "target": "Ten"},
]
increment_en = increment_en * 100
def lod_to_dol(list_of_dicts: tp.List[tp.Dict[str, tp.Any]]) -> tp.Dict[str, list]:
dict_of_lists = {
key: [dct[key] for dct in list_of_dicts] for key in list_of_dicts[0]
}
return dict_of_lists
increment_en = lod_to_dol(increment_en)
def preprocess_function_(
examples,
tokenizer: PreTrainedTokenizer,
max_input_length: int,
max_target_length: int,
):
inputs = examples["input"]
targets = examples["target"]
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
# Setup the tokenizer for targets
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=max_target_length, truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
def main():
tokenizer = transformers.MT5Tokenizer.from_pretrained("google/mt5-base")
model = transformers.MT5ForConditionalGeneration.from_pretrained("google/mt5-base")
args = Seq2SeqTrainingArguments(
"script_debug",
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
fp16=False,
push_to_hub=False,
sharded_ddp=["zero_dp_3"],
max_steps=10000,
logging_steps=5000,
save_steps=5000
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)
dataset = datasets.DatasetDict(
{
"train": datasets.Dataset.from_dict(increment_en),
"test": datasets.Dataset.from_dict(increment_en),
}
)
preprocess_function = functools.partial(
preprocess_function_,
tokenizer=tokenizer,
max_input_length=512,
max_target_length=512
)
processed_ds = dataset.map(preprocess_function, batched=True)
processed_ds.set_format(
type="torch", columns=["input_ids", "attention_mask", "labels"]
)
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=processed_ds["train"],
eval_dataset=processed_ds["test"],
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.train()
if __name__ == "__main__":
main()
Expected behavior
The training code should not crash.
Issue Analytics
- State:
- Created a year ago
- Comments:12 (4 by maintainers)
Top GitHub Comments
Yes, I can confirm that it works!
I guess I don’t need to file a FairScale issue after all!
Probably, I suppose.