vits voice conversion fail [Bug]
See original GitHub issueDescribe the bug
The following error occurs when I use vits for voice conversion :
RuntimeError: Expected tensor for argument #1 ‘indices’ to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)
To Reproduce
import os
from trainer import Trainer, TrainerArgs
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits,CharactersConfig,VitsArgs
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.speakers import SpeakerManager
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(
name="baker_old_2", path="/datasets/temp-bznsyp", language="zh-cn"
)
audio_config = BaseAudioConfig(
sample_rate=48000,
win_length=1024,
hop_length=256,
num_mels=80,
preemphasis=0.0,
ref_level_db=20,
log_func="np.log",
do_trim_silence=True,
trim_db=45,
mel_fmin=0,
mel_fmax=None,
spec_gain=1.0,
signal_norm=False,
do_amp_to_db_linear=False,
)
vitsArgs = VitsArgs(
use_speaker_embedding=True,
use_sdp=False,
use_speaker_encoder_as_loss=True,
speaker_encoder_config_path="/TTS/models/tts_models--multilingual--multi-dataset--your_tts/config_se.json",
speaker_encoder_model_path="/TTS/models/tts_models--multilingual--multi-dataset--your_tts/model_se.pth",
)
config = VitsConfig(
model_args=vitsArgs,
audio=audio_config,
run_name="vits_baker_temp",
batch_size=48,
eval_batch_size=24,
batch_group_size=5,
num_loader_workers=0,
num_eval_loader_workers=8,
run_eval=True,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="chinese_mandarin_cleaners",
use_phonemes=True,
phoneme_language="zh-cn",
phonemizer="zh_cn_phonemizer",
add_blank=False,
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
compute_input_seq_cache=False,
print_step=25,
print_eval=True,
mixed_precision=True,
output_path=output_path,
datasets=[dataset_config],
characters=CharactersConfig(
characters_class=None,
vocab_dict=None,
pad="_",
eos="~",
bos="^",
blank=None,
characters="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),.:;? ",
punctuations="\uff0c\u3002\uff1f\uff01\uff5e\uff1a\uff1b*\u2014\u2014-\uff08\uff09\u3010\u3011!'(),-.:;? “”",
phonemes="12345giy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u025a\u02de\u026b",
is_unique=False,
is_sorted=True
),
test_sentences=[
["你在做什么?", "baker", None, "zh-cn"],
["篮球场上没有人", "baker", None, "zh-cn"],
],
)
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# config is updated with the default characters if not defined in the config.
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
eval_split_max_size=config.eval_split_max_size,
eval_split_size=config.eval_split_size,
)
speaker_manager = SpeakerManager()
speaker_manager.use_cuda = True
speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
config.model_args.num_speakers = speaker_manager.num_speakers
# init model
model = Vits(config, ap, tokenizer, speaker_manager=speaker_manager)
# init the trainer and
trainer = Trainer(
TrainerArgs(),
config,
output_path,
model=model,
train_samples=train_samples,
eval_samples=eval_samples,
)
trainer.fit()
voice conversion command:
tts --model_path ./vits_baker_temp-June-20-2022_02+48PM-0000000/best_model.pth --config_path ./vits_baker_temp-June-20-2022_02+48PM-0000000/config.json --speaker_idx "baker" --out_path output.wav --reference_wav 006637.wav
Expected behavior
voice conversion success!
Logs
/opt/conda/lib/python3.8/site-packages/torch/functional.py:695: UserWarning: stft will soon require the return_complex parameter be given for real inputs, and will further require that return_complex=True in a future PyTorch release. (Triggered internally at ../aten/src/ATen/native/SpectralOps.cpp:798.)
return _VF.stft(input, n_fft, hop_length, win_length, window, # type: ignore[attr-defined]
Traceback (most recent call last):
File "/opt/conda/bin/tts", line 33, in <module>
sys.exit(load_entry_point('TTS', 'console_scripts', 'tts')())
File "/TTS/TTS/bin/synthesize.py", line 309, in main
wav = synthesizer.tts(
File "/TTS/TTS/utils/synthesizer.py", line 339, in tts
outputs = transfer_voice(
File "/TTS/TTS/tts/utils/synthesis.py", line 304, in transfer_voice
model_outputs = _func(reference_wav, speaker_id, d_vector, reference_speaker_id, reference_d_vector)
File "/opt/conda/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/TTS/TTS/tts/models/vits.py", line 1140, in inference_voice_conversion
wav, _, _ = self.voice_conversion(y, y_lengths, speaker_cond_src, speaker_cond_tgt)
File "/TTS/TTS/tts/models/vits.py", line 1157, in voice_conversion
g_src = self.emb_g(speaker_cond_src).unsqueeze(-1)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/nn/modules/sparse.py", line 158, in forward
return F.embedding(
File "/opt/conda/lib/python3.8/site-packages/torch/nn/functional.py", line 2183, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)
Environment
{
"CUDA": {
"GPU": [
"NVIDIA GeForce RTX 3090",
"NVIDIA GeForce RTX 3090",
"NVIDIA GeForce RTX 3090",
"NVIDIA GeForce RTX 3090"
],
"available": true,
"version": "11.3"
},
"Packages": {
"PyTorch_debug": false,
"PyTorch_version": "1.11.0+cu113",
"TTS": "0.6.2",
"numpy": "1.21.6"
},
"System": {
"OS": "Linux",
"architecture": [
"64bit",
""
],
"processor": "x86_64",
"python": "3.8.12",
"version": "#91-Ubuntu SMP Thu Jul 15 19:09:17 UTC 2021"
}
}
Additional context
No response
Issue Analytics
- State:
- Created a year ago
- Comments:37 (17 by maintainers)
Top Results From Across the Web
EfficientTTS 2: Variational End-to-End Text-to-Speech ...
Presents results on TTS and voice conversion, demonstrating improvement over: TTS: EFTS-CNN, VITS and ablations of EFTS2; VC: YourTTS. Strength And Weaknesses: ...
Read more >arXiv:2201.12567v1 [cs.SD] 29 Jan 2022
ABSTRACT. The voice conversion task is to modify the speaker iden- tity of continuous speech while preserving the linguistic content.
Read more >skytnt/moe-tts at fb95943 - colab - Hugging Face
return "Error: Text is too long", None ... Markdown("# Moe TTS And Voice Conversion Using VITS Model\n\n".
Read more >Voice Cloning Tutorial with Coqui TTS and Google Colab
Voice Cloning Tutorial with Coqui TTS and Google Colab | Fine Tune Your Own VITS Model for Free. 14K views 4 months ago....
Read more >(PDF) VISinger: Variational Inference with Adversarial ...
It is non-trivial to adopt VITS in singing voice synthesis, be- ... Objective metrics, including F0 root mean square error (F0 RMSE).
Read more >Top Related Medium Post
No results found
Top Related StackOverflow Question
No results found
Troubleshoot Live Code
Lightrun enables developers to add logs, metrics and snapshots to live code - no restarts or redeploys required.
Start FreeTop Related Reddit Thread
No results found
Top Related Hackernoon Post
No results found
Top Related Tweet
No results found
Top Related Dev.to Post
No results found
Top Related Hashnode Post
No results found
Top GitHub Comments
Let me try the multi-speaker dataset and see what happens.
All four commands can be executed successfully! 👍