[Bug] speaker adoption for vits is not working properly
See original GitHub issueDescribe the bug
i am attempting to add Indian English speaker adaption on top of vits model trained on vctk with zero-shot yourtts approach , i created d-vector using speaker encoder model used in yourtts after training for 150k steps speakers switching is happening. is there something wrong with my config ? or should i keep training for more steps? sampling rate used in speaker encoder is 16000 and vits model sampling rate is 22050 i am attaching training logs , sample audio outputs and config file for speaker encoder and vits https://soundcloud.com/manmay-nakhashi/sets/tts-outputs
config.json
To Reproduce
{ "model": "vits", "run_name": "vits_1", "run_description": "", "epochs": 1000, "batch_size": 24, "eval_batch_size": 8, "mixed_precision": false, "scheduler_after_epoch": true, "run_eval": true, "test_delay_epochs": -1, "print_eval": false, "dashboard_logger": "tensorboard", "print_step": 25, "plot_step": 100, "model_param_stats": false, "project_name": null, "log_model_step": 10000, "wandb_entity": null, "save_step": 10000, "checkpoint": true, "keep_all_best": false, "keep_after": 10000, "num_loader_workers": 4, "num_eval_loader_workers": 4, "use_noise_augment": false, "use_language_weighted_sampler": false, "output_path": "/home/anchaljaiswal/tts-coqui", "distributed_backend": "nccl", "distributed_url": "tcp://localhost:54321", "audio": { "fft_size": 1024, "win_length": 1024, "hop_length": 256, "frame_shift_ms": null, "frame_length_ms": null, "stft_pad_mode": "reflect", "sample_rate": 22050, "resample": true, "preemphasis": 0.0, "ref_level_db": 20, "do_sound_norm": false, "log_func": "np.log", "do_trim_silence": true, "trim_db": 23.0, "do_rms_norm": false, "db_level": null, "power": 1.5, "griffin_lim_iters": 60, "num_mels": 80, "mel_fmin": 0, "mel_fmax": null, "spec_gain": 1.0, "do_amp_to_db_linear": false, "do_amp_to_db_mel": true, "signal_norm": false, "min_level_db": -100, "symmetric_norm": true, "max_norm": 4.0, "clip_norm": true, "stats_path": null }, "use_phonemes": false, "use_espeak_phonemes": true, "phoneme_language": "en-in", "compute_input_seq_cache": true, "text_cleaner": "english_cleaners", "enable_eos_bos_chars": false, "test_sentences_file": "", "phoneme_cache_path": "/home/anchaljaiswal/tts-coqui/phoneme_cache", "characters": { "pad": "_", "eos": "~", "bos": "^", "characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ", "punctuations": "!'(),-.:;? ", "phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b" }, "batch_group_size": 4, "loss_masking": null, "sort_by_audio_len": true, "min_seq_len": 32768, "max_seq_len": 1500000, "compute_f0": false, "compute_linear_spec": true, "add_blank": true, "datasets": [ { "name": "vctk", "path": "/home/anchaljaiswal/tts-coqui/vctk_/output_path", "meta_file_train": "", "ignored_speakers": null, "language": "en-in", "meta_file_val": "", "meta_file_attn_mask": "" } ], "optimizer": "AdamW", "optimizer_params": { "betas": [ 0.8, 0.99 ], "eps": 1e-09, "weight_decay": 0.01 }, "lr_scheduler": "", "lr_scheduler_params": {}, "test_sentences": [ [ "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent." ], [ "Be a voice, not an echo." ], [ "I'm sorry Dave. I'm afraid I can't do that." ], [ "This cake is great. It's so delicious and moist." ], [ "Prior to November 22, 1963." ] ], "model_args": { "num_chars": 76, "out_channels": 513, "spec_segment_size": 32, "hidden_channels": 192, "hidden_channels_ffn_text_encoder": 768, "num_heads_text_encoder": 2, "num_layers_text_encoder": 6, "kernel_size_text_encoder": 3, "dropout_p_text_encoder": 0.1, "dropout_p_duration_predictor": 0.5, "kernel_size_posterior_encoder": 5, "dilation_rate_posterior_encoder": 1, "num_layers_posterior_encoder": 16, "kernel_size_flow": 5, "dilation_rate_flow": 1, "num_layers_flow": 4, "resblock_type_decoder": "1", "resblock_kernel_sizes_decoder": [ 3, 7, 11 ], "resblock_dilation_sizes_decoder": [ [ 1, 3, 5 ], [ 1, 3, 5 ], [ 1, 3, 5 ] ], "upsample_rates_decoder": [ 8, 8, 2, 2 ], "upsample_initial_channel_decoder": 512, "upsample_kernel_sizes_decoder": [ 16, 16, 4, 4 ], "use_sdp": true, "noise_scale": 1.0, "inference_noise_scale": 0.667, "length_scale": 1, "noise_scale_dp": 1.0, "inference_noise_scale_dp": 1.0, "max_inference_len": null, "init_discriminator": true, "use_spectral_norm_disriminator": false, "use_speaker_embedding": false, "num_speakers": 15, "speakers_file": "/home/anchaljaiswal/tts-coqui/vits_-May-29-2022_04+35PM-33560189/speakers.json", "d_vector_file": "/home/anchaljaiswal/tts-coqui/d_vectors/speakers.json", "speaker_embedding_channels": 256, "use_d_vector_file": true, "d_vector_dim": 512, "detach_dp_input": true, "use_language_embedding": false, "embedded_language_dim": 4, "num_languages": 0, "language_ids_file": null, "use_speaker_encoder_as_loss": false, "speaker_encoder_config_path": "/home/anchaljaiswal/tts-coqui/se/config_se.json", "speaker_encoder_model_path": "/home/anchaljaiswal/tts-coqui/se/SE_checkpoint.pth.tar", "freeze_encoder": false, "freeze_DP": false, "freeze_PE": false, "freeze_flow_decoder": false, "freeze_waveform_decoder": false }, "grad_clip": [ 1000, 1000 ], "lr_gen": 0.0002, "lr_disc": 0.0002, "lr_scheduler_gen": "ExponentialLR", "lr_scheduler_gen_params": { "gamma": 0.999875, "last_epoch": -1 }, "lr_scheduler_disc": "ExponentialLR", "lr_scheduler_disc_params": { "gamma": 0.999875, "last_epoch": -1 }, "kl_loss_alpha": 1.0, "disc_loss_alpha": 1.0, "gen_loss_alpha": 1.0, "feat_loss_alpha": 1.0, "mel_loss_alpha": 45.0, "dur_loss_alpha": 1.0, "speaker_encoder_loss_alpha": 1.0, "return_wav": true, "r": 1, "num_speakers": 0, "use_speaker_embedding": false, "speakers_file": "/home/anchaljaiswal/tts-coqui/vits_1-May-29-2022_04+35PM-33560189/speakers.json", "speaker_embedding_channels": 256, "language_ids_file": null, "use_language_embedding": false, "use_d_vector_file": true, "d_vector_file": "/home/anchaljaiswal/tts-coqui/d_vectors/speakers.json", "d_vector_dim": 512 }
Expected behavior
No response
Logs
No response
Environment
{
"CUDA": {
"GPU": [
"NVIDIA A100-SXM4-40GB"
],
"available": true,
"version": "11.1"
},
"Packages": {
"PyTorch_debug": false,
"PyTorch_version": "1.9.0+cu111",
"TTS": "0.5.0",
"numpy": "1.21.6"
},
"System": {
"OS": "Linux",
"architecture": [
"64bit",
""
],
"processor": "",
"python": "3.7.12",
"version": "#1 SMP Debian 4.19.232-1 (2022-03-07)"
}
}
Additional context
No response
Issue Analytics
- State:
- Created a year ago
- Comments:10 (10 by maintainers)
Top GitHub Comments
No. the speaker_idx is sufficient. I think there must be something wrong in training data, maybe few samples have jumbled up from different speakers and labeled as same speaker. Is there any way to confirm that? On a quick note, if I were you, I would try the plot the embeddings using this notebook by modifying code for your own dataset. If jumbling is right, you might see some wav files of different speakers in same cluster and maybe get a good start point to investigate ahead.