[Bug] Not able to replicate quality of the pretrained VITS LJSpeech model
See original GitHub issueš Description
I tried training on the VITS LJSpeech recipe all the way through, but its quality is significantly worse than the pretrained model.
Hereās how it looks. (Note: Iāve had to stop/continue it a few times, so itās gone way past the 1000 epochs / 270k steps itās supposed to.)
Hereās how it sounds at checkpoint 270k:
And here is how the pretrained model sounds in comparison:
As you can see, mine has trouble pronouncing the following words, but the pretrained model pronounces them without issue:
@2s: "about" pronounced as "bout"
@4s: "little" pronounced as "livle"
@6s: Weird stuttery audio.
@10s: "involved" pronounced as "invo-ho-lved"
@15s: "confirmed" pronounced as "confimed"
@18s: "warfare" pronounced as "wa-ha-rfare"
@22s: "armed forces" butchered
How can I make a model as good as the pretrained one?
To Reproduce
Make a copy of the VITS LJSpeech recipe, but modify the path
to point to the dataset, and phoneme_cache_path
to point wherever.
Run it:
CUDA_VISIBLE_DEVICES=0 python path/to/modified/recipe.py
Expected behavior
We get a model thatās similar to the pretrained one.
Environment
{
"CUDA": {
"GPU": [
"NVIDIA GeForce RTX 3090"
],
"available": true,
"version": "11.3"
},
"Packages": {
"PyTorch_debug": false,
"PyTorch_version": "1.10.2",
"TTS": "0.5.0",
"numpy": "1.21.2"
},
"System": {
"OS": "Linux",
"architecture": [
"64bit",
""
],
"processor": "x86_64",
"python": "3.7.11",
"version": "#202202230823 SMP PREEMPT Wed Feb 23 14:53:24 UTC 2022"
}
}
Additional context 1
config.json in output folder
{
"model": "vits",
"run_name": "vits_ljspeech",
"run_description": "",
"epochs": 1000,
"batch_size": 48,
"eval_batch_size": 16,
"mixed_precision": true,
"scheduler_after_epoch": true,
"run_eval": true,
"test_delay_epochs": -1,
"print_eval": true,
"dashboard_logger": "tensorboard",
"print_step": 25,
"plot_step": 100,
"model_param_stats": false,
"project_name": null,
"log_model_step": null,
"wandb_entity": null,
"save_step": 10000,
"checkpoint": true,
"keep_all_best": false,
"keep_after": 10000,
"num_loader_workers": 4,
"num_eval_loader_workers": 4,
"use_noise_augment": false,
"use_language_weighted_sampler": false,
"output_path": "/home/fijipants/repo/coqui-0.5.0/runs2",
"distributed_backend": "nccl",
"distributed_url": "tcp://localhost:54321",
"audio": {
"fft_size": 1024,
"win_length": 1024,
"hop_length": 256,
"frame_shift_ms": null,
"frame_length_ms": null,
"stft_pad_mode": "reflect",
"sample_rate": 22050,
"resample": false,
"preemphasis": 0.0,
"ref_level_db": 20,
"do_sound_norm": false,
"log_func": "np.log",
"do_trim_silence": true,
"trim_db": 45,
"do_rms_norm": false,
"db_level": null,
"power": 1.5,
"griffin_lim_iters": 60,
"num_mels": 80,
"mel_fmin": 0,
"mel_fmax": null,
"spec_gain": 1.0,
"do_amp_to_db_linear": false,
"do_amp_to_db_mel": true,
"signal_norm": false,
"min_level_db": -100,
"symmetric_norm": true,
"max_norm": 4.0,
"clip_norm": true,
"stats_path": null
},
"use_phonemes": true,
"use_espeak_phonemes": true,
"phoneme_language": "en-us",
"compute_input_seq_cache": true,
"text_cleaner": "english_cleaners",
"enable_eos_bos_chars": false,
"test_sentences_file": "",
"phoneme_cache_path": "/home/fijipants/phoneme-cache",
"characters": {
"pad": "_",
"eos": "~",
"bos": "^",
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!'(),-.:;? ",
"punctuations": "!'(),-.:;? ",
"phonemes": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b"
},
"batch_group_size": 5,
"loss_masking": null,
"sort_by_audio_len": true,
"min_seq_len": 0,
"max_seq_len": 500000,
"compute_f0": false,
"compute_linear_spec": true,
"add_blank": true,
"datasets": [
{
"name": "ljspeech",
"path": "/home/fijipants/datasets/Female/LJSpeech-1.1/",
"meta_file_train": "metadata.csv",
"ignored_speakers": null,
"language": "",
"meta_file_val": "",
"meta_file_attn_mask": ""
}
],
"optimizer": "AdamW",
"optimizer_params": {
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"weight_decay": 0.01
},
"lr_scheduler": "",
"lr_scheduler_params": {},
"test_sentences": [
[
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."
],
[
"Be a voice, not an echo."
],
[
"I'm sorry Dave. I'm afraid I can't do that."
],
[
"This cake is great. It's so delicious and moist."
],
[
"Prior to November 22, 1963."
]
],
"model_args": {
"num_chars": 192,
"out_channels": 513,
"spec_segment_size": 32,
"hidden_channels": 192,
"hidden_channels_ffn_text_encoder": 768,
"num_heads_text_encoder": 2,
"num_layers_text_encoder": 6,
"kernel_size_text_encoder": 3,
"dropout_p_text_encoder": 0.1,
"dropout_p_duration_predictor": 0.5,
"kernel_size_posterior_encoder": 5,
"dilation_rate_posterior_encoder": 1,
"num_layers_posterior_encoder": 16,
"kernel_size_flow": 5,
"dilation_rate_flow": 1,
"num_layers_flow": 4,
"resblock_type_decoder": "1",
"resblock_kernel_sizes_decoder": [
3,
7,
11
],
"resblock_dilation_sizes_decoder": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates_decoder": [
8,
8,
2,
2
],
"upsample_initial_channel_decoder": 512,
"upsample_kernel_sizes_decoder": [
16,
16,
4,
4
],
"use_sdp": true,
"noise_scale": 1.0,
"inference_noise_scale": 0.667,
"length_scale": 1,
"noise_scale_dp": 1.0,
"inference_noise_scale_dp": 1.0,
"max_inference_len": null,
"init_discriminator": true,
"use_spectral_norm_disriminator": false,
"use_speaker_embedding": false,
"num_speakers": 0,
"speakers_file": null,
"d_vector_file": null,
"speaker_embedding_channels": 256,
"use_d_vector_file": false,
"d_vector_dim": 0,
"detach_dp_input": true,
"use_language_embedding": false,
"embedded_language_dim": 4,
"num_languages": 0,
"language_ids_file": null,
"use_speaker_encoder_as_loss": false,
"speaker_encoder_config_path": "",
"speaker_encoder_model_path": "",
"freeze_encoder": false,
"freeze_DP": false,
"freeze_PE": false,
"freeze_flow_decoder": false,
"freeze_waveform_decoder": false
},
"grad_clip": [
1000,
1000
],
"lr_gen": 0.0002,
"lr_disc": 0.0002,
"lr_scheduler_gen": "ExponentialLR",
"lr_scheduler_gen_params": {
"gamma": 0.999875,
"last_epoch": -1
},
"lr_scheduler_disc": "ExponentialLR",
"lr_scheduler_disc_params": {
"gamma": 0.999875,
"last_epoch": -1
},
"kl_loss_alpha": 1.0,
"disc_loss_alpha": 1.0,
"gen_loss_alpha": 1.0,
"feat_loss_alpha": 1.0,
"mel_loss_alpha": 45.0,
"dur_loss_alpha": 1.0,
"speaker_encoder_loss_alpha": 1.0,
"return_wav": true,
"r": 1,
"num_speakers": 0,
"use_speaker_embedding": false,
"speakers_file": null,
"speaker_embedding_channels": 256,
"language_ids_file": null,
"use_language_embedding": false,
"use_d_vector_file": false,
"d_vector_file": null,
"d_vector_dim": 0
}
Additional context 2
My goal is to create a VITS LJSpeech checkpoint with the same quality as the pretrained model.
I tried training on the pretrained config, but there are a bunch of missing values like batch_size.
Alternatively to using the recipe, could you suggest some values to plug in to the pretrained config so I could try with that? Namely batch_size
, eval_batch_size
, num_loader_workers
, num_eval_loader_workers
, min_seq_len
, and max_seq_len
. I have two 3090s at my disposal.
Issue Analytics
- State:
- Created 2 years ago
- Comments:7 (5 by maintainers)
Top GitHub Comments
We fixed some bugs in the dev branch. Can you try using it?
It looks like some characters are missing in your character list and the models skip those.
I have similar issues, sometimes even a word is not pronounced, also wondering how to check the mapping of characters to IDs.