log_dir: "Models/tm_tel_ft_24k" first_stage_path: "first_stage.pth" save_freq: 2 log_interval: 10 device: "cuda" epochs_1st: 30 epochs_2nd: 20 batch_size: 2 # Keep at 2 with filtering max_len: 200 # This is fine - refers to audio frames, not phonemes pretrained_model: "/home/purview/Documents/TextToSpeech_Backup/StyleTTS2/Models/LibriTTS/epochs_2nd_00020.pth" second_stage_load_pretrained: true load_only_params: true F0_path: "Utils/JDC/bst.t7" ASR_config: "Utils/ASR/config.yml" ASR_path: "Utils/ASR/epoch_00080.pth" PLBERT_dir: "Utils/PLBERT/" data_params: train_data: "Data_custom/train_list.txt" val_data: "Data_custom/val_list.txt" root_path: "/home/purview/Documents/TextToSpeech_Backup/Processed_Dataset_24k/wavs" OOD_data: "Data_custom/OOD_texts.txt" min_length: 50 # <<<< This is in phonemes - keep it low # Rest of your config stays the same... preprocess_params: sr: 24000 spect_params: n_fft: 2048 win_length: 1200 hop_length: 300 model_params: # match the LibriTTS checkpoint setting (it was trained multispeaker:true) # You can still finetune with only speaker_id=0 in your train_list.txt multispeaker: true dim_in: 64 hidden_dim: 512 max_conv_dim: 512 n_layer: 3 n_mels: 80 n_token: 178 max_dur: 50 style_dim: 128 dropout: 0.2 # MUST MATCH LibriTTS CHECKPOINT (this is your main fix) decoder: type: "hifigan" resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] resblock_kernel_sizes: [3, 7, 11] upsample_initial_channel: 512 upsample_rates: [10, 5, 3, 2] upsample_kernel_sizes: [20, 10, 6, 4] slm: model: "microsoft/wavlm-base-plus" sr: 16000 hidden: 768 nlayers: 13 initial_channel: 64 diffusion: embedding_mask_proba: 0.1 transformer: num_layers: 3 num_heads: 8 head_features: 64 multiplier: 2 dist: sigma_data: 0.19926648961191362 estimate_sigma_data: true mean: -3.0 std: 1.0 loss_params: lambda_mel: 5.0 lambda_gen: 1.0 lambda_slm: 1.0 lambda_mono: 1.0 lambda_s2s: 1.0 TMA_epoch: 4 lambda_F0: 1.0 lambda_norm: 1.0 lambda_dur: 1.0 lambda_ce: 20.0 lambda_sty: 1.0 lambda_diff: 1.0 # For a safe first run, delay diffusion + joint/SLM-adv. # After it runs, you can set these back to 0 like LibriTTS. diff_epoch: 999 joint_epoch: 999 optimizer_params: lr: 0.0001 bert_lr: 0.00001 ft_lr: 0.00001 slmadv_params: min_len: 400 max_len: 500 batch_percentage: 0.5 iter: 20 thresh: 5 scale: 0.01 sig: 1.5