log_dir: "Models/hindi_english_multispeaker_finetuned" first_stage_path: "first_stage.pth" save_freq: 1 log_interval: 10 device: "cuda" epochs_1st: 15 epochs_2nd: 15 batch_size: 2 max_len: 200 pretrained_model: "" second_stage_load_pretrained: true load_only_params: true F0_path: "Utils/JDC/bst.t7" ASR_config: "Utils/ASR/config.yml" ASR_path: "Utils/ASR/epoch_00080.pth" PLBERT_dir: "Utils/PLBERT/" data_params: train_data: "" val_data: "" root_path: "" OOD_data: "" min_length: 50 # Audio preprocessing (24kHz) preprocess_params: sr: 24000 spect_params: n_fft: 2048 win_length: 1200 hop_length: 300 # Model architecture model_params: multispeaker: true num_speakers: 5 dim_in: 64 hidden_dim: 512 max_conv_dim: 512 n_layer: 3 n_mels: 80 n_token: 178 max_dur: 50 style_dim: 128 dropout: 0.2 speaker_embed_dim: 256 decoder: type: "hifigan" resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] resblock_kernel_sizes: [3, 7, 11] upsample_initial_channel: 512 upsample_rates: [10, 5, 3, 2] upsample_kernel_sizes: [20, 10, 6, 4] slm: model: "microsoft/wavlm-base-plus" sr: 16000 hidden: 768 nlayers: 13 initial_channel: 64 diffusion: embedding_mask_proba: 0.1 transformer: num_layers: 3 num_heads: 8 head_features: 64 multiplier: 2 dist: sigma_data: 0.19926648961191362 estimate_sigma_data: true mean: -3.0 std: 1.0 loss_params: lambda_mel: 5.0 lambda_gen: 1.0 lambda_slm: 1.0 lambda_mono: 1.0 lambda_s2s: 1.0 lambda_F0: 1.0 lambda_norm: 1.0 lambda_dur: 1.0 lambda_ce: 20.0 lambda_sty: 1.0 lambda_diff: 1.0 TMA_epoch: 2 diff_epoch: 0 joint_epoch: 0 optimizer_params: lr: 0.00005 bert_lr: 0.000005 ft_lr: 0.000005 slmadv_params: min_len: 400 max_len: 500 batch_percentage: 0.5 iter: 20 thresh: 5 scale: 0.01 sig: 1.5