{ "gpt": { "model_dim": 1280, "heads": 20, "layers": 24, "max_mel_tokens": 1815, "max_text_tokens": 600, "number_text_tokens": 12000, "number_mel_codes": 8194, "start_mel_token": 8192, "stop_mel_token": 8193, "start_text_token": 0, "stop_text_token": 1, "use_mel_codes_as_input": true, "mel_length_compression": 1024, "condition_type": "conformer_perceiver", "condition_num_latent": 32, "max_conditioning_inputs": 1, "condition_module": { "input_size": 100, "output_size": 512, "linear_units": 2048, "attention_heads": 8, "num_blocks": 6, "dropout_rate": 0.0, "input_layer": "conv2d2", "pos_enc_layer_type": "rel_pos", "normalize_before": true, "use_cnn_module": true, "cnn_module_kernel": 15, "perceiver_mult": 2 }, "emo_condition_module": { "input_size": 100, "output_size": 512, "linear_units": 1024, "attention_heads": 4, "num_blocks": 4, "dropout_rate": 0.0, "input_layer": "conv2d2", "pos_enc_layer_type": "rel_pos", "normalize_before": true, "use_cnn_module": true, "cnn_module_kernel": 15, "perceiver_mult": 2 } }, "bigvgan": { "resblock": "1", "upsample_rates": [ 4, 4, 4, 4, 2, 2 ], "upsample_kernel_sizes": [ 8, 8, 4, 4, 4, 4 ], "upsample_initial_channel": 1536, "resblock_kernel_sizes": [ 3, 7, 11 ], "resblock_dilation_sizes": [ [ 1, 3, 5 ], [ 1, 3, 5 ], [ 1, 3, 5 ] ], "gpt_dim": 1024, "num_mels": 100, "speaker_embedding_dim": 512, "cond_d_vector_in_each_upsampling_layer": true, "activation": "snakebeta", "snake_logscale": true, "feat_upsample": false, "use_tanh_at_final": true }, "mel": { "sample_rate": 22050, "n_fft": 1024, "hop_length": 256, "win_length": 1024, "n_mels": 80, "mel_fmin": 0.0, "mel_fmax": null, "normalize": false }, "bpe_model": "bpe.model", "gpt_checkpoint": "gpt.pth", "bigvgan_checkpoint": "", "version": 2.0, "sample_rate": 22050, "s2mel": { "sr": 22050, "n_fft": 1024, "hop_length": 256, "win_length": 1024, "n_mels": 80 }, "precision": "fp16", "fp16_conversion": { "floating_weights": "cast_to_float16", "source": "mlx fp32/fp16 safetensors" } }