{
  "gpt": {
    "model_dim": 1280,
    "heads": 20,
    "layers": 24,
    "max_mel_tokens": 1815,
    "max_text_tokens": 600,
    "number_text_tokens": 12000,
    "number_mel_codes": 8194,
    "start_mel_token": 8192,
    "stop_mel_token": 8193,
    "start_text_token": 0,
    "stop_text_token": 1,
    "use_mel_codes_as_input": true,
    "mel_length_compression": 1024,
    "condition_type": "conformer_perceiver",
    "condition_num_latent": 32,
    "max_conditioning_inputs": 1,
    "condition_module": {
      "input_size": 100,
      "output_size": 512,
      "linear_units": 2048,
      "attention_heads": 8,
      "num_blocks": 6,
      "dropout_rate": 0.0,
      "input_layer": "conv2d2",
      "pos_enc_layer_type": "rel_pos",
      "normalize_before": true,
      "use_cnn_module": true,
      "cnn_module_kernel": 15,
      "perceiver_mult": 2
    },
    "emo_condition_module": {
      "input_size": 100,
      "output_size": 512,
      "linear_units": 1024,
      "attention_heads": 4,
      "num_blocks": 4,
      "dropout_rate": 0.0,
      "input_layer": "conv2d2",
      "pos_enc_layer_type": "rel_pos",
      "normalize_before": true,
      "use_cnn_module": true,
      "cnn_module_kernel": 15,
      "perceiver_mult": 2
    }
  },
  "bigvgan": {
    "resblock": "1",
    "upsample_rates": [
      4,
      4,
      4,
      4,
      2,
      2
    ],
    "upsample_kernel_sizes": [
      8,
      8,
      4,
      4,
      4,
      4
    ],
    "upsample_initial_channel": 1536,
    "resblock_kernel_sizes": [
      3,
      7,
      11
    ],
    "resblock_dilation_sizes": [
      [
        1,
        3,
        5
      ],
      [
        1,
        3,
        5
      ],
      [
        1,
        3,
        5
      ]
    ],
    "gpt_dim": 1024,
    "num_mels": 100,
    "speaker_embedding_dim": 512,
    "cond_d_vector_in_each_upsampling_layer": true,
    "activation": "snakebeta",
    "snake_logscale": true,
    "feat_upsample": false,
    "use_tanh_at_final": true
  },
  "mel": {
    "sample_rate": 22050,
    "n_fft": 1024,
    "hop_length": 256,
    "win_length": 1024,
    "n_mels": 80,
    "mel_fmin": 0.0,
    "mel_fmax": null,
    "normalize": false
  },
  "bpe_model": "bpe.model",
  "gpt_checkpoint": "gpt.pth",
  "bigvgan_checkpoint": "",
  "version": 2.0,
  "sample_rate": 22050,
  "s2mel": {
    "sr": 22050,
    "n_fft": 1024,
    "hop_length": 256,
    "win_length": 1024,
    "n_mels": 80
  },
  "precision": "fp16",
  "fp16_conversion": {
    "floating_weights": "cast_to_float16",
    "source": "mlx fp32/fp16 safetensors"
  }
}