{
  "model_type": "khala-music-generation",
  "name": "Khala-MusicGeneration-v1.0-MPS",
  "description": "Apple Silicon / MPS port of the Khala text-to-music model. Two Megatron-LM transformer stacks (backbone autoregressive token model + super-resolution token model) feeding a DAC-style residual-vector-quantized audio decoder.",
  "library_name": null,
  "pipeline_tag": "text-to-audio",
  "torch_dtype": "bfloat16",
  "transformers_version": null,
  "components": {
    "backbone": {
      "role": "autoregressive RVQ-token language model",
      "weights": "khala_backbone.safetensors",
      "source_args": "backbone_megatron_args.json",
      "architecture": "megatron-gpt",
      "num_layers": 24,
      "hidden_size": 2048,
      "ffn_hidden_size": 5632,
      "num_attention_heads": 32,
      "kv_channels": 64,
      "group_query_attention": true,
      "num_query_groups": 8,
      "normalization": "RMSNorm",
      "norm_epsilon": 1e-06,
      "activation": "swiglu",
      "position_embedding_type": "rope",
      "rotary_base": 500000,
      "max_position_embeddings": 16384,
      "seq_length": 16384,
      "vocab_size": 130304,
      "padded_vocab_size": 130432,
      "untie_embeddings_and_output_weights": true,
      "add_bias_linear": true,
      "add_qkv_bias": true,
      "num_quantizers": 64
    },
    "superres": {
      "role": "super-resolution RVQ-token model",
      "weights": "khala_superres.safetensors",
      "source_args": "superres_megatron_args.json",
      "architecture": "megatron-gpt",
      "num_layers": 24,
      "hidden_size": 2048,
      "ffn_hidden_size": 5632,
      "num_attention_heads": 32,
      "kv_channels": 64,
      "group_query_attention": true,
      "num_query_groups": 8,
      "normalization": "RMSNorm",
      "norm_epsilon": 1e-06,
      "activation": "swiglu",
      "position_embedding_type": "rope",
      "rotary_base": 500000,
      "max_position_embeddings": 8192,
      "seq_length": 8192,
      "vocab_size": 193792,
      "padded_vocab_size": 193920,
      "untie_embeddings_and_output_weights": true,
      "num_quantizers": 64
    },
    "decoder": {
      "role": "DAC-style RVQ audio decoder",
      "weights": "decoder_weights.pt",
      "source_config": "decoder_config.yaml",
      "d_latent": 128,
      "codebook_size": 1024,
      "num_quantizers": 64,
      "encoder_n_filters": 128,
      "encoder_strides": [4, 8, 8, 8],
      "decoder_n_filters": 192,
      "decoder_rates": [8, 8, 8, 4],
      "channels": 2
    }
  }
}