Vinpolar's picture
Add config.json to enable download tracking
68e26e0 verified
Raw
History Blame Contribute Delete
2.51 kB
{
"model_type": "khala-music-generation",
"name": "Khala-MusicGeneration-v1.0-MPS",
"description": "Apple Silicon / MPS port of the Khala text-to-music model. Two Megatron-LM transformer stacks (backbone autoregressive token model + super-resolution token model) feeding a DAC-style residual-vector-quantized audio decoder.",
"library_name": null,
"pipeline_tag": "text-to-audio",
"torch_dtype": "bfloat16",
"transformers_version": null,
"components": {
"backbone": {
"role": "autoregressive RVQ-token language model",
"weights": "khala_backbone.safetensors",
"source_args": "backbone_megatron_args.json",
"architecture": "megatron-gpt",
"num_layers": 24,
"hidden_size": 2048,
"ffn_hidden_size": 5632,
"num_attention_heads": 32,
"kv_channels": 64,
"group_query_attention": true,
"num_query_groups": 8,
"normalization": "RMSNorm",
"norm_epsilon": 1e-06,
"activation": "swiglu",
"position_embedding_type": "rope",
"rotary_base": 500000,
"max_position_embeddings": 16384,
"seq_length": 16384,
"vocab_size": 130304,
"padded_vocab_size": 130432,
"untie_embeddings_and_output_weights": true,
"add_bias_linear": true,
"add_qkv_bias": true,
"num_quantizers": 64
},
"superres": {
"role": "super-resolution RVQ-token model",
"weights": "khala_superres.safetensors",
"source_args": "superres_megatron_args.json",
"architecture": "megatron-gpt",
"num_layers": 24,
"hidden_size": 2048,
"ffn_hidden_size": 5632,
"num_attention_heads": 32,
"kv_channels": 64,
"group_query_attention": true,
"num_query_groups": 8,
"normalization": "RMSNorm",
"norm_epsilon": 1e-06,
"activation": "swiglu",
"position_embedding_type": "rope",
"rotary_base": 500000,
"max_position_embeddings": 8192,
"seq_length": 8192,
"vocab_size": 193792,
"padded_vocab_size": 193920,
"untie_embeddings_and_output_weights": true,
"num_quantizers": 64
},
"decoder": {
"role": "DAC-style RVQ audio decoder",
"weights": "decoder_weights.pt",
"source_config": "decoder_config.yaml",
"d_latent": 128,
"codebook_size": 1024,
"num_quantizers": 64,
"encoder_n_filters": 128,
"encoder_strides": [4, 8, 8, 8],
"decoder_n_filters": 192,
"decoder_rates": [8, 8, 8, 4],
"channels": 2
}
}
}