| { |
| "model_type": "khala-music-generation", |
| "name": "Khala-MusicGeneration-v1.0-MPS", |
| "description": "Apple Silicon / MPS port of the Khala text-to-music model. Two Megatron-LM transformer stacks (backbone autoregressive token model + super-resolution token model) feeding a DAC-style residual-vector-quantized audio decoder.", |
| "library_name": null, |
| "pipeline_tag": "text-to-audio", |
| "torch_dtype": "bfloat16", |
| "transformers_version": null, |
| "components": { |
| "backbone": { |
| "role": "autoregressive RVQ-token language model", |
| "weights": "khala_backbone.safetensors", |
| "source_args": "backbone_megatron_args.json", |
| "architecture": "megatron-gpt", |
| "num_layers": 24, |
| "hidden_size": 2048, |
| "ffn_hidden_size": 5632, |
| "num_attention_heads": 32, |
| "kv_channels": 64, |
| "group_query_attention": true, |
| "num_query_groups": 8, |
| "normalization": "RMSNorm", |
| "norm_epsilon": 1e-06, |
| "activation": "swiglu", |
| "position_embedding_type": "rope", |
| "rotary_base": 500000, |
| "max_position_embeddings": 16384, |
| "seq_length": 16384, |
| "vocab_size": 130304, |
| "padded_vocab_size": 130432, |
| "untie_embeddings_and_output_weights": true, |
| "add_bias_linear": true, |
| "add_qkv_bias": true, |
| "num_quantizers": 64 |
| }, |
| "superres": { |
| "role": "super-resolution RVQ-token model", |
| "weights": "khala_superres.safetensors", |
| "source_args": "superres_megatron_args.json", |
| "architecture": "megatron-gpt", |
| "num_layers": 24, |
| "hidden_size": 2048, |
| "ffn_hidden_size": 5632, |
| "num_attention_heads": 32, |
| "kv_channels": 64, |
| "group_query_attention": true, |
| "num_query_groups": 8, |
| "normalization": "RMSNorm", |
| "norm_epsilon": 1e-06, |
| "activation": "swiglu", |
| "position_embedding_type": "rope", |
| "rotary_base": 500000, |
| "max_position_embeddings": 8192, |
| "seq_length": 8192, |
| "vocab_size": 193792, |
| "padded_vocab_size": 193920, |
| "untie_embeddings_and_output_weights": true, |
| "num_quantizers": 64 |
| }, |
| "decoder": { |
| "role": "DAC-style RVQ audio decoder", |
| "weights": "decoder_weights.pt", |
| "source_config": "decoder_config.yaml", |
| "d_latent": 128, |
| "codebook_size": 1024, |
| "num_quantizers": 64, |
| "encoder_n_filters": 128, |
| "encoder_strides": [4, 8, 8, 8], |
| "decoder_n_filters": 192, |
| "decoder_rates": [8, 8, 8, 4], |
| "channels": 2 |
| } |
| } |
| } |
|
|