{ "model_type": "khala-music-generation", "name": "Khala-MusicGeneration-v1.0-MPS", "description": "Apple Silicon / MPS port of the Khala text-to-music model. Two Megatron-LM transformer stacks (backbone autoregressive token model + super-resolution token model) feeding a DAC-style residual-vector-quantized audio decoder.", "library_name": null, "pipeline_tag": "text-to-audio", "torch_dtype": "bfloat16", "transformers_version": null, "components": { "backbone": { "role": "autoregressive RVQ-token language model", "weights": "khala_backbone.safetensors", "source_args": "backbone_megatron_args.json", "architecture": "megatron-gpt", "num_layers": 24, "hidden_size": 2048, "ffn_hidden_size": 5632, "num_attention_heads": 32, "kv_channels": 64, "group_query_attention": true, "num_query_groups": 8, "normalization": "RMSNorm", "norm_epsilon": 1e-06, "activation": "swiglu", "position_embedding_type": "rope", "rotary_base": 500000, "max_position_embeddings": 16384, "seq_length": 16384, "vocab_size": 130304, "padded_vocab_size": 130432, "untie_embeddings_and_output_weights": true, "add_bias_linear": true, "add_qkv_bias": true, "num_quantizers": 64 }, "superres": { "role": "super-resolution RVQ-token model", "weights": "khala_superres.safetensors", "source_args": "superres_megatron_args.json", "architecture": "megatron-gpt", "num_layers": 24, "hidden_size": 2048, "ffn_hidden_size": 5632, "num_attention_heads": 32, "kv_channels": 64, "group_query_attention": true, "num_query_groups": 8, "normalization": "RMSNorm", "norm_epsilon": 1e-06, "activation": "swiglu", "position_embedding_type": "rope", "rotary_base": 500000, "max_position_embeddings": 8192, "seq_length": 8192, "vocab_size": 193792, "padded_vocab_size": 193920, "untie_embeddings_and_output_weights": true, "num_quantizers": 64 }, "decoder": { "role": "DAC-style RVQ audio decoder", "weights": "decoder_weights.pt", "source_config": "decoder_config.yaml", "d_latent": 128, "codebook_size": 1024, "num_quantizers": 64, "encoder_n_filters": 128, "encoder_strides": [4, 8, 8, 8], "decoder_n_filters": 192, "decoder_rates": [8, 8, 8, 4], "channels": 2 } } }