Vinpolar
/

Khala-MusicGeneration-v1.0-MPS

khala-music-generation

music-generation

acoustic-tokens

Model card Files Files and versions

Khala-MusicGeneration-v1.0-MPS / config.json

Vinpolar's picture

Add config.json to enable download tracking

68e26e0 verified 17 days ago

History Blame Contribute Delete

2.51 kB

	{
	"model_type": "khala-music-generation",
	"name": "Khala-MusicGeneration-v1.0-MPS",
	"description": "Apple Silicon / MPS port of the Khala text-to-music model. Two Megatron-LM transformer stacks (backbone autoregressive token model + super-resolution token model) feeding a DAC-style residual-vector-quantized audio decoder.",
	"library_name": null,
	"pipeline_tag": "text-to-audio",
	"torch_dtype": "bfloat16",
	"transformers_version": null,
	"components": {
	"backbone": {
	"role": "autoregressive RVQ-token language model",
	"weights": "khala_backbone.safetensors",
	"source_args": "backbone_megatron_args.json",
	"architecture": "megatron-gpt",
	"num_layers": 24,
	"hidden_size": 2048,
	"ffn_hidden_size": 5632,
	"num_attention_heads": 32,
	"kv_channels": 64,
	"group_query_attention": true,
	"num_query_groups": 8,
	"normalization": "RMSNorm",
	"norm_epsilon": 1e-06,
	"activation": "swiglu",
	"position_embedding_type": "rope",
	"rotary_base": 500000,
	"max_position_embeddings": 16384,
	"seq_length": 16384,
	"vocab_size": 130304,
	"padded_vocab_size": 130432,
	"untie_embeddings_and_output_weights": true,
	"add_bias_linear": true,
	"add_qkv_bias": true,
	"num_quantizers": 64
	},
	"superres": {
	"role": "super-resolution RVQ-token model",
	"weights": "khala_superres.safetensors",
	"source_args": "superres_megatron_args.json",
	"architecture": "megatron-gpt",
	"num_layers": 24,
	"hidden_size": 2048,
	"ffn_hidden_size": 5632,
	"num_attention_heads": 32,
	"kv_channels": 64,
	"group_query_attention": true,
	"num_query_groups": 8,
	"normalization": "RMSNorm",
	"norm_epsilon": 1e-06,
	"activation": "swiglu",
	"position_embedding_type": "rope",
	"rotary_base": 500000,
	"max_position_embeddings": 8192,
	"seq_length": 8192,
	"vocab_size": 193792,
	"padded_vocab_size": 193920,
	"untie_embeddings_and_output_weights": true,
	"num_quantizers": 64
	},
	"decoder": {
	"role": "DAC-style RVQ audio decoder",
	"weights": "decoder_weights.pt",
	"source_config": "decoder_config.yaml",
	"d_latent": 128,
	"codebook_size": 1024,
	"num_quantizers": 64,
	"encoder_n_filters": 128,
	"encoder_strides": [4, 8, 8, 8],
	"decoder_n_filters": 192,
	"decoder_rates": [8, 8, 8, 4],
	"channels": 2
	}
	}
	}