import os import gradio as gr import torch import librosa import numpy as np from transformers import AutoTokenizer, AutoModelForCausalLM from encodec import EncodecModel # ---------------------------------------------------------------------- # Constants (must match training configuration) # ---------------------------------------------------------------------- SAMPLE_RATE = 24000 NUM_CODEBOOKS = 8 CODEBOOK_SIZE = 1024 # Total audio tokens added: 1 start token + NUM_CODEBOOKS * CODEBOOK_SIZE NUM_AUDIO_TOKENS_ADDED = 1 + NUM_CODEBOOKS * CODEBOOK_SIZE # 8193 # ---------------------------------------------------------------------- # Load EnCodec once # ---------------------------------------------------------------------- def load_encodec(device): model = EncodecModel.encodec_model_24khz() model.set_target_bandwidth(6.0) model.to(device) return model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encodec_model = load_encodec(device) # ---------------------------------------------------------------------- # Load fine-tuned Qwen model from Hugging Face Hub # ---------------------------------------------------------------------- MODEL_ID = "michsethowusu/twi-symptoms-predict" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_ID, dtype=torch.bfloat16, # more recent API, avoids deprecation warning device_map="auto", trust_remote_code=True, ) model.eval() # Calculate old_vocab_size = total vocab - added audio tokens old_vocab_size = len(tokenizer) - NUM_AUDIO_TOKENS_ADDED audio_start_id = old_vocab_size # first added token is the start marker def audio_token_id(cb, val): """Global token ID for codebook cb and value val.""" return old_vocab_size + 1 + cb * CODEBOOK_SIZE + val # ---------------------------------------------------------------------- # Audio preprocessing # ---------------------------------------------------------------------- def audio_to_tokens(audio_path): """Convert a WAV file to interleaved discrete token list.""" wav, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True) wav = torch.tensor(wav, device=device).unsqueeze(0).unsqueeze(0) # (1,1,T) with torch.no_grad(): encoded_frames = encodec_model.encode(wav) codes = encoded_frames[0][0].cpu().numpy() # (8, T') interleaved = codes.T.flatten().tolist() # (T' * 8,) return interleaved # ---------------------------------------------------------------------- # Classification function # ---------------------------------------------------------------------- def classify_audio(audio_filepath): if audio_filepath is None: return ( "
Body Part: {body_part}
" f"SubβIssue: {sub_issue}
" f"{prediction}
" f"