# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Quick test script for audio inference with NVIDIA Nemotron Nano Omni model. This script demonstrates how to use the model for audio understanding tasks. Usage: python quick_test_audio.py --model_path /path/to/model --audio_path /path/to/audio.wav The audio file should be a WAV file sampled at 16kHz. If your audio is in a different format or sample rate, it will be automatically resampled. """ import argparse import torch from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer def load_model(model_path: str, device: str = "cuda:0"): """Load the Omni model and processor. Args: model_path: Path to the pretrained model device: Device to load the model on Returns: Tuple of (model, tokenizer, processor) """ print(f"Loading model from {model_path}...") model = AutoModelForCausalLM.from_pretrained( model_path, trust_remote_code=True, device_map=device, dtype=torch.bfloat16 ).eval() tokenizer = AutoTokenizer.from_pretrained(model_path) processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) print("Model loaded successfully!") return model, tokenizer, processor def test_audio_transcription( model, tokenizer, processor, audio_path: str, prompt_text: str = "Transcribe the audio.", device: str = "cuda:0", max_new_tokens: int = 1024, do_sample: bool = False, ): """Test model inference on an audio file. Args: model: The Omni model with audio support tokenizer: The tokenizer processor: The processor audio_path: Path to the audio file prompt_text: Text prompt for the model device: Device to run inference on max_new_tokens: Maximum number of tokens to generate do_sample: Whether to use sampling for generation """ print(f"\nProcessing audio: {audio_path}") # Prepare messages with audio token embedded directly in text. # The chat template does not expand {"type": "audio"} content blocks into # tokens — it just stringifies the list. So we place the # audio placeholder token in the user message text and let the processor # expand it to the correct number of repeated tokens. audio_token = getattr(tokenizer, "audio_token", "") messages = [ {"role": "system", "content": "/no_think"}, {"role": "user", "content": f"{audio_token}\n{prompt_text}"}, ] # Generate prompt prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) # Process inputs - processor handles audio loading and token expansion inputs = processor( text=[prompt], audio=[audio_path], return_tensors="pt", ) # Get sound clips before moving to device (they're numpy arrays, not tensors) sound_clips = inputs.pop("sound_clips", None) # Move tensor inputs to device inputs = inputs.to(device) print(f"Input ids shape: {inputs.input_ids.shape}") if sound_clips is not None: if isinstance(sound_clips, list): print(f"Sound clips: {len(sound_clips)} clips, first clip length: {len(sound_clips[0])} samples") else: print(f"Sound clips: {type(sound_clips)}") # Generate output - model handles feature extraction from raw waveforms generated_ids = model.generate( input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, sound_clips=sound_clips, max_new_tokens=max_new_tokens, do_sample=do_sample, eos_token_id=tokenizer.eos_token_id, ) # Decode output — trim to only generated tokens generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] output_text = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0] print(f"\n{'='*50}") print(f"Prompt: {prompt_text}") print(f"{'='*50}") print(f"Output: {output_text}\n") def test_audio_understanding( model, tokenizer, processor, audio_path: str, device: str = "cuda:0", max_new_tokens: int = 1024, ): """Test various audio understanding prompts. Args: model: The Omni model with audio support tokenizer: The tokenizer processor: The processor audio_path: Path to the audio file device: Device to run inference on max_new_tokens: Maximum number of tokens to generate """ prompts = [ "Transcribe the audio.", "What is being said in this audio?", "Describe the audio content.", ] for prompt in prompts: test_audio_transcription( model, tokenizer, processor, audio_path, prompt_text=prompt, device=device, max_new_tokens=max_new_tokens, ) def main(): parser = argparse.ArgumentParser(description="Test audio inference with Omni model") parser.add_argument( "--model_path", type=str, required=True, help="Path to the pretrained model" ) parser.add_argument( "--audio_path", type=str, required=True, help="Path to the audio file (WAV format, preferably 16kHz)" ) parser.add_argument( "--device", type=str, default="cuda:0", help="Device to run inference on (e.g., cuda:0, cpu)" ) parser.add_argument( "--max_new_tokens", type=int, default=1024, help="Maximum number of tokens to generate" ) parser.add_argument( "--prompt", type=str, default=None, help="Custom prompt for audio understanding (default: runs multiple prompts)" ) args = parser.parse_args() # Load model model, tokenizer, processor = load_model(args.model_path, args.device) print("=" * 50) print("Testing Audio Inference") print("=" * 50) if args.prompt: # Single custom prompt test_audio_transcription( model, tokenizer, processor, args.audio_path, prompt_text=args.prompt, device=args.device, max_new_tokens=args.max_new_tokens ) else: # Multiple prompts test_audio_understanding( model, tokenizer, processor, args.audio_path, device=args.device, max_new_tokens=args.max_new_tokens ) if __name__ == "__main__": main()