SreyanG-NVIDIA commited on
Commit
a748c2f
·
1 Parent(s): a34b3bc

Add inference timing logs

Browse files
Files changed (1) hide show
  1. app.py +15 -0
app.py CHANGED
@@ -435,6 +435,7 @@ def download_youtube_audio(url, force_reload=False):
435
 
436
  @spaces.GPU
437
  def infer(audio_path, youtube_url, prompt_text):
 
438
  try:
439
  device = "cuda" if torch.cuda.is_available() else "cpu"
440
  _log_cuda_runtime("infer")
@@ -481,14 +482,28 @@ def infer(audio_path, youtube_url, prompt_text):
481
  ).to(model.device)
482
  batch["input_features"] = batch["input_features"].to(model.dtype)
483
 
 
 
484
  gen_ids = model.generate(**batch, max_new_tokens=4096, repetition_penalty=1.2)
 
485
  inp_len = batch["input_ids"].shape[1]
486
  new_tokens = gen_ids[:, inp_len:]
 
487
  texts = processor.batch_decode(new_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
488
 
489
  result = texts[0] if texts else ""
 
 
 
 
 
 
 
 
490
  return f"{status_message}\n\n{result}"
491
  except Exception as e:
 
 
492
  return f"❌ Error: {str(e)}"
493
 
494
 
 
435
 
436
  @spaces.GPU
437
  def infer(audio_path, youtube_url, prompt_text):
438
+ infer_start = time.perf_counter()
439
  try:
440
  device = "cuda" if torch.cuda.is_available() else "cpu"
441
  _log_cuda_runtime("infer")
 
482
  ).to(model.device)
483
  batch["input_features"] = batch["input_features"].to(model.dtype)
484
 
485
+ input_token_count = int(batch["input_ids"].shape[1])
486
+ generation_start = time.perf_counter()
487
  gen_ids = model.generate(**batch, max_new_tokens=4096, repetition_penalty=1.2)
488
+ generation_elapsed = time.perf_counter() - generation_start
489
  inp_len = batch["input_ids"].shape[1]
490
  new_tokens = gen_ids[:, inp_len:]
491
+ generated_token_count = int(new_tokens.shape[1])
492
  texts = processor.batch_decode(new_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
493
 
494
  result = texts[0] if texts else ""
495
+ total_elapsed = time.perf_counter() - infer_start
496
+ tokens_per_second = generated_token_count / generation_elapsed if generation_elapsed > 0 else 0.0
497
+ print(
498
+ f"[infer] total_time_s={total_elapsed:.2f} generation_time_s={generation_elapsed:.2f} "
499
+ f"input_tokens={input_token_count} generated_tokens={generated_token_count} "
500
+ f"tokens_per_second={tokens_per_second:.2f}",
501
+ flush=True,
502
+ )
503
  return f"{status_message}\n\n{result}"
504
  except Exception as e:
505
+ total_elapsed = time.perf_counter() - infer_start
506
+ print(f"[infer] failed_after_s={total_elapsed:.2f} error={e}", flush=True)
507
  return f"❌ Error: {str(e)}"
508
 
509