Commit ·
a748c2f
1
Parent(s): a34b3bc
Add inference timing logs
Browse files
app.py
CHANGED
|
@@ -435,6 +435,7 @@ def download_youtube_audio(url, force_reload=False):
|
|
| 435 |
|
| 436 |
@spaces.GPU
|
| 437 |
def infer(audio_path, youtube_url, prompt_text):
|
|
|
|
| 438 |
try:
|
| 439 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 440 |
_log_cuda_runtime("infer")
|
|
@@ -481,14 +482,28 @@ def infer(audio_path, youtube_url, prompt_text):
|
|
| 481 |
).to(model.device)
|
| 482 |
batch["input_features"] = batch["input_features"].to(model.dtype)
|
| 483 |
|
|
|
|
|
|
|
| 484 |
gen_ids = model.generate(**batch, max_new_tokens=4096, repetition_penalty=1.2)
|
|
|
|
| 485 |
inp_len = batch["input_ids"].shape[1]
|
| 486 |
new_tokens = gen_ids[:, inp_len:]
|
|
|
|
| 487 |
texts = processor.batch_decode(new_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
| 488 |
|
| 489 |
result = texts[0] if texts else ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 490 |
return f"{status_message}\n\n{result}"
|
| 491 |
except Exception as e:
|
|
|
|
|
|
|
| 492 |
return f"❌ Error: {str(e)}"
|
| 493 |
|
| 494 |
|
|
|
|
| 435 |
|
| 436 |
@spaces.GPU
|
| 437 |
def infer(audio_path, youtube_url, prompt_text):
|
| 438 |
+
infer_start = time.perf_counter()
|
| 439 |
try:
|
| 440 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 441 |
_log_cuda_runtime("infer")
|
|
|
|
| 482 |
).to(model.device)
|
| 483 |
batch["input_features"] = batch["input_features"].to(model.dtype)
|
| 484 |
|
| 485 |
+
input_token_count = int(batch["input_ids"].shape[1])
|
| 486 |
+
generation_start = time.perf_counter()
|
| 487 |
gen_ids = model.generate(**batch, max_new_tokens=4096, repetition_penalty=1.2)
|
| 488 |
+
generation_elapsed = time.perf_counter() - generation_start
|
| 489 |
inp_len = batch["input_ids"].shape[1]
|
| 490 |
new_tokens = gen_ids[:, inp_len:]
|
| 491 |
+
generated_token_count = int(new_tokens.shape[1])
|
| 492 |
texts = processor.batch_decode(new_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
| 493 |
|
| 494 |
result = texts[0] if texts else ""
|
| 495 |
+
total_elapsed = time.perf_counter() - infer_start
|
| 496 |
+
tokens_per_second = generated_token_count / generation_elapsed if generation_elapsed > 0 else 0.0
|
| 497 |
+
print(
|
| 498 |
+
f"[infer] total_time_s={total_elapsed:.2f} generation_time_s={generation_elapsed:.2f} "
|
| 499 |
+
f"input_tokens={input_token_count} generated_tokens={generated_token_count} "
|
| 500 |
+
f"tokens_per_second={tokens_per_second:.2f}",
|
| 501 |
+
flush=True,
|
| 502 |
+
)
|
| 503 |
return f"{status_message}\n\n{result}"
|
| 504 |
except Exception as e:
|
| 505 |
+
total_elapsed = time.perf_counter() - infer_start
|
| 506 |
+
print(f"[infer] failed_after_s={total_elapsed:.2f} error={e}", flush=True)
|
| 507 |
return f"❌ Error: {str(e)}"
|
| 508 |
|
| 509 |
|