MedGRPO Team commited on
Commit Β·
05d6c42
1
Parent(s): e2b1040
update
Browse files
app.py
CHANGED
|
@@ -90,6 +90,8 @@ if not GROUND_TRUTH_FILE.exists():
|
|
| 90 |
json.dump(GROUND_TRUTH, f)
|
| 91 |
print(f"β Ground truth saved to {GROUND_TRUTH_FILE}")
|
| 92 |
|
|
|
|
|
|
|
| 93 |
# MedVidBench Metrics Definitions (10 metrics from 8 tasks)
|
| 94 |
# Note: TAL has 2 metrics, DVC has 2 metrics, others have 1 metric each
|
| 95 |
METRICS = {
|
|
@@ -129,18 +131,18 @@ METRICS = {
|
|
| 129 |
"higher_better": True,
|
| 130 |
"description": "Mean IoU at threshold 0.5 for temporal localization"
|
| 131 |
},
|
| 132 |
-
"dvc_llm": {
|
| 133 |
-
"name": "DVC_llm",
|
| 134 |
-
"full_name": "Dense Video Captioning LLM Score",
|
| 135 |
-
"higher_better": True,
|
| 136 |
-
"description": "Caption quality score (LLM judge or semantic similarity)"
|
| 137 |
-
},
|
| 138 |
"dvc_f1": {
|
| 139 |
"name": "DVC_F1",
|
| 140 |
"full_name": "Dense Video Captioning F1",
|
| 141 |
"higher_better": True,
|
| 142 |
"description": "F1 score for temporal segment localization"
|
| 143 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
"vs_llm": {
|
| 145 |
"name": "VS_llm",
|
| 146 |
"full_name": "Video Summary LLM Score",
|
|
@@ -391,6 +393,166 @@ def backup_results_to_repo(model_name: str, results_dir: Path):
|
|
| 391 |
print(f"β οΈ Failed to backup results: {e}")
|
| 392 |
|
| 393 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 394 |
# ============================================================================
|
| 395 |
# Admin Functions
|
| 396 |
# ============================================================================
|
|
@@ -1181,7 +1343,7 @@ def submit_model(file, model_name: str, organization: str, contact: str = "", pr
|
|
| 1181 |
new_entry = {
|
| 1182 |
"model_name": model_name,
|
| 1183 |
"organization": organization,
|
| 1184 |
-
**{metric: round(metrics.get(metric, 0.0),
|
| 1185 |
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 1186 |
"contact": contact
|
| 1187 |
}
|
|
@@ -1220,7 +1382,7 @@ def submit_model(file, model_name: str, organization: str, contact: str = "", pr
|
|
| 1220 |
success_msg += "\n### π Metric Scores\n"
|
| 1221 |
for metric_key, metric_info in METRICS.items():
|
| 1222 |
score = metrics.get(metric_key, 0.0)
|
| 1223 |
-
success_msg += f"- **{metric_info['name']}**: {score:.
|
| 1224 |
|
| 1225 |
rank = df[df['model_name'] == model_name].index[0] + 1
|
| 1226 |
success_msg += f"\n### π Ranking\n**Rank**: #{rank} out of {len(df)} models\n"
|
|
@@ -1230,7 +1392,8 @@ def submit_model(file, model_name: str, organization: str, contact: str = "", pr
|
|
| 1230 |
|
| 1231 |
|
| 1232 |
def format_leaderboard_display(df: pd.DataFrame) -> pd.DataFrame:
|
| 1233 |
-
"""Format leaderboard dataframe for display with 10 metrics (no average).
|
|
|
|
| 1234 |
if df.empty:
|
| 1235 |
return df
|
| 1236 |
|
|
@@ -1255,6 +1418,13 @@ def format_leaderboard_display(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 1255 |
# Rename columns for display
|
| 1256 |
display_df = df[display_cols].copy()
|
| 1257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1258 |
# Build column names
|
| 1259 |
column_names = []
|
| 1260 |
for col in display_cols:
|
|
@@ -1536,9 +1706,9 @@ Evaluation logs are being written to:
|
|
| 1536 |
if dvc_llm > 0.0 or vs_llm > 0.0 or rc_llm > 0.0:
|
| 1537 |
# Update leaderboard
|
| 1538 |
df = load_leaderboard()
|
| 1539 |
-
df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm,
|
| 1540 |
-
df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm,
|
| 1541 |
-
df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm,
|
| 1542 |
df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
|
| 1543 |
save_leaderboard(df)
|
| 1544 |
|
|
@@ -1546,7 +1716,7 @@ Evaluation logs are being written to:
|
|
| 1546 |
update_llm_judge_status(
|
| 1547 |
model_name,
|
| 1548 |
'completed',
|
| 1549 |
-
f"DVC: {dvc_llm:.
|
| 1550 |
)
|
| 1551 |
else:
|
| 1552 |
update_llm_judge_status(model_name, 'failed', 'Failed to extract metrics')
|
|
@@ -1619,9 +1789,9 @@ def check_llm_judge_evaluation_status(model_name: str) -> str:
|
|
| 1619 |
**Completed**: {msg}
|
| 1620 |
|
| 1621 |
### π Caption Metrics
|
| 1622 |
-
- **DVC_llm**: {dvc:.
|
| 1623 |
-
- **VS_llm**: {vs:.
|
| 1624 |
-
- **RC_llm**: {rc:.
|
| 1625 |
|
| 1626 |
β Leaderboard has been updated!
|
| 1627 |
|
|
@@ -1643,6 +1813,13 @@ Please check the logs or try running the evaluation again.
|
|
| 1643 |
return f"βΉοΈ **Status**: {status}\n\n{msg}"
|
| 1644 |
|
| 1645 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1646 |
# Create Gradio interface
|
| 1647 |
with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
| 1648 |
|
|
|
|
| 90 |
json.dump(GROUND_TRUTH, f)
|
| 91 |
print(f"β Ground truth saved to {GROUND_TRUTH_FILE}")
|
| 92 |
|
| 93 |
+
# Note: Default leaderboard data is populated after all functions are defined (see below).
|
| 94 |
+
|
| 95 |
# MedVidBench Metrics Definitions (10 metrics from 8 tasks)
|
| 96 |
# Note: TAL has 2 metrics, DVC has 2 metrics, others have 1 metric each
|
| 97 |
METRICS = {
|
|
|
|
| 131 |
"higher_better": True,
|
| 132 |
"description": "Mean IoU at threshold 0.5 for temporal localization"
|
| 133 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
"dvc_f1": {
|
| 135 |
"name": "DVC_F1",
|
| 136 |
"full_name": "Dense Video Captioning F1",
|
| 137 |
"higher_better": True,
|
| 138 |
"description": "F1 score for temporal segment localization"
|
| 139 |
},
|
| 140 |
+
"dvc_llm": {
|
| 141 |
+
"name": "DVC_llm",
|
| 142 |
+
"full_name": "Dense Video Captioning LLM Score",
|
| 143 |
+
"higher_better": True,
|
| 144 |
+
"description": "Caption quality score (LLM judge or semantic similarity)"
|
| 145 |
+
},
|
| 146 |
"vs_llm": {
|
| 147 |
"name": "VS_llm",
|
| 148 |
"full_name": "Video Summary LLM Score",
|
|
|
|
| 393 |
print(f"β οΈ Failed to backup results: {e}")
|
| 394 |
|
| 395 |
|
| 396 |
+
# ============================================================================
|
| 397 |
+
# Default Data Population
|
| 398 |
+
# ============================================================================
|
| 399 |
+
|
| 400 |
+
# Default baseline entries from MedGRPO project page results table.
|
| 401 |
+
# These are pre-computed evaluation results for known models.
|
| 402 |
+
DEFAULT_LEADERBOARD_ENTRIES = [
|
| 403 |
+
# --- 2025 Off-the-shelf Baselines ---
|
| 404 |
+
{
|
| 405 |
+
"model_name": "GPT-4.1",
|
| 406 |
+
"organization": "OpenAI",
|
| 407 |
+
"cvs_acc": 0.018, "nap_acc": 0.250, "sa_acc": 0.087, "stg_miou": 0.014,
|
| 408 |
+
"tag_miou_03": 0.096, "tag_miou_05": 0.005,
|
| 409 |
+
"dvc_f1": 0.101, "dvc_llm": 2.438, "vs_llm": 2.490, "rc_llm": 2.080,
|
| 410 |
+
"date": "2025-01-14", "contact": "",
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"model_name": "Gemini-2.5-Flash",
|
| 414 |
+
"organization": "Google",
|
| 415 |
+
"cvs_acc": 0.101, "nap_acc": 0.228, "sa_acc": 0.107, "stg_miou": 0.047,
|
| 416 |
+
"tag_miou_03": 0.045, "tag_miou_05": 0.021,
|
| 417 |
+
"dvc_f1": 0.084, "dvc_llm": 2.387, "vs_llm": 2.352, "rc_llm": 1.912,
|
| 418 |
+
"date": "2025-01-14", "contact": "",
|
| 419 |
+
},
|
| 420 |
+
{
|
| 421 |
+
"model_name": "VideoChat-R1.5-7B",
|
| 422 |
+
"organization": "OpenGVLab",
|
| 423 |
+
"cvs_acc": 0.000, "nap_acc": 0.270, "sa_acc": 0.006, "stg_miou": 0.000,
|
| 424 |
+
"tag_miou_03": 0.009, "tag_miou_05": 0.005,
|
| 425 |
+
"dvc_f1": 0.026, "dvc_llm": 1.723, "vs_llm": 3.034, "rc_llm": 3.086,
|
| 426 |
+
"date": "2025-01-14", "contact": "",
|
| 427 |
+
},
|
| 428 |
+
# --- 2025 Qwen2.5VL-7B ---
|
| 429 |
+
{
|
| 430 |
+
"model_name": "Qwen2.5VL-7B",
|
| 431 |
+
"organization": "Alibaba",
|
| 432 |
+
"cvs_acc": 0.105, "nap_acc": 0.151, "sa_acc": 0.010, "stg_miou": 0.020,
|
| 433 |
+
"tag_miou_03": 0.006, "tag_miou_05": 0.068,
|
| 434 |
+
"dvc_f1": 0.075, "dvc_llm": 2.512, "vs_llm": 2.452, "rc_llm": 2.090,
|
| 435 |
+
"date": "2025-01-14", "contact": "",
|
| 436 |
+
},
|
| 437 |
+
{
|
| 438 |
+
"model_name": "Qwen2.5VL-7B-Surg-CholecT50",
|
| 439 |
+
"organization": "NVIDIA",
|
| 440 |
+
"cvs_acc": 0.000, "nap_acc": 0.302, "sa_acc": 0.000, "stg_miou": 0.000,
|
| 441 |
+
"tag_miou_03": 0.019, "tag_miou_05": 0.013,
|
| 442 |
+
"dvc_f1": 0.051, "dvc_llm": 1.945, "vs_llm": 2.101, "rc_llm": 2.986,
|
| 443 |
+
"date": "2025-01-14", "contact": "",
|
| 444 |
+
},
|
| 445 |
+
{
|
| 446 |
+
"model_name": "Qwen2.5VL-7B-SFT",
|
| 447 |
+
"organization": "UII America (Ours)",
|
| 448 |
+
"cvs_acc": 0.894, "nap_acc": 0.442, "sa_acc": 0.218, "stg_miou": 0.177,
|
| 449 |
+
"tag_miou_03": 0.142, "tag_miou_05": 0.091,
|
| 450 |
+
"dvc_f1": 0.165, "dvc_llm": 3.665, "vs_llm": 3.596, "rc_llm": 2.757,
|
| 451 |
+
"date": "2025-01-14", "contact": "gaozhongpai@gmail.com",
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"model_name": "Qwen2.5VL-7B-MedGRPO",
|
| 455 |
+
"organization": "UII America (Ours)",
|
| 456 |
+
"cvs_acc": 0.896, "nap_acc": 0.405, "sa_acc": 0.254, "stg_miou": 0.202,
|
| 457 |
+
"tag_miou_03": 0.216, "tag_miou_05": 0.156,
|
| 458 |
+
"dvc_f1": 0.214, "dvc_llm": 3.797, "vs_llm": 4.184, "rc_llm": 3.442,
|
| 459 |
+
"date": "2025-01-14", "contact": "gaozhongpai@gmail.com",
|
| 460 |
+
},
|
| 461 |
+
# --- 2025 Qwen3-VL-4B ---
|
| 462 |
+
{
|
| 463 |
+
"model_name": "Qwen3VL-4B",
|
| 464 |
+
"organization": "Alibaba",
|
| 465 |
+
"cvs_acc": 0.000, "nap_acc": 0.178, "sa_acc": 0.006, "stg_miou": 0.000,
|
| 466 |
+
"tag_miou_03": 0.039, "tag_miou_05": 0.034,
|
| 467 |
+
"dvc_f1": 0.128, "dvc_llm": 1.939, "vs_llm": 2.926, "rc_llm": 2.853,
|
| 468 |
+
"date": "2025-01-14", "contact": "",
|
| 469 |
+
},
|
| 470 |
+
{
|
| 471 |
+
"model_name": "Qwen3VL-4B-SFT",
|
| 472 |
+
"organization": "UII America (Ours)",
|
| 473 |
+
"cvs_acc": 0.895, "nap_acc": 0.466, "sa_acc": 0.270, "stg_miou": 0.133,
|
| 474 |
+
"tag_miou_03": 0.465, "tag_miou_05": 0.403,
|
| 475 |
+
"dvc_f1": 0.435, "dvc_llm": 3.862, "vs_llm": 4.180, "rc_llm": 3.752,
|
| 476 |
+
"date": "2025-01-14", "contact": "gaozhongpai@gmail.com",
|
| 477 |
+
},
|
| 478 |
+
{
|
| 479 |
+
"model_name": "Qwen3VL-4B-MedGRPO",
|
| 480 |
+
"organization": "UII America (Ours)",
|
| 481 |
+
"cvs_acc": 0.898, "nap_acc": 0.473, "sa_acc": 0.285, "stg_miou": 0.176,
|
| 482 |
+
"tag_miou_03": 0.504, "tag_miou_05": 0.441,
|
| 483 |
+
"dvc_f1": 0.480, "dvc_llm": 3.950, "vs_llm": 4.227, "rc_llm": 3.861,
|
| 484 |
+
"date": "2025-01-14", "contact": "gaozhongpai@gmail.com",
|
| 485 |
+
},
|
| 486 |
+
# --- 2026 Off-the-shelf Baselines & Qwen3.5-4B ---
|
| 487 |
+
{
|
| 488 |
+
"model_name": "GPT-5.4",
|
| 489 |
+
"organization": "OpenAI",
|
| 490 |
+
"cvs_acc": 0.164, "nap_acc": 0.393, "sa_acc": 0.267, "stg_miou": 0.004,
|
| 491 |
+
"tag_miou_03": 0.086, "tag_miou_05": 0.055,
|
| 492 |
+
"dvc_f1": 0.178, "dvc_llm": 3.403, "vs_llm": 3.976, "rc_llm": 3.714,
|
| 493 |
+
"date": "2026-04-13", "contact": "",
|
| 494 |
+
},
|
| 495 |
+
{
|
| 496 |
+
"model_name": "Gemini-3.1-flash-lite",
|
| 497 |
+
"organization": "Google",
|
| 498 |
+
"cvs_acc": 0.242, "nap_acc": 0.406, "sa_acc": 0.225, "stg_miou": 0.059,
|
| 499 |
+
"tag_miou_03": 0.072, "tag_miou_05": 0.049,
|
| 500 |
+
"dvc_f1": 0.174, "dvc_llm": 3.198, "vs_llm": 3.737, "rc_llm": 3.492,
|
| 501 |
+
"date": "2026-04-13", "contact": "",
|
| 502 |
+
},
|
| 503 |
+
{
|
| 504 |
+
"model_name": "Qwen3.5-4B",
|
| 505 |
+
"organization": "Alibaba",
|
| 506 |
+
"cvs_acc": 0.309, "nap_acc": 0.231, "sa_acc": 0.276, "stg_miou": 0.051,
|
| 507 |
+
"tag_miou_03": 0.074, "tag_miou_05": 0.040,
|
| 508 |
+
"dvc_f1": 0.142, "dvc_llm": 2.699, "vs_llm": 3.491, "rc_llm": 3.037,
|
| 509 |
+
"date": "2026-04-13", "contact": "",
|
| 510 |
+
},
|
| 511 |
+
{
|
| 512 |
+
"model_name": "Qwen3.5-4B-SFT",
|
| 513 |
+
"organization": "UII America (Ours)",
|
| 514 |
+
"cvs_acc": 0.897, "nap_acc": 0.576, "sa_acc": 0.354, "stg_miou": 0.190,
|
| 515 |
+
"tag_miou_03": 0.482, "tag_miou_05": 0.429,
|
| 516 |
+
"dvc_f1": 0.451, "dvc_llm": 3.741, "vs_llm": 4.238, "rc_llm": 3.746,
|
| 517 |
+
"date": "2026-04-13", "contact": "gaozhongpai@gmail.com",
|
| 518 |
+
},
|
| 519 |
+
]
|
| 520 |
+
|
| 521 |
+
|
| 522 |
+
def populate_default_data():
|
| 523 |
+
"""
|
| 524 |
+
Populate leaderboard with default baseline entries if they are missing.
|
| 525 |
+
Called at startup to ensure all known models are present.
|
| 526 |
+
Only adds entries that don't already exist (by model_name).
|
| 527 |
+
"""
|
| 528 |
+
df = load_leaderboard()
|
| 529 |
+
existing_names = set(df['model_name'].values) if not df.empty else set()
|
| 530 |
+
|
| 531 |
+
new_entries = []
|
| 532 |
+
for entry in DEFAULT_LEADERBOARD_ENTRIES:
|
| 533 |
+
if entry["model_name"] not in existing_names:
|
| 534 |
+
new_entries.append(entry)
|
| 535 |
+
|
| 536 |
+
if not new_entries:
|
| 537 |
+
print(f"β Default data check: all {len(DEFAULT_LEADERBOARD_ENTRIES)} baseline entries already present")
|
| 538 |
+
return
|
| 539 |
+
|
| 540 |
+
print(f"π Populating {len(new_entries)} default baseline entries...")
|
| 541 |
+
|
| 542 |
+
new_df = pd.DataFrame(new_entries)
|
| 543 |
+
if df.empty:
|
| 544 |
+
df = new_df
|
| 545 |
+
else:
|
| 546 |
+
df = pd.concat([df, new_df], ignore_index=True)
|
| 547 |
+
|
| 548 |
+
# Sort by cvs_acc descending (consistent with load_leaderboard)
|
| 549 |
+
if 'cvs_acc' in df.columns:
|
| 550 |
+
df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
|
| 551 |
+
|
| 552 |
+
save_leaderboard(df)
|
| 553 |
+
print(f"β Populated {len(new_entries)} default entries. Total: {len(df)} models")
|
| 554 |
+
|
| 555 |
+
|
| 556 |
# ============================================================================
|
| 557 |
# Admin Functions
|
| 558 |
# ============================================================================
|
|
|
|
| 1343 |
new_entry = {
|
| 1344 |
"model_name": model_name,
|
| 1345 |
"organization": organization,
|
| 1346 |
+
**{metric: round(metrics.get(metric, 0.0), 3) for metric in METRICS.keys()},
|
| 1347 |
"date": datetime.now().strftime("%Y-%m-%d"),
|
| 1348 |
"contact": contact
|
| 1349 |
}
|
|
|
|
| 1382 |
success_msg += "\n### π Metric Scores\n"
|
| 1383 |
for metric_key, metric_info in METRICS.items():
|
| 1384 |
score = metrics.get(metric_key, 0.0)
|
| 1385 |
+
success_msg += f"- **{metric_info['name']}**: {score:.3f}\n"
|
| 1386 |
|
| 1387 |
rank = df[df['model_name'] == model_name].index[0] + 1
|
| 1388 |
success_msg += f"\n### π Ranking\n**Rank**: #{rank} out of {len(df)} models\n"
|
|
|
|
| 1392 |
|
| 1393 |
|
| 1394 |
def format_leaderboard_display(df: pd.DataFrame) -> pd.DataFrame:
|
| 1395 |
+
"""Format leaderboard dataframe for display with 10 metrics (no average).
|
| 1396 |
+
All metric values are rounded to 3 decimal places to match the project page table."""
|
| 1397 |
if df.empty:
|
| 1398 |
return df
|
| 1399 |
|
|
|
|
| 1418 |
# Rename columns for display
|
| 1419 |
display_df = df[display_cols].copy()
|
| 1420 |
|
| 1421 |
+
# Round all metric columns to 3 decimal places for consistent display
|
| 1422 |
+
for metric_key in METRICS.keys():
|
| 1423 |
+
if metric_key in display_df.columns:
|
| 1424 |
+
display_df[metric_key] = display_df[metric_key].apply(
|
| 1425 |
+
lambda x: round(float(x), 3) if pd.notna(x) else 0.0
|
| 1426 |
+
)
|
| 1427 |
+
|
| 1428 |
# Build column names
|
| 1429 |
column_names = []
|
| 1430 |
for col in display_cols:
|
|
|
|
| 1706 |
if dvc_llm > 0.0 or vs_llm > 0.0 or rc_llm > 0.0:
|
| 1707 |
# Update leaderboard
|
| 1708 |
df = load_leaderboard()
|
| 1709 |
+
df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 3)
|
| 1710 |
+
df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 3)
|
| 1711 |
+
df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 3)
|
| 1712 |
df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
|
| 1713 |
save_leaderboard(df)
|
| 1714 |
|
|
|
|
| 1716 |
update_llm_judge_status(
|
| 1717 |
model_name,
|
| 1718 |
'completed',
|
| 1719 |
+
f"DVC: {dvc_llm:.3f}, VS: {vs_llm:.3f}, RC: {rc_llm:.3f}"
|
| 1720 |
)
|
| 1721 |
else:
|
| 1722 |
update_llm_judge_status(model_name, 'failed', 'Failed to extract metrics')
|
|
|
|
| 1789 |
**Completed**: {msg}
|
| 1790 |
|
| 1791 |
### π Caption Metrics
|
| 1792 |
+
- **DVC_llm**: {dvc:.3f}
|
| 1793 |
+
- **VS_llm**: {vs:.3f}
|
| 1794 |
+
- **RC_llm**: {rc:.3f}
|
| 1795 |
|
| 1796 |
β Leaderboard has been updated!
|
| 1797 |
|
|
|
|
| 1813 |
return f"βΉοΈ **Status**: {status}\n\n{msg}"
|
| 1814 |
|
| 1815 |
|
| 1816 |
+
# Populate default baseline data on startup
|
| 1817 |
+
print("=" * 60)
|
| 1818 |
+
print("POPULATING DEFAULT LEADERBOARD DATA")
|
| 1819 |
+
print("=" * 60)
|
| 1820 |
+
populate_default_data()
|
| 1821 |
+
print("=" * 60)
|
| 1822 |
+
|
| 1823 |
# Create Gradio interface
|
| 1824 |
with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
| 1825 |
|