MedGRPO Team commited on
Commit
05d6c42
Β·
1 Parent(s): e2b1040
Files changed (1) hide show
  1. app.py +193 -16
app.py CHANGED
@@ -90,6 +90,8 @@ if not GROUND_TRUTH_FILE.exists():
90
  json.dump(GROUND_TRUTH, f)
91
  print(f"βœ“ Ground truth saved to {GROUND_TRUTH_FILE}")
92
 
 
 
93
  # MedVidBench Metrics Definitions (10 metrics from 8 tasks)
94
  # Note: TAL has 2 metrics, DVC has 2 metrics, others have 1 metric each
95
  METRICS = {
@@ -129,18 +131,18 @@ METRICS = {
129
  "higher_better": True,
130
  "description": "Mean IoU at threshold 0.5 for temporal localization"
131
  },
132
- "dvc_llm": {
133
- "name": "DVC_llm",
134
- "full_name": "Dense Video Captioning LLM Score",
135
- "higher_better": True,
136
- "description": "Caption quality score (LLM judge or semantic similarity)"
137
- },
138
  "dvc_f1": {
139
  "name": "DVC_F1",
140
  "full_name": "Dense Video Captioning F1",
141
  "higher_better": True,
142
  "description": "F1 score for temporal segment localization"
143
  },
 
 
 
 
 
 
144
  "vs_llm": {
145
  "name": "VS_llm",
146
  "full_name": "Video Summary LLM Score",
@@ -391,6 +393,166 @@ def backup_results_to_repo(model_name: str, results_dir: Path):
391
  print(f"⚠️ Failed to backup results: {e}")
392
 
393
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  # ============================================================================
395
  # Admin Functions
396
  # ============================================================================
@@ -1181,7 +1343,7 @@ def submit_model(file, model_name: str, organization: str, contact: str = "", pr
1181
  new_entry = {
1182
  "model_name": model_name,
1183
  "organization": organization,
1184
- **{metric: round(metrics.get(metric, 0.0), 4) for metric in METRICS.keys()},
1185
  "date": datetime.now().strftime("%Y-%m-%d"),
1186
  "contact": contact
1187
  }
@@ -1220,7 +1382,7 @@ def submit_model(file, model_name: str, organization: str, contact: str = "", pr
1220
  success_msg += "\n### πŸ“ˆ Metric Scores\n"
1221
  for metric_key, metric_info in METRICS.items():
1222
  score = metrics.get(metric_key, 0.0)
1223
- success_msg += f"- **{metric_info['name']}**: {score:.4f}\n"
1224
 
1225
  rank = df[df['model_name'] == model_name].index[0] + 1
1226
  success_msg += f"\n### πŸ† Ranking\n**Rank**: #{rank} out of {len(df)} models\n"
@@ -1230,7 +1392,8 @@ def submit_model(file, model_name: str, organization: str, contact: str = "", pr
1230
 
1231
 
1232
  def format_leaderboard_display(df: pd.DataFrame) -> pd.DataFrame:
1233
- """Format leaderboard dataframe for display with 10 metrics (no average)."""
 
1234
  if df.empty:
1235
  return df
1236
 
@@ -1255,6 +1418,13 @@ def format_leaderboard_display(df: pd.DataFrame) -> pd.DataFrame:
1255
  # Rename columns for display
1256
  display_df = df[display_cols].copy()
1257
 
 
 
 
 
 
 
 
1258
  # Build column names
1259
  column_names = []
1260
  for col in display_cols:
@@ -1536,9 +1706,9 @@ Evaluation logs are being written to:
1536
  if dvc_llm > 0.0 or vs_llm > 0.0 or rc_llm > 0.0:
1537
  # Update leaderboard
1538
  df = load_leaderboard()
1539
- df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 4)
1540
- df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 4)
1541
- df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 4)
1542
  df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
1543
  save_leaderboard(df)
1544
 
@@ -1546,7 +1716,7 @@ Evaluation logs are being written to:
1546
  update_llm_judge_status(
1547
  model_name,
1548
  'completed',
1549
- f"DVC: {dvc_llm:.4f}, VS: {vs_llm:.4f}, RC: {rc_llm:.4f}"
1550
  )
1551
  else:
1552
  update_llm_judge_status(model_name, 'failed', 'Failed to extract metrics')
@@ -1619,9 +1789,9 @@ def check_llm_judge_evaluation_status(model_name: str) -> str:
1619
  **Completed**: {msg}
1620
 
1621
  ### πŸ“ˆ Caption Metrics
1622
- - **DVC_llm**: {dvc:.4f}
1623
- - **VS_llm**: {vs:.4f}
1624
- - **RC_llm**: {rc:.4f}
1625
 
1626
  βœ“ Leaderboard has been updated!
1627
 
@@ -1643,6 +1813,13 @@ Please check the logs or try running the evaluation again.
1643
  return f"ℹ️ **Status**: {status}\n\n{msg}"
1644
 
1645
 
 
 
 
 
 
 
 
1646
  # Create Gradio interface
1647
  with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
1648
 
 
90
  json.dump(GROUND_TRUTH, f)
91
  print(f"βœ“ Ground truth saved to {GROUND_TRUTH_FILE}")
92
 
93
+ # Note: Default leaderboard data is populated after all functions are defined (see below).
94
+
95
  # MedVidBench Metrics Definitions (10 metrics from 8 tasks)
96
  # Note: TAL has 2 metrics, DVC has 2 metrics, others have 1 metric each
97
  METRICS = {
 
131
  "higher_better": True,
132
  "description": "Mean IoU at threshold 0.5 for temporal localization"
133
  },
 
 
 
 
 
 
134
  "dvc_f1": {
135
  "name": "DVC_F1",
136
  "full_name": "Dense Video Captioning F1",
137
  "higher_better": True,
138
  "description": "F1 score for temporal segment localization"
139
  },
140
+ "dvc_llm": {
141
+ "name": "DVC_llm",
142
+ "full_name": "Dense Video Captioning LLM Score",
143
+ "higher_better": True,
144
+ "description": "Caption quality score (LLM judge or semantic similarity)"
145
+ },
146
  "vs_llm": {
147
  "name": "VS_llm",
148
  "full_name": "Video Summary LLM Score",
 
393
  print(f"⚠️ Failed to backup results: {e}")
394
 
395
 
396
+ # ============================================================================
397
+ # Default Data Population
398
+ # ============================================================================
399
+
400
+ # Default baseline entries from MedGRPO project page results table.
401
+ # These are pre-computed evaluation results for known models.
402
+ DEFAULT_LEADERBOARD_ENTRIES = [
403
+ # --- 2025 Off-the-shelf Baselines ---
404
+ {
405
+ "model_name": "GPT-4.1",
406
+ "organization": "OpenAI",
407
+ "cvs_acc": 0.018, "nap_acc": 0.250, "sa_acc": 0.087, "stg_miou": 0.014,
408
+ "tag_miou_03": 0.096, "tag_miou_05": 0.005,
409
+ "dvc_f1": 0.101, "dvc_llm": 2.438, "vs_llm": 2.490, "rc_llm": 2.080,
410
+ "date": "2025-01-14", "contact": "",
411
+ },
412
+ {
413
+ "model_name": "Gemini-2.5-Flash",
414
+ "organization": "Google",
415
+ "cvs_acc": 0.101, "nap_acc": 0.228, "sa_acc": 0.107, "stg_miou": 0.047,
416
+ "tag_miou_03": 0.045, "tag_miou_05": 0.021,
417
+ "dvc_f1": 0.084, "dvc_llm": 2.387, "vs_llm": 2.352, "rc_llm": 1.912,
418
+ "date": "2025-01-14", "contact": "",
419
+ },
420
+ {
421
+ "model_name": "VideoChat-R1.5-7B",
422
+ "organization": "OpenGVLab",
423
+ "cvs_acc": 0.000, "nap_acc": 0.270, "sa_acc": 0.006, "stg_miou": 0.000,
424
+ "tag_miou_03": 0.009, "tag_miou_05": 0.005,
425
+ "dvc_f1": 0.026, "dvc_llm": 1.723, "vs_llm": 3.034, "rc_llm": 3.086,
426
+ "date": "2025-01-14", "contact": "",
427
+ },
428
+ # --- 2025 Qwen2.5VL-7B ---
429
+ {
430
+ "model_name": "Qwen2.5VL-7B",
431
+ "organization": "Alibaba",
432
+ "cvs_acc": 0.105, "nap_acc": 0.151, "sa_acc": 0.010, "stg_miou": 0.020,
433
+ "tag_miou_03": 0.006, "tag_miou_05": 0.068,
434
+ "dvc_f1": 0.075, "dvc_llm": 2.512, "vs_llm": 2.452, "rc_llm": 2.090,
435
+ "date": "2025-01-14", "contact": "",
436
+ },
437
+ {
438
+ "model_name": "Qwen2.5VL-7B-Surg-CholecT50",
439
+ "organization": "NVIDIA",
440
+ "cvs_acc": 0.000, "nap_acc": 0.302, "sa_acc": 0.000, "stg_miou": 0.000,
441
+ "tag_miou_03": 0.019, "tag_miou_05": 0.013,
442
+ "dvc_f1": 0.051, "dvc_llm": 1.945, "vs_llm": 2.101, "rc_llm": 2.986,
443
+ "date": "2025-01-14", "contact": "",
444
+ },
445
+ {
446
+ "model_name": "Qwen2.5VL-7B-SFT",
447
+ "organization": "UII America (Ours)",
448
+ "cvs_acc": 0.894, "nap_acc": 0.442, "sa_acc": 0.218, "stg_miou": 0.177,
449
+ "tag_miou_03": 0.142, "tag_miou_05": 0.091,
450
+ "dvc_f1": 0.165, "dvc_llm": 3.665, "vs_llm": 3.596, "rc_llm": 2.757,
451
+ "date": "2025-01-14", "contact": "gaozhongpai@gmail.com",
452
+ },
453
+ {
454
+ "model_name": "Qwen2.5VL-7B-MedGRPO",
455
+ "organization": "UII America (Ours)",
456
+ "cvs_acc": 0.896, "nap_acc": 0.405, "sa_acc": 0.254, "stg_miou": 0.202,
457
+ "tag_miou_03": 0.216, "tag_miou_05": 0.156,
458
+ "dvc_f1": 0.214, "dvc_llm": 3.797, "vs_llm": 4.184, "rc_llm": 3.442,
459
+ "date": "2025-01-14", "contact": "gaozhongpai@gmail.com",
460
+ },
461
+ # --- 2025 Qwen3-VL-4B ---
462
+ {
463
+ "model_name": "Qwen3VL-4B",
464
+ "organization": "Alibaba",
465
+ "cvs_acc": 0.000, "nap_acc": 0.178, "sa_acc": 0.006, "stg_miou": 0.000,
466
+ "tag_miou_03": 0.039, "tag_miou_05": 0.034,
467
+ "dvc_f1": 0.128, "dvc_llm": 1.939, "vs_llm": 2.926, "rc_llm": 2.853,
468
+ "date": "2025-01-14", "contact": "",
469
+ },
470
+ {
471
+ "model_name": "Qwen3VL-4B-SFT",
472
+ "organization": "UII America (Ours)",
473
+ "cvs_acc": 0.895, "nap_acc": 0.466, "sa_acc": 0.270, "stg_miou": 0.133,
474
+ "tag_miou_03": 0.465, "tag_miou_05": 0.403,
475
+ "dvc_f1": 0.435, "dvc_llm": 3.862, "vs_llm": 4.180, "rc_llm": 3.752,
476
+ "date": "2025-01-14", "contact": "gaozhongpai@gmail.com",
477
+ },
478
+ {
479
+ "model_name": "Qwen3VL-4B-MedGRPO",
480
+ "organization": "UII America (Ours)",
481
+ "cvs_acc": 0.898, "nap_acc": 0.473, "sa_acc": 0.285, "stg_miou": 0.176,
482
+ "tag_miou_03": 0.504, "tag_miou_05": 0.441,
483
+ "dvc_f1": 0.480, "dvc_llm": 3.950, "vs_llm": 4.227, "rc_llm": 3.861,
484
+ "date": "2025-01-14", "contact": "gaozhongpai@gmail.com",
485
+ },
486
+ # --- 2026 Off-the-shelf Baselines & Qwen3.5-4B ---
487
+ {
488
+ "model_name": "GPT-5.4",
489
+ "organization": "OpenAI",
490
+ "cvs_acc": 0.164, "nap_acc": 0.393, "sa_acc": 0.267, "stg_miou": 0.004,
491
+ "tag_miou_03": 0.086, "tag_miou_05": 0.055,
492
+ "dvc_f1": 0.178, "dvc_llm": 3.403, "vs_llm": 3.976, "rc_llm": 3.714,
493
+ "date": "2026-04-13", "contact": "",
494
+ },
495
+ {
496
+ "model_name": "Gemini-3.1-flash-lite",
497
+ "organization": "Google",
498
+ "cvs_acc": 0.242, "nap_acc": 0.406, "sa_acc": 0.225, "stg_miou": 0.059,
499
+ "tag_miou_03": 0.072, "tag_miou_05": 0.049,
500
+ "dvc_f1": 0.174, "dvc_llm": 3.198, "vs_llm": 3.737, "rc_llm": 3.492,
501
+ "date": "2026-04-13", "contact": "",
502
+ },
503
+ {
504
+ "model_name": "Qwen3.5-4B",
505
+ "organization": "Alibaba",
506
+ "cvs_acc": 0.309, "nap_acc": 0.231, "sa_acc": 0.276, "stg_miou": 0.051,
507
+ "tag_miou_03": 0.074, "tag_miou_05": 0.040,
508
+ "dvc_f1": 0.142, "dvc_llm": 2.699, "vs_llm": 3.491, "rc_llm": 3.037,
509
+ "date": "2026-04-13", "contact": "",
510
+ },
511
+ {
512
+ "model_name": "Qwen3.5-4B-SFT",
513
+ "organization": "UII America (Ours)",
514
+ "cvs_acc": 0.897, "nap_acc": 0.576, "sa_acc": 0.354, "stg_miou": 0.190,
515
+ "tag_miou_03": 0.482, "tag_miou_05": 0.429,
516
+ "dvc_f1": 0.451, "dvc_llm": 3.741, "vs_llm": 4.238, "rc_llm": 3.746,
517
+ "date": "2026-04-13", "contact": "gaozhongpai@gmail.com",
518
+ },
519
+ ]
520
+
521
+
522
+ def populate_default_data():
523
+ """
524
+ Populate leaderboard with default baseline entries if they are missing.
525
+ Called at startup to ensure all known models are present.
526
+ Only adds entries that don't already exist (by model_name).
527
+ """
528
+ df = load_leaderboard()
529
+ existing_names = set(df['model_name'].values) if not df.empty else set()
530
+
531
+ new_entries = []
532
+ for entry in DEFAULT_LEADERBOARD_ENTRIES:
533
+ if entry["model_name"] not in existing_names:
534
+ new_entries.append(entry)
535
+
536
+ if not new_entries:
537
+ print(f"βœ“ Default data check: all {len(DEFAULT_LEADERBOARD_ENTRIES)} baseline entries already present")
538
+ return
539
+
540
+ print(f"πŸ“‹ Populating {len(new_entries)} default baseline entries...")
541
+
542
+ new_df = pd.DataFrame(new_entries)
543
+ if df.empty:
544
+ df = new_df
545
+ else:
546
+ df = pd.concat([df, new_df], ignore_index=True)
547
+
548
+ # Sort by cvs_acc descending (consistent with load_leaderboard)
549
+ if 'cvs_acc' in df.columns:
550
+ df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
551
+
552
+ save_leaderboard(df)
553
+ print(f"βœ“ Populated {len(new_entries)} default entries. Total: {len(df)} models")
554
+
555
+
556
  # ============================================================================
557
  # Admin Functions
558
  # ============================================================================
 
1343
  new_entry = {
1344
  "model_name": model_name,
1345
  "organization": organization,
1346
+ **{metric: round(metrics.get(metric, 0.0), 3) for metric in METRICS.keys()},
1347
  "date": datetime.now().strftime("%Y-%m-%d"),
1348
  "contact": contact
1349
  }
 
1382
  success_msg += "\n### πŸ“ˆ Metric Scores\n"
1383
  for metric_key, metric_info in METRICS.items():
1384
  score = metrics.get(metric_key, 0.0)
1385
+ success_msg += f"- **{metric_info['name']}**: {score:.3f}\n"
1386
 
1387
  rank = df[df['model_name'] == model_name].index[0] + 1
1388
  success_msg += f"\n### πŸ† Ranking\n**Rank**: #{rank} out of {len(df)} models\n"
 
1392
 
1393
 
1394
  def format_leaderboard_display(df: pd.DataFrame) -> pd.DataFrame:
1395
+ """Format leaderboard dataframe for display with 10 metrics (no average).
1396
+ All metric values are rounded to 3 decimal places to match the project page table."""
1397
  if df.empty:
1398
  return df
1399
 
 
1418
  # Rename columns for display
1419
  display_df = df[display_cols].copy()
1420
 
1421
+ # Round all metric columns to 3 decimal places for consistent display
1422
+ for metric_key in METRICS.keys():
1423
+ if metric_key in display_df.columns:
1424
+ display_df[metric_key] = display_df[metric_key].apply(
1425
+ lambda x: round(float(x), 3) if pd.notna(x) else 0.0
1426
+ )
1427
+
1428
  # Build column names
1429
  column_names = []
1430
  for col in display_cols:
 
1706
  if dvc_llm > 0.0 or vs_llm > 0.0 or rc_llm > 0.0:
1707
  # Update leaderboard
1708
  df = load_leaderboard()
1709
+ df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 3)
1710
+ df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 3)
1711
+ df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 3)
1712
  df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
1713
  save_leaderboard(df)
1714
 
 
1716
  update_llm_judge_status(
1717
  model_name,
1718
  'completed',
1719
+ f"DVC: {dvc_llm:.3f}, VS: {vs_llm:.3f}, RC: {rc_llm:.3f}"
1720
  )
1721
  else:
1722
  update_llm_judge_status(model_name, 'failed', 'Failed to extract metrics')
 
1789
  **Completed**: {msg}
1790
 
1791
  ### πŸ“ˆ Caption Metrics
1792
+ - **DVC_llm**: {dvc:.3f}
1793
+ - **VS_llm**: {vs:.3f}
1794
+ - **RC_llm**: {rc:.3f}
1795
 
1796
  βœ“ Leaderboard has been updated!
1797
 
 
1813
  return f"ℹ️ **Status**: {status}\n\n{msg}"
1814
 
1815
 
1816
+ # Populate default baseline data on startup
1817
+ print("=" * 60)
1818
+ print("POPULATING DEFAULT LEADERBOARD DATA")
1819
+ print("=" * 60)
1820
+ populate_default_data()
1821
+ print("=" * 60)
1822
+
1823
  # Create Gradio interface
1824
  with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
1825