MedGRPO Team Claude Opus 4.7 (1M context) commited on
Commit Β·
faf76da
1
Parent(s): a690dc6
Rank leaderboard by average rank across all 10 metrics
Browse filesPreviously sorted by cvs_acc only, which is misleading when top models
cluster within 0.004 on that single metric. The new sort computes each
model's rank per metric (1 = best; ties share smaller rank; NaN β last),
averages those ranks, and sorts ascending (lower avg rank = better).
- Add sort_by_avg_rank helper
- Replace all 7 sort_values('cvs_acc', ascending=False) call sites
- Add "How Models Are Ranked" section to the About tab
- Refresh stale CVS-sort comments
Robust to metric-scale differences (accuracy 0β1 vs LLM-judge 1β5) and
rewards models that are strong across tasks rather than exceptional
on one.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
app.py
CHANGED
|
@@ -216,6 +216,34 @@ TEST_SET_STATS = {
|
|
| 216 |
}
|
| 217 |
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
def load_leaderboard() -> pd.DataFrame:
|
| 220 |
"""
|
| 221 |
Load leaderboard from private HuggingFace repo.
|
|
@@ -245,9 +273,9 @@ def load_leaderboard() -> pd.DataFrame:
|
|
| 245 |
if 'average' in df.columns:
|
| 246 |
df = df.drop('average', axis=1)
|
| 247 |
|
| 248 |
-
# Sort by
|
| 249 |
if 'cvs_acc' in df.columns:
|
| 250 |
-
df =
|
| 251 |
|
| 252 |
print(f"β Loaded leaderboard from private repo: {len(df)} entries")
|
| 253 |
return df
|
|
@@ -268,9 +296,9 @@ def load_leaderboard() -> pd.DataFrame:
|
|
| 268 |
if 'average' in df.columns:
|
| 269 |
df = df.drop('average', axis=1)
|
| 270 |
|
| 271 |
-
# Sort by
|
| 272 |
if 'cvs_acc' in df.columns:
|
| 273 |
-
df =
|
| 274 |
|
| 275 |
print(f"β Loaded leaderboard from local file: {len(df)} entries")
|
| 276 |
return df
|
|
@@ -365,7 +393,7 @@ def load_official_leaderboard() -> pd.DataFrame:
|
|
| 365 |
if data:
|
| 366 |
df = pd.DataFrame(data)
|
| 367 |
if 'cvs_acc' in df.columns:
|
| 368 |
-
df =
|
| 369 |
print(f"β Loaded official leaderboard from private repo: {len(df)} entries")
|
| 370 |
return df
|
| 371 |
except Exception as e:
|
|
@@ -380,7 +408,7 @@ def load_official_leaderboard() -> pd.DataFrame:
|
|
| 380 |
if data:
|
| 381 |
df = pd.DataFrame(data)
|
| 382 |
if 'cvs_acc' in df.columns:
|
| 383 |
-
df =
|
| 384 |
print(f"β Loaded official leaderboard from local file: {len(df)} entries")
|
| 385 |
return df
|
| 386 |
|
|
@@ -443,7 +471,7 @@ def add_to_official_leaderboard(model_name: str, organization: str, metrics: Dic
|
|
| 443 |
}
|
| 444 |
|
| 445 |
df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
|
| 446 |
-
df =
|
| 447 |
save_official_leaderboard(df)
|
| 448 |
|
| 449 |
return True, f"β Added '{model_name}' to official leaderboard (rank #{df[df['model_name'] == model_name].index[0] + 1})"
|
|
@@ -1452,8 +1480,8 @@ def submit_model(file, model_name: str, organization: str, contact: str = "", mo
|
|
| 1452 |
}
|
| 1453 |
|
| 1454 |
df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
|
| 1455 |
-
# Sort by
|
| 1456 |
-
df =
|
| 1457 |
|
| 1458 |
save_leaderboard(df)
|
| 1459 |
|
|
@@ -1946,7 +1974,7 @@ Evaluation logs are being written to:
|
|
| 1946 |
df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 3)
|
| 1947 |
df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 3)
|
| 1948 |
df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 3)
|
| 1949 |
-
df =
|
| 1950 |
save_leaderboard(df)
|
| 1951 |
|
| 1952 |
# Update status to completed
|
|
@@ -2306,6 +2334,12 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
|
|
| 2306 |
|
| 2307 |
---
|
| 2308 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2309 |
### Benchmark Tasks
|
| 2310 |
""")
|
| 2311 |
|
|
|
|
| 216 |
}
|
| 217 |
|
| 218 |
|
| 219 |
+
def sort_by_avg_rank(df: pd.DataFrame) -> pd.DataFrame:
|
| 220 |
+
"""Sort the leaderboard by average rank across all metrics.
|
| 221 |
+
|
| 222 |
+
Each metric is ranked descending (1 = best); missing values sort to the
|
| 223 |
+
bottom of that metric. A model's score is the mean rank across metrics β
|
| 224 |
+
lower is better. Ties in a given metric share the smaller rank
|
| 225 |
+
(competition ranking).
|
| 226 |
+
"""
|
| 227 |
+
if df.empty:
|
| 228 |
+
return df.reset_index(drop=True)
|
| 229 |
+
|
| 230 |
+
metric_keys = [k for k in METRICS.keys() if k in df.columns]
|
| 231 |
+
if not metric_keys:
|
| 232 |
+
return df.reset_index(drop=True)
|
| 233 |
+
|
| 234 |
+
ranks = pd.DataFrame(index=df.index)
|
| 235 |
+
for m in metric_keys:
|
| 236 |
+
col = pd.to_numeric(df[m], errors="coerce")
|
| 237 |
+
# rank descending, ties share the smaller rank (method="min")
|
| 238 |
+
ranks[m] = col.rank(ascending=False, method="min", na_option="bottom")
|
| 239 |
+
|
| 240 |
+
df = df.copy()
|
| 241 |
+
df["_avg_rank"] = ranks.mean(axis=1)
|
| 242 |
+
df = df.sort_values("_avg_rank", ascending=True, kind="mergesort").reset_index(drop=True)
|
| 243 |
+
df = df.drop(columns=["_avg_rank"])
|
| 244 |
+
return df
|
| 245 |
+
|
| 246 |
+
|
| 247 |
def load_leaderboard() -> pd.DataFrame:
|
| 248 |
"""
|
| 249 |
Load leaderboard from private HuggingFace repo.
|
|
|
|
| 273 |
if 'average' in df.columns:
|
| 274 |
df = df.drop('average', axis=1)
|
| 275 |
|
| 276 |
+
# Sort by average rank across all metrics (lower avg rank = better)
|
| 277 |
if 'cvs_acc' in df.columns:
|
| 278 |
+
df = sort_by_avg_rank(df)
|
| 279 |
|
| 280 |
print(f"β Loaded leaderboard from private repo: {len(df)} entries")
|
| 281 |
return df
|
|
|
|
| 296 |
if 'average' in df.columns:
|
| 297 |
df = df.drop('average', axis=1)
|
| 298 |
|
| 299 |
+
# Sort by average rank across all metrics (lower avg rank = better)
|
| 300 |
if 'cvs_acc' in df.columns:
|
| 301 |
+
df = sort_by_avg_rank(df)
|
| 302 |
|
| 303 |
print(f"β Loaded leaderboard from local file: {len(df)} entries")
|
| 304 |
return df
|
|
|
|
| 393 |
if data:
|
| 394 |
df = pd.DataFrame(data)
|
| 395 |
if 'cvs_acc' in df.columns:
|
| 396 |
+
df = sort_by_avg_rank(df)
|
| 397 |
print(f"β Loaded official leaderboard from private repo: {len(df)} entries")
|
| 398 |
return df
|
| 399 |
except Exception as e:
|
|
|
|
| 408 |
if data:
|
| 409 |
df = pd.DataFrame(data)
|
| 410 |
if 'cvs_acc' in df.columns:
|
| 411 |
+
df = sort_by_avg_rank(df)
|
| 412 |
print(f"β Loaded official leaderboard from local file: {len(df)} entries")
|
| 413 |
return df
|
| 414 |
|
|
|
|
| 471 |
}
|
| 472 |
|
| 473 |
df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
|
| 474 |
+
df = sort_by_avg_rank(df)
|
| 475 |
save_official_leaderboard(df)
|
| 476 |
|
| 477 |
return True, f"β Added '{model_name}' to official leaderboard (rank #{df[df['model_name'] == model_name].index[0] + 1})"
|
|
|
|
| 1480 |
}
|
| 1481 |
|
| 1482 |
df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
|
| 1483 |
+
# Sort by average rank across all metrics (lower avg rank = better)
|
| 1484 |
+
df = sort_by_avg_rank(df)
|
| 1485 |
|
| 1486 |
save_leaderboard(df)
|
| 1487 |
|
|
|
|
| 1974 |
df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 3)
|
| 1975 |
df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 3)
|
| 1976 |
df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 3)
|
| 1977 |
+
df = sort_by_avg_rank(df)
|
| 1978 |
save_leaderboard(df)
|
| 1979 |
|
| 1980 |
# Update status to completed
|
|
|
|
| 2334 |
|
| 2335 |
---
|
| 2336 |
|
| 2337 |
+
### How Models Are Ranked
|
| 2338 |
+
|
| 2339 |
+
Models are ranked by **average rank across all 10 metrics** β lower average rank = better. For each metric we rank every model (1 = best; ties share the smaller rank), then average those per-metric ranks. This is robust to different metric scales (accuracy 0β1 vs. LLM-judge 1β5) and rewards models that are strong across tasks rather than exceptional on one.
|
| 2340 |
+
|
| 2341 |
+
---
|
| 2342 |
+
|
| 2343 |
### Benchmark Tasks
|
| 2344 |
""")
|
| 2345 |
|