MedGRPO Team Claude Opus 4.7 (1M context) commited on
Commit
faf76da
Β·
1 Parent(s): a690dc6

Rank leaderboard by average rank across all 10 metrics

Browse files

Previously sorted by cvs_acc only, which is misleading when top models
cluster within 0.004 on that single metric. The new sort computes each
model's rank per metric (1 = best; ties share smaller rank; NaN β†’ last),
averages those ranks, and sorts ascending (lower avg rank = better).

- Add sort_by_avg_rank helper
- Replace all 7 sort_values('cvs_acc', ascending=False) call sites
- Add "How Models Are Ranked" section to the About tab
- Refresh stale CVS-sort comments

Robust to metric-scale differences (accuracy 0–1 vs LLM-judge 1–5) and
rewards models that are strong across tasks rather than exceptional
on one.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Files changed (1) hide show
  1. app.py +44 -10
app.py CHANGED
@@ -216,6 +216,34 @@ TEST_SET_STATS = {
216
  }
217
 
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  def load_leaderboard() -> pd.DataFrame:
220
  """
221
  Load leaderboard from private HuggingFace repo.
@@ -245,9 +273,9 @@ def load_leaderboard() -> pd.DataFrame:
245
  if 'average' in df.columns:
246
  df = df.drop('average', axis=1)
247
 
248
- # Sort by first metric (CVS_acc) descending
249
  if 'cvs_acc' in df.columns:
250
- df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
251
 
252
  print(f"βœ“ Loaded leaderboard from private repo: {len(df)} entries")
253
  return df
@@ -268,9 +296,9 @@ def load_leaderboard() -> pd.DataFrame:
268
  if 'average' in df.columns:
269
  df = df.drop('average', axis=1)
270
 
271
- # Sort by first metric (CVS_acc) descending - no overall average
272
  if 'cvs_acc' in df.columns:
273
- df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
274
 
275
  print(f"βœ“ Loaded leaderboard from local file: {len(df)} entries")
276
  return df
@@ -365,7 +393,7 @@ def load_official_leaderboard() -> pd.DataFrame:
365
  if data:
366
  df = pd.DataFrame(data)
367
  if 'cvs_acc' in df.columns:
368
- df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
369
  print(f"βœ“ Loaded official leaderboard from private repo: {len(df)} entries")
370
  return df
371
  except Exception as e:
@@ -380,7 +408,7 @@ def load_official_leaderboard() -> pd.DataFrame:
380
  if data:
381
  df = pd.DataFrame(data)
382
  if 'cvs_acc' in df.columns:
383
- df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
384
  print(f"βœ“ Loaded official leaderboard from local file: {len(df)} entries")
385
  return df
386
 
@@ -443,7 +471,7 @@ def add_to_official_leaderboard(model_name: str, organization: str, metrics: Dic
443
  }
444
 
445
  df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
446
- df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
447
  save_official_leaderboard(df)
448
 
449
  return True, f"βœ“ Added '{model_name}' to official leaderboard (rank #{df[df['model_name'] == model_name].index[0] + 1})"
@@ -1452,8 +1480,8 @@ def submit_model(file, model_name: str, organization: str, contact: str = "", mo
1452
  }
1453
 
1454
  df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
1455
- # Sort by first metric (CVS_acc)
1456
- df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
1457
 
1458
  save_leaderboard(df)
1459
 
@@ -1946,7 +1974,7 @@ Evaluation logs are being written to:
1946
  df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 3)
1947
  df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 3)
1948
  df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 3)
1949
- df = df.sort_values('cvs_acc', ascending=False).reset_index(drop=True)
1950
  save_leaderboard(df)
1951
 
1952
  # Update status to completed
@@ -2306,6 +2334,12 @@ with gr.Blocks(title="MedVidBench Leaderboard", theme=gr.themes.Soft()) as demo:
2306
 
2307
  ---
2308
 
 
 
 
 
 
 
2309
  ### Benchmark Tasks
2310
  """)
2311
 
 
216
  }
217
 
218
 
219
+ def sort_by_avg_rank(df: pd.DataFrame) -> pd.DataFrame:
220
+ """Sort the leaderboard by average rank across all metrics.
221
+
222
+ Each metric is ranked descending (1 = best); missing values sort to the
223
+ bottom of that metric. A model's score is the mean rank across metrics β€”
224
+ lower is better. Ties in a given metric share the smaller rank
225
+ (competition ranking).
226
+ """
227
+ if df.empty:
228
+ return df.reset_index(drop=True)
229
+
230
+ metric_keys = [k for k in METRICS.keys() if k in df.columns]
231
+ if not metric_keys:
232
+ return df.reset_index(drop=True)
233
+
234
+ ranks = pd.DataFrame(index=df.index)
235
+ for m in metric_keys:
236
+ col = pd.to_numeric(df[m], errors="coerce")
237
+ # rank descending, ties share the smaller rank (method="min")
238
+ ranks[m] = col.rank(ascending=False, method="min", na_option="bottom")
239
+
240
+ df = df.copy()
241
+ df["_avg_rank"] = ranks.mean(axis=1)
242
+ df = df.sort_values("_avg_rank", ascending=True, kind="mergesort").reset_index(drop=True)
243
+ df = df.drop(columns=["_avg_rank"])
244
+ return df
245
+
246
+
247
  def load_leaderboard() -> pd.DataFrame:
248
  """
249
  Load leaderboard from private HuggingFace repo.
 
273
  if 'average' in df.columns:
274
  df = df.drop('average', axis=1)
275
 
276
+ # Sort by average rank across all metrics (lower avg rank = better)
277
  if 'cvs_acc' in df.columns:
278
+ df = sort_by_avg_rank(df)
279
 
280
  print(f"βœ“ Loaded leaderboard from private repo: {len(df)} entries")
281
  return df
 
296
  if 'average' in df.columns:
297
  df = df.drop('average', axis=1)
298
 
299
+ # Sort by average rank across all metrics (lower avg rank = better)
300
  if 'cvs_acc' in df.columns:
301
+ df = sort_by_avg_rank(df)
302
 
303
  print(f"βœ“ Loaded leaderboard from local file: {len(df)} entries")
304
  return df
 
393
  if data:
394
  df = pd.DataFrame(data)
395
  if 'cvs_acc' in df.columns:
396
+ df = sort_by_avg_rank(df)
397
  print(f"βœ“ Loaded official leaderboard from private repo: {len(df)} entries")
398
  return df
399
  except Exception as e:
 
408
  if data:
409
  df = pd.DataFrame(data)
410
  if 'cvs_acc' in df.columns:
411
+ df = sort_by_avg_rank(df)
412
  print(f"βœ“ Loaded official leaderboard from local file: {len(df)} entries")
413
  return df
414
 
 
471
  }
472
 
473
  df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
474
+ df = sort_by_avg_rank(df)
475
  save_official_leaderboard(df)
476
 
477
  return True, f"βœ“ Added '{model_name}' to official leaderboard (rank #{df[df['model_name'] == model_name].index[0] + 1})"
 
1480
  }
1481
 
1482
  df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
1483
+ # Sort by average rank across all metrics (lower avg rank = better)
1484
+ df = sort_by_avg_rank(df)
1485
 
1486
  save_leaderboard(df)
1487
 
 
1974
  df.loc[df['model_name'] == model_name, 'dvc_llm'] = round(dvc_llm, 3)
1975
  df.loc[df['model_name'] == model_name, 'vs_llm'] = round(vs_llm, 3)
1976
  df.loc[df['model_name'] == model_name, 'rc_llm'] = round(rc_llm, 3)
1977
+ df = sort_by_avg_rank(df)
1978
  save_leaderboard(df)
1979
 
1980
  # Update status to completed
 
2334
 
2335
  ---
2336
 
2337
+ ### How Models Are Ranked
2338
+
2339
+ Models are ranked by **average rank across all 10 metrics** β€” lower average rank = better. For each metric we rank every model (1 = best; ties share the smaller rank), then average those per-metric ranks. This is robust to different metric scales (accuracy 0–1 vs. LLM-judge 1–5) and rewards models that are strong across tasks rather than exceptional on one.
2340
+
2341
+ ---
2342
+
2343
  ### Benchmark Tasks
2344
  """)
2345