MedGRPO Team commited on
Commit
1a7ba72
·
1 Parent(s): 68d9818
Files changed (2) hide show
  1. app.py +28 -4
  2. evaluation/evaluate_predictions.py +1 -1
app.py CHANGED
@@ -1884,14 +1884,17 @@ def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
1884
  eval_wrapper = Path("evaluation/evaluate_predictions.py")
1885
  log_file = model_dir / "eval_llm_judge_log.txt"
1886
 
1887
- # Build command for background execution
 
 
1888
  cmd = [
1889
  sys.executable,
1890
  "-u",
1891
  str(eval_wrapper),
1892
  str(input_file),
1893
  "--grouping", "overall",
1894
- "--ground-truth", str(GROUND_TRUTH_FILE)
 
1895
  # NOTE: No --skip-llm-judge flag, so LLM judge will run
1896
  ]
1897
 
@@ -2076,13 +2079,34 @@ Refresh the Leaderboard tab to see updated rankings.
2076
  return f"✓ **Evaluation Complete**\n\n{msg}\n\n⚠️ Model not found in leaderboard"
2077
 
2078
  elif status == 'failed':
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2079
  return f"""
2080
  ## ❌ Evaluation Failed
2081
 
2082
  **Model**: {model_name}
2083
  **Error**: {msg}
2084
-
2085
- Please check the logs or try running the evaluation again.
2086
  """
2087
 
2088
  return f"ℹ️ **Status**: {status}\n\n{msg}"
 
1884
  eval_wrapper = Path("evaluation/evaluate_predictions.py")
1885
  log_file = model_dir / "eval_llm_judge_log.txt"
1886
 
1887
+ # Build command for background execution.
1888
+ # Scope to caption tasks only — rule-based metrics were already computed
1889
+ # in Step 1 and are deterministic, so recomputing them here is wasted work.
1890
  cmd = [
1891
  sys.executable,
1892
  "-u",
1893
  str(eval_wrapper),
1894
  str(input_file),
1895
  "--grouping", "overall",
1896
+ "--ground-truth", str(GROUND_TRUTH_FILE),
1897
+ "--tasks", "dvc", "vs", "rc",
1898
  # NOTE: No --skip-llm-judge flag, so LLM judge will run
1899
  ]
1900
 
 
2079
  return f"✓ **Evaluation Complete**\n\n{msg}\n\n⚠️ Model not found in leaderboard"
2080
 
2081
  elif status == 'failed':
2082
+ model_dir = RESULTS_DIR / model_name.replace(" ", "_")
2083
+ log_file = model_dir / "eval_llm_judge_log.txt"
2084
+
2085
+ log_section = ""
2086
+ try:
2087
+ if log_file.exists():
2088
+ with open(log_file, 'r') as f:
2089
+ lines = f.readlines()
2090
+ log_preview = ''.join(lines[-50:]) if lines else "(log file is empty)"
2091
+ log_section = f"""
2092
+ ### 📝 Recent Logs (last 50 lines)
2093
+
2094
+ ```
2095
+ {log_preview}
2096
+ ```
2097
+ """
2098
+ else:
2099
+ log_section = f"\n⚠️ Log file not found: `{log_file}`\n"
2100
+ except Exception as e:
2101
+ log_section = f"\n⚠️ Unable to read logs: {e}\n"
2102
+
2103
  return f"""
2104
  ## ❌ Evaluation Failed
2105
 
2106
  **Model**: {model_name}
2107
  **Error**: {msg}
2108
+ {log_section}
2109
+ Please review the logs above and try running the evaluation again.
2110
  """
2111
 
2112
  return f"ℹ️ **Status**: {status}\n\n{msg}"
evaluation/evaluate_predictions.py CHANGED
@@ -309,7 +309,7 @@ def main():
309
  help="Grouping strategy: 'per-dataset' or 'overall' (default: overall)")
310
  parser.add_argument("--analyze-only", action="store_true",
311
  help="Only analyze the file structure without running evaluations")
312
- parser.add_argument("--skip-llm-judge", default=True, action="store_true",
313
  help="Skip LLM judge evaluation for caption tasks (use when LLM scores are pre-computed)")
314
 
315
  args = parser.parse_args()
 
309
  help="Grouping strategy: 'per-dataset' or 'overall' (default: overall)")
310
  parser.add_argument("--analyze-only", action="store_true",
311
  help="Only analyze the file structure without running evaluations")
312
+ parser.add_argument("--skip-llm-judge", default=False, action="store_true",
313
  help="Skip LLM judge evaluation for caption tasks (use when LLM scores are pre-computed)")
314
 
315
  args = parser.parse_args()