Spaces:

UII-AI
/

MedVidBench-Leaderboard

Running

App Files Files Community

MedGRPO Team commited on about 1 month ago

Commit

1a7ba72

1 Parent(s): 68d9818

update

Browse files

Files changed (2) hide show

app.py +28 -4
evaluation/evaluate_predictions.py +1 -1

app.py CHANGED Viewed

@@ -1884,14 +1884,17 @@ def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
         eval_wrapper = Path("evaluation/evaluate_predictions.py")
         log_file = model_dir / "eval_llm_judge_log.txt"
-        # Build command for background execution
         cmd = [
             sys.executable,
             "-u",
             str(eval_wrapper),
             str(input_file),
             "--grouping", "overall",
-            "--ground-truth", str(GROUND_TRUTH_FILE)
             # NOTE: No --skip-llm-judge flag, so LLM judge will run
         ]
@@ -2076,13 +2079,34 @@ Refresh the Leaderboard tab to see updated rankings.
             return f"✓ **Evaluation Complete**\n\n{msg}\n\n⚠️ Model not found in leaderboard"
     elif status == 'failed':
         return f"""
 ## ❌ Evaluation Failed
 **Model**: {model_name}
 **Error**: {msg}
-Please check the logs or try running the evaluation again.
 """
     return f"ℹ️ **Status**: {status}\n\n{msg}"

         eval_wrapper = Path("evaluation/evaluate_predictions.py")
         log_file = model_dir / "eval_llm_judge_log.txt"
+        # Build command for background execution.
+        # Scope to caption tasks only — rule-based metrics were already computed
+        # in Step 1 and are deterministic, so recomputing them here is wasted work.
         cmd = [
             sys.executable,
             "-u",
             str(eval_wrapper),
             str(input_file),
             "--grouping", "overall",
+            "--ground-truth", str(GROUND_TRUTH_FILE),
+            "--tasks", "dvc", "vs", "rc",
             # NOTE: No --skip-llm-judge flag, so LLM judge will run
         ]
             return f"✓ **Evaluation Complete**\n\n{msg}\n\n⚠️ Model not found in leaderboard"
     elif status == 'failed':
+        model_dir = RESULTS_DIR / model_name.replace(" ", "_")
+        log_file = model_dir / "eval_llm_judge_log.txt"
+        log_section = ""
+        try:
+            if log_file.exists():
+                with open(log_file, 'r') as f:
+                    lines = f.readlines()
+                log_preview = ''.join(lines[-50:]) if lines else "(log file is empty)"
+                log_section = f"""
+### 📝 Recent Logs (last 50 lines)
+```
+{log_preview}
+```
+"""
+            else:
+                log_section = f"\n⚠️ Log file not found: `{log_file}`\n"
+        except Exception as e:
+            log_section = f"\n⚠️ Unable to read logs: {e}\n"
         return f"""
 ## ❌ Evaluation Failed
 **Model**: {model_name}
 **Error**: {msg}
+{log_section}
+Please review the logs above and try running the evaluation again.
 """
     return f"ℹ️ **Status**: {status}\n\n{msg}"

evaluation/evaluate_predictions.py CHANGED Viewed

@@ -309,7 +309,7 @@ def main():
                        help="Grouping strategy: 'per-dataset' or 'overall' (default: overall)")
     parser.add_argument("--analyze-only", action="store_true",
                        help="Only analyze the file structure without running evaluations")
-    parser.add_argument("--skip-llm-judge", default=True, action="store_true",
                        help="Skip LLM judge evaluation for caption tasks (use when LLM scores are pre-computed)")
     args = parser.parse_args()

                        help="Grouping strategy: 'per-dataset' or 'overall' (default: overall)")
     parser.add_argument("--analyze-only", action="store_true",
                        help="Only analyze the file structure without running evaluations")
+    parser.add_argument("--skip-llm-judge", default=False, action="store_true",
                        help="Skip LLM judge evaluation for caption tasks (use when LLM scores are pre-computed)")
     args = parser.parse_args()