MedGRPO Team commited on
Commit ·
1a7ba72
1
Parent(s): 68d9818
update
Browse files- app.py +28 -4
- evaluation/evaluate_predictions.py +1 -1
app.py
CHANGED
|
@@ -1884,14 +1884,17 @@ def run_llm_judge_evaluation(model_name: str, progress=gr.Progress()) -> str:
|
|
| 1884 |
eval_wrapper = Path("evaluation/evaluate_predictions.py")
|
| 1885 |
log_file = model_dir / "eval_llm_judge_log.txt"
|
| 1886 |
|
| 1887 |
-
# Build command for background execution
|
|
|
|
|
|
|
| 1888 |
cmd = [
|
| 1889 |
sys.executable,
|
| 1890 |
"-u",
|
| 1891 |
str(eval_wrapper),
|
| 1892 |
str(input_file),
|
| 1893 |
"--grouping", "overall",
|
| 1894 |
-
"--ground-truth", str(GROUND_TRUTH_FILE)
|
|
|
|
| 1895 |
# NOTE: No --skip-llm-judge flag, so LLM judge will run
|
| 1896 |
]
|
| 1897 |
|
|
@@ -2076,13 +2079,34 @@ Refresh the Leaderboard tab to see updated rankings.
|
|
| 2076 |
return f"✓ **Evaluation Complete**\n\n{msg}\n\n⚠️ Model not found in leaderboard"
|
| 2077 |
|
| 2078 |
elif status == 'failed':
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2079 |
return f"""
|
| 2080 |
## ❌ Evaluation Failed
|
| 2081 |
|
| 2082 |
**Model**: {model_name}
|
| 2083 |
**Error**: {msg}
|
| 2084 |
-
|
| 2085 |
-
Please
|
| 2086 |
"""
|
| 2087 |
|
| 2088 |
return f"ℹ️ **Status**: {status}\n\n{msg}"
|
|
|
|
| 1884 |
eval_wrapper = Path("evaluation/evaluate_predictions.py")
|
| 1885 |
log_file = model_dir / "eval_llm_judge_log.txt"
|
| 1886 |
|
| 1887 |
+
# Build command for background execution.
|
| 1888 |
+
# Scope to caption tasks only — rule-based metrics were already computed
|
| 1889 |
+
# in Step 1 and are deterministic, so recomputing them here is wasted work.
|
| 1890 |
cmd = [
|
| 1891 |
sys.executable,
|
| 1892 |
"-u",
|
| 1893 |
str(eval_wrapper),
|
| 1894 |
str(input_file),
|
| 1895 |
"--grouping", "overall",
|
| 1896 |
+
"--ground-truth", str(GROUND_TRUTH_FILE),
|
| 1897 |
+
"--tasks", "dvc", "vs", "rc",
|
| 1898 |
# NOTE: No --skip-llm-judge flag, so LLM judge will run
|
| 1899 |
]
|
| 1900 |
|
|
|
|
| 2079 |
return f"✓ **Evaluation Complete**\n\n{msg}\n\n⚠️ Model not found in leaderboard"
|
| 2080 |
|
| 2081 |
elif status == 'failed':
|
| 2082 |
+
model_dir = RESULTS_DIR / model_name.replace(" ", "_")
|
| 2083 |
+
log_file = model_dir / "eval_llm_judge_log.txt"
|
| 2084 |
+
|
| 2085 |
+
log_section = ""
|
| 2086 |
+
try:
|
| 2087 |
+
if log_file.exists():
|
| 2088 |
+
with open(log_file, 'r') as f:
|
| 2089 |
+
lines = f.readlines()
|
| 2090 |
+
log_preview = ''.join(lines[-50:]) if lines else "(log file is empty)"
|
| 2091 |
+
log_section = f"""
|
| 2092 |
+
### 📝 Recent Logs (last 50 lines)
|
| 2093 |
+
|
| 2094 |
+
```
|
| 2095 |
+
{log_preview}
|
| 2096 |
+
```
|
| 2097 |
+
"""
|
| 2098 |
+
else:
|
| 2099 |
+
log_section = f"\n⚠️ Log file not found: `{log_file}`\n"
|
| 2100 |
+
except Exception as e:
|
| 2101 |
+
log_section = f"\n⚠️ Unable to read logs: {e}\n"
|
| 2102 |
+
|
| 2103 |
return f"""
|
| 2104 |
## ❌ Evaluation Failed
|
| 2105 |
|
| 2106 |
**Model**: {model_name}
|
| 2107 |
**Error**: {msg}
|
| 2108 |
+
{log_section}
|
| 2109 |
+
Please review the logs above and try running the evaluation again.
|
| 2110 |
"""
|
| 2111 |
|
| 2112 |
return f"ℹ️ **Status**: {status}\n\n{msg}"
|
evaluation/evaluate_predictions.py
CHANGED
|
@@ -309,7 +309,7 @@ def main():
|
|
| 309 |
help="Grouping strategy: 'per-dataset' or 'overall' (default: overall)")
|
| 310 |
parser.add_argument("--analyze-only", action="store_true",
|
| 311 |
help="Only analyze the file structure without running evaluations")
|
| 312 |
-
parser.add_argument("--skip-llm-judge", default=
|
| 313 |
help="Skip LLM judge evaluation for caption tasks (use when LLM scores are pre-computed)")
|
| 314 |
|
| 315 |
args = parser.parse_args()
|
|
|
|
| 309 |
help="Grouping strategy: 'per-dataset' or 'overall' (default: overall)")
|
| 310 |
parser.add_argument("--analyze-only", action="store_true",
|
| 311 |
help="Only analyze the file structure without running evaluations")
|
| 312 |
+
parser.add_argument("--skip-llm-judge", default=False, action="store_true",
|
| 313 |
help="Skip LLM judge evaluation for caption tasks (use when LLM scores are pre-computed)")
|
| 314 |
|
| 315 |
args = parser.parse_args()
|