MedGRPO Team commited on
Commit ·
83aad2b
1
Parent(s): 4510cf8
Make validation flexible: accept both 'answer'/'response' and 'gnd'/'ground_truth' field names
Browse files
app.py
CHANGED
|
@@ -190,8 +190,8 @@ def validate_results_file(file_path: str) -> Tuple[bool, str]:
|
|
| 190 |
[
|
| 191 |
{
|
| 192 |
"question": "...",
|
| 193 |
-
"response": "...",
|
| 194 |
-
"ground_truth": "...",
|
| 195 |
"qa_type": "tal/stg/next_action/dvc/vs/rc/skill_assessment/cvs_assessment",
|
| 196 |
"metadata": {"video_id": "...", "fps": "...", ...},
|
| 197 |
"data_source": "AVOS/CholecT50/...",
|
|
@@ -199,6 +199,11 @@ def validate_results_file(file_path: str) -> Tuple[bool, str]:
|
|
| 199 |
},
|
| 200 |
...
|
| 201 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
"""
|
| 203 |
try:
|
| 204 |
with open(file_path, 'r') as f:
|
|
@@ -215,12 +220,24 @@ def validate_results_file(file_path: str) -> Tuple[bool, str]:
|
|
| 215 |
if len(records) == 0:
|
| 216 |
return False, "Empty results file"
|
| 217 |
|
| 218 |
-
# Check first record has required fields
|
| 219 |
sample = records[0]
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
if
|
| 223 |
-
return False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
# Check qa_type is valid
|
| 226 |
valid_qa_types = ["tal", "stg", "next_action", "dense_captioning", "video_summary", "region_caption",
|
|
@@ -571,8 +588,8 @@ with gr.Blocks(title="MedGRPO Leaderboard", theme=gr.themes.Soft()) as demo:
|
|
| 571 |
[
|
| 572 |
{
|
| 573 |
"question": "<video>\\nQuestion text...",
|
| 574 |
-
"response": "Model's answer...",
|
| 575 |
-
"ground_truth": "Correct answer...",
|
| 576 |
"qa_type": "tal",
|
| 577 |
"metadata": {"video_id": "...", "fps": "1.0", ...},
|
| 578 |
"data_source": "AVOS",
|
|
@@ -582,6 +599,11 @@ with gr.Blocks(title="MedGRPO Leaderboard", theme=gr.themes.Soft()) as demo:
|
|
| 582 |
]
|
| 583 |
```
|
| 584 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
**Valid qa_types**: `tal`, `stg`, `next_action`, `dense_captioning`, `video_summary`, `region_caption`, `skill_assessment`, `cvs_assessment`
|
| 586 |
|
| 587 |
#### ⚙️ Evaluation Process
|
|
|
|
| 190 |
[
|
| 191 |
{
|
| 192 |
"question": "...",
|
| 193 |
+
"response": "..." or "answer": "...", # Either field name accepted
|
| 194 |
+
"ground_truth": "..." or "gnd": "...", # Either field name accepted
|
| 195 |
"qa_type": "tal/stg/next_action/dvc/vs/rc/skill_assessment/cvs_assessment",
|
| 196 |
"metadata": {"video_id": "...", "fps": "...", ...},
|
| 197 |
"data_source": "AVOS/CholecT50/...",
|
|
|
|
| 199 |
},
|
| 200 |
...
|
| 201 |
]
|
| 202 |
+
|
| 203 |
+
Notes:
|
| 204 |
+
- Accepts both 'response' and 'answer' for model output
|
| 205 |
+
- Accepts both 'ground_truth' and 'gnd' for reference answer
|
| 206 |
+
- Can be either a list or dict (dict values will be extracted)
|
| 207 |
"""
|
| 208 |
try:
|
| 209 |
with open(file_path, 'r') as f:
|
|
|
|
| 220 |
if len(records) == 0:
|
| 221 |
return False, "Empty results file"
|
| 222 |
|
| 223 |
+
# Check first record has required fields (flexible field names)
|
| 224 |
sample = records[0]
|
| 225 |
+
|
| 226 |
+
# Check for question field
|
| 227 |
+
if "question" not in sample:
|
| 228 |
+
return False, "Missing required field: 'question'"
|
| 229 |
+
|
| 230 |
+
# Check for response (accept 'response' or 'answer')
|
| 231 |
+
if "response" not in sample and "answer" not in sample:
|
| 232 |
+
return False, "Missing required field: 'response' or 'answer'"
|
| 233 |
+
|
| 234 |
+
# Check for ground truth (accept 'ground_truth' or 'gnd')
|
| 235 |
+
if "ground_truth" not in sample and "gnd" not in sample:
|
| 236 |
+
return False, "Missing required field: 'ground_truth' or 'gnd'"
|
| 237 |
+
|
| 238 |
+
# Check for qa_type
|
| 239 |
+
if "qa_type" not in sample:
|
| 240 |
+
return False, "Missing required field: 'qa_type'"
|
| 241 |
|
| 242 |
# Check qa_type is valid
|
| 243 |
valid_qa_types = ["tal", "stg", "next_action", "dense_captioning", "video_summary", "region_caption",
|
|
|
|
| 588 |
[
|
| 589 |
{
|
| 590 |
"question": "<video>\\nQuestion text...",
|
| 591 |
+
"response": "Model's answer...", // or "answer"
|
| 592 |
+
"ground_truth": "Correct answer...", // or "gnd"
|
| 593 |
"qa_type": "tal",
|
| 594 |
"metadata": {"video_id": "...", "fps": "1.0", ...},
|
| 595 |
"data_source": "AVOS",
|
|
|
|
| 599 |
]
|
| 600 |
```
|
| 601 |
|
| 602 |
+
**Note**: Both field naming conventions are accepted:
|
| 603 |
+
- Model output: `response` or `answer`
|
| 604 |
+
- Reference answer: `ground_truth` or `gnd`
|
| 605 |
+
- Format can be list or dict (dict values will be extracted)
|
| 606 |
+
|
| 607 |
**Valid qa_types**: `tal`, `stg`, `next_action`, `dense_captioning`, `video_summary`, `region_caption`, `skill_assessment`, `cvs_assessment`
|
| 608 |
|
| 609 |
#### ⚙️ Evaluation Process
|