MedGRPO Team commited on
Commit
83aad2b
·
1 Parent(s): 4510cf8

Make validation flexible: accept both 'answer'/'response' and 'gnd'/'ground_truth' field names

Browse files
Files changed (1) hide show
  1. app.py +31 -9
app.py CHANGED
@@ -190,8 +190,8 @@ def validate_results_file(file_path: str) -> Tuple[bool, str]:
190
  [
191
  {
192
  "question": "...",
193
- "response": "...",
194
- "ground_truth": "...",
195
  "qa_type": "tal/stg/next_action/dvc/vs/rc/skill_assessment/cvs_assessment",
196
  "metadata": {"video_id": "...", "fps": "...", ...},
197
  "data_source": "AVOS/CholecT50/...",
@@ -199,6 +199,11 @@ def validate_results_file(file_path: str) -> Tuple[bool, str]:
199
  },
200
  ...
201
  ]
 
 
 
 
 
202
  """
203
  try:
204
  with open(file_path, 'r') as f:
@@ -215,12 +220,24 @@ def validate_results_file(file_path: str) -> Tuple[bool, str]:
215
  if len(records) == 0:
216
  return False, "Empty results file"
217
 
218
- # Check first record has required fields
219
  sample = records[0]
220
- required_fields = ["question", "response", "qa_type"]
221
- missing = [f for f in required_fields if f not in sample]
222
- if missing:
223
- return False, f"Missing required fields: {missing}"
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
  # Check qa_type is valid
226
  valid_qa_types = ["tal", "stg", "next_action", "dense_captioning", "video_summary", "region_caption",
@@ -571,8 +588,8 @@ with gr.Blocks(title="MedGRPO Leaderboard", theme=gr.themes.Soft()) as demo:
571
  [
572
  {
573
  "question": "<video>\\nQuestion text...",
574
- "response": "Model's answer...",
575
- "ground_truth": "Correct answer...",
576
  "qa_type": "tal",
577
  "metadata": {"video_id": "...", "fps": "1.0", ...},
578
  "data_source": "AVOS",
@@ -582,6 +599,11 @@ with gr.Blocks(title="MedGRPO Leaderboard", theme=gr.themes.Soft()) as demo:
582
  ]
583
  ```
584
 
 
 
 
 
 
585
  **Valid qa_types**: `tal`, `stg`, `next_action`, `dense_captioning`, `video_summary`, `region_caption`, `skill_assessment`, `cvs_assessment`
586
 
587
  #### ⚙️ Evaluation Process
 
190
  [
191
  {
192
  "question": "...",
193
+ "response": "..." or "answer": "...", # Either field name accepted
194
+ "ground_truth": "..." or "gnd": "...", # Either field name accepted
195
  "qa_type": "tal/stg/next_action/dvc/vs/rc/skill_assessment/cvs_assessment",
196
  "metadata": {"video_id": "...", "fps": "...", ...},
197
  "data_source": "AVOS/CholecT50/...",
 
199
  },
200
  ...
201
  ]
202
+
203
+ Notes:
204
+ - Accepts both 'response' and 'answer' for model output
205
+ - Accepts both 'ground_truth' and 'gnd' for reference answer
206
+ - Can be either a list or dict (dict values will be extracted)
207
  """
208
  try:
209
  with open(file_path, 'r') as f:
 
220
  if len(records) == 0:
221
  return False, "Empty results file"
222
 
223
+ # Check first record has required fields (flexible field names)
224
  sample = records[0]
225
+
226
+ # Check for question field
227
+ if "question" not in sample:
228
+ return False, "Missing required field: 'question'"
229
+
230
+ # Check for response (accept 'response' or 'answer')
231
+ if "response" not in sample and "answer" not in sample:
232
+ return False, "Missing required field: 'response' or 'answer'"
233
+
234
+ # Check for ground truth (accept 'ground_truth' or 'gnd')
235
+ if "ground_truth" not in sample and "gnd" not in sample:
236
+ return False, "Missing required field: 'ground_truth' or 'gnd'"
237
+
238
+ # Check for qa_type
239
+ if "qa_type" not in sample:
240
+ return False, "Missing required field: 'qa_type'"
241
 
242
  # Check qa_type is valid
243
  valid_qa_types = ["tal", "stg", "next_action", "dense_captioning", "video_summary", "region_caption",
 
588
  [
589
  {
590
  "question": "<video>\\nQuestion text...",
591
+ "response": "Model's answer...", // or "answer"
592
+ "ground_truth": "Correct answer...", // or "gnd"
593
  "qa_type": "tal",
594
  "metadata": {"video_id": "...", "fps": "1.0", ...},
595
  "data_source": "AVOS",
 
599
  ]
600
  ```
601
 
602
+ **Note**: Both field naming conventions are accepted:
603
+ - Model output: `response` or `answer`
604
+ - Reference answer: `ground_truth` or `gnd`
605
+ - Format can be list or dict (dict values will be extracted)
606
+
607
  **Valid qa_types**: `tal`, `stg`, `next_action`, `dense_captioning`, `video_summary`, `region_caption`, `skill_assessment`, `cvs_assessment`
608
 
609
  #### ⚙️ Evaluation Process