MedGRPO Team commited on
Commit
18339c0
·
1 Parent(s): 5f41159

Fix evaluate_all_pai to pass --skip-llm-judge to task main() functions

Browse files

CRITICAL FIX: The root cause of LLM judge still running despite --skip-llm-judge flag

Problem:
- _run_task_eval() was calling module.main() without any parameters
- main() functions had no way to receive the skip_llm_judge flag
- Result: Per-dataset evaluations ran LLM judge even when flag was set

Solution:
- Add skip_llm_judge parameter to _run_task_eval()
- Pass flag via sys.argv to main() functions
- main() functions (like eval_dvc.py) now parse --skip-llm-judge from sys.argv

Flow now works correctly:
1. app.py calls evaluate_predictions.py with --skip-llm-judge
2. evaluate_predictions.py calls evaluate_all_pai.run_evaluation(skip_llm_judge=True)
3. run_evaluation() calls _run_task_eval(task, file, skip_llm_judge=True)
4. _run_task_eval() sets sys.argv with --skip-llm-judge
5. module.main() parses sys.argv and correctly skips LLM judge

Tested: Per-dataset DVC evaluations now show 'Skipping LLM judge' message

Files changed (1) hide show
  1. evaluation/evaluate_all_pai.py +51 -41
evaluation/evaluate_all_pai.py CHANGED
@@ -701,55 +701,68 @@ def print_overall_evaluation_results(output_file, tasks, all_task_results, skip_
701
  traceback.print_exc()
702
 
703
 
704
- def _run_task_eval(task, output_file):
705
  """Helper function to run a single task evaluation.
706
 
707
  Args:
708
  task: Task name (e.g., 'tal', 'stg')
709
  output_file: Path to results JSON
 
710
 
711
  Returns:
712
  Dictionary of evaluation results
713
  """
714
  import sys
715
 
716
- if task == "dvc":
717
- module = load_eval_module("eval_dvc")
718
- task_results = module.main()
719
- elif task == "tal":
720
- module = load_eval_module("eval_tal")
721
- task_results = module.main()
722
- elif task == "next_action":
723
- module = load_eval_module("eval_next_action")
724
- task_results = module.main()
725
- elif task == "stg":
726
- module = load_eval_module("eval_stg")
727
- task_results = module.main()
728
- elif task == "rc":
729
- module = load_eval_module("eval_caption_llm_judge")
730
- # Evaluate region caption using LLM judge
731
- task_results = module.evaluate_caption_task(output_file, "region_caption")
732
- elif task == "vs":
733
- module = load_eval_module("eval_caption_llm_judge")
734
- # Evaluate video summary using LLM judge
735
- task_results = module.evaluate_caption_task(output_file, "video_summary")
736
- elif task == "skill_assessment":
737
- module = load_eval_module("eval_skill_assessment")
738
- task_results = module.main()
739
- elif task == "cvs_assessment":
740
- module = load_eval_module("eval_cvs_assessment")
741
- task_results = module.main()
742
- elif task == "gemini_structured":
743
- module = load_eval_module("eval_gemini_structured")
744
- task_results = module.main()
745
- elif task == "gpt_structured":
746
- module = load_eval_module("eval_gpt_structured")
747
- task_results = module.main()
748
- else:
749
- print(f"Unknown task: {task}")
750
- task_results = {}
751
 
752
- return task_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
753
 
754
 
755
  def run_evaluation(output_file, tasks=None, grouping="per-dataset", silent_eval=False, skip_llm_judge=False):
@@ -848,14 +861,11 @@ def run_evaluation(output_file, tasks=None, grouping="per-dataset", silent_eval=
848
  # Even in silent mode, show progress
849
  print(f"Evaluating {task.upper()}...", flush=True)
850
 
851
- # Set sys.argv for the task-specific main function
852
- sys.argv = ["eval_script", output_file]
853
-
854
  # Load the module dynamically and call main to get results
855
  try:
856
  # Optionally suppress output from eval modules
857
  # Note: Disabled redirect to show metrics even in silent mode
858
- task_results = _run_task_eval(task, output_file)
859
 
860
  # Store the results for this task
861
  all_task_results[task] = task_results if task_results else {}
 
701
  traceback.print_exc()
702
 
703
 
704
+ def _run_task_eval(task, output_file, skip_llm_judge=False):
705
  """Helper function to run a single task evaluation.
706
 
707
  Args:
708
  task: Task name (e.g., 'tal', 'stg')
709
  output_file: Path to results JSON
710
+ skip_llm_judge: If True, skip LLM judge for caption tasks (DVC, VS, RC)
711
 
712
  Returns:
713
  Dictionary of evaluation results
714
  """
715
  import sys
716
 
717
+ # Save original sys.argv
718
+ original_argv = sys.argv.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
719
 
720
+ try:
721
+ # Set sys.argv for main() functions
722
+ sys.argv = ["eval_script", output_file]
723
+ if skip_llm_judge:
724
+ sys.argv.append("--skip-llm-judge")
725
+
726
+ if task == "dvc":
727
+ module = load_eval_module("eval_dvc")
728
+ task_results = module.main()
729
+ elif task == "tal":
730
+ module = load_eval_module("eval_tal")
731
+ task_results = module.main()
732
+ elif task == "next_action":
733
+ module = load_eval_module("eval_next_action")
734
+ task_results = module.main()
735
+ elif task == "stg":
736
+ module = load_eval_module("eval_stg")
737
+ task_results = module.main()
738
+ elif task == "rc":
739
+ module = load_eval_module("eval_caption_llm_judge")
740
+ # Evaluate region caption using LLM judge
741
+ task_results = module.evaluate_caption_task(output_file, "region_caption")
742
+ elif task == "vs":
743
+ module = load_eval_module("eval_caption_llm_judge")
744
+ # Evaluate video summary using LLM judge
745
+ task_results = module.evaluate_caption_task(output_file, "video_summary")
746
+ elif task == "skill_assessment":
747
+ module = load_eval_module("eval_skill_assessment")
748
+ task_results = module.main()
749
+ elif task == "cvs_assessment":
750
+ module = load_eval_module("eval_cvs_assessment")
751
+ task_results = module.main()
752
+ elif task == "gemini_structured":
753
+ module = load_eval_module("eval_gemini_structured")
754
+ task_results = module.main()
755
+ elif task == "gpt_structured":
756
+ module = load_eval_module("eval_gpt_structured")
757
+ task_results = module.main()
758
+ else:
759
+ print(f"Unknown task: {task}")
760
+ task_results = {}
761
+
762
+ return task_results
763
+ finally:
764
+ # Restore original sys.argv
765
+ sys.argv = original_argv
766
 
767
 
768
  def run_evaluation(output_file, tasks=None, grouping="per-dataset", silent_eval=False, skip_llm_judge=False):
 
861
  # Even in silent mode, show progress
862
  print(f"Evaluating {task.upper()}...", flush=True)
863
 
 
 
 
864
  # Load the module dynamically and call main to get results
865
  try:
866
  # Optionally suppress output from eval modules
867
  # Note: Disabled redirect to show metrics even in silent mode
868
+ task_results = _run_task_eval(task, output_file, skip_llm_judge=skip_llm_judge)
869
 
870
  # Store the results for this task
871
  all_task_results[task] = task_results if task_results else {}