Fix evaluate_all_pai to pass --skip-llm-judge to task main() functions
Browse filesCRITICAL FIX: The root cause of LLM judge still running despite --skip-llm-judge flag
Problem:
- _run_task_eval() was calling module.main() without any parameters
- main() functions had no way to receive the skip_llm_judge flag
- Result: Per-dataset evaluations ran LLM judge even when flag was set
Solution:
- Add skip_llm_judge parameter to _run_task_eval()
- Pass flag via sys.argv to main() functions
- main() functions (like eval_dvc.py) now parse --skip-llm-judge from sys.argv
Flow now works correctly:
1. app.py calls evaluate_predictions.py with --skip-llm-judge
2. evaluate_predictions.py calls evaluate_all_pai.run_evaluation(skip_llm_judge=True)
3. run_evaluation() calls _run_task_eval(task, file, skip_llm_judge=True)
4. _run_task_eval() sets sys.argv with --skip-llm-judge
5. module.main() parses sys.argv and correctly skips LLM judge
Tested: Per-dataset DVC evaluations now show 'Skipping LLM judge' message
- evaluation/evaluate_all_pai.py +51 -41
|
@@ -701,55 +701,68 @@ def print_overall_evaluation_results(output_file, tasks, all_task_results, skip_
|
|
| 701 |
traceback.print_exc()
|
| 702 |
|
| 703 |
|
| 704 |
-
def _run_task_eval(task, output_file):
|
| 705 |
"""Helper function to run a single task evaluation.
|
| 706 |
|
| 707 |
Args:
|
| 708 |
task: Task name (e.g., 'tal', 'stg')
|
| 709 |
output_file: Path to results JSON
|
|
|
|
| 710 |
|
| 711 |
Returns:
|
| 712 |
Dictionary of evaluation results
|
| 713 |
"""
|
| 714 |
import sys
|
| 715 |
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
task_results = module.main()
|
| 719 |
-
elif task == "tal":
|
| 720 |
-
module = load_eval_module("eval_tal")
|
| 721 |
-
task_results = module.main()
|
| 722 |
-
elif task == "next_action":
|
| 723 |
-
module = load_eval_module("eval_next_action")
|
| 724 |
-
task_results = module.main()
|
| 725 |
-
elif task == "stg":
|
| 726 |
-
module = load_eval_module("eval_stg")
|
| 727 |
-
task_results = module.main()
|
| 728 |
-
elif task == "rc":
|
| 729 |
-
module = load_eval_module("eval_caption_llm_judge")
|
| 730 |
-
# Evaluate region caption using LLM judge
|
| 731 |
-
task_results = module.evaluate_caption_task(output_file, "region_caption")
|
| 732 |
-
elif task == "vs":
|
| 733 |
-
module = load_eval_module("eval_caption_llm_judge")
|
| 734 |
-
# Evaluate video summary using LLM judge
|
| 735 |
-
task_results = module.evaluate_caption_task(output_file, "video_summary")
|
| 736 |
-
elif task == "skill_assessment":
|
| 737 |
-
module = load_eval_module("eval_skill_assessment")
|
| 738 |
-
task_results = module.main()
|
| 739 |
-
elif task == "cvs_assessment":
|
| 740 |
-
module = load_eval_module("eval_cvs_assessment")
|
| 741 |
-
task_results = module.main()
|
| 742 |
-
elif task == "gemini_structured":
|
| 743 |
-
module = load_eval_module("eval_gemini_structured")
|
| 744 |
-
task_results = module.main()
|
| 745 |
-
elif task == "gpt_structured":
|
| 746 |
-
module = load_eval_module("eval_gpt_structured")
|
| 747 |
-
task_results = module.main()
|
| 748 |
-
else:
|
| 749 |
-
print(f"Unknown task: {task}")
|
| 750 |
-
task_results = {}
|
| 751 |
|
| 752 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
|
| 754 |
|
| 755 |
def run_evaluation(output_file, tasks=None, grouping="per-dataset", silent_eval=False, skip_llm_judge=False):
|
|
@@ -848,14 +861,11 @@ def run_evaluation(output_file, tasks=None, grouping="per-dataset", silent_eval=
|
|
| 848 |
# Even in silent mode, show progress
|
| 849 |
print(f"Evaluating {task.upper()}...", flush=True)
|
| 850 |
|
| 851 |
-
# Set sys.argv for the task-specific main function
|
| 852 |
-
sys.argv = ["eval_script", output_file]
|
| 853 |
-
|
| 854 |
# Load the module dynamically and call main to get results
|
| 855 |
try:
|
| 856 |
# Optionally suppress output from eval modules
|
| 857 |
# Note: Disabled redirect to show metrics even in silent mode
|
| 858 |
-
task_results = _run_task_eval(task, output_file)
|
| 859 |
|
| 860 |
# Store the results for this task
|
| 861 |
all_task_results[task] = task_results if task_results else {}
|
|
|
|
| 701 |
traceback.print_exc()
|
| 702 |
|
| 703 |
|
| 704 |
+
def _run_task_eval(task, output_file, skip_llm_judge=False):
|
| 705 |
"""Helper function to run a single task evaluation.
|
| 706 |
|
| 707 |
Args:
|
| 708 |
task: Task name (e.g., 'tal', 'stg')
|
| 709 |
output_file: Path to results JSON
|
| 710 |
+
skip_llm_judge: If True, skip LLM judge for caption tasks (DVC, VS, RC)
|
| 711 |
|
| 712 |
Returns:
|
| 713 |
Dictionary of evaluation results
|
| 714 |
"""
|
| 715 |
import sys
|
| 716 |
|
| 717 |
+
# Save original sys.argv
|
| 718 |
+
original_argv = sys.argv.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 719 |
|
| 720 |
+
try:
|
| 721 |
+
# Set sys.argv for main() functions
|
| 722 |
+
sys.argv = ["eval_script", output_file]
|
| 723 |
+
if skip_llm_judge:
|
| 724 |
+
sys.argv.append("--skip-llm-judge")
|
| 725 |
+
|
| 726 |
+
if task == "dvc":
|
| 727 |
+
module = load_eval_module("eval_dvc")
|
| 728 |
+
task_results = module.main()
|
| 729 |
+
elif task == "tal":
|
| 730 |
+
module = load_eval_module("eval_tal")
|
| 731 |
+
task_results = module.main()
|
| 732 |
+
elif task == "next_action":
|
| 733 |
+
module = load_eval_module("eval_next_action")
|
| 734 |
+
task_results = module.main()
|
| 735 |
+
elif task == "stg":
|
| 736 |
+
module = load_eval_module("eval_stg")
|
| 737 |
+
task_results = module.main()
|
| 738 |
+
elif task == "rc":
|
| 739 |
+
module = load_eval_module("eval_caption_llm_judge")
|
| 740 |
+
# Evaluate region caption using LLM judge
|
| 741 |
+
task_results = module.evaluate_caption_task(output_file, "region_caption")
|
| 742 |
+
elif task == "vs":
|
| 743 |
+
module = load_eval_module("eval_caption_llm_judge")
|
| 744 |
+
# Evaluate video summary using LLM judge
|
| 745 |
+
task_results = module.evaluate_caption_task(output_file, "video_summary")
|
| 746 |
+
elif task == "skill_assessment":
|
| 747 |
+
module = load_eval_module("eval_skill_assessment")
|
| 748 |
+
task_results = module.main()
|
| 749 |
+
elif task == "cvs_assessment":
|
| 750 |
+
module = load_eval_module("eval_cvs_assessment")
|
| 751 |
+
task_results = module.main()
|
| 752 |
+
elif task == "gemini_structured":
|
| 753 |
+
module = load_eval_module("eval_gemini_structured")
|
| 754 |
+
task_results = module.main()
|
| 755 |
+
elif task == "gpt_structured":
|
| 756 |
+
module = load_eval_module("eval_gpt_structured")
|
| 757 |
+
task_results = module.main()
|
| 758 |
+
else:
|
| 759 |
+
print(f"Unknown task: {task}")
|
| 760 |
+
task_results = {}
|
| 761 |
+
|
| 762 |
+
return task_results
|
| 763 |
+
finally:
|
| 764 |
+
# Restore original sys.argv
|
| 765 |
+
sys.argv = original_argv
|
| 766 |
|
| 767 |
|
| 768 |
def run_evaluation(output_file, tasks=None, grouping="per-dataset", silent_eval=False, skip_llm_judge=False):
|
|
|
|
| 861 |
# Even in silent mode, show progress
|
| 862 |
print(f"Evaluating {task.upper()}...", flush=True)
|
| 863 |
|
|
|
|
|
|
|
|
|
|
| 864 |
# Load the module dynamically and call main to get results
|
| 865 |
try:
|
| 866 |
# Optionally suppress output from eval modules
|
| 867 |
# Note: Disabled redirect to show metrics even in silent mode
|
| 868 |
+
task_results = _run_task_eval(task, output_file, skip_llm_judge=skip_llm_judge)
|
| 869 |
|
| 870 |
# Store the results for this task
|
| 871 |
all_task_results[task] = task_results if task_results else {}
|