MedVidBench-Leaderboard / test_llm_judge.py
MedGRPO Team
Add --skip-llm-judge flag for faster evaluation
3487a07
#!/usr/bin/env python3
"""
Test script to verify LLM judge is working correctly.
Run this after setting OPENAI_API_KEY.
"""
import os
import sys
import json
# Add evaluation directory to path
sys.path.insert(0, 'evaluation')
def test_api_key():
"""Check if API key is set"""
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
print("❌ OPENAI_API_KEY not set")
print("\nSet it with:")
print(" export OPENAI_API_KEY='your-key-here'")
return False
print(f"✓ API key found (length: {len(api_key)})")
return True
def test_llm_judge():
"""Test LLM judge with a single sample"""
try:
from eval_caption_llm_judge import evaluate_single_caption_llm
# Create a test sample
test_data = {
"prediction": "The surgeon performs a laparoscopic cholecystectomy using graspers and scissors to dissect the gallbladder from the liver bed.",
"answer": "The surgeon removes the gallbladder using minimally invasive techniques with specialized instruments.",
"metadata": {"video_id": "test_video"}
}
print("\nTesting LLM judge with sample caption...")
print(f"Prediction: {test_data['prediction'][:100]}...")
api_key = os.getenv('OPENAI_API_KEY')
result = evaluate_single_caption_llm(
test_data,
task_type="video_summary",
api_key=api_key
)
if result:
print("\n✅ LLM judge working!")
print(f" Average Score: {result.get('average_score', 0):.3f}/5.0")
print(f" Aspect Scores:")
for aspect, score in sorted(result.get('aspect_scores', {}).items()):
print(f" {aspect}: {score:.1f}/5.0")
return True
else:
print("❌ LLM judge returned no result")
return False
except Exception as e:
print(f"❌ Error testing LLM judge: {e}")
import traceback
traceback.print_exc()
return False
def main():
print("="*60)
print("MedVidBench LLM Judge Test")
print("="*60)
# Step 1: Check API key
if not test_api_key():
sys.exit(1)
# Step 2: Test LLM judge
if not test_llm_judge():
print("\n⚠️ LLM judge test failed")
print("This could mean:")
print(" 1. Invalid API key")
print(" 2. No OpenAI credits")
print(" 3. Network connectivity issue")
print(" 4. Import error")
sys.exit(1)
print("\n" + "="*60)
print("✅ All tests passed! LLM judge is ready.")
print("="*60)
if __name__ == "__main__":
main()