Spaces:

UII-AI
/

MedVidBench-Leaderboard

Running

MedVidBench-Leaderboard / test_llm_judge.py

MedGRPO Team

Add --skip-llm-judge flag for faster evaluation

3487a07 4 months ago

2.66 kB

	#!/usr/bin/env python3
	"""
	Test script to verify LLM judge is working correctly.
	Run this after setting OPENAI_API_KEY.
	"""
	import os
	import sys
	import json

	# Add evaluation directory to path
	sys.path.insert(0, 'evaluation')

	def test_api_key():
	"""Check if API key is set"""
	api_key = os.getenv('OPENAI_API_KEY')

	if not api_key:
	print("❌ OPENAI_API_KEY not set")
	print("\nSet it with:")
	print(" export OPENAI_API_KEY='your-key-here'")
	return False

	print(f"✓ API key found (length: {len(api_key)})")
	return True

	def test_llm_judge():
	"""Test LLM judge with a single sample"""
	try:
	from eval_caption_llm_judge import evaluate_single_caption_llm

	# Create a test sample
	test_data = {
	"prediction": "The surgeon performs a laparoscopic cholecystectomy using graspers and scissors to dissect the gallbladder from the liver bed.",
	"answer": "The surgeon removes the gallbladder using minimally invasive techniques with specialized instruments.",
	"metadata": {"video_id": "test_video"}
	}

	print("\nTesting LLM judge with sample caption...")
	print(f"Prediction: {test_data['prediction'][:100]}...")

	api_key = os.getenv('OPENAI_API_KEY')
	result = evaluate_single_caption_llm(
	test_data,
	task_type="video_summary",
	api_key=api_key
	)

	if result:
	print("\n✅ LLM judge working!")
	print(f" Average Score: {result.get('average_score', 0):.3f}/5.0")
	print(f" Aspect Scores:")
	for aspect, score in sorted(result.get('aspect_scores', {}).items()):
	print(f" {aspect}: {score:.1f}/5.0")
	return True
	else:
	print("❌ LLM judge returned no result")
	return False

	except Exception as e:
	print(f"❌ Error testing LLM judge: {e}")
	import traceback
	traceback.print_exc()
	return False

	def main():
	print("="*60)
	print("MedVidBench LLM Judge Test")
	print("="*60)

	# Step 1: Check API key
	if not test_api_key():
	sys.exit(1)

	# Step 2: Test LLM judge
	if not test_llm_judge():
	print("\n⚠️ LLM judge test failed")
	print("This could mean:")
	print(" 1. Invalid API key")
	print(" 2. No OpenAI credits")
	print(" 3. Network connectivity issue")
	print(" 4. Import error")
	sys.exit(1)

	print("\n" + "="*60)
	print("✅ All tests passed! LLM judge is ready.")
	print("="*60)

	if __name__ == "__main__":
	main()