Spaces:

UII-AI
/

MedVidBench-Leaderboard

Running

File size: 2,660 Bytes

3487a07

#!/usr/bin/env python3
"""
Test script to verify LLM judge is working correctly.
Run this after setting OPENAI_API_KEY.
"""
import os
import sys
import json

# Add evaluation directory to path
sys.path.insert(0, 'evaluation')

def test_api_key():
    """Check if API key is set"""
    api_key = os.getenv('OPENAI_API_KEY')

    if not api_key:
        print("❌ OPENAI_API_KEY not set")
        print("\nSet it with:")
        print("  export OPENAI_API_KEY='your-key-here'")
        return False

    print(f"✓ API key found (length: {len(api_key)})")
    return True

def test_llm_judge():
    """Test LLM judge with a single sample"""
    try:
        from eval_caption_llm_judge import evaluate_single_caption_llm

        # Create a test sample
        test_data = {
            "prediction": "The surgeon performs a laparoscopic cholecystectomy using graspers and scissors to dissect the gallbladder from the liver bed.",
            "answer": "The surgeon removes the gallbladder using minimally invasive techniques with specialized instruments.",
            "metadata": {"video_id": "test_video"}
        }

        print("\nTesting LLM judge with sample caption...")
        print(f"Prediction: {test_data['prediction'][:100]}...")

        api_key = os.getenv('OPENAI_API_KEY')
        result = evaluate_single_caption_llm(
            test_data,
            task_type="video_summary",
            api_key=api_key
        )

        if result:
            print("\n✅ LLM judge working!")
            print(f"  Average Score: {result.get('average_score', 0):.3f}/5.0")
            print(f"  Aspect Scores:")
            for aspect, score in sorted(result.get('aspect_scores', {}).items()):
                print(f"    {aspect}: {score:.1f}/5.0")
            return True
        else:
            print("❌ LLM judge returned no result")
            return False

    except Exception as e:
        print(f"❌ Error testing LLM judge: {e}")
        import traceback
        traceback.print_exc()
        return False

def main():
    print("="*60)
    print("MedVidBench LLM Judge Test")
    print("="*60)

    # Step 1: Check API key
    if not test_api_key():
        sys.exit(1)

    # Step 2: Test LLM judge
    if not test_llm_judge():
        print("\n⚠️  LLM judge test failed")
        print("This could mean:")
        print("  1. Invalid API key")
        print("  2. No OpenAI credits")
        print("  3. Network connectivity issue")
        print("  4. Import error")
        sys.exit(1)

    print("\n" + "="*60)
    print("✅ All tests passed! LLM judge is ready.")
    print("="*60)

if __name__ == "__main__":
    main()