Spaces:

ashish1265659565
/

pharmaspine-backend

Sleeping

File size: 3,987 Bytes

08fd094

import json
import time
from typing import List, Dict, Any
from src.retrieval import hybrid_search

"""
evaluation.py

What it does:
Evaluates the performance of the retrieval system against a set of predefined test cases.
Calculates Precision, Recall, F1-Score, and Latency.

How it works:
It loads test cases from a JSON file. For each test case, it runs the `hybrid_search` function
and measures the time taken (Latency). It then compares the retrieved chunks against the 
expected keywords to determine true positives, false positives, and false negatives.

Key formulas:
- Precision = True Positives / (True Positives + False Positives) (Are the retrieved chunks actually relevant?)
- Recall = True Positives / (True Positives + False Negatives) (Did we find all the relevant keywords?)
- F1-Score = 2 * (Precision * Recall) / (Precision + Recall)

Example input:
- A JSON file with test cases (e.g., data/test_cases.json)

Example output:
- Prints aggregate metrics to the console.
"""

def evaluate_retrieval(test_cases_path: str) -> Dict[str, float]:
    """
    Runs the evaluation suite and returns aggregate metrics.
    """
    try:
        with open(test_cases_path, 'r') as f:
            test_cases = json.load(f)
    except FileNotFoundError:
        print(f"Test cases file not found at {test_cases_path}")
        return {}

    total_precision = 0.0
    total_recall = 0.0
    total_latency = 0.0
    total_f1 = 0.0
    num_cases = len(test_cases)
    
    if num_cases == 0:
        return {}

    print(f"Running evaluation on {num_cases} test cases...")
    
    for case in test_cases:
        query = case["question"]
        expected_keywords = [kw.lower() for kw in case.get("expected_key_words", [])]
        
        # Measure latency
        start_time = time.time()
        retrieved = hybrid_search(query, top_k=5)
        end_time = time.time()
        
        latency_ms = (end_time - start_time) * 1000
        total_latency += latency_ms
        
        # We define a "True Positive" as a keyword that was found in any of the retrieved chunks
        retrieved_text_combined = " ".join([chunk[0].lower() for chunk in retrieved])
        
        true_positives = 0
        for kw in expected_keywords:
            if kw in retrieved_text_combined:
                true_positives += 1
        
        # For precision, we look at how many chunks were actually relevant.
        # A chunk is relevant if it contains at least one expected keyword.
        relevant_chunks_retrieved = 0
        for chunk, _ in retrieved:
            chunk_lower = chunk.lower()
            if any(kw in chunk_lower for kw in expected_keywords):
                relevant_chunks_retrieved += 1
        
        # Calculate metrics for this case
        precision = relevant_chunks_retrieved / len(retrieved) if retrieved else 0.0
        recall = true_positives / len(expected_keywords) if expected_keywords else 0.0
        
        f1 = 0.0
        if precision + recall > 0:
            f1 = 2 * (precision * recall) / (precision + recall)
            
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        
    # Aggregate
    avg_precision = total_precision / num_cases
    avg_recall = total_recall / num_cases
    avg_f1 = total_f1 / num_cases
    avg_latency = total_latency / num_cases
    
    results = {
        "Precision": avg_precision,
        "Recall": avg_recall,
        "F1_Score": avg_f1,
        "Latency_ms": avg_latency
    }
    
    print("\n--- Evaluation Results ---")
    print(f"Precision: {avg_precision:.2%} (Target: >85%)")
    print(f"Recall:    {avg_recall:.2%} (Target: >90%)")
    print(f"F1 Score:  {avg_f1:.2%} (Target: >87%)")
    print(f"Latency:   {avg_latency:.2f}ms (Target: <500ms)")
    
    return results

if __name__ == "__main__":
    import sys
    test_path = "data/test_cases.json"
    if len(sys.argv) > 1:
        test_path = sys.argv[1]
    evaluate_retrieval(test_path)