File size: 3,987 Bytes
08fd094
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import json
import time
from typing import List, Dict, Any
from src.retrieval import hybrid_search

"""
evaluation.py

What it does:
Evaluates the performance of the retrieval system against a set of predefined test cases.
Calculates Precision, Recall, F1-Score, and Latency.

How it works:
It loads test cases from a JSON file. For each test case, it runs the `hybrid_search` function
and measures the time taken (Latency). It then compares the retrieved chunks against the 
expected keywords to determine true positives, false positives, and false negatives.

Key formulas:
- Precision = True Positives / (True Positives + False Positives) (Are the retrieved chunks actually relevant?)
- Recall = True Positives / (True Positives + False Negatives) (Did we find all the relevant keywords?)
- F1-Score = 2 * (Precision * Recall) / (Precision + Recall)

Example input:
- A JSON file with test cases (e.g., data/test_cases.json)

Example output:
- Prints aggregate metrics to the console.
"""

def evaluate_retrieval(test_cases_path: str) -> Dict[str, float]:
    """
    Runs the evaluation suite and returns aggregate metrics.
    """
    try:
        with open(test_cases_path, 'r') as f:
            test_cases = json.load(f)
    except FileNotFoundError:
        print(f"Test cases file not found at {test_cases_path}")
        return {}

    total_precision = 0.0
    total_recall = 0.0
    total_latency = 0.0
    total_f1 = 0.0
    num_cases = len(test_cases)
    
    if num_cases == 0:
        return {}

    print(f"Running evaluation on {num_cases} test cases...")
    
    for case in test_cases:
        query = case["question"]
        expected_keywords = [kw.lower() for kw in case.get("expected_key_words", [])]
        
        # Measure latency
        start_time = time.time()
        retrieved = hybrid_search(query, top_k=5)
        end_time = time.time()
        
        latency_ms = (end_time - start_time) * 1000
        total_latency += latency_ms
        
        # We define a "True Positive" as a keyword that was found in any of the retrieved chunks
        retrieved_text_combined = " ".join([chunk[0].lower() for chunk in retrieved])
        
        true_positives = 0
        for kw in expected_keywords:
            if kw in retrieved_text_combined:
                true_positives += 1
        
        # For precision, we look at how many chunks were actually relevant.
        # A chunk is relevant if it contains at least one expected keyword.
        relevant_chunks_retrieved = 0
        for chunk, _ in retrieved:
            chunk_lower = chunk.lower()
            if any(kw in chunk_lower for kw in expected_keywords):
                relevant_chunks_retrieved += 1
        
        # Calculate metrics for this case
        precision = relevant_chunks_retrieved / len(retrieved) if retrieved else 0.0
        recall = true_positives / len(expected_keywords) if expected_keywords else 0.0
        
        f1 = 0.0
        if precision + recall > 0:
            f1 = 2 * (precision * recall) / (precision + recall)
            
        total_precision += precision
        total_recall += recall
        total_f1 += f1
        
    # Aggregate
    avg_precision = total_precision / num_cases
    avg_recall = total_recall / num_cases
    avg_f1 = total_f1 / num_cases
    avg_latency = total_latency / num_cases
    
    results = {
        "Precision": avg_precision,
        "Recall": avg_recall,
        "F1_Score": avg_f1,
        "Latency_ms": avg_latency
    }
    
    print("\n--- Evaluation Results ---")
    print(f"Precision: {avg_precision:.2%} (Target: >85%)")
    print(f"Recall:    {avg_recall:.2%} (Target: >90%)")
    print(f"F1 Score:  {avg_f1:.2%} (Target: >87%)")
    print(f"Latency:   {avg_latency:.2f}ms (Target: <500ms)")
    
    return results

if __name__ == "__main__":
    import sys
    test_path = "data/test_cases.json"
    if len(sys.argv) > 1:
        test_path = sys.argv[1]
    evaluate_retrieval(test_path)