import json import time from typing import List, Dict, Any from src.retrieval import hybrid_search """ evaluation.py What it does: Evaluates the performance of the retrieval system against a set of predefined test cases. Calculates Precision, Recall, F1-Score, and Latency. How it works: It loads test cases from a JSON file. For each test case, it runs the `hybrid_search` function and measures the time taken (Latency). It then compares the retrieved chunks against the expected keywords to determine true positives, false positives, and false negatives. Key formulas: - Precision = True Positives / (True Positives + False Positives) (Are the retrieved chunks actually relevant?) - Recall = True Positives / (True Positives + False Negatives) (Did we find all the relevant keywords?) - F1-Score = 2 * (Precision * Recall) / (Precision + Recall) Example input: - A JSON file with test cases (e.g., data/test_cases.json) Example output: - Prints aggregate metrics to the console. """ def evaluate_retrieval(test_cases_path: str) -> Dict[str, float]: """ Runs the evaluation suite and returns aggregate metrics. """ try: with open(test_cases_path, 'r') as f: test_cases = json.load(f) except FileNotFoundError: print(f"Test cases file not found at {test_cases_path}") return {} total_precision = 0.0 total_recall = 0.0 total_latency = 0.0 total_f1 = 0.0 num_cases = len(test_cases) if num_cases == 0: return {} print(f"Running evaluation on {num_cases} test cases...") for case in test_cases: query = case["question"] expected_keywords = [kw.lower() for kw in case.get("expected_key_words", [])] # Measure latency start_time = time.time() retrieved = hybrid_search(query, top_k=5) end_time = time.time() latency_ms = (end_time - start_time) * 1000 total_latency += latency_ms # We define a "True Positive" as a keyword that was found in any of the retrieved chunks retrieved_text_combined = " ".join([chunk[0].lower() for chunk in retrieved]) true_positives = 0 for kw in expected_keywords: if kw in retrieved_text_combined: true_positives += 1 # For precision, we look at how many chunks were actually relevant. # A chunk is relevant if it contains at least one expected keyword. relevant_chunks_retrieved = 0 for chunk, _ in retrieved: chunk_lower = chunk.lower() if any(kw in chunk_lower for kw in expected_keywords): relevant_chunks_retrieved += 1 # Calculate metrics for this case precision = relevant_chunks_retrieved / len(retrieved) if retrieved else 0.0 recall = true_positives / len(expected_keywords) if expected_keywords else 0.0 f1 = 0.0 if precision + recall > 0: f1 = 2 * (precision * recall) / (precision + recall) total_precision += precision total_recall += recall total_f1 += f1 # Aggregate avg_precision = total_precision / num_cases avg_recall = total_recall / num_cases avg_f1 = total_f1 / num_cases avg_latency = total_latency / num_cases results = { "Precision": avg_precision, "Recall": avg_recall, "F1_Score": avg_f1, "Latency_ms": avg_latency } print("\n--- Evaluation Results ---") print(f"Precision: {avg_precision:.2%} (Target: >85%)") print(f"Recall: {avg_recall:.2%} (Target: >90%)") print(f"F1 Score: {avg_f1:.2%} (Target: >87%)") print(f"Latency: {avg_latency:.2f}ms (Target: <500ms)") return results if __name__ == "__main__": import sys test_path = "data/test_cases.json" if len(sys.argv) > 1: test_path = sys.argv[1] evaluate_retrieval(test_path)