| import json |
| import time |
| from typing import List, Dict, Any |
| from src.retrieval import hybrid_search |
|
|
| """ |
| evaluation.py |
| |
| What it does: |
| Evaluates the performance of the retrieval system against a set of predefined test cases. |
| Calculates Precision, Recall, F1-Score, and Latency. |
| |
| How it works: |
| It loads test cases from a JSON file. For each test case, it runs the `hybrid_search` function |
| and measures the time taken (Latency). It then compares the retrieved chunks against the |
| expected keywords to determine true positives, false positives, and false negatives. |
| |
| Key formulas: |
| - Precision = True Positives / (True Positives + False Positives) (Are the retrieved chunks actually relevant?) |
| - Recall = True Positives / (True Positives + False Negatives) (Did we find all the relevant keywords?) |
| - F1-Score = 2 * (Precision * Recall) / (Precision + Recall) |
| |
| Example input: |
| - A JSON file with test cases (e.g., data/test_cases.json) |
| |
| Example output: |
| - Prints aggregate metrics to the console. |
| """ |
|
|
| def evaluate_retrieval(test_cases_path: str) -> Dict[str, float]: |
| """ |
| Runs the evaluation suite and returns aggregate metrics. |
| """ |
| try: |
| with open(test_cases_path, 'r') as f: |
| test_cases = json.load(f) |
| except FileNotFoundError: |
| print(f"Test cases file not found at {test_cases_path}") |
| return {} |
|
|
| total_precision = 0.0 |
| total_recall = 0.0 |
| total_latency = 0.0 |
| total_f1 = 0.0 |
| num_cases = len(test_cases) |
| |
| if num_cases == 0: |
| return {} |
|
|
| print(f"Running evaluation on {num_cases} test cases...") |
| |
| for case in test_cases: |
| query = case["question"] |
| expected_keywords = [kw.lower() for kw in case.get("expected_key_words", [])] |
| |
| |
| start_time = time.time() |
| retrieved = hybrid_search(query, top_k=5) |
| end_time = time.time() |
| |
| latency_ms = (end_time - start_time) * 1000 |
| total_latency += latency_ms |
| |
| |
| retrieved_text_combined = " ".join([chunk[0].lower() for chunk in retrieved]) |
| |
| true_positives = 0 |
| for kw in expected_keywords: |
| if kw in retrieved_text_combined: |
| true_positives += 1 |
| |
| |
| |
| relevant_chunks_retrieved = 0 |
| for chunk, _ in retrieved: |
| chunk_lower = chunk.lower() |
| if any(kw in chunk_lower for kw in expected_keywords): |
| relevant_chunks_retrieved += 1 |
| |
| |
| precision = relevant_chunks_retrieved / len(retrieved) if retrieved else 0.0 |
| recall = true_positives / len(expected_keywords) if expected_keywords else 0.0 |
| |
| f1 = 0.0 |
| if precision + recall > 0: |
| f1 = 2 * (precision * recall) / (precision + recall) |
| |
| total_precision += precision |
| total_recall += recall |
| total_f1 += f1 |
| |
| |
| avg_precision = total_precision / num_cases |
| avg_recall = total_recall / num_cases |
| avg_f1 = total_f1 / num_cases |
| avg_latency = total_latency / num_cases |
| |
| results = { |
| "Precision": avg_precision, |
| "Recall": avg_recall, |
| "F1_Score": avg_f1, |
| "Latency_ms": avg_latency |
| } |
| |
| print("\n--- Evaluation Results ---") |
| print(f"Precision: {avg_precision:.2%} (Target: >85%)") |
| print(f"Recall: {avg_recall:.2%} (Target: >90%)") |
| print(f"F1 Score: {avg_f1:.2%} (Target: >87%)") |
| print(f"Latency: {avg_latency:.2f}ms (Target: <500ms)") |
| |
| return results |
|
|
| if __name__ == "__main__": |
| import sys |
| test_path = "data/test_cases.json" |
| if len(sys.argv) > 1: |
| test_path = sys.argv[1] |
| evaluate_retrieval(test_path) |
|
|