pharmaspine-backend / src /evaluation.py
ashish1265659565's picture
Upload folder using huggingface_hub
08fd094 verified
Raw
History Blame Contribute Delete
3.99 kB
import json
import time
from typing import List, Dict, Any
from src.retrieval import hybrid_search
"""
evaluation.py
What it does:
Evaluates the performance of the retrieval system against a set of predefined test cases.
Calculates Precision, Recall, F1-Score, and Latency.
How it works:
It loads test cases from a JSON file. For each test case, it runs the `hybrid_search` function
and measures the time taken (Latency). It then compares the retrieved chunks against the
expected keywords to determine true positives, false positives, and false negatives.
Key formulas:
- Precision = True Positives / (True Positives + False Positives) (Are the retrieved chunks actually relevant?)
- Recall = True Positives / (True Positives + False Negatives) (Did we find all the relevant keywords?)
- F1-Score = 2 * (Precision * Recall) / (Precision + Recall)
Example input:
- A JSON file with test cases (e.g., data/test_cases.json)
Example output:
- Prints aggregate metrics to the console.
"""
def evaluate_retrieval(test_cases_path: str) -> Dict[str, float]:
"""
Runs the evaluation suite and returns aggregate metrics.
"""
try:
with open(test_cases_path, 'r') as f:
test_cases = json.load(f)
except FileNotFoundError:
print(f"Test cases file not found at {test_cases_path}")
return {}
total_precision = 0.0
total_recall = 0.0
total_latency = 0.0
total_f1 = 0.0
num_cases = len(test_cases)
if num_cases == 0:
return {}
print(f"Running evaluation on {num_cases} test cases...")
for case in test_cases:
query = case["question"]
expected_keywords = [kw.lower() for kw in case.get("expected_key_words", [])]
# Measure latency
start_time = time.time()
retrieved = hybrid_search(query, top_k=5)
end_time = time.time()
latency_ms = (end_time - start_time) * 1000
total_latency += latency_ms
# We define a "True Positive" as a keyword that was found in any of the retrieved chunks
retrieved_text_combined = " ".join([chunk[0].lower() for chunk in retrieved])
true_positives = 0
for kw in expected_keywords:
if kw in retrieved_text_combined:
true_positives += 1
# For precision, we look at how many chunks were actually relevant.
# A chunk is relevant if it contains at least one expected keyword.
relevant_chunks_retrieved = 0
for chunk, _ in retrieved:
chunk_lower = chunk.lower()
if any(kw in chunk_lower for kw in expected_keywords):
relevant_chunks_retrieved += 1
# Calculate metrics for this case
precision = relevant_chunks_retrieved / len(retrieved) if retrieved else 0.0
recall = true_positives / len(expected_keywords) if expected_keywords else 0.0
f1 = 0.0
if precision + recall > 0:
f1 = 2 * (precision * recall) / (precision + recall)
total_precision += precision
total_recall += recall
total_f1 += f1
# Aggregate
avg_precision = total_precision / num_cases
avg_recall = total_recall / num_cases
avg_f1 = total_f1 / num_cases
avg_latency = total_latency / num_cases
results = {
"Precision": avg_precision,
"Recall": avg_recall,
"F1_Score": avg_f1,
"Latency_ms": avg_latency
}
print("\n--- Evaluation Results ---")
print(f"Precision: {avg_precision:.2%} (Target: >85%)")
print(f"Recall: {avg_recall:.2%} (Target: >90%)")
print(f"F1 Score: {avg_f1:.2%} (Target: >87%)")
print(f"Latency: {avg_latency:.2f}ms (Target: <500ms)")
return results
if __name__ == "__main__":
import sys
test_path = "data/test_cases.json"
if len(sys.argv) > 1:
test_path = sys.argv[1]
evaluate_retrieval(test_path)