Spaces:

ashish1265659565
/

pharmaspine-backend

Sleeping

App Files Files Community

pharmaspine-backend / src /evaluation.py

ashish1265659565

Upload folder using huggingface_hub

08fd094 verified 3 days ago

Raw

History Blame Contribute Delete

3.99 kB

	import json
	import time
	from typing import List, Dict, Any
	from src.retrieval import hybrid_search

	"""
	evaluation.py

	What it does:
	Evaluates the performance of the retrieval system against a set of predefined test cases.
	Calculates Precision, Recall, F1-Score, and Latency.

	How it works:
	It loads test cases from a JSON file. For each test case, it runs the `hybrid_search` function
	and measures the time taken (Latency). It then compares the retrieved chunks against the
	expected keywords to determine true positives, false positives, and false negatives.

	Key formulas:
	- Precision = True Positives / (True Positives + False Positives) (Are the retrieved chunks actually relevant?)
	- Recall = True Positives / (True Positives + False Negatives) (Did we find all the relevant keywords?)
	- F1-Score = 2 * (Precision * Recall) / (Precision + Recall)

	Example input:
	- A JSON file with test cases (e.g., data/test_cases.json)

	Example output:
	- Prints aggregate metrics to the console.
	"""

	def evaluate_retrieval(test_cases_path: str) -> Dict[str, float]:
	"""
	Runs the evaluation suite and returns aggregate metrics.
	"""
	try:
	with open(test_cases_path, 'r') as f:
	test_cases = json.load(f)
	except FileNotFoundError:
	print(f"Test cases file not found at {test_cases_path}")
	return {}

	total_precision = 0.0
	total_recall = 0.0
	total_latency = 0.0
	total_f1 = 0.0
	num_cases = len(test_cases)

	if num_cases == 0:
	return {}

	print(f"Running evaluation on {num_cases} test cases...")

	for case in test_cases:
	query = case["question"]
	expected_keywords = [kw.lower() for kw in case.get("expected_key_words", [])]

	# Measure latency
	start_time = time.time()
	retrieved = hybrid_search(query, top_k=5)
	end_time = time.time()

	latency_ms = (end_time - start_time) * 1000
	total_latency += latency_ms

	# We define a "True Positive" as a keyword that was found in any of the retrieved chunks
	retrieved_text_combined = " ".join([chunk[0].lower() for chunk in retrieved])

	true_positives = 0
	for kw in expected_keywords:
	if kw in retrieved_text_combined:
	true_positives += 1

	# For precision, we look at how many chunks were actually relevant.
	# A chunk is relevant if it contains at least one expected keyword.
	relevant_chunks_retrieved = 0
	for chunk, _ in retrieved:
	chunk_lower = chunk.lower()
	if any(kw in chunk_lower for kw in expected_keywords):
	relevant_chunks_retrieved += 1

	# Calculate metrics for this case
	precision = relevant_chunks_retrieved / len(retrieved) if retrieved else 0.0
	recall = true_positives / len(expected_keywords) if expected_keywords else 0.0

	f1 = 0.0
	if precision + recall > 0:
	f1 = 2 * (precision * recall) / (precision + recall)

	total_precision += precision
	total_recall += recall
	total_f1 += f1

	# Aggregate
	avg_precision = total_precision / num_cases
	avg_recall = total_recall / num_cases
	avg_f1 = total_f1 / num_cases
	avg_latency = total_latency / num_cases

	results = {
	"Precision": avg_precision,
	"Recall": avg_recall,
	"F1_Score": avg_f1,
	"Latency_ms": avg_latency
	}

	print("\n--- Evaluation Results ---")
	print(f"Precision: {avg_precision:.2%} (Target: >85%)")
	print(f"Recall: {avg_recall:.2%} (Target: >90%)")
	print(f"F1 Score: {avg_f1:.2%} (Target: >87%)")
	print(f"Latency: {avg_latency:.2f}ms (Target: <500ms)")

	return results

	if __name__ == "__main__":
	import sys
	test_path = "data/test_cases.json"
	if len(sys.argv) > 1:
	test_path = sys.argv[1]
	evaluate_retrieval(test_path)