#!/usr/bin/env python3 """Quality eval harness — runs the chat pipeline on a fixed test set and scores answers on three rubrics that don't need a human: 1. Citation rate — answers that cite [1][2] when problem_type ∈ REQUIRES_CITATION 2. Banned-chemical rate — answers that mention any banned chemical (lower is better) 3. Latency p50/p95 — retrieval + generation 4. Top-1 score — average rerank score of #1 doc Run BEFORE and AFTER any retrieval/LLM change to prove regressions. Usage: python -m pipelines.eval_chatbot --test eval/test_queries.json --out eval/results.json """ from __future__ import annotations import argparse import json import statistics import time from pathlib import Path from kcc_core import classify, citation_guard, config, prompt from kcc_core import llm as llmmod from kcc_core import retrieval as retr from kcc_core.prompt import BANNED_CHEMICALS, REQUIRES_CITATION def main(): p = argparse.ArgumentParser() p.add_argument("--test", default="eval/test_queries.json") p.add_argument("--out", default="eval/results.json") args = p.parse_args() tests = json.load(open(args.test, "r", encoding="utf-8")) print(f"[eval] loaded {len(tests)} test queries") retriever = retr.get_retriever() golden = retr.get_golden_retriever() rows = [] for t in tests: q = t["query"] crop = classify.detect_crop(q) ptype = classify.classify_problem(q) normalized = classify.normalize_query(q) t0 = time.perf_counter() docs = retr.multi_step_retrieve(retriever, golden, q, normalized, crop, ptype, top_k=5, state=t.get("state", ""), district=t.get("district", ""), run_hyde=True) ret_ms = (time.perf_counter() - t0) * 1000 ctx = retr.KCCRetriever.format_context(docs) prm = prompt.build_prompt(q, ctx, problem_type=ptype, language=classify.detect_language(q), detected_crop=crop) t1 = time.perf_counter() ans = llmmod.generate(prm, max_tokens=400, temperature=0.1) gen_ms = (time.perf_counter() - t1) * 1000 ans, _w = citation_guard.review(ans, problem_type=ptype) top_score = docs[0].rerank_score if docs and docs[0].rerank_score else ( docs[0].score if docs else 0.0) cited = bool(citation_guard.has_citations(ans)) # Negation-aware: "do NOT use Endosulfan" is correct, not a leak. banned = citation_guard.banned_chemical_check(ans or "") rows.append({ "query": q, "crop": crop, "problem_type": ptype, "top_score": round(top_score, 3), "ret_ms": round(ret_ms, 1), "gen_ms": round(gen_ms, 1), "cited": cited, "needs_cite": ptype in REQUIRES_CITATION, "banned": banned, "answer": ans, }) print(f" {len(rows)}/{len(tests)} {q[:60]}... " f"top={top_score:.2f} ret={ret_ms:.0f}ms cited={cited}") # Aggregate needs_cite = [r for r in rows if r["needs_cite"]] cite_rate = (sum(1 for r in needs_cite if r["cited"]) / len(needs_cite)) \ if needs_cite else None banned_rate = sum(1 for r in rows if r["banned"]) / len(rows) ret_ms = [r["ret_ms"] for r in rows] gen_ms = [r["gen_ms"] for r in rows] top_scores = [r["top_score"] for r in rows] summary = { "n": len(rows), "citation_rate": round(cite_rate, 3) if cite_rate is not None else None, "banned_rate": round(banned_rate, 3), "ret_ms_p50": round(statistics.median(ret_ms), 1), "ret_ms_p95": round(statistics.quantiles(ret_ms, n=20)[-1], 1) if len(ret_ms) >= 20 else None, "gen_ms_p50": round(statistics.median(gen_ms), 1), "top_score_mean": round(statistics.mean(top_scores), 3), } out = {"summary": summary, "rows": rows} Path(args.out).parent.mkdir(parents=True, exist_ok=True) with open(args.out, "w", encoding="utf-8") as f: json.dump(out, f, ensure_ascii=False, indent=2) print(f"\n[eval] summary: {json.dumps(summary, indent=2)}") print(f"[eval] wrote {args.out}") if __name__ == "__main__": raise SystemExit(main())