import json import time from collections.abc import Iterator from _core.llm import LLMClient from _core.models import estimate_cost from _core.tracer import Step DEFAULT_DATASET = [ {"q": "What is the capital of France?", "ref": "Paris"}, {"q": "What is 2 + 2?", "ref": "4"}, {"q": "Who wrote 'Romeo and Juliet'?", "ref": "William Shakespeare"}, {"q": "What is the chemical symbol for water?", "ref": "H2O"}, {"q": "How many continents are there?", "ref": "7"}, ] SUT_PROMPT = "Answer the question concisely." JUDGE_PROMPT = ( "You are a strict grader. Given a question, a reference answer, and a candidate " 'answer, reply with JSON {"score": 1-5, "pass": bool, "reason": str}. ' "Output ONLY JSON." ) class EvalHarness: def __init__( self, llm: LLMClient, dataset: list[dict] | None = None, pass_threshold: int = 4 ): self.llm = llm self.dataset = dataset if dataset is not None else DEFAULT_DATASET self.pass_threshold = pass_threshold def _ask(self, system: str, user: str): start = time.monotonic() resp = self.llm.chat( [{"role": "system", "content": system}, {"role": "user", "content": user}] ) latency = int((time.monotonic() - start) * 1000) cost = estimate_cost(self.llm.model, resp.prompt_tokens, resp.completion_tokens) return resp, latency, cost def run(self, _user_input: str = "") -> Iterator[Step]: scores: list[int] = [] for case in self.dataset: aresp, alat, acost = self._ask(SUT_PROMPT, case["q"]) yield Step(kind="action", content=f"Q: {case['q']}") yield Step( kind="observation", content=f"Answer: {aresp.content}", tokens=aresp.prompt_tokens + aresp.completion_tokens, cost_usd=acost, latency_ms=alat, ) jresp, jlat, jcost = self._ask( JUDGE_PROMPT, f"Question: {case['q']}\nReference: {case['ref']}\nCandidate: {aresp.content}", ) try: verdict = json.loads(jresp.content or "{}") except json.JSONDecodeError: verdict = {"score": 0, "pass": False, "reason": "unparseable judge output"} scores.append(int(verdict.get("score", 0))) yield Step( kind="thought", content=( f"Judge: score={verdict.get('score')} pass={verdict.get('pass')} " f"— {verdict.get('reason', '')}" ), tokens=jresp.prompt_tokens + jresp.completion_tokens, cost_usd=jcost, latency_ms=jlat, ) avg = sum(scores) / len(scores) if scores else 0.0 passed = sum(1 for s in scores if s >= self.pass_threshold) yield Step( kind="final", content=( f"Avg score: {avg:.2f}/5 · Passed {passed}/{len(scores)} " f"(threshold {self.pass_threshold})" ), )