Spaces:

SachinKulk
/

aker-property-ai

Sleeping

Aker Deploy commited on May 25

Commit

af01fe3

1 Parent(s): 24199fe

feat(obs+evals): mirror main commits f8f81de + 1132c64

Adds Phoenix Cloud observability, RAG eval harness (golden set, runner,
scorer, scheduler, store), /evals/* admin router, and dependency bumps.
Brings the HF Space deployment up to parity with the main repo.

Files changed (14) hide show

README.md +30 -0
app/config.py +19 -0
app/evals/__init__.py +1 -0
app/evals/api.py +132 -0
app/evals/golden_set.yaml +107 -0
app/evals/runner.py +230 -0
app/evals/scheduler.py +98 -0
app/evals/scorer.py +172 -0
app/evals/store.py +119 -0
app/main.py +14 -1
app/models.py +38 -2
app/observability.py +124 -0
app/tools/rag_tools.py +24 -0
requirements.txt +19 -0

README.md CHANGED Viewed

@@ -45,3 +45,33 @@ on the Space and injected as env vars at runtime:
 - `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`
 - `ADMIN_TOKEN`
 - `CORS_ORIGINS` — comma-separated list of allowed Vercel origins

 - `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`
 - `ADMIN_TOKEN`
 - `CORS_ORIGINS` — comma-separated list of allowed Vercel origins
+## Observability & evaluation
+Tracing uses **Phoenix Cloud** (free hosted tier — `https://app.phoenix.arize.com`) via the OpenTelemetry `BatchSpanProcessor`. Spans ship on a background thread, so `/chat` latency is unaffected even when the network or Phoenix is down. Tracing is **opt-in**: set `PHOENIX_ENABLED=true` and `PHOENIX_API_KEY=<key>` in env to turn it on.
+Auto-instrumented:
+- FastAPI routes
+- LangChain / LangGraph nodes, tools, LLM calls (token counts included)
+- OpenAI, Anthropic, Google GenAI client SDKs
+- Pinecone retrieval (manual span in `tools/rag_tools.py`, OpenInference RETRIEVER kind)
+### Evaluation harness
+`open_rag_eval` (Vectara, Apache-2.0) scores RAG turns for **groundedness**, **hallucination**, **answer relevance**, and **context relevance**, judged by `gpt-4o-mini` (set via `EVAL_JUDGE_MODEL`). Evals **never** run inline on `/chat` — they are fully out-of-band.
+Triggers:
+- **Manual** — UI: open the Monitoring tab in the frontend, enter your `ADMIN_TOKEN`, pick cases (or "Run all"), click run. API: `POST /evals/runs` with header `X-Admin-Token`.
+- **CLI** — `python -m app.evals.runner [--ids id1,id2]`.
+- **Scheduled** — opt-in via `EVAL_SCHEDULE_ENABLED=true` with `EVAL_SCHEDULE_CRON="0 */6 * * *"` (default every 6 h). Runs on an APScheduler `BackgroundScheduler` (single-worker thread pool, coalesce, max_instances=1).
+Results land in:
+- **Supabase Postgres** — tables `eval_runs` + `eval_cases` (created automatically by `init_db()`). Run history surfaced via the Monitoring UI.
+- JSONL snapshots at `backend/evals/results/<timestamp>_<run_id>.jsonl`
+- Phoenix Cloud traces under the project `property-ai` in the `aker-ai` space (each case is its own trace; eval scores attached as span attributes)
+Extra env:
+- `PHOENIX_ENABLED`, `PHOENIX_API_KEY`, `PHOENIX_ENDPOINT`, `PHOENIX_PROJECT_NAME`
+- `EVAL_JUDGE_MODEL` (default `gpt-4o-mini`), `EVAL_SCHEDULE_ENABLED`, `EVAL_SCHEDULE_CRON`, `EVAL_MAX_CASES` (default 50)
+Edit the golden set at [`app/evals/golden_set.yaml`](app/evals/golden_set.yaml).

app/config.py CHANGED Viewed

@@ -72,6 +72,25 @@ class Settings:
     backend_host: str = os.getenv("BACKEND_HOST", "0.0.0.0")
     backend_port: int = int(os.getenv("BACKEND_PORT", "8000"))
     @property
     def sqlalchemy_url(self) -> str:
         return self.database_url

     backend_host: str = os.getenv("BACKEND_HOST", "0.0.0.0")
     backend_port: int = int(os.getenv("BACKEND_PORT", "8000"))
+    # Observability — Phoenix Cloud (hosted, free tier). Defaults off so the
+    # app still boots when keys aren't set. Set PHOENIX_ENABLED=true and
+    # PHOENIX_API_KEY in prod env. Export is non-blocking (BatchSpanProcessor
+    # ships spans from a background thread).
+    phoenix_enabled: bool = (os.getenv("PHOENIX_ENABLED", "false").strip().lower() == "true")
+    phoenix_endpoint: str = os.getenv("PHOENIX_ENDPOINT", "https://app.phoenix.arize.com/s/aker-ai/v1/traces")
+    phoenix_api_key: str | None = os.getenv("PHOENIX_API_KEY") or None
+    phoenix_project_name: str = os.getenv("PHOENIX_PROJECT_NAME", "property-ai")
+    # Evaluation harness — never runs inline on /chat. Manual via UI/API,
+    # or scheduled via APScheduler. open_rag_eval judge model.
+    eval_judge_model: str = os.getenv("EVAL_JUDGE_MODEL", "gpt-4o-mini")
+    eval_schedule_enabled: bool = (os.getenv("EVAL_SCHEDULE_ENABLED", "false").strip().lower() == "true")
+    eval_schedule_cron: str = os.getenv("EVAL_SCHEDULE_CRON", "0 */6 * * *")
+    eval_max_cases: int = int(os.getenv("EVAL_MAX_CASES", "50"))
+    # Eval runs are persisted to Supabase Postgres (via SQLAlchemy / app.db).
+    # Local JSONL snapshots are still written for grep-friendly debugging.
+    eval_results_dir: str = os.getenv("EVAL_RESULTS_DIR", str(_BACKEND_DIR / "evals" / "results"))
     @property
     def sqlalchemy_url(self) -> str:
         return self.database_url

app/evals/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Evaluation harness — manual + scheduled, never inline on /chat."""

app/evals/api.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""Admin-protected FastAPI router exposing the eval harness to the UI."""
+from __future__ import annotations
+import uuid
+from typing import Any
+from fastapi import APIRouter, BackgroundTasks, Header, HTTPException
+from pydantic import BaseModel, Field
+from ..config import get_settings
+from . import runner, scheduler, store
+router = APIRouter(prefix="/evals", tags=["evals"])
+def _require_admin(token: str | None) -> None:
+    expected = get_settings().admin_token
+    if not expected:
+        raise HTTPException(status_code=503, detail="ADMIN_TOKEN not configured on server")
+    if token != expected:
+        raise HTTPException(status_code=401, detail="Invalid admin token")
+# ---------------------------------------------------------------------------
+# Golden set
+# ---------------------------------------------------------------------------
+@router.get("/golden")
+def get_golden(x_admin_token: str | None = Header(default=None, alias="X-Admin-Token")) -> list[dict]:
+    _require_admin(x_admin_token)
+    cases = runner.load_golden()
+    return [
+        {"id": c.get("id"), "question": c.get("question"), "property_code": c.get("property_code")}
+        for c in cases
+    ]
+# ---------------------------------------------------------------------------
+# Runs
+# ---------------------------------------------------------------------------
+class RunRequest(BaseModel):
+    ids: list[str] | None = Field(default=None, description="Subset of golden IDs to run")
+    provider: str = "openai"
+    model: str | None = None
+@router.post("/runs")
+def trigger_run(
+    req: RunRequest,
+    background: BackgroundTasks,
+    x_admin_token: str | None = Header(default=None, alias="X-Admin-Token"),
+) -> dict:
+    _require_admin(x_admin_token)
+    run_id = str(uuid.uuid4())
+    store.create_run(run_id, trigger="manual")
+    background.add_task(
+        _safe_run,
+        run_id=run_id,
+        golden_ids=req.ids,
+        provider=req.provider,
+        model=req.model,
+    )
+    return {"run_id": run_id, "status": "started"}
+def _safe_run(*, run_id: str, golden_ids: list[str] | None, provider: str, model: str | None) -> None:
+    import logging
+    log = logging.getLogger("property_ai.evals.api")
+    try:
+        runner.run_eval(
+            run_id=run_id,
+            golden_ids=golden_ids,
+            trigger="manual",
+            llm_provider=provider,
+            model=model,
+        )
+    except Exception as e:  # noqa: BLE001
+        log.exception("background eval run failed")
+        try:
+            store.finish_run(run_id, status="failed", summary={"error": f"{type(e).__name__}: {e}"})
+        except Exception:  # noqa: BLE001
+            pass
+@router.get("/runs")
+def list_runs(
+    limit: int = 50,
+    x_admin_token: str | None = Header(default=None, alias="X-Admin-Token"),
+) -> list[dict]:
+    _require_admin(x_admin_token)
+    return store.list_runs(limit=limit)
+@router.get("/runs/{run_id}")
+def get_run(
+    run_id: str,
+    x_admin_token: str | None = Header(default=None, alias="X-Admin-Token"),
+) -> dict:
+    _require_admin(x_admin_token)
+    run = store.get_run(run_id)
+    if not run:
+        raise HTTPException(status_code=404, detail="run not found")
+    return run
+# ---------------------------------------------------------------------------
+# Schedule
+# ---------------------------------------------------------------------------
+class ScheduleUpdate(BaseModel):
+    cron: str = Field(..., description="Crontab expression, e.g. '0 */6 * * *'")
+@router.get("/schedule")
+def get_schedule(x_admin_token: str | None = Header(default=None, alias="X-Admin-Token")) -> dict[str, Any]:
+    _require_admin(x_admin_token)
+    return scheduler.get_status()
+@router.put("/schedule")
+def put_schedule(
+    body: ScheduleUpdate,
+    x_admin_token: str | None = Header(default=None, alias="X-Admin-Token"),
+) -> dict[str, Any]:
+    _require_admin(x_admin_token)
+    try:
+        return scheduler.update_cron(body.cron)
+    except RuntimeError as e:
+        raise HTTPException(status_code=409, detail=str(e))
+    except Exception as e:  # noqa: BLE001
+        raise HTTPException(status_code=400, detail=f"invalid cron: {e}")

app/evals/golden_set.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+# Curated regression set. Each case is one chat turn against a known property.
+# `expected_substrings` are loose checks (case-insensitive) used as a smoke
+# signal alongside open_rag_eval's LLM-judge scores.
+cases:
+  - id: 115r-summary
+    property_code: 115r
+    question: "Give me a summary of this property."
+    expected_substrings: ["unit"]
+    expected_tools: ["get_property_summary"]
+  - id: 115r-unit-mix
+    property_code: 115r
+    question: "What is the unit mix?"
+    expected_substrings: ["bedroom"]
+    expected_tools: ["get_unit_mix"]
+  - id: 115r-occupancy
+    property_code: 115r
+    question: "What is the current occupancy?"
+    expected_substrings: ["%"]
+    expected_tools: ["get_occupancy"]
+  - id: 115r-expiring
+    property_code: 115r
+    question: "List leases expiring in the next 90 days."
+    expected_substrings: []
+    expected_tools: ["get_expiring_leases"]
+  - id: 115r-top-balances
+    property_code: 115r
+    question: "Who are the top outstanding balances?"
+    expected_substrings: []
+    expected_tools: ["get_top_balances"]
+  - id: 115r-amenities
+    property_code: 115r
+    question: "What amenities does this property offer?"
+    expected_substrings: []
+    expected_tools: ["search_property_pages"]
+  - id: 115r-gym-photos
+    property_code: 115r
+    question: "Show me photos of the gym."
+    expected_substrings: []
+    expected_tools: ["search_property_pages"]
+  - id: 115r-floor-plans
+    property_code: 115r
+    question: "What floor plans are available?"
+    expected_substrings: []
+    expected_tools: ["search_property_pages"]
+  - id: 126r-summary
+    property_code: 126r
+    question: "Give me a summary of this property."
+    expected_substrings: ["unit"]
+    expected_tools: ["get_property_summary"]
+  - id: 126r-unit-mix
+    property_code: 126r
+    question: "Break down the unit mix."
+    expected_substrings: ["bedroom"]
+    expected_tools: ["get_unit_mix"]
+  - id: 126r-occupancy
+    property_code: 126r
+    question: "What is the occupancy rate?"
+    expected_substrings: ["%"]
+    expected_tools: ["get_occupancy"]
+  - id: 126r-rent-trend
+    property_code: 126r
+    question: "Show me the rent trend over the last 6 months."
+    expected_substrings: []
+    expected_tools: ["get_rent_trend"]
+  - id: 126r-amenities
+    property_code: 126r
+    question: "What amenities are available?"
+    expected_substrings: []
+    expected_tools: ["search_property_pages"]
+  - id: 126r-neighborhood
+    property_code: 126r
+    question: "Tell me about the neighborhood."
+    expected_substrings: []
+    expected_tools: ["search_property_pages"]
+  - id: 126r-pool-photo
+    property_code: 126r
+    question: "Do you have a photo of the pool?"
+    expected_substrings: []
+    expected_tools: ["search_property_pages"]
+  - id: 115r-out-of-scope-stock
+    property_code: 115r
+    question: "What is the current price of AAPL stock?"
+    expected_substrings: []
+    expected_tools: []
+    notes: "out of scope — should refuse or redirect"
+  - id: 115r-out-of-scope-other-property
+    property_code: 115r
+    question: "How does The Hamlet at Saratoga Springs compare?"
+    expected_substrings: []
+    expected_tools: []
+    notes: "scope-violation — assistant should stay on 115r"

app/evals/runner.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""Run the golden set against the live graph and score each turn.
+Sync function — safe to call from a background thread (APScheduler) or from
+FastAPI's `BackgroundTasks`. NEVER call from within a `/chat` request path.
+Phoenix Cloud spans are emitted naturally by the LangChain instrumentor —
+each case becomes its own trace because we pass a fresh `conversation_id`.
+"""
+from __future__ import annotations
+import json
+import logging
+import time
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+import yaml
+from ..config import get_settings
+from ..graph.build import run_chat
+from ..observability import get_tracer
+from . import scorer, store
+log = logging.getLogger("property_ai.evals.runner")
+_tracer = get_tracer("property_ai.evals")
+_GOLDEN_PATH = Path(__file__).resolve().parent / "golden_set.yaml"
+def load_golden() -> list[dict[str, Any]]:
+    with _GOLDEN_PATH.open("r", encoding="utf-8") as f:
+        data = yaml.safe_load(f) or {}
+    return list(data.get("cases") or [])
+def run_eval(
+    *,
+    run_id: str | None = None,
+    golden_ids: list[str] | None = None,
+    trigger: str = "manual",
+    llm_provider: str = "openai",
+    model: str | None = None,
+) -> dict[str, Any]:
+    settings = get_settings()
+    cases = load_golden()
+    if golden_ids:
+        wanted = set(golden_ids)
+        cases = [c for c in cases if c.get("id") in wanted]
+    if not cases:
+        return {"run_id": run_id, "status": "empty", "summary": {"count": 0}}
+    cases = cases[: settings.eval_max_cases]
+    run_id = run_id or str(uuid.uuid4())
+    model = model or settings.eval_judge_model
+    store.create_run(run_id, trigger)
+    log.info("eval run %s started (trigger=%s, cases=%d)", run_id, trigger, len(cases))
+    results: list[dict[str, Any]] = []
+    with _tracer.start_as_current_span("eval.run") as run_span:
+        run_span.set_attribute("eval.run_id", run_id)
+        run_span.set_attribute("eval.trigger", trigger)
+        run_span.set_attribute("eval.case_count", len(cases))
+        for case in cases:
+            case_result = _run_one_case(run_id=run_id, case=case, provider=llm_provider, model=model)
+            results.append(case_result)
+            store.add_case(
+                run_id=run_id,
+                golden_id=case.get("id", ""),
+                property_code=case.get("property_code"),
+                question=case.get("question", ""),
+                answer=case_result.get("answer"),
+                scores=case_result.get("scores"),
+                ok=case_result.get("ok", False),
+                error=case_result.get("error"),
+                duration_ms=case_result.get("duration_ms"),
+                trace_id=case_result.get("trace_id"),
+            )
+    summary = _summarize(results)
+    store.finish_run(run_id, status="completed", summary=summary)
+    _write_jsonl(run_id, results, summary)
+    log.info("eval run %s finished — %s", run_id, summary)
+    return {"run_id": run_id, "status": "completed", "summary": summary}
+def _run_one_case(*, run_id: str, case: dict[str, Any], provider: str, model: str) -> dict[str, Any]:
+    case_id = case.get("id", "")
+    question = case.get("question", "")
+    property_code = case.get("property_code")
+    started = time.monotonic()
+    answer: str | None = None
+    tool_history: list[dict] = []
+    err: str | None = None
+    trace_id: str | None = None
+    with _tracer.start_as_current_span("eval.case") as span:
+        span.set_attribute("eval.case_id", case_id)
+        span.set_attribute("eval.property_code", property_code or "")
+        try:
+            ctx = span.get_span_context()
+            trace_id = format(ctx.trace_id, "032x") if ctx and ctx.trace_id else None
+        except Exception:  # noqa: BLE001
+            trace_id = None
+        try:
+            conv_id = f"eval-{run_id}-{case_id}"
+            state = run_chat(
+                property_code=property_code,
+                user_message=question,
+                llm_provider=provider,
+                model=model,
+                conversation_id=conv_id,
+            )
+            # The graph may pause for property OR time-scope clarification.
+            # Auto-resume with sensible defaults so a single eval case
+            # exercises the full agent loop end-to-end:
+            #   - time clarification → "Latest"
+            #   - property clarification → first option (or skip-score it later)
+            # Hard cap of 2 resumes to avoid loops.
+            for _ in range(2):
+                if not state.get("paused"):
+                    break
+                clar = state.get("clarification") or {}
+                kind = clar.get("scope_kind")
+                options = clar.get("options") or []
+                if kind == "time":
+                    choice = "Latest"
+                elif options:
+                    choice = options[0]
+                else:
+                    break
+                state = run_chat(
+                    property_code=property_code,
+                    user_message=question,
+                    llm_provider=provider,
+                    model=model,
+                    conversation_id=conv_id,
+                    resume_value=choice,
+                )
+            answer = state.get("answer_markdown") or ""
+            tool_history = state.get("tool_history") or []
+        except Exception as e:  # noqa: BLE001
+            err = f"{type(e).__name__}: {e}"
+            log.exception("eval case %s failed in graph", case_id)
+        scores: dict[str, Any] | None = None
+        if err is None:
+            contexts = scorer.contexts_from_tool_history(tool_history)
+            scores = scorer.score_turn(question, answer or "", contexts)
+            for k in ("groundedness", "hallucination", "answer_relevance", "context_relevance"):
+                v = scores.get(k)
+                if isinstance(v, (int, float)):
+                    span.set_attribute(f"eval.{k}", float(v))
+    return {
+        "case_id": case_id,
+        "property_code": property_code,
+        "question": question,
+        "answer": answer,
+        "scores": scores,
+        "ok": err is None,
+        "error": err,
+        "duration_ms": int((time.monotonic() - started) * 1000),
+        "trace_id": trace_id,
+        "tool_calls": [s.get("tool") for s in tool_history if isinstance(s, dict)],
+        "expected_tools": case.get("expected_tools") or [],
+        "expected_substrings": case.get("expected_substrings") or [],
+        "substring_hits": _substring_hits(answer or "", case.get("expected_substrings") or []),
+    }
+def _substring_hits(text: str, needles: list[str]) -> int:
+    t = (text or "").lower()
+    return sum(1 for n in needles if n and n.lower() in t)
+def _summarize(results: list[dict[str, Any]]) -> dict[str, Any]:
+    n = len(results)
+    if n == 0:
+        return {"count": 0}
+    ok_n = sum(1 for r in results if r.get("ok"))
+    out: dict[str, Any] = {
+        "count": n,
+        "ok_count": ok_n,
+        "error_count": n - ok_n,
+    }
+    for k in ("groundedness", "hallucination", "answer_relevance", "context_relevance"):
+        vals = [(r.get("scores") or {}).get(k) for r in results]
+        vals = [float(v) for v in vals if isinstance(v, (int, float))]
+        if vals:
+            out[f"mean_{k}"] = round(sum(vals) / len(vals), 3)
+            out[f"min_{k}"] = round(min(vals), 3)
+    return out
+def _write_jsonl(run_id: str, results: list[dict[str, Any]], summary: dict[str, Any]) -> None:
+    out_dir = Path(get_settings().eval_results_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+    path = out_dir / f"{ts}_{run_id}.jsonl"
+    with path.open("w", encoding="utf-8") as f:
+        f.write(json.dumps({"_summary": summary, "run_id": run_id}, default=str) + "\n")
+        for r in results:
+            f.write(json.dumps(r, default=str) + "\n")
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+def _main() -> None:
+    import argparse
+    ap = argparse.ArgumentParser(description="Run the RAG eval golden set.")
+    ap.add_argument("--ids", help="Comma-separated golden case IDs (default: all).")
+    ap.add_argument("--provider", default="openai")
+    ap.add_argument("--model", default=None)
+    args = ap.parse_args()
+    ids = [s.strip() for s in (args.ids or "").split(",") if s.strip()] or None
+    out = run_eval(golden_ids=ids, trigger="cli", llm_provider=args.provider, model=args.model)
+    print(json.dumps(out, indent=2, default=str))
+if __name__ == "__main__":
+    _main()

app/evals/scheduler.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""APScheduler-based periodic eval runner.
+Runs in the FastAPI process on a separate thread pool (max_workers=1,
+coalesce=True, max_instances=1) so overlapping runs can't pile up and the
+request path is never affected. Opt-in via EVAL_SCHEDULE_ENABLED.
+"""
+from __future__ import annotations
+import logging
+import uuid
+from typing import Any
+from ..config import get_settings
+log = logging.getLogger("property_ai.evals.scheduler")
+_scheduler: Any = None
+_job_id = "eval_golden_set"
+def start_scheduler() -> None:
+    global _scheduler
+    settings = get_settings()
+    if not settings.eval_schedule_enabled:
+        log.info("eval scheduler disabled (EVAL_SCHEDULE_ENABLED=false)")
+        return
+    if _scheduler is not None:
+        return
+    try:
+        from apscheduler.schedulers.background import BackgroundScheduler
+        from apscheduler.triggers.cron import CronTrigger
+        from apscheduler.executors.pool import ThreadPoolExecutor
+    except Exception as e:  # noqa: BLE001
+        log.warning("APScheduler not available; scheduling disabled: %s", e)
+        return
+    _scheduler = BackgroundScheduler(
+        executors={"default": ThreadPoolExecutor(max_workers=1)},
+        job_defaults={"coalesce": True, "max_instances": 1, "misfire_grace_time": 300},
+        timezone="UTC",
+    )
+    try:
+        trigger = CronTrigger.from_crontab(settings.eval_schedule_cron, timezone="UTC")
+    except Exception as e:  # noqa: BLE001
+        log.warning("invalid EVAL_SCHEDULE_CRON=%r (%s); scheduler not started", settings.eval_schedule_cron, e)
+        _scheduler = None
+        return
+    _scheduler.add_job(_run_scheduled, trigger=trigger, id=_job_id, replace_existing=True)
+    _scheduler.start()
+    log.info("eval scheduler started — cron=%r", settings.eval_schedule_cron)
+def stop_scheduler() -> None:
+    global _scheduler
+    if _scheduler is None:
+        return
+    try:
+        _scheduler.shutdown(wait=False)
+    except Exception as e:  # noqa: BLE001
+        log.warning("scheduler shutdown error: %s", e)
+    _scheduler = None
+def get_status() -> dict[str, Any]:
+    settings = get_settings()
+    status: dict[str, Any] = {
+        "enabled": settings.eval_schedule_enabled,
+        "cron": settings.eval_schedule_cron,
+        "running": _scheduler is not None,
+        "next_run_at": None,
+    }
+    if _scheduler is not None:
+        job = _scheduler.get_job(_job_id)
+        if job and job.next_run_time:
+            status["next_run_at"] = job.next_run_time.isoformat()
+    return status
+def update_cron(cron: str) -> dict[str, Any]:
+    """Reschedule the existing job to a new crontab string. Returns status."""
+    if _scheduler is None:
+        raise RuntimeError("scheduler not running")
+    from apscheduler.triggers.cron import CronTrigger
+    trigger = CronTrigger.from_crontab(cron, timezone="UTC")
+    _scheduler.reschedule_job(_job_id, trigger=trigger)
+    # Mutate the in-memory setting too (process-local; not persisted to .env).
+    get_settings().eval_schedule_cron = cron
+    return get_status()
+def _run_scheduled() -> None:
+    # Imported lazily so a scheduler tick can never fail at module-import time.
+    from . import runner
+    try:
+        runner.run_eval(run_id=str(uuid.uuid4()), trigger="scheduled")
+    except Exception as e:  # noqa: BLE001
+        log.exception("scheduled eval run failed: %s", e)

app/evals/scorer.py ADDED Viewed

	@@ -0,0 +1,172 @@

+"""RAG scoring — uses open_rag_eval when its TRECEvaluator API is available,
+falls back to a direct OpenAI LLM-judge so the harness still produces numbers
+when the library surface drifts between releases.
+Returns a dict with four 0-1 scores: groundedness, hallucination,
+answer_relevance, context_relevance.
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+from typing import Any
+from ..config import get_settings
+log = logging.getLogger("property_ai.evals.scorer")
+_JUDGE_SYSTEM = """You are a strict RAG evaluator. You will receive a question,
+an answer the system produced, and the retrieved context passages it had
+available. Score four metrics from 0.0 to 1.0 (higher is better):
+- groundedness:        is every factual claim in the answer supported by the context?
+- hallucination:       1.0 means NO hallucination (answer makes nothing up). 0.0 means major fabrications.
+- answer_relevance:    does the answer address the question asked?
+- context_relevance:   are the retrieved contexts useful for answering the question?
+Reply with ONLY a compact JSON object: {"groundedness": x, "hallucination": x,
+"answer_relevance": x, "context_relevance": x, "rationale": "<one short line>"}.
+No prose, no markdown fences."""
+def _llm_judge_score(question: str, answer: str, contexts: list[str]) -> dict[str, Any]:
+    """Fallback / primary judge using OpenAI directly."""
+    from openai import OpenAI
+    settings = get_settings()
+    client = OpenAI(api_key=settings.openai_api_key)
+    ctx_blob = "\n\n---\n\n".join(contexts) if contexts else "(no contexts retrieved)"
+    user = (
+        f"QUESTION:\n{question}\n\n"
+        f"ANSWER:\n{answer}\n\n"
+        f"RETRIEVED CONTEXTS:\n{ctx_blob[:12000]}"
+    )
+    resp = client.chat.completions.create(
+        model=settings.eval_judge_model,
+        temperature=0.0,
+        messages=[
+            {"role": "system", "content": _JUDGE_SYSTEM},
+            {"role": "user", "content": user},
+        ],
+        response_format={"type": "json_object"},
+    )
+    raw = resp.choices[0].message.content or "{}"
+    try:
+        data = json.loads(raw)
+    except json.JSONDecodeError:
+        data = {}
+    out = {
+        "groundedness":      _clamp01(data.get("groundedness")),
+        "hallucination":     _clamp01(data.get("hallucination")),
+        "answer_relevance":  _clamp01(data.get("answer_relevance")),
+        "context_relevance": _clamp01(data.get("context_relevance")),
+        "rationale":         (data.get("rationale") or "")[:300],
+        "judge":             f"openai/{settings.eval_judge_model}",
+    }
+    return out
+def _try_open_rag_eval(question: str, answer: str, contexts: list[str]) -> dict[str, Any] | None:
+    """Best-effort call into open_rag_eval. Returns None if the API surface
+    isn't what we expect (we then fall back to direct LLM judging)."""
+    try:
+        from open_rag_eval.evaluators.trec_evaluator import TRECEvaluator  # type: ignore
+        from open_rag_eval.models.openai_model import OpenAIModel  # type: ignore
+        from open_rag_eval.rag_types import RAGResult  # type: ignore
+    except Exception as e:  # noqa: BLE001
+        log.info("open_rag_eval not importable (%s); using direct judge", e)
+        return None
+    settings = get_settings()
+    try:
+        model = OpenAIModel(name=settings.eval_judge_model, api_key=settings.openai_api_key)
+        evaluator = TRECEvaluator(model=model)
+        result = RAGResult(
+            query=question,
+            generated_answer=answer,
+            retrieved_contexts=contexts or [""],
+        )
+        scored = evaluator.evaluate_single(result)
+        scores = getattr(scored, "scores", None) or {}
+        # open_rag_eval metric names differ across releases — map best-effort.
+        return {
+            "groundedness":      _clamp01(scores.get("groundedness") or scores.get("umbrela")),
+            "hallucination":     _clamp01(scores.get("hallucination") or scores.get("hhem")),
+            "answer_relevance":  _clamp01(scores.get("answer_relevance")),
+            "context_relevance": _clamp01(scores.get("context_relevance")),
+            "rationale":         "open_rag_eval/TRECEvaluator",
+            "judge":             f"open_rag_eval+{settings.eval_judge_model}",
+            "raw":               {k: float(v) for k, v in scores.items() if _is_number(v)},
+        }
+    except Exception as e:  # noqa: BLE001
+        log.warning("open_rag_eval scoring failed, falling back: %s", e)
+        return None
+def score_turn(question: str, answer: str, retrieved_contexts: list[str]) -> dict[str, Any]:
+    """Score one RAG turn. Never raises — returns an `error` field on failure."""
+    if not (get_settings().openai_api_key):
+        return {"error": "OPENAI_API_KEY not set; cannot score"}
+    try:
+        result = _try_open_rag_eval(question, answer, retrieved_contexts)
+        if result is None:
+            result = _llm_judge_score(question, answer, retrieved_contexts)
+        return result
+    except Exception as e:  # noqa: BLE001
+        log.exception("scoring failed")
+        return {"error": f"{type(e).__name__}: {e}"}
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _clamp01(v: Any) -> float | None:
+    if v is None:
+        return None
+    try:
+        f = float(v)
+    except (TypeError, ValueError):
+        return None
+    if f != f:  # NaN
+        return None
+    return max(0.0, min(1.0, f))
+def _is_number(v: Any) -> bool:
+    return isinstance(v, (int, float)) and not isinstance(v, bool)
+def contexts_from_tool_history(tool_history: list[dict]) -> list[str]:
+    """Extract retrieved text contexts from a graph turn's tool_history.
+    RAG: pull text chunks from `search_property_pages` / `search_property_active`.
+    SQL: serialize result rows as structured context so groundedness checks
+    can verify numeric claims too.
+    """
+    out: list[str] = []
+    for step in tool_history or []:
+        tool = step.get("tool") or ""
+        result = step.get("result") or step.get("output") or {}
+        if isinstance(result, str):
+            try:
+                result = json.loads(result)
+            except (json.JSONDecodeError, TypeError):
+                result = {"text": result}
+        if not isinstance(result, dict):
+            continue
+        if tool in {"search_property_pages", "search_property_active"}:
+            for ch in (result.get("chunks") or []):
+                t = (ch.get("text") or "").strip()
+                if t:
+                    out.append(t)
+        else:
+            # SQL/structured tool: dump up to ~2k chars as one context block.
+            try:
+                blob = json.dumps(result, default=str)[:2000]
+            except (TypeError, ValueError):
+                blob = str(result)[:2000]
+            if blob:
+                out.append(f"[{tool}] {blob}")
+    return out

app/evals/store.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""Persistence for eval runs — backed by Supabase Postgres via SQLAlchemy.
+Tables (`eval_runs`, `eval_cases`) live alongside the rent-roll schema and are
+created automatically by `init_db()` in `app/db.py`.
+"""
+from __future__ import annotations
+from datetime import datetime, timezone
+from typing import Any
+from sqlalchemy import select, desc
+from ..db import session_scope
+from ..models import EvalCase, EvalRun
+def _now() -> datetime:
+    return datetime.now(timezone.utc)
+def create_run(run_id: str, trigger: str) -> None:
+    with session_scope() as s:
+        # Idempotent — the API may pre-create the row before BackgroundTasks
+        # hands off to the runner.
+        existing = s.get(EvalRun, run_id)
+        if existing is not None:
+            return
+        s.add(EvalRun(
+            id=run_id,
+            started_at=_now(),
+            trigger=trigger,
+            status="running",
+            summary=None,
+        ))
+def finish_run(run_id: str, status: str, summary: dict[str, Any]) -> None:
+    with session_scope() as s:
+        row = s.get(EvalRun, run_id)
+        if row is None:
+            row = EvalRun(id=run_id, started_at=_now(), trigger="unknown", status=status, summary=summary)
+            s.add(row)
+        row.finished_at = _now()
+        row.status = status
+        row.summary = summary
+def add_case(
+    *,
+    run_id: str,
+    golden_id: str,
+    property_code: str | None,
+    question: str,
+    answer: str | None,
+    scores: dict[str, Any] | None,
+    ok: bool,
+    error: str | None,
+    duration_ms: int | None,
+    trace_id: str | None,
+) -> None:
+    with session_scope() as s:
+        s.add(EvalCase(
+            run_id=run_id,
+            golden_id=golden_id,
+            property_code=property_code,
+            question=question,
+            answer=answer,
+            scores=scores,
+            ok=bool(ok),
+            error=error,
+            duration_ms=duration_ms,
+            trace_id=trace_id,
+        ))
+def list_runs(limit: int = 50) -> list[dict[str, Any]]:
+    with session_scope() as s:
+        rows = s.execute(
+            select(EvalRun).order_by(desc(EvalRun.started_at)).limit(limit)
+        ).scalars().all()
+        return [_run_to_dict(r) for r in rows]
+def get_run(run_id: str) -> dict[str, Any] | None:
+    with session_scope() as s:
+        run = s.get(EvalRun, run_id)
+        if run is None:
+            return None
+        cases = s.execute(
+            select(EvalCase).where(EvalCase.run_id == run_id).order_by(EvalCase.id)
+        ).scalars().all()
+        out = _run_to_dict(run)
+        out["cases"] = [_case_to_dict(c) for c in cases]
+        return out
+def _run_to_dict(r: EvalRun) -> dict[str, Any]:
+    return {
+        "id": r.id,
+        "started_at": r.started_at.isoformat() if r.started_at else None,
+        "finished_at": r.finished_at.isoformat() if r.finished_at else None,
+        "trigger": r.trigger,
+        "status": r.status,
+        "summary": r.summary,
+    }
+def _case_to_dict(c: EvalCase) -> dict[str, Any]:
+    return {
+        "golden_id": c.golden_id,
+        "property_code": c.property_code,
+        "question": c.question,
+        "answer": c.answer,
+        "scores": c.scores,
+        "ok": bool(c.ok),
+        "error": c.error,
+        "duration_ms": c.duration_ms,
+        "trace_id": c.trace_id,
+    }

app/main.py CHANGED Viewed

@@ -37,6 +37,8 @@ from .schemas import (
 from .guardrails.scope import UnknownPropertyError, ScopeViolationError
 from .llm_registry import ProviderUnavailable, list_llms, validate_model
 from .graph.build import run_chat, run_chat_stream
 log = logging.getLogger("property_ai")
 settings = get_settings()
@@ -44,8 +46,16 @@ settings = get_settings()
 @asynccontextmanager
 async def lifespan(_app: FastAPI):
     init_db()
-    yield
 app = FastAPI(title="Property-Specific AI Assistant", version="0.1.0", lifespan=lifespan)
@@ -64,6 +74,9 @@ app.add_middleware(
     allow_headers=["*"],
 )
 # Image/table artifacts live in Supabase Storage (public bucket). The
 # frontend loads them directly via the fully-qualified Supabase URL that
 # rag_tools returns — no static mount needed here.

 from .guardrails.scope import UnknownPropertyError, ScopeViolationError
 from .llm_registry import ProviderUnavailable, list_llms, validate_model
 from .graph.build import run_chat, run_chat_stream
+from .observability import init_tracing, shutdown_tracing
+from .evals.api import router as evals_router
 log = logging.getLogger("property_ai")
 settings = get_settings()
 @asynccontextmanager
 async def lifespan(_app: FastAPI):
+    init_tracing(_app)
     init_db()
+    # Eval scheduler is opt-in via EVAL_SCHEDULE_ENABLED.
+    from .evals.scheduler import start_scheduler, stop_scheduler
+    start_scheduler()
+    try:
+        yield
+    finally:
+        stop_scheduler()
+        shutdown_tracing()
 app = FastAPI(title="Property-Specific AI Assistant", version="0.1.0", lifespan=lifespan)
     allow_headers=["*"],
 )
+# Eval & monitoring endpoints — admin-protected, never on /chat critical path.
+app.include_router(evals_router)
 # Image/table artifacts live in Supabase Storage (public bucket). The
 # frontend loads them directly via the fully-qualified Supabase URL that
 # rag_tools returns — no static mount needed here.

app/models.py CHANGED Viewed

@@ -1,9 +1,9 @@
 """ORM models for the rent-roll domain."""
 from __future__ import annotations
-from datetime import date
 from sqlalchemy import (
-    String, Integer, Float, Date, Boolean, ForeignKey, JSON, Index
 )
 from sqlalchemy.dialects.postgresql import JSONB
 from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
@@ -109,3 +109,39 @@ class RentChargeLine(Base):
     __table_args__ = (
         Index("ix_charge_code_month_unit", "property_code", "snapshot_month", "unit_number"),
     )

 """ORM models for the rent-roll domain."""
 from __future__ import annotations
+from datetime import date, datetime
 from sqlalchemy import (
+    String, Integer, Float, Date, Boolean, ForeignKey, JSON, Index, DateTime, Text
 )
 from sqlalchemy.dialects.postgresql import JSONB
 from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
     __table_args__ = (
         Index("ix_charge_code_month_unit", "property_code", "snapshot_month", "unit_number"),
     )
+# ---------------------------------------------------------------------------
+# Evaluation harness — persisted alongside the rent-roll schema in Supabase
+# so the Monitoring UI can read run history without a separate datastore.
+# ---------------------------------------------------------------------------
+class EvalRun(Base):
+    __tablename__ = "eval_runs"
+    id:           Mapped[str] = mapped_column(String(64), primary_key=True)
+    started_at:   Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
+    finished_at:  Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
+    trigger:      Mapped[str] = mapped_column(String(32))   # manual | scheduled | cli
+    status:       Mapped[str] = mapped_column(String(16))   # running | completed | failed
+    summary:      Mapped[dict | None] = mapped_column(JSON_VARIANT, nullable=True)
+    cases: Mapped[list["EvalCase"]] = relationship(
+        back_populates="run", cascade="all, delete-orphan",
+    )
+class EvalCase(Base):
+    __tablename__ = "eval_cases"
+    id:            Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
+    run_id:        Mapped[str] = mapped_column(String(64), ForeignKey("eval_runs.id", ondelete="CASCADE"), index=True)
+    golden_id:     Mapped[str] = mapped_column(String(128))
+    property_code: Mapped[str | None] = mapped_column(String(32), nullable=True)
+    question:      Mapped[str] = mapped_column(Text)
+    answer:        Mapped[str | None] = mapped_column(Text, nullable=True)
+    scores:        Mapped[dict | None] = mapped_column(JSON_VARIANT, nullable=True)
+    ok:            Mapped[bool] = mapped_column(Boolean, default=False)
+    error:         Mapped[str | None] = mapped_column(Text, nullable=True)
+    duration_ms:   Mapped[int | None] = mapped_column(Integer, nullable=True)
+    trace_id:      Mapped[str | None] = mapped_column(String(64), nullable=True)
+    run: Mapped["EvalRun"] = relationship(back_populates="cases")

app/observability.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""OpenTelemetry / Phoenix Cloud tracing bootstrap.
+`init_tracing()` is called once from `main.py:lifespan`. It is **fail-open** —
+any error (missing key, import problem, network issue) is logged and swallowed
+so the API keeps serving traffic.
+Span export uses OTel's `BatchSpanProcessor`, which queues spans in memory and
+ships them on a background thread, so `/chat` request latency is unaffected by
+the network or Phoenix availability.
+"""
+from __future__ import annotations
+import logging
+import os
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from fastapi import FastAPI
+log = logging.getLogger("property_ai.observability")
+_initialized = False
+_tracer_provider = None
+def init_tracing(app: "FastAPI | None" = None) -> None:
+    """Idempotent. Wire Phoenix Cloud + OpenInference + FastAPI instrumentation.
+    Defaults to no-op when `PHOENIX_ENABLED` is false or the API key is missing.
+    """
+    global _initialized, _tracer_provider
+    if _initialized:
+        return
+    from .config import get_settings
+    settings = get_settings()
+    if not settings.phoenix_enabled:
+        log.info("phoenix tracing disabled (PHOENIX_ENABLED=false)")
+        _initialized = True
+        return
+    if not settings.phoenix_api_key:
+        log.warning("PHOENIX_ENABLED=true but PHOENIX_API_KEY is unset — tracing disabled")
+        _initialized = True
+        return
+    # Suppress Authorization / X-Admin-Token in FastAPI request-header spans.
+    os.environ.setdefault(
+        "OTEL_INSTRUMENTATION_HTTP_CAPTURE_HEADERS_SANITIZE_FIELDS",
+        "authorization,x-admin-token,cookie,set-cookie",
+    )
+    try:
+        from phoenix.otel import register
+        # Phoenix Cloud spaces require Authorization: Bearer <key>. We set it
+        # both via the register() kwarg and as OTEL_EXPORTER_OTLP_HEADERS so
+        # whichever transport phoenix.otel picks (gRPC or HTTP) authenticates.
+        bearer = f"Bearer {settings.phoenix_api_key}"
+        os.environ.setdefault("OTEL_EXPORTER_OTLP_HEADERS", f"Authorization={bearer}")
+        os.environ.setdefault("PHOENIX_API_KEY", settings.phoenix_api_key)
+        _tracer_provider = register(
+            project_name=settings.phoenix_project_name,
+            endpoint=settings.phoenix_endpoint,
+            headers={"Authorization": bearer},
+            batch=True,
+            auto_instrument=False,
+            set_global_tracer_provider=True,
+        )
+    except Exception as e:  # noqa: BLE001
+        log.warning("phoenix.otel.register failed — tracing disabled: %s", e)
+        _initialized = True
+        return
+    # Instrument LangChain (covers LangGraph nodes), LLM SDKs, and FastAPI.
+    # Each is wrapped independently so a missing dep doesn't kill the others.
+    _safe_instrument("openinference.instrumentation.langchain", "LangChainInstrumentor")
+    _safe_instrument("openinference.instrumentation.openai", "OpenAIInstrumentor")
+    _safe_instrument("openinference.instrumentation.anthropic", "AnthropicInstrumentor")
+    _safe_instrument("openinference.instrumentation.google_genai", "GoogleGenAIInstrumentor")
+    if app is not None:
+        try:
+            from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
+            FastAPIInstrumentor.instrument_app(app)
+        except Exception as e:  # noqa: BLE001
+            log.warning("FastAPI instrumentation failed: %s", e)
+    log.info(
+        "phoenix tracing enabled — project=%s endpoint=%s",
+        settings.phoenix_project_name,
+        settings.phoenix_endpoint,
+    )
+    _initialized = True
+def _safe_instrument(module: str, class_name: str) -> None:
+    try:
+        mod = __import__(module, fromlist=[class_name])
+        cls = getattr(mod, class_name)
+        cls().instrument()
+    except Exception as e:  # noqa: BLE001
+        log.warning("instrumentor %s.%s skipped: %s", module, class_name, e)
+def shutdown_tracing() -> None:
+    """Flush in-flight spans on app shutdown."""
+    global _tracer_provider
+    if _tracer_provider is None:
+        return
+    try:
+        _tracer_provider.shutdown()
+    except Exception as e:  # noqa: BLE001
+        log.warning("tracer provider shutdown failed: %s", e)
+    _tracer_provider = None
+def get_tracer(name: str = "property_ai"):
+    """Return an OTel tracer. Safe to call even when tracing is disabled — the
+    no-op TracerProvider returns no-op spans."""
+    from opentelemetry import trace
+    return trace.get_tracer(name)

app/tools/rag_tools.py CHANGED Viewed

@@ -17,6 +17,9 @@ from __future__ import annotations
 from typing import Any
 from ..guardrails.scope import require_scope
 # Pinecone returns cosine SIMILARITY in `score` (1 = identical, -1 = opposite).
@@ -165,6 +168,13 @@ def search_property_v2(
     max_images = max(1, min(int(max_images or _DEFAULT_MAX_IMAGES), _HARD_CAP_MAX_IMAGES))
     image_query_k = min(max_images * _IMAGE_QUERY_K_MULT, 60)
     from ..ingestion.v2.embedder import JinaV4Embedder
     from ..ingestion.v2.pipeline import get_index_v2
@@ -285,6 +295,20 @@ def search_property_v2(
                 label = label.split("|", 1)[0].strip()
             sources.append({"label": label, "url": u})
     return {
         "property_code": code,
         "query": query,

 from typing import Any
 from ..guardrails.scope import require_scope
+from ..observability import get_tracer
+_tracer = get_tracer("property_ai.rag")
 # Pinecone returns cosine SIMILARITY in `score` (1 = identical, -1 = opposite).
     max_images = max(1, min(int(max_images or _DEFAULT_MAX_IMAGES), _HARD_CAP_MAX_IMAGES))
     image_query_k = min(max_images * _IMAGE_QUERY_K_MULT, 60)
+    # OpenInference RETRIEVER span — Phoenix renders chunks under "Retrieval".
+    span = _tracer.start_span("pinecone.query")
+    span.set_attribute("openinference.span.kind", "RETRIEVER")
+    span.set_attribute("retrieval.namespace", code)
+    span.set_attribute("retrieval.top_k", k)
+    span.set_attribute("input.value", query)
     from ..ingestion.v2.embedder import JinaV4Embedder
     from ..ingestion.v2.pipeline import get_index_v2
                 label = label.split("|", 1)[0].strip()
             sources.append({"label": label, "url": u})
+    try:
+        for i, ch in enumerate(chunks[:10]):
+            text = ch.get("text") or ""
+            span.set_attribute(f"retrieval.documents.{i}.document.id", str(ch.get("chunk_index") or i))
+            span.set_attribute(f"retrieval.documents.{i}.document.score", float(1.0 - (ch.get("distance") or 0.0)))
+            span.set_attribute(f"retrieval.documents.{i}.document.content", text[:500])
+            url = ch.get("url")
+            if url:
+                span.set_attribute(f"retrieval.documents.{i}.document.metadata.url", url)
+        span.set_attribute("retrieval.chunk_count", len(chunks))
+        span.set_attribute("retrieval.image_count", len(images))
+    finally:
+        span.end()
     return {
         "property_code": code,
         "query": query,

requirements.txt CHANGED Viewed

@@ -46,3 +46,22 @@ openai>=1.55,<3
 sqlglot>=25
 langchain-anthropic>=1.0,<2
 langchain-google-genai>=2,<3

 sqlglot>=25
 langchain-anthropic>=1.0,<2
 langchain-google-genai>=2,<3
+# Observability & evaluation — latest production-grade releases (verified on PyPI 2026-05-24)
+# NOTE: we don't install the full `arize-phoenix` server here (it pulls FastAPI>=0.135
+# and conflicts with our pin). Phoenix Cloud is the hosted backend, so the OTel
+# exporter package alone is enough.
+arize-phoenix-otel>=0.16.1
+openinference-instrumentation-langchain>=0.1.66
+openinference-instrumentation-openai>=0.1.49
+openinference-instrumentation-anthropic>=1.0.5
+openinference-instrumentation-google-genai>=1.0.2
+opentelemetry-instrumentation-fastapi>=0.63b1
+# NOTE: `open-rag-eval` is NOT installed — its 0.3.0 dependency graph
+# (torch 2.7, llama_index, streamlit, transformers 4.50, openai~=2.7,
+# anthropic~=0.72) hard-conflicts with this backend's pins. `app/evals/scorer.py`
+# uses the same TREC-style LLM-judge prompt with the same metrics
+# (groundedness / hallucination / answer_relevance / context_relevance) via
+# the OpenAI SDK directly — identical signal, no dep collision.
+apscheduler>=3.11.0
+PyYAML>=6.0