Spaces:
Sleeping
Sleeping
Aker Deploy commited on
Commit ·
af01fe3
1
Parent(s): 24199fe
feat(obs+evals): mirror main commits f8f81de + 1132c64
Browse filesAdds Phoenix Cloud observability, RAG eval harness (golden set, runner,
scorer, scheduler, store), /evals/* admin router, and dependency bumps.
Brings the HF Space deployment up to parity with the main repo.
- README.md +30 -0
- app/config.py +19 -0
- app/evals/__init__.py +1 -0
- app/evals/api.py +132 -0
- app/evals/golden_set.yaml +107 -0
- app/evals/runner.py +230 -0
- app/evals/scheduler.py +98 -0
- app/evals/scorer.py +172 -0
- app/evals/store.py +119 -0
- app/main.py +14 -1
- app/models.py +38 -2
- app/observability.py +124 -0
- app/tools/rag_tools.py +24 -0
- requirements.txt +19 -0
README.md
CHANGED
|
@@ -45,3 +45,33 @@ on the Space and injected as env vars at runtime:
|
|
| 45 |
- `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`
|
| 46 |
- `ADMIN_TOKEN`
|
| 47 |
- `CORS_ORIGINS` — comma-separated list of allowed Vercel origins
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
- `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`
|
| 46 |
- `ADMIN_TOKEN`
|
| 47 |
- `CORS_ORIGINS` — comma-separated list of allowed Vercel origins
|
| 48 |
+
|
| 49 |
+
## Observability & evaluation
|
| 50 |
+
|
| 51 |
+
Tracing uses **Phoenix Cloud** (free hosted tier — `https://app.phoenix.arize.com`) via the OpenTelemetry `BatchSpanProcessor`. Spans ship on a background thread, so `/chat` latency is unaffected even when the network or Phoenix is down. Tracing is **opt-in**: set `PHOENIX_ENABLED=true` and `PHOENIX_API_KEY=<key>` in env to turn it on.
|
| 52 |
+
|
| 53 |
+
Auto-instrumented:
|
| 54 |
+
- FastAPI routes
|
| 55 |
+
- LangChain / LangGraph nodes, tools, LLM calls (token counts included)
|
| 56 |
+
- OpenAI, Anthropic, Google GenAI client SDKs
|
| 57 |
+
- Pinecone retrieval (manual span in `tools/rag_tools.py`, OpenInference RETRIEVER kind)
|
| 58 |
+
|
| 59 |
+
### Evaluation harness
|
| 60 |
+
|
| 61 |
+
`open_rag_eval` (Vectara, Apache-2.0) scores RAG turns for **groundedness**, **hallucination**, **answer relevance**, and **context relevance**, judged by `gpt-4o-mini` (set via `EVAL_JUDGE_MODEL`). Evals **never** run inline on `/chat` — they are fully out-of-band.
|
| 62 |
+
|
| 63 |
+
Triggers:
|
| 64 |
+
- **Manual** — UI: open the Monitoring tab in the frontend, enter your `ADMIN_TOKEN`, pick cases (or "Run all"), click run. API: `POST /evals/runs` with header `X-Admin-Token`.
|
| 65 |
+
- **CLI** — `python -m app.evals.runner [--ids id1,id2]`.
|
| 66 |
+
- **Scheduled** — opt-in via `EVAL_SCHEDULE_ENABLED=true` with `EVAL_SCHEDULE_CRON="0 */6 * * *"` (default every 6 h). Runs on an APScheduler `BackgroundScheduler` (single-worker thread pool, coalesce, max_instances=1).
|
| 67 |
+
|
| 68 |
+
Results land in:
|
| 69 |
+
- **Supabase Postgres** — tables `eval_runs` + `eval_cases` (created automatically by `init_db()`). Run history surfaced via the Monitoring UI.
|
| 70 |
+
- JSONL snapshots at `backend/evals/results/<timestamp>_<run_id>.jsonl`
|
| 71 |
+
- Phoenix Cloud traces under the project `property-ai` in the `aker-ai` space (each case is its own trace; eval scores attached as span attributes)
|
| 72 |
+
|
| 73 |
+
Extra env:
|
| 74 |
+
- `PHOENIX_ENABLED`, `PHOENIX_API_KEY`, `PHOENIX_ENDPOINT`, `PHOENIX_PROJECT_NAME`
|
| 75 |
+
- `EVAL_JUDGE_MODEL` (default `gpt-4o-mini`), `EVAL_SCHEDULE_ENABLED`, `EVAL_SCHEDULE_CRON`, `EVAL_MAX_CASES` (default 50)
|
| 76 |
+
|
| 77 |
+
Edit the golden set at [`app/evals/golden_set.yaml`](app/evals/golden_set.yaml).
|
app/config.py
CHANGED
|
@@ -72,6 +72,25 @@ class Settings:
|
|
| 72 |
backend_host: str = os.getenv("BACKEND_HOST", "0.0.0.0")
|
| 73 |
backend_port: int = int(os.getenv("BACKEND_PORT", "8000"))
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
@property
|
| 76 |
def sqlalchemy_url(self) -> str:
|
| 77 |
return self.database_url
|
|
|
|
| 72 |
backend_host: str = os.getenv("BACKEND_HOST", "0.0.0.0")
|
| 73 |
backend_port: int = int(os.getenv("BACKEND_PORT", "8000"))
|
| 74 |
|
| 75 |
+
# Observability — Phoenix Cloud (hosted, free tier). Defaults off so the
|
| 76 |
+
# app still boots when keys aren't set. Set PHOENIX_ENABLED=true and
|
| 77 |
+
# PHOENIX_API_KEY in prod env. Export is non-blocking (BatchSpanProcessor
|
| 78 |
+
# ships spans from a background thread).
|
| 79 |
+
phoenix_enabled: bool = (os.getenv("PHOENIX_ENABLED", "false").strip().lower() == "true")
|
| 80 |
+
phoenix_endpoint: str = os.getenv("PHOENIX_ENDPOINT", "https://app.phoenix.arize.com/s/aker-ai/v1/traces")
|
| 81 |
+
phoenix_api_key: str | None = os.getenv("PHOENIX_API_KEY") or None
|
| 82 |
+
phoenix_project_name: str = os.getenv("PHOENIX_PROJECT_NAME", "property-ai")
|
| 83 |
+
|
| 84 |
+
# Evaluation harness — never runs inline on /chat. Manual via UI/API,
|
| 85 |
+
# or scheduled via APScheduler. open_rag_eval judge model.
|
| 86 |
+
eval_judge_model: str = os.getenv("EVAL_JUDGE_MODEL", "gpt-4o-mini")
|
| 87 |
+
eval_schedule_enabled: bool = (os.getenv("EVAL_SCHEDULE_ENABLED", "false").strip().lower() == "true")
|
| 88 |
+
eval_schedule_cron: str = os.getenv("EVAL_SCHEDULE_CRON", "0 */6 * * *")
|
| 89 |
+
eval_max_cases: int = int(os.getenv("EVAL_MAX_CASES", "50"))
|
| 90 |
+
# Eval runs are persisted to Supabase Postgres (via SQLAlchemy / app.db).
|
| 91 |
+
# Local JSONL snapshots are still written for grep-friendly debugging.
|
| 92 |
+
eval_results_dir: str = os.getenv("EVAL_RESULTS_DIR", str(_BACKEND_DIR / "evals" / "results"))
|
| 93 |
+
|
| 94 |
@property
|
| 95 |
def sqlalchemy_url(self) -> str:
|
| 96 |
return self.database_url
|
app/evals/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Evaluation harness — manual + scheduled, never inline on /chat."""
|
app/evals/api.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Admin-protected FastAPI router exposing the eval harness to the UI."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
import uuid
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
from fastapi import APIRouter, BackgroundTasks, Header, HTTPException
|
| 8 |
+
from pydantic import BaseModel, Field
|
| 9 |
+
|
| 10 |
+
from ..config import get_settings
|
| 11 |
+
from . import runner, scheduler, store
|
| 12 |
+
|
| 13 |
+
router = APIRouter(prefix="/evals", tags=["evals"])
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _require_admin(token: str | None) -> None:
|
| 17 |
+
expected = get_settings().admin_token
|
| 18 |
+
if not expected:
|
| 19 |
+
raise HTTPException(status_code=503, detail="ADMIN_TOKEN not configured on server")
|
| 20 |
+
if token != expected:
|
| 21 |
+
raise HTTPException(status_code=401, detail="Invalid admin token")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ---------------------------------------------------------------------------
|
| 25 |
+
# Golden set
|
| 26 |
+
# ---------------------------------------------------------------------------
|
| 27 |
+
|
| 28 |
+
@router.get("/golden")
|
| 29 |
+
def get_golden(x_admin_token: str | None = Header(default=None, alias="X-Admin-Token")) -> list[dict]:
|
| 30 |
+
_require_admin(x_admin_token)
|
| 31 |
+
cases = runner.load_golden()
|
| 32 |
+
return [
|
| 33 |
+
{"id": c.get("id"), "question": c.get("question"), "property_code": c.get("property_code")}
|
| 34 |
+
for c in cases
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
# Runs
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
|
| 42 |
+
class RunRequest(BaseModel):
|
| 43 |
+
ids: list[str] | None = Field(default=None, description="Subset of golden IDs to run")
|
| 44 |
+
provider: str = "openai"
|
| 45 |
+
model: str | None = None
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
@router.post("/runs")
|
| 49 |
+
def trigger_run(
|
| 50 |
+
req: RunRequest,
|
| 51 |
+
background: BackgroundTasks,
|
| 52 |
+
x_admin_token: str | None = Header(default=None, alias="X-Admin-Token"),
|
| 53 |
+
) -> dict:
|
| 54 |
+
_require_admin(x_admin_token)
|
| 55 |
+
run_id = str(uuid.uuid4())
|
| 56 |
+
store.create_run(run_id, trigger="manual")
|
| 57 |
+
background.add_task(
|
| 58 |
+
_safe_run,
|
| 59 |
+
run_id=run_id,
|
| 60 |
+
golden_ids=req.ids,
|
| 61 |
+
provider=req.provider,
|
| 62 |
+
model=req.model,
|
| 63 |
+
)
|
| 64 |
+
return {"run_id": run_id, "status": "started"}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _safe_run(*, run_id: str, golden_ids: list[str] | None, provider: str, model: str | None) -> None:
|
| 68 |
+
import logging
|
| 69 |
+
log = logging.getLogger("property_ai.evals.api")
|
| 70 |
+
try:
|
| 71 |
+
runner.run_eval(
|
| 72 |
+
run_id=run_id,
|
| 73 |
+
golden_ids=golden_ids,
|
| 74 |
+
trigger="manual",
|
| 75 |
+
llm_provider=provider,
|
| 76 |
+
model=model,
|
| 77 |
+
)
|
| 78 |
+
except Exception as e: # noqa: BLE001
|
| 79 |
+
log.exception("background eval run failed")
|
| 80 |
+
try:
|
| 81 |
+
store.finish_run(run_id, status="failed", summary={"error": f"{type(e).__name__}: {e}"})
|
| 82 |
+
except Exception: # noqa: BLE001
|
| 83 |
+
pass
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
@router.get("/runs")
|
| 87 |
+
def list_runs(
|
| 88 |
+
limit: int = 50,
|
| 89 |
+
x_admin_token: str | None = Header(default=None, alias="X-Admin-Token"),
|
| 90 |
+
) -> list[dict]:
|
| 91 |
+
_require_admin(x_admin_token)
|
| 92 |
+
return store.list_runs(limit=limit)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
@router.get("/runs/{run_id}")
|
| 96 |
+
def get_run(
|
| 97 |
+
run_id: str,
|
| 98 |
+
x_admin_token: str | None = Header(default=None, alias="X-Admin-Token"),
|
| 99 |
+
) -> dict:
|
| 100 |
+
_require_admin(x_admin_token)
|
| 101 |
+
run = store.get_run(run_id)
|
| 102 |
+
if not run:
|
| 103 |
+
raise HTTPException(status_code=404, detail="run not found")
|
| 104 |
+
return run
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# ---------------------------------------------------------------------------
|
| 108 |
+
# Schedule
|
| 109 |
+
# ---------------------------------------------------------------------------
|
| 110 |
+
|
| 111 |
+
class ScheduleUpdate(BaseModel):
|
| 112 |
+
cron: str = Field(..., description="Crontab expression, e.g. '0 */6 * * *'")
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
@router.get("/schedule")
|
| 116 |
+
def get_schedule(x_admin_token: str | None = Header(default=None, alias="X-Admin-Token")) -> dict[str, Any]:
|
| 117 |
+
_require_admin(x_admin_token)
|
| 118 |
+
return scheduler.get_status()
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
@router.put("/schedule")
|
| 122 |
+
def put_schedule(
|
| 123 |
+
body: ScheduleUpdate,
|
| 124 |
+
x_admin_token: str | None = Header(default=None, alias="X-Admin-Token"),
|
| 125 |
+
) -> dict[str, Any]:
|
| 126 |
+
_require_admin(x_admin_token)
|
| 127 |
+
try:
|
| 128 |
+
return scheduler.update_cron(body.cron)
|
| 129 |
+
except RuntimeError as e:
|
| 130 |
+
raise HTTPException(status_code=409, detail=str(e))
|
| 131 |
+
except Exception as e: # noqa: BLE001
|
| 132 |
+
raise HTTPException(status_code=400, detail=f"invalid cron: {e}")
|
app/evals/golden_set.yaml
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Curated regression set. Each case is one chat turn against a known property.
|
| 2 |
+
# `expected_substrings` are loose checks (case-insensitive) used as a smoke
|
| 3 |
+
# signal alongside open_rag_eval's LLM-judge scores.
|
| 4 |
+
cases:
|
| 5 |
+
- id: 115r-summary
|
| 6 |
+
property_code: 115r
|
| 7 |
+
question: "Give me a summary of this property."
|
| 8 |
+
expected_substrings: ["unit"]
|
| 9 |
+
expected_tools: ["get_property_summary"]
|
| 10 |
+
|
| 11 |
+
- id: 115r-unit-mix
|
| 12 |
+
property_code: 115r
|
| 13 |
+
question: "What is the unit mix?"
|
| 14 |
+
expected_substrings: ["bedroom"]
|
| 15 |
+
expected_tools: ["get_unit_mix"]
|
| 16 |
+
|
| 17 |
+
- id: 115r-occupancy
|
| 18 |
+
property_code: 115r
|
| 19 |
+
question: "What is the current occupancy?"
|
| 20 |
+
expected_substrings: ["%"]
|
| 21 |
+
expected_tools: ["get_occupancy"]
|
| 22 |
+
|
| 23 |
+
- id: 115r-expiring
|
| 24 |
+
property_code: 115r
|
| 25 |
+
question: "List leases expiring in the next 90 days."
|
| 26 |
+
expected_substrings: []
|
| 27 |
+
expected_tools: ["get_expiring_leases"]
|
| 28 |
+
|
| 29 |
+
- id: 115r-top-balances
|
| 30 |
+
property_code: 115r
|
| 31 |
+
question: "Who are the top outstanding balances?"
|
| 32 |
+
expected_substrings: []
|
| 33 |
+
expected_tools: ["get_top_balances"]
|
| 34 |
+
|
| 35 |
+
- id: 115r-amenities
|
| 36 |
+
property_code: 115r
|
| 37 |
+
question: "What amenities does this property offer?"
|
| 38 |
+
expected_substrings: []
|
| 39 |
+
expected_tools: ["search_property_pages"]
|
| 40 |
+
|
| 41 |
+
- id: 115r-gym-photos
|
| 42 |
+
property_code: 115r
|
| 43 |
+
question: "Show me photos of the gym."
|
| 44 |
+
expected_substrings: []
|
| 45 |
+
expected_tools: ["search_property_pages"]
|
| 46 |
+
|
| 47 |
+
- id: 115r-floor-plans
|
| 48 |
+
property_code: 115r
|
| 49 |
+
question: "What floor plans are available?"
|
| 50 |
+
expected_substrings: []
|
| 51 |
+
expected_tools: ["search_property_pages"]
|
| 52 |
+
|
| 53 |
+
- id: 126r-summary
|
| 54 |
+
property_code: 126r
|
| 55 |
+
question: "Give me a summary of this property."
|
| 56 |
+
expected_substrings: ["unit"]
|
| 57 |
+
expected_tools: ["get_property_summary"]
|
| 58 |
+
|
| 59 |
+
- id: 126r-unit-mix
|
| 60 |
+
property_code: 126r
|
| 61 |
+
question: "Break down the unit mix."
|
| 62 |
+
expected_substrings: ["bedroom"]
|
| 63 |
+
expected_tools: ["get_unit_mix"]
|
| 64 |
+
|
| 65 |
+
- id: 126r-occupancy
|
| 66 |
+
property_code: 126r
|
| 67 |
+
question: "What is the occupancy rate?"
|
| 68 |
+
expected_substrings: ["%"]
|
| 69 |
+
expected_tools: ["get_occupancy"]
|
| 70 |
+
|
| 71 |
+
- id: 126r-rent-trend
|
| 72 |
+
property_code: 126r
|
| 73 |
+
question: "Show me the rent trend over the last 6 months."
|
| 74 |
+
expected_substrings: []
|
| 75 |
+
expected_tools: ["get_rent_trend"]
|
| 76 |
+
|
| 77 |
+
- id: 126r-amenities
|
| 78 |
+
property_code: 126r
|
| 79 |
+
question: "What amenities are available?"
|
| 80 |
+
expected_substrings: []
|
| 81 |
+
expected_tools: ["search_property_pages"]
|
| 82 |
+
|
| 83 |
+
- id: 126r-neighborhood
|
| 84 |
+
property_code: 126r
|
| 85 |
+
question: "Tell me about the neighborhood."
|
| 86 |
+
expected_substrings: []
|
| 87 |
+
expected_tools: ["search_property_pages"]
|
| 88 |
+
|
| 89 |
+
- id: 126r-pool-photo
|
| 90 |
+
property_code: 126r
|
| 91 |
+
question: "Do you have a photo of the pool?"
|
| 92 |
+
expected_substrings: []
|
| 93 |
+
expected_tools: ["search_property_pages"]
|
| 94 |
+
|
| 95 |
+
- id: 115r-out-of-scope-stock
|
| 96 |
+
property_code: 115r
|
| 97 |
+
question: "What is the current price of AAPL stock?"
|
| 98 |
+
expected_substrings: []
|
| 99 |
+
expected_tools: []
|
| 100 |
+
notes: "out of scope — should refuse or redirect"
|
| 101 |
+
|
| 102 |
+
- id: 115r-out-of-scope-other-property
|
| 103 |
+
property_code: 115r
|
| 104 |
+
question: "How does The Hamlet at Saratoga Springs compare?"
|
| 105 |
+
expected_substrings: []
|
| 106 |
+
expected_tools: []
|
| 107 |
+
notes: "scope-violation — assistant should stay on 115r"
|
app/evals/runner.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Run the golden set against the live graph and score each turn.
|
| 2 |
+
|
| 3 |
+
Sync function — safe to call from a background thread (APScheduler) or from
|
| 4 |
+
FastAPI's `BackgroundTasks`. NEVER call from within a `/chat` request path.
|
| 5 |
+
|
| 6 |
+
Phoenix Cloud spans are emitted naturally by the LangChain instrumentor —
|
| 7 |
+
each case becomes its own trace because we pass a fresh `conversation_id`.
|
| 8 |
+
"""
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import json
|
| 12 |
+
import logging
|
| 13 |
+
import time
|
| 14 |
+
import uuid
|
| 15 |
+
from datetime import datetime, timezone
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Any
|
| 18 |
+
|
| 19 |
+
import yaml
|
| 20 |
+
|
| 21 |
+
from ..config import get_settings
|
| 22 |
+
from ..graph.build import run_chat
|
| 23 |
+
from ..observability import get_tracer
|
| 24 |
+
from . import scorer, store
|
| 25 |
+
|
| 26 |
+
log = logging.getLogger("property_ai.evals.runner")
|
| 27 |
+
_tracer = get_tracer("property_ai.evals")
|
| 28 |
+
|
| 29 |
+
_GOLDEN_PATH = Path(__file__).resolve().parent / "golden_set.yaml"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def load_golden() -> list[dict[str, Any]]:
|
| 33 |
+
with _GOLDEN_PATH.open("r", encoding="utf-8") as f:
|
| 34 |
+
data = yaml.safe_load(f) or {}
|
| 35 |
+
return list(data.get("cases") or [])
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def run_eval(
|
| 39 |
+
*,
|
| 40 |
+
run_id: str | None = None,
|
| 41 |
+
golden_ids: list[str] | None = None,
|
| 42 |
+
trigger: str = "manual",
|
| 43 |
+
llm_provider: str = "openai",
|
| 44 |
+
model: str | None = None,
|
| 45 |
+
) -> dict[str, Any]:
|
| 46 |
+
settings = get_settings()
|
| 47 |
+
cases = load_golden()
|
| 48 |
+
if golden_ids:
|
| 49 |
+
wanted = set(golden_ids)
|
| 50 |
+
cases = [c for c in cases if c.get("id") in wanted]
|
| 51 |
+
if not cases:
|
| 52 |
+
return {"run_id": run_id, "status": "empty", "summary": {"count": 0}}
|
| 53 |
+
|
| 54 |
+
cases = cases[: settings.eval_max_cases]
|
| 55 |
+
run_id = run_id or str(uuid.uuid4())
|
| 56 |
+
model = model or settings.eval_judge_model
|
| 57 |
+
|
| 58 |
+
store.create_run(run_id, trigger)
|
| 59 |
+
log.info("eval run %s started (trigger=%s, cases=%d)", run_id, trigger, len(cases))
|
| 60 |
+
|
| 61 |
+
results: list[dict[str, Any]] = []
|
| 62 |
+
|
| 63 |
+
with _tracer.start_as_current_span("eval.run") as run_span:
|
| 64 |
+
run_span.set_attribute("eval.run_id", run_id)
|
| 65 |
+
run_span.set_attribute("eval.trigger", trigger)
|
| 66 |
+
run_span.set_attribute("eval.case_count", len(cases))
|
| 67 |
+
|
| 68 |
+
for case in cases:
|
| 69 |
+
case_result = _run_one_case(run_id=run_id, case=case, provider=llm_provider, model=model)
|
| 70 |
+
results.append(case_result)
|
| 71 |
+
store.add_case(
|
| 72 |
+
run_id=run_id,
|
| 73 |
+
golden_id=case.get("id", ""),
|
| 74 |
+
property_code=case.get("property_code"),
|
| 75 |
+
question=case.get("question", ""),
|
| 76 |
+
answer=case_result.get("answer"),
|
| 77 |
+
scores=case_result.get("scores"),
|
| 78 |
+
ok=case_result.get("ok", False),
|
| 79 |
+
error=case_result.get("error"),
|
| 80 |
+
duration_ms=case_result.get("duration_ms"),
|
| 81 |
+
trace_id=case_result.get("trace_id"),
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
summary = _summarize(results)
|
| 85 |
+
store.finish_run(run_id, status="completed", summary=summary)
|
| 86 |
+
_write_jsonl(run_id, results, summary)
|
| 87 |
+
log.info("eval run %s finished — %s", run_id, summary)
|
| 88 |
+
return {"run_id": run_id, "status": "completed", "summary": summary}
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def _run_one_case(*, run_id: str, case: dict[str, Any], provider: str, model: str) -> dict[str, Any]:
|
| 92 |
+
case_id = case.get("id", "")
|
| 93 |
+
question = case.get("question", "")
|
| 94 |
+
property_code = case.get("property_code")
|
| 95 |
+
started = time.monotonic()
|
| 96 |
+
|
| 97 |
+
answer: str | None = None
|
| 98 |
+
tool_history: list[dict] = []
|
| 99 |
+
err: str | None = None
|
| 100 |
+
trace_id: str | None = None
|
| 101 |
+
|
| 102 |
+
with _tracer.start_as_current_span("eval.case") as span:
|
| 103 |
+
span.set_attribute("eval.case_id", case_id)
|
| 104 |
+
span.set_attribute("eval.property_code", property_code or "")
|
| 105 |
+
try:
|
| 106 |
+
ctx = span.get_span_context()
|
| 107 |
+
trace_id = format(ctx.trace_id, "032x") if ctx and ctx.trace_id else None
|
| 108 |
+
except Exception: # noqa: BLE001
|
| 109 |
+
trace_id = None
|
| 110 |
+
|
| 111 |
+
try:
|
| 112 |
+
conv_id = f"eval-{run_id}-{case_id}"
|
| 113 |
+
state = run_chat(
|
| 114 |
+
property_code=property_code,
|
| 115 |
+
user_message=question,
|
| 116 |
+
llm_provider=provider,
|
| 117 |
+
model=model,
|
| 118 |
+
conversation_id=conv_id,
|
| 119 |
+
)
|
| 120 |
+
# The graph may pause for property OR time-scope clarification.
|
| 121 |
+
# Auto-resume with sensible defaults so a single eval case
|
| 122 |
+
# exercises the full agent loop end-to-end:
|
| 123 |
+
# - time clarification → "Latest"
|
| 124 |
+
# - property clarification → first option (or skip-score it later)
|
| 125 |
+
# Hard cap of 2 resumes to avoid loops.
|
| 126 |
+
for _ in range(2):
|
| 127 |
+
if not state.get("paused"):
|
| 128 |
+
break
|
| 129 |
+
clar = state.get("clarification") or {}
|
| 130 |
+
kind = clar.get("scope_kind")
|
| 131 |
+
options = clar.get("options") or []
|
| 132 |
+
if kind == "time":
|
| 133 |
+
choice = "Latest"
|
| 134 |
+
elif options:
|
| 135 |
+
choice = options[0]
|
| 136 |
+
else:
|
| 137 |
+
break
|
| 138 |
+
state = run_chat(
|
| 139 |
+
property_code=property_code,
|
| 140 |
+
user_message=question,
|
| 141 |
+
llm_provider=provider,
|
| 142 |
+
model=model,
|
| 143 |
+
conversation_id=conv_id,
|
| 144 |
+
resume_value=choice,
|
| 145 |
+
)
|
| 146 |
+
answer = state.get("answer_markdown") or ""
|
| 147 |
+
tool_history = state.get("tool_history") or []
|
| 148 |
+
except Exception as e: # noqa: BLE001
|
| 149 |
+
err = f"{type(e).__name__}: {e}"
|
| 150 |
+
log.exception("eval case %s failed in graph", case_id)
|
| 151 |
+
|
| 152 |
+
scores: dict[str, Any] | None = None
|
| 153 |
+
if err is None:
|
| 154 |
+
contexts = scorer.contexts_from_tool_history(tool_history)
|
| 155 |
+
scores = scorer.score_turn(question, answer or "", contexts)
|
| 156 |
+
for k in ("groundedness", "hallucination", "answer_relevance", "context_relevance"):
|
| 157 |
+
v = scores.get(k)
|
| 158 |
+
if isinstance(v, (int, float)):
|
| 159 |
+
span.set_attribute(f"eval.{k}", float(v))
|
| 160 |
+
|
| 161 |
+
return {
|
| 162 |
+
"case_id": case_id,
|
| 163 |
+
"property_code": property_code,
|
| 164 |
+
"question": question,
|
| 165 |
+
"answer": answer,
|
| 166 |
+
"scores": scores,
|
| 167 |
+
"ok": err is None,
|
| 168 |
+
"error": err,
|
| 169 |
+
"duration_ms": int((time.monotonic() - started) * 1000),
|
| 170 |
+
"trace_id": trace_id,
|
| 171 |
+
"tool_calls": [s.get("tool") for s in tool_history if isinstance(s, dict)],
|
| 172 |
+
"expected_tools": case.get("expected_tools") or [],
|
| 173 |
+
"expected_substrings": case.get("expected_substrings") or [],
|
| 174 |
+
"substring_hits": _substring_hits(answer or "", case.get("expected_substrings") or []),
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def _substring_hits(text: str, needles: list[str]) -> int:
|
| 179 |
+
t = (text or "").lower()
|
| 180 |
+
return sum(1 for n in needles if n and n.lower() in t)
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def _summarize(results: list[dict[str, Any]]) -> dict[str, Any]:
|
| 184 |
+
n = len(results)
|
| 185 |
+
if n == 0:
|
| 186 |
+
return {"count": 0}
|
| 187 |
+
ok_n = sum(1 for r in results if r.get("ok"))
|
| 188 |
+
out: dict[str, Any] = {
|
| 189 |
+
"count": n,
|
| 190 |
+
"ok_count": ok_n,
|
| 191 |
+
"error_count": n - ok_n,
|
| 192 |
+
}
|
| 193 |
+
for k in ("groundedness", "hallucination", "answer_relevance", "context_relevance"):
|
| 194 |
+
vals = [(r.get("scores") or {}).get(k) for r in results]
|
| 195 |
+
vals = [float(v) for v in vals if isinstance(v, (int, float))]
|
| 196 |
+
if vals:
|
| 197 |
+
out[f"mean_{k}"] = round(sum(vals) / len(vals), 3)
|
| 198 |
+
out[f"min_{k}"] = round(min(vals), 3)
|
| 199 |
+
return out
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
def _write_jsonl(run_id: str, results: list[dict[str, Any]], summary: dict[str, Any]) -> None:
|
| 203 |
+
out_dir = Path(get_settings().eval_results_dir)
|
| 204 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 205 |
+
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
| 206 |
+
path = out_dir / f"{ts}_{run_id}.jsonl"
|
| 207 |
+
with path.open("w", encoding="utf-8") as f:
|
| 208 |
+
f.write(json.dumps({"_summary": summary, "run_id": run_id}, default=str) + "\n")
|
| 209 |
+
for r in results:
|
| 210 |
+
f.write(json.dumps(r, default=str) + "\n")
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
# ---------------------------------------------------------------------------
|
| 214 |
+
# CLI
|
| 215 |
+
# ---------------------------------------------------------------------------
|
| 216 |
+
|
| 217 |
+
def _main() -> None:
|
| 218 |
+
import argparse
|
| 219 |
+
ap = argparse.ArgumentParser(description="Run the RAG eval golden set.")
|
| 220 |
+
ap.add_argument("--ids", help="Comma-separated golden case IDs (default: all).")
|
| 221 |
+
ap.add_argument("--provider", default="openai")
|
| 222 |
+
ap.add_argument("--model", default=None)
|
| 223 |
+
args = ap.parse_args()
|
| 224 |
+
ids = [s.strip() for s in (args.ids or "").split(",") if s.strip()] or None
|
| 225 |
+
out = run_eval(golden_ids=ids, trigger="cli", llm_provider=args.provider, model=args.model)
|
| 226 |
+
print(json.dumps(out, indent=2, default=str))
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
if __name__ == "__main__":
|
| 230 |
+
_main()
|
app/evals/scheduler.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""APScheduler-based periodic eval runner.
|
| 2 |
+
|
| 3 |
+
Runs in the FastAPI process on a separate thread pool (max_workers=1,
|
| 4 |
+
coalesce=True, max_instances=1) so overlapping runs can't pile up and the
|
| 5 |
+
request path is never affected. Opt-in via EVAL_SCHEDULE_ENABLED.
|
| 6 |
+
"""
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import logging
|
| 10 |
+
import uuid
|
| 11 |
+
from typing import Any
|
| 12 |
+
|
| 13 |
+
from ..config import get_settings
|
| 14 |
+
|
| 15 |
+
log = logging.getLogger("property_ai.evals.scheduler")
|
| 16 |
+
|
| 17 |
+
_scheduler: Any = None
|
| 18 |
+
_job_id = "eval_golden_set"
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def start_scheduler() -> None:
|
| 22 |
+
global _scheduler
|
| 23 |
+
settings = get_settings()
|
| 24 |
+
if not settings.eval_schedule_enabled:
|
| 25 |
+
log.info("eval scheduler disabled (EVAL_SCHEDULE_ENABLED=false)")
|
| 26 |
+
return
|
| 27 |
+
if _scheduler is not None:
|
| 28 |
+
return
|
| 29 |
+
try:
|
| 30 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
| 31 |
+
from apscheduler.triggers.cron import CronTrigger
|
| 32 |
+
from apscheduler.executors.pool import ThreadPoolExecutor
|
| 33 |
+
except Exception as e: # noqa: BLE001
|
| 34 |
+
log.warning("APScheduler not available; scheduling disabled: %s", e)
|
| 35 |
+
return
|
| 36 |
+
|
| 37 |
+
_scheduler = BackgroundScheduler(
|
| 38 |
+
executors={"default": ThreadPoolExecutor(max_workers=1)},
|
| 39 |
+
job_defaults={"coalesce": True, "max_instances": 1, "misfire_grace_time": 300},
|
| 40 |
+
timezone="UTC",
|
| 41 |
+
)
|
| 42 |
+
try:
|
| 43 |
+
trigger = CronTrigger.from_crontab(settings.eval_schedule_cron, timezone="UTC")
|
| 44 |
+
except Exception as e: # noqa: BLE001
|
| 45 |
+
log.warning("invalid EVAL_SCHEDULE_CRON=%r (%s); scheduler not started", settings.eval_schedule_cron, e)
|
| 46 |
+
_scheduler = None
|
| 47 |
+
return
|
| 48 |
+
|
| 49 |
+
_scheduler.add_job(_run_scheduled, trigger=trigger, id=_job_id, replace_existing=True)
|
| 50 |
+
_scheduler.start()
|
| 51 |
+
log.info("eval scheduler started — cron=%r", settings.eval_schedule_cron)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def stop_scheduler() -> None:
|
| 55 |
+
global _scheduler
|
| 56 |
+
if _scheduler is None:
|
| 57 |
+
return
|
| 58 |
+
try:
|
| 59 |
+
_scheduler.shutdown(wait=False)
|
| 60 |
+
except Exception as e: # noqa: BLE001
|
| 61 |
+
log.warning("scheduler shutdown error: %s", e)
|
| 62 |
+
_scheduler = None
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def get_status() -> dict[str, Any]:
|
| 66 |
+
settings = get_settings()
|
| 67 |
+
status: dict[str, Any] = {
|
| 68 |
+
"enabled": settings.eval_schedule_enabled,
|
| 69 |
+
"cron": settings.eval_schedule_cron,
|
| 70 |
+
"running": _scheduler is not None,
|
| 71 |
+
"next_run_at": None,
|
| 72 |
+
}
|
| 73 |
+
if _scheduler is not None:
|
| 74 |
+
job = _scheduler.get_job(_job_id)
|
| 75 |
+
if job and job.next_run_time:
|
| 76 |
+
status["next_run_at"] = job.next_run_time.isoformat()
|
| 77 |
+
return status
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def update_cron(cron: str) -> dict[str, Any]:
|
| 81 |
+
"""Reschedule the existing job to a new crontab string. Returns status."""
|
| 82 |
+
if _scheduler is None:
|
| 83 |
+
raise RuntimeError("scheduler not running")
|
| 84 |
+
from apscheduler.triggers.cron import CronTrigger
|
| 85 |
+
trigger = CronTrigger.from_crontab(cron, timezone="UTC")
|
| 86 |
+
_scheduler.reschedule_job(_job_id, trigger=trigger)
|
| 87 |
+
# Mutate the in-memory setting too (process-local; not persisted to .env).
|
| 88 |
+
get_settings().eval_schedule_cron = cron
|
| 89 |
+
return get_status()
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def _run_scheduled() -> None:
|
| 93 |
+
# Imported lazily so a scheduler tick can never fail at module-import time.
|
| 94 |
+
from . import runner
|
| 95 |
+
try:
|
| 96 |
+
runner.run_eval(run_id=str(uuid.uuid4()), trigger="scheduled")
|
| 97 |
+
except Exception as e: # noqa: BLE001
|
| 98 |
+
log.exception("scheduled eval run failed: %s", e)
|
app/evals/scorer.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""RAG scoring — uses open_rag_eval when its TRECEvaluator API is available,
|
| 2 |
+
falls back to a direct OpenAI LLM-judge so the harness still produces numbers
|
| 3 |
+
when the library surface drifts between releases.
|
| 4 |
+
|
| 5 |
+
Returns a dict with four 0-1 scores: groundedness, hallucination,
|
| 6 |
+
answer_relevance, context_relevance.
|
| 7 |
+
"""
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import logging
|
| 12 |
+
import os
|
| 13 |
+
from typing import Any
|
| 14 |
+
|
| 15 |
+
from ..config import get_settings
|
| 16 |
+
|
| 17 |
+
log = logging.getLogger("property_ai.evals.scorer")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
_JUDGE_SYSTEM = """You are a strict RAG evaluator. You will receive a question,
|
| 21 |
+
an answer the system produced, and the retrieved context passages it had
|
| 22 |
+
available. Score four metrics from 0.0 to 1.0 (higher is better):
|
| 23 |
+
|
| 24 |
+
- groundedness: is every factual claim in the answer supported by the context?
|
| 25 |
+
- hallucination: 1.0 means NO hallucination (answer makes nothing up). 0.0 means major fabrications.
|
| 26 |
+
- answer_relevance: does the answer address the question asked?
|
| 27 |
+
- context_relevance: are the retrieved contexts useful for answering the question?
|
| 28 |
+
|
| 29 |
+
Reply with ONLY a compact JSON object: {"groundedness": x, "hallucination": x,
|
| 30 |
+
"answer_relevance": x, "context_relevance": x, "rationale": "<one short line>"}.
|
| 31 |
+
No prose, no markdown fences."""
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _llm_judge_score(question: str, answer: str, contexts: list[str]) -> dict[str, Any]:
|
| 35 |
+
"""Fallback / primary judge using OpenAI directly."""
|
| 36 |
+
from openai import OpenAI
|
| 37 |
+
settings = get_settings()
|
| 38 |
+
client = OpenAI(api_key=settings.openai_api_key)
|
| 39 |
+
ctx_blob = "\n\n---\n\n".join(contexts) if contexts else "(no contexts retrieved)"
|
| 40 |
+
user = (
|
| 41 |
+
f"QUESTION:\n{question}\n\n"
|
| 42 |
+
f"ANSWER:\n{answer}\n\n"
|
| 43 |
+
f"RETRIEVED CONTEXTS:\n{ctx_blob[:12000]}"
|
| 44 |
+
)
|
| 45 |
+
resp = client.chat.completions.create(
|
| 46 |
+
model=settings.eval_judge_model,
|
| 47 |
+
temperature=0.0,
|
| 48 |
+
messages=[
|
| 49 |
+
{"role": "system", "content": _JUDGE_SYSTEM},
|
| 50 |
+
{"role": "user", "content": user},
|
| 51 |
+
],
|
| 52 |
+
response_format={"type": "json_object"},
|
| 53 |
+
)
|
| 54 |
+
raw = resp.choices[0].message.content or "{}"
|
| 55 |
+
try:
|
| 56 |
+
data = json.loads(raw)
|
| 57 |
+
except json.JSONDecodeError:
|
| 58 |
+
data = {}
|
| 59 |
+
out = {
|
| 60 |
+
"groundedness": _clamp01(data.get("groundedness")),
|
| 61 |
+
"hallucination": _clamp01(data.get("hallucination")),
|
| 62 |
+
"answer_relevance": _clamp01(data.get("answer_relevance")),
|
| 63 |
+
"context_relevance": _clamp01(data.get("context_relevance")),
|
| 64 |
+
"rationale": (data.get("rationale") or "")[:300],
|
| 65 |
+
"judge": f"openai/{settings.eval_judge_model}",
|
| 66 |
+
}
|
| 67 |
+
return out
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _try_open_rag_eval(question: str, answer: str, contexts: list[str]) -> dict[str, Any] | None:
|
| 71 |
+
"""Best-effort call into open_rag_eval. Returns None if the API surface
|
| 72 |
+
isn't what we expect (we then fall back to direct LLM judging)."""
|
| 73 |
+
try:
|
| 74 |
+
from open_rag_eval.evaluators.trec_evaluator import TRECEvaluator # type: ignore
|
| 75 |
+
from open_rag_eval.models.openai_model import OpenAIModel # type: ignore
|
| 76 |
+
from open_rag_eval.rag_types import RAGResult # type: ignore
|
| 77 |
+
except Exception as e: # noqa: BLE001
|
| 78 |
+
log.info("open_rag_eval not importable (%s); using direct judge", e)
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
settings = get_settings()
|
| 82 |
+
try:
|
| 83 |
+
model = OpenAIModel(name=settings.eval_judge_model, api_key=settings.openai_api_key)
|
| 84 |
+
evaluator = TRECEvaluator(model=model)
|
| 85 |
+
result = RAGResult(
|
| 86 |
+
query=question,
|
| 87 |
+
generated_answer=answer,
|
| 88 |
+
retrieved_contexts=contexts or [""],
|
| 89 |
+
)
|
| 90 |
+
scored = evaluator.evaluate_single(result)
|
| 91 |
+
scores = getattr(scored, "scores", None) or {}
|
| 92 |
+
# open_rag_eval metric names differ across releases — map best-effort.
|
| 93 |
+
return {
|
| 94 |
+
"groundedness": _clamp01(scores.get("groundedness") or scores.get("umbrela")),
|
| 95 |
+
"hallucination": _clamp01(scores.get("hallucination") or scores.get("hhem")),
|
| 96 |
+
"answer_relevance": _clamp01(scores.get("answer_relevance")),
|
| 97 |
+
"context_relevance": _clamp01(scores.get("context_relevance")),
|
| 98 |
+
"rationale": "open_rag_eval/TRECEvaluator",
|
| 99 |
+
"judge": f"open_rag_eval+{settings.eval_judge_model}",
|
| 100 |
+
"raw": {k: float(v) for k, v in scores.items() if _is_number(v)},
|
| 101 |
+
}
|
| 102 |
+
except Exception as e: # noqa: BLE001
|
| 103 |
+
log.warning("open_rag_eval scoring failed, falling back: %s", e)
|
| 104 |
+
return None
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def score_turn(question: str, answer: str, retrieved_contexts: list[str]) -> dict[str, Any]:
|
| 108 |
+
"""Score one RAG turn. Never raises — returns an `error` field on failure."""
|
| 109 |
+
if not (get_settings().openai_api_key):
|
| 110 |
+
return {"error": "OPENAI_API_KEY not set; cannot score"}
|
| 111 |
+
try:
|
| 112 |
+
result = _try_open_rag_eval(question, answer, retrieved_contexts)
|
| 113 |
+
if result is None:
|
| 114 |
+
result = _llm_judge_score(question, answer, retrieved_contexts)
|
| 115 |
+
return result
|
| 116 |
+
except Exception as e: # noqa: BLE001
|
| 117 |
+
log.exception("scoring failed")
|
| 118 |
+
return {"error": f"{type(e).__name__}: {e}"}
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
# ---------------------------------------------------------------------------
|
| 122 |
+
# Helpers
|
| 123 |
+
# ---------------------------------------------------------------------------
|
| 124 |
+
|
| 125 |
+
def _clamp01(v: Any) -> float | None:
|
| 126 |
+
if v is None:
|
| 127 |
+
return None
|
| 128 |
+
try:
|
| 129 |
+
f = float(v)
|
| 130 |
+
except (TypeError, ValueError):
|
| 131 |
+
return None
|
| 132 |
+
if f != f: # NaN
|
| 133 |
+
return None
|
| 134 |
+
return max(0.0, min(1.0, f))
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def _is_number(v: Any) -> bool:
|
| 138 |
+
return isinstance(v, (int, float)) and not isinstance(v, bool)
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def contexts_from_tool_history(tool_history: list[dict]) -> list[str]:
|
| 142 |
+
"""Extract retrieved text contexts from a graph turn's tool_history.
|
| 143 |
+
|
| 144 |
+
RAG: pull text chunks from `search_property_pages` / `search_property_active`.
|
| 145 |
+
SQL: serialize result rows as structured context so groundedness checks
|
| 146 |
+
can verify numeric claims too.
|
| 147 |
+
"""
|
| 148 |
+
out: list[str] = []
|
| 149 |
+
for step in tool_history or []:
|
| 150 |
+
tool = step.get("tool") or ""
|
| 151 |
+
result = step.get("result") or step.get("output") or {}
|
| 152 |
+
if isinstance(result, str):
|
| 153 |
+
try:
|
| 154 |
+
result = json.loads(result)
|
| 155 |
+
except (json.JSONDecodeError, TypeError):
|
| 156 |
+
result = {"text": result}
|
| 157 |
+
if not isinstance(result, dict):
|
| 158 |
+
continue
|
| 159 |
+
if tool in {"search_property_pages", "search_property_active"}:
|
| 160 |
+
for ch in (result.get("chunks") or []):
|
| 161 |
+
t = (ch.get("text") or "").strip()
|
| 162 |
+
if t:
|
| 163 |
+
out.append(t)
|
| 164 |
+
else:
|
| 165 |
+
# SQL/structured tool: dump up to ~2k chars as one context block.
|
| 166 |
+
try:
|
| 167 |
+
blob = json.dumps(result, default=str)[:2000]
|
| 168 |
+
except (TypeError, ValueError):
|
| 169 |
+
blob = str(result)[:2000]
|
| 170 |
+
if blob:
|
| 171 |
+
out.append(f"[{tool}] {blob}")
|
| 172 |
+
return out
|
app/evals/store.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Persistence for eval runs — backed by Supabase Postgres via SQLAlchemy.
|
| 2 |
+
|
| 3 |
+
Tables (`eval_runs`, `eval_cases`) live alongside the rent-roll schema and are
|
| 4 |
+
created automatically by `init_db()` in `app/db.py`.
|
| 5 |
+
"""
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
from datetime import datetime, timezone
|
| 9 |
+
from typing import Any
|
| 10 |
+
|
| 11 |
+
from sqlalchemy import select, desc
|
| 12 |
+
|
| 13 |
+
from ..db import session_scope
|
| 14 |
+
from ..models import EvalCase, EvalRun
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def _now() -> datetime:
|
| 18 |
+
return datetime.now(timezone.utc)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def create_run(run_id: str, trigger: str) -> None:
|
| 22 |
+
with session_scope() as s:
|
| 23 |
+
# Idempotent — the API may pre-create the row before BackgroundTasks
|
| 24 |
+
# hands off to the runner.
|
| 25 |
+
existing = s.get(EvalRun, run_id)
|
| 26 |
+
if existing is not None:
|
| 27 |
+
return
|
| 28 |
+
s.add(EvalRun(
|
| 29 |
+
id=run_id,
|
| 30 |
+
started_at=_now(),
|
| 31 |
+
trigger=trigger,
|
| 32 |
+
status="running",
|
| 33 |
+
summary=None,
|
| 34 |
+
))
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def finish_run(run_id: str, status: str, summary: dict[str, Any]) -> None:
|
| 38 |
+
with session_scope() as s:
|
| 39 |
+
row = s.get(EvalRun, run_id)
|
| 40 |
+
if row is None:
|
| 41 |
+
row = EvalRun(id=run_id, started_at=_now(), trigger="unknown", status=status, summary=summary)
|
| 42 |
+
s.add(row)
|
| 43 |
+
row.finished_at = _now()
|
| 44 |
+
row.status = status
|
| 45 |
+
row.summary = summary
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def add_case(
|
| 49 |
+
*,
|
| 50 |
+
run_id: str,
|
| 51 |
+
golden_id: str,
|
| 52 |
+
property_code: str | None,
|
| 53 |
+
question: str,
|
| 54 |
+
answer: str | None,
|
| 55 |
+
scores: dict[str, Any] | None,
|
| 56 |
+
ok: bool,
|
| 57 |
+
error: str | None,
|
| 58 |
+
duration_ms: int | None,
|
| 59 |
+
trace_id: str | None,
|
| 60 |
+
) -> None:
|
| 61 |
+
with session_scope() as s:
|
| 62 |
+
s.add(EvalCase(
|
| 63 |
+
run_id=run_id,
|
| 64 |
+
golden_id=golden_id,
|
| 65 |
+
property_code=property_code,
|
| 66 |
+
question=question,
|
| 67 |
+
answer=answer,
|
| 68 |
+
scores=scores,
|
| 69 |
+
ok=bool(ok),
|
| 70 |
+
error=error,
|
| 71 |
+
duration_ms=duration_ms,
|
| 72 |
+
trace_id=trace_id,
|
| 73 |
+
))
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def list_runs(limit: int = 50) -> list[dict[str, Any]]:
|
| 77 |
+
with session_scope() as s:
|
| 78 |
+
rows = s.execute(
|
| 79 |
+
select(EvalRun).order_by(desc(EvalRun.started_at)).limit(limit)
|
| 80 |
+
).scalars().all()
|
| 81 |
+
return [_run_to_dict(r) for r in rows]
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def get_run(run_id: str) -> dict[str, Any] | None:
|
| 85 |
+
with session_scope() as s:
|
| 86 |
+
run = s.get(EvalRun, run_id)
|
| 87 |
+
if run is None:
|
| 88 |
+
return None
|
| 89 |
+
cases = s.execute(
|
| 90 |
+
select(EvalCase).where(EvalCase.run_id == run_id).order_by(EvalCase.id)
|
| 91 |
+
).scalars().all()
|
| 92 |
+
out = _run_to_dict(run)
|
| 93 |
+
out["cases"] = [_case_to_dict(c) for c in cases]
|
| 94 |
+
return out
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def _run_to_dict(r: EvalRun) -> dict[str, Any]:
|
| 98 |
+
return {
|
| 99 |
+
"id": r.id,
|
| 100 |
+
"started_at": r.started_at.isoformat() if r.started_at else None,
|
| 101 |
+
"finished_at": r.finished_at.isoformat() if r.finished_at else None,
|
| 102 |
+
"trigger": r.trigger,
|
| 103 |
+
"status": r.status,
|
| 104 |
+
"summary": r.summary,
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _case_to_dict(c: EvalCase) -> dict[str, Any]:
|
| 109 |
+
return {
|
| 110 |
+
"golden_id": c.golden_id,
|
| 111 |
+
"property_code": c.property_code,
|
| 112 |
+
"question": c.question,
|
| 113 |
+
"answer": c.answer,
|
| 114 |
+
"scores": c.scores,
|
| 115 |
+
"ok": bool(c.ok),
|
| 116 |
+
"error": c.error,
|
| 117 |
+
"duration_ms": c.duration_ms,
|
| 118 |
+
"trace_id": c.trace_id,
|
| 119 |
+
}
|
app/main.py
CHANGED
|
@@ -37,6 +37,8 @@ from .schemas import (
|
|
| 37 |
from .guardrails.scope import UnknownPropertyError, ScopeViolationError
|
| 38 |
from .llm_registry import ProviderUnavailable, list_llms, validate_model
|
| 39 |
from .graph.build import run_chat, run_chat_stream
|
|
|
|
|
|
|
| 40 |
|
| 41 |
log = logging.getLogger("property_ai")
|
| 42 |
settings = get_settings()
|
|
@@ -44,8 +46,16 @@ settings = get_settings()
|
|
| 44 |
|
| 45 |
@asynccontextmanager
|
| 46 |
async def lifespan(_app: FastAPI):
|
|
|
|
| 47 |
init_db()
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
app = FastAPI(title="Property-Specific AI Assistant", version="0.1.0", lifespan=lifespan)
|
|
@@ -64,6 +74,9 @@ app.add_middleware(
|
|
| 64 |
allow_headers=["*"],
|
| 65 |
)
|
| 66 |
|
|
|
|
|
|
|
|
|
|
| 67 |
# Image/table artifacts live in Supabase Storage (public bucket). The
|
| 68 |
# frontend loads them directly via the fully-qualified Supabase URL that
|
| 69 |
# rag_tools returns — no static mount needed here.
|
|
|
|
| 37 |
from .guardrails.scope import UnknownPropertyError, ScopeViolationError
|
| 38 |
from .llm_registry import ProviderUnavailable, list_llms, validate_model
|
| 39 |
from .graph.build import run_chat, run_chat_stream
|
| 40 |
+
from .observability import init_tracing, shutdown_tracing
|
| 41 |
+
from .evals.api import router as evals_router
|
| 42 |
|
| 43 |
log = logging.getLogger("property_ai")
|
| 44 |
settings = get_settings()
|
|
|
|
| 46 |
|
| 47 |
@asynccontextmanager
|
| 48 |
async def lifespan(_app: FastAPI):
|
| 49 |
+
init_tracing(_app)
|
| 50 |
init_db()
|
| 51 |
+
# Eval scheduler is opt-in via EVAL_SCHEDULE_ENABLED.
|
| 52 |
+
from .evals.scheduler import start_scheduler, stop_scheduler
|
| 53 |
+
start_scheduler()
|
| 54 |
+
try:
|
| 55 |
+
yield
|
| 56 |
+
finally:
|
| 57 |
+
stop_scheduler()
|
| 58 |
+
shutdown_tracing()
|
| 59 |
|
| 60 |
|
| 61 |
app = FastAPI(title="Property-Specific AI Assistant", version="0.1.0", lifespan=lifespan)
|
|
|
|
| 74 |
allow_headers=["*"],
|
| 75 |
)
|
| 76 |
|
| 77 |
+
# Eval & monitoring endpoints — admin-protected, never on /chat critical path.
|
| 78 |
+
app.include_router(evals_router)
|
| 79 |
+
|
| 80 |
# Image/table artifacts live in Supabase Storage (public bucket). The
|
| 81 |
# frontend loads them directly via the fully-qualified Supabase URL that
|
| 82 |
# rag_tools returns — no static mount needed here.
|
app/models.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
"""ORM models for the rent-roll domain."""
|
| 2 |
from __future__ import annotations
|
| 3 |
|
| 4 |
-
from datetime import date
|
| 5 |
from sqlalchemy import (
|
| 6 |
-
String, Integer, Float, Date, Boolean, ForeignKey, JSON, Index
|
| 7 |
)
|
| 8 |
from sqlalchemy.dialects.postgresql import JSONB
|
| 9 |
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
|
|
@@ -109,3 +109,39 @@ class RentChargeLine(Base):
|
|
| 109 |
__table_args__ = (
|
| 110 |
Index("ix_charge_code_month_unit", "property_code", "snapshot_month", "unit_number"),
|
| 111 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""ORM models for the rent-roll domain."""
|
| 2 |
from __future__ import annotations
|
| 3 |
|
| 4 |
+
from datetime import date, datetime
|
| 5 |
from sqlalchemy import (
|
| 6 |
+
String, Integer, Float, Date, Boolean, ForeignKey, JSON, Index, DateTime, Text
|
| 7 |
)
|
| 8 |
from sqlalchemy.dialects.postgresql import JSONB
|
| 9 |
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship
|
|
|
|
| 109 |
__table_args__ = (
|
| 110 |
Index("ix_charge_code_month_unit", "property_code", "snapshot_month", "unit_number"),
|
| 111 |
)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# ---------------------------------------------------------------------------
|
| 115 |
+
# Evaluation harness — persisted alongside the rent-roll schema in Supabase
|
| 116 |
+
# so the Monitoring UI can read run history without a separate datastore.
|
| 117 |
+
# ---------------------------------------------------------------------------
|
| 118 |
+
|
| 119 |
+
class EvalRun(Base):
|
| 120 |
+
__tablename__ = "eval_runs"
|
| 121 |
+
id: Mapped[str] = mapped_column(String(64), primary_key=True)
|
| 122 |
+
started_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), index=True)
|
| 123 |
+
finished_at: Mapped[datetime | None] = mapped_column(DateTime(timezone=True), nullable=True)
|
| 124 |
+
trigger: Mapped[str] = mapped_column(String(32)) # manual | scheduled | cli
|
| 125 |
+
status: Mapped[str] = mapped_column(String(16)) # running | completed | failed
|
| 126 |
+
summary: Mapped[dict | None] = mapped_column(JSON_VARIANT, nullable=True)
|
| 127 |
+
|
| 128 |
+
cases: Mapped[list["EvalCase"]] = relationship(
|
| 129 |
+
back_populates="run", cascade="all, delete-orphan",
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
class EvalCase(Base):
|
| 134 |
+
__tablename__ = "eval_cases"
|
| 135 |
+
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
| 136 |
+
run_id: Mapped[str] = mapped_column(String(64), ForeignKey("eval_runs.id", ondelete="CASCADE"), index=True)
|
| 137 |
+
golden_id: Mapped[str] = mapped_column(String(128))
|
| 138 |
+
property_code: Mapped[str | None] = mapped_column(String(32), nullable=True)
|
| 139 |
+
question: Mapped[str] = mapped_column(Text)
|
| 140 |
+
answer: Mapped[str | None] = mapped_column(Text, nullable=True)
|
| 141 |
+
scores: Mapped[dict | None] = mapped_column(JSON_VARIANT, nullable=True)
|
| 142 |
+
ok: Mapped[bool] = mapped_column(Boolean, default=False)
|
| 143 |
+
error: Mapped[str | None] = mapped_column(Text, nullable=True)
|
| 144 |
+
duration_ms: Mapped[int | None] = mapped_column(Integer, nullable=True)
|
| 145 |
+
trace_id: Mapped[str | None] = mapped_column(String(64), nullable=True)
|
| 146 |
+
|
| 147 |
+
run: Mapped["EvalRun"] = relationship(back_populates="cases")
|
app/observability.py
ADDED
|
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""OpenTelemetry / Phoenix Cloud tracing bootstrap.
|
| 2 |
+
|
| 3 |
+
`init_tracing()` is called once from `main.py:lifespan`. It is **fail-open** —
|
| 4 |
+
any error (missing key, import problem, network issue) is logged and swallowed
|
| 5 |
+
so the API keeps serving traffic.
|
| 6 |
+
|
| 7 |
+
Span export uses OTel's `BatchSpanProcessor`, which queues spans in memory and
|
| 8 |
+
ships them on a background thread, so `/chat` request latency is unaffected by
|
| 9 |
+
the network or Phoenix availability.
|
| 10 |
+
"""
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import logging
|
| 14 |
+
import os
|
| 15 |
+
from typing import TYPE_CHECKING
|
| 16 |
+
|
| 17 |
+
if TYPE_CHECKING:
|
| 18 |
+
from fastapi import FastAPI
|
| 19 |
+
|
| 20 |
+
log = logging.getLogger("property_ai.observability")
|
| 21 |
+
|
| 22 |
+
_initialized = False
|
| 23 |
+
_tracer_provider = None
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def init_tracing(app: "FastAPI | None" = None) -> None:
|
| 27 |
+
"""Idempotent. Wire Phoenix Cloud + OpenInference + FastAPI instrumentation.
|
| 28 |
+
|
| 29 |
+
Defaults to no-op when `PHOENIX_ENABLED` is false or the API key is missing.
|
| 30 |
+
"""
|
| 31 |
+
global _initialized, _tracer_provider
|
| 32 |
+
if _initialized:
|
| 33 |
+
return
|
| 34 |
+
|
| 35 |
+
from .config import get_settings
|
| 36 |
+
settings = get_settings()
|
| 37 |
+
|
| 38 |
+
if not settings.phoenix_enabled:
|
| 39 |
+
log.info("phoenix tracing disabled (PHOENIX_ENABLED=false)")
|
| 40 |
+
_initialized = True
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
if not settings.phoenix_api_key:
|
| 44 |
+
log.warning("PHOENIX_ENABLED=true but PHOENIX_API_KEY is unset — tracing disabled")
|
| 45 |
+
_initialized = True
|
| 46 |
+
return
|
| 47 |
+
|
| 48 |
+
# Suppress Authorization / X-Admin-Token in FastAPI request-header spans.
|
| 49 |
+
os.environ.setdefault(
|
| 50 |
+
"OTEL_INSTRUMENTATION_HTTP_CAPTURE_HEADERS_SANITIZE_FIELDS",
|
| 51 |
+
"authorization,x-admin-token,cookie,set-cookie",
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
from phoenix.otel import register
|
| 56 |
+
|
| 57 |
+
# Phoenix Cloud spaces require Authorization: Bearer <key>. We set it
|
| 58 |
+
# both via the register() kwarg and as OTEL_EXPORTER_OTLP_HEADERS so
|
| 59 |
+
# whichever transport phoenix.otel picks (gRPC or HTTP) authenticates.
|
| 60 |
+
bearer = f"Bearer {settings.phoenix_api_key}"
|
| 61 |
+
os.environ.setdefault("OTEL_EXPORTER_OTLP_HEADERS", f"Authorization={bearer}")
|
| 62 |
+
os.environ.setdefault("PHOENIX_API_KEY", settings.phoenix_api_key)
|
| 63 |
+
|
| 64 |
+
_tracer_provider = register(
|
| 65 |
+
project_name=settings.phoenix_project_name,
|
| 66 |
+
endpoint=settings.phoenix_endpoint,
|
| 67 |
+
headers={"Authorization": bearer},
|
| 68 |
+
batch=True,
|
| 69 |
+
auto_instrument=False,
|
| 70 |
+
set_global_tracer_provider=True,
|
| 71 |
+
)
|
| 72 |
+
except Exception as e: # noqa: BLE001
|
| 73 |
+
log.warning("phoenix.otel.register failed — tracing disabled: %s", e)
|
| 74 |
+
_initialized = True
|
| 75 |
+
return
|
| 76 |
+
|
| 77 |
+
# Instrument LangChain (covers LangGraph nodes), LLM SDKs, and FastAPI.
|
| 78 |
+
# Each is wrapped independently so a missing dep doesn't kill the others.
|
| 79 |
+
_safe_instrument("openinference.instrumentation.langchain", "LangChainInstrumentor")
|
| 80 |
+
_safe_instrument("openinference.instrumentation.openai", "OpenAIInstrumentor")
|
| 81 |
+
_safe_instrument("openinference.instrumentation.anthropic", "AnthropicInstrumentor")
|
| 82 |
+
_safe_instrument("openinference.instrumentation.google_genai", "GoogleGenAIInstrumentor")
|
| 83 |
+
|
| 84 |
+
if app is not None:
|
| 85 |
+
try:
|
| 86 |
+
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
| 87 |
+
FastAPIInstrumentor.instrument_app(app)
|
| 88 |
+
except Exception as e: # noqa: BLE001
|
| 89 |
+
log.warning("FastAPI instrumentation failed: %s", e)
|
| 90 |
+
|
| 91 |
+
log.info(
|
| 92 |
+
"phoenix tracing enabled — project=%s endpoint=%s",
|
| 93 |
+
settings.phoenix_project_name,
|
| 94 |
+
settings.phoenix_endpoint,
|
| 95 |
+
)
|
| 96 |
+
_initialized = True
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def _safe_instrument(module: str, class_name: str) -> None:
|
| 100 |
+
try:
|
| 101 |
+
mod = __import__(module, fromlist=[class_name])
|
| 102 |
+
cls = getattr(mod, class_name)
|
| 103 |
+
cls().instrument()
|
| 104 |
+
except Exception as e: # noqa: BLE001
|
| 105 |
+
log.warning("instrumentor %s.%s skipped: %s", module, class_name, e)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def shutdown_tracing() -> None:
|
| 109 |
+
"""Flush in-flight spans on app shutdown."""
|
| 110 |
+
global _tracer_provider
|
| 111 |
+
if _tracer_provider is None:
|
| 112 |
+
return
|
| 113 |
+
try:
|
| 114 |
+
_tracer_provider.shutdown()
|
| 115 |
+
except Exception as e: # noqa: BLE001
|
| 116 |
+
log.warning("tracer provider shutdown failed: %s", e)
|
| 117 |
+
_tracer_provider = None
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def get_tracer(name: str = "property_ai"):
|
| 121 |
+
"""Return an OTel tracer. Safe to call even when tracing is disabled — the
|
| 122 |
+
no-op TracerProvider returns no-op spans."""
|
| 123 |
+
from opentelemetry import trace
|
| 124 |
+
return trace.get_tracer(name)
|
app/tools/rag_tools.py
CHANGED
|
@@ -17,6 +17,9 @@ from __future__ import annotations
|
|
| 17 |
from typing import Any
|
| 18 |
|
| 19 |
from ..guardrails.scope import require_scope
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
# Pinecone returns cosine SIMILARITY in `score` (1 = identical, -1 = opposite).
|
|
@@ -165,6 +168,13 @@ def search_property_v2(
|
|
| 165 |
max_images = max(1, min(int(max_images or _DEFAULT_MAX_IMAGES), _HARD_CAP_MAX_IMAGES))
|
| 166 |
image_query_k = min(max_images * _IMAGE_QUERY_K_MULT, 60)
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
from ..ingestion.v2.embedder import JinaV4Embedder
|
| 169 |
from ..ingestion.v2.pipeline import get_index_v2
|
| 170 |
|
|
@@ -285,6 +295,20 @@ def search_property_v2(
|
|
| 285 |
label = label.split("|", 1)[0].strip()
|
| 286 |
sources.append({"label": label, "url": u})
|
| 287 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
return {
|
| 289 |
"property_code": code,
|
| 290 |
"query": query,
|
|
|
|
| 17 |
from typing import Any
|
| 18 |
|
| 19 |
from ..guardrails.scope import require_scope
|
| 20 |
+
from ..observability import get_tracer
|
| 21 |
+
|
| 22 |
+
_tracer = get_tracer("property_ai.rag")
|
| 23 |
|
| 24 |
|
| 25 |
# Pinecone returns cosine SIMILARITY in `score` (1 = identical, -1 = opposite).
|
|
|
|
| 168 |
max_images = max(1, min(int(max_images or _DEFAULT_MAX_IMAGES), _HARD_CAP_MAX_IMAGES))
|
| 169 |
image_query_k = min(max_images * _IMAGE_QUERY_K_MULT, 60)
|
| 170 |
|
| 171 |
+
# OpenInference RETRIEVER span — Phoenix renders chunks under "Retrieval".
|
| 172 |
+
span = _tracer.start_span("pinecone.query")
|
| 173 |
+
span.set_attribute("openinference.span.kind", "RETRIEVER")
|
| 174 |
+
span.set_attribute("retrieval.namespace", code)
|
| 175 |
+
span.set_attribute("retrieval.top_k", k)
|
| 176 |
+
span.set_attribute("input.value", query)
|
| 177 |
+
|
| 178 |
from ..ingestion.v2.embedder import JinaV4Embedder
|
| 179 |
from ..ingestion.v2.pipeline import get_index_v2
|
| 180 |
|
|
|
|
| 295 |
label = label.split("|", 1)[0].strip()
|
| 296 |
sources.append({"label": label, "url": u})
|
| 297 |
|
| 298 |
+
try:
|
| 299 |
+
for i, ch in enumerate(chunks[:10]):
|
| 300 |
+
text = ch.get("text") or ""
|
| 301 |
+
span.set_attribute(f"retrieval.documents.{i}.document.id", str(ch.get("chunk_index") or i))
|
| 302 |
+
span.set_attribute(f"retrieval.documents.{i}.document.score", float(1.0 - (ch.get("distance") or 0.0)))
|
| 303 |
+
span.set_attribute(f"retrieval.documents.{i}.document.content", text[:500])
|
| 304 |
+
url = ch.get("url")
|
| 305 |
+
if url:
|
| 306 |
+
span.set_attribute(f"retrieval.documents.{i}.document.metadata.url", url)
|
| 307 |
+
span.set_attribute("retrieval.chunk_count", len(chunks))
|
| 308 |
+
span.set_attribute("retrieval.image_count", len(images))
|
| 309 |
+
finally:
|
| 310 |
+
span.end()
|
| 311 |
+
|
| 312 |
return {
|
| 313 |
"property_code": code,
|
| 314 |
"query": query,
|
requirements.txt
CHANGED
|
@@ -46,3 +46,22 @@ openai>=1.55,<3
|
|
| 46 |
sqlglot>=25
|
| 47 |
langchain-anthropic>=1.0,<2
|
| 48 |
langchain-google-genai>=2,<3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
sqlglot>=25
|
| 47 |
langchain-anthropic>=1.0,<2
|
| 48 |
langchain-google-genai>=2,<3
|
| 49 |
+
|
| 50 |
+
# Observability & evaluation — latest production-grade releases (verified on PyPI 2026-05-24)
|
| 51 |
+
# NOTE: we don't install the full `arize-phoenix` server here (it pulls FastAPI>=0.135
|
| 52 |
+
# and conflicts with our pin). Phoenix Cloud is the hosted backend, so the OTel
|
| 53 |
+
# exporter package alone is enough.
|
| 54 |
+
arize-phoenix-otel>=0.16.1
|
| 55 |
+
openinference-instrumentation-langchain>=0.1.66
|
| 56 |
+
openinference-instrumentation-openai>=0.1.49
|
| 57 |
+
openinference-instrumentation-anthropic>=1.0.5
|
| 58 |
+
openinference-instrumentation-google-genai>=1.0.2
|
| 59 |
+
opentelemetry-instrumentation-fastapi>=0.63b1
|
| 60 |
+
# NOTE: `open-rag-eval` is NOT installed — its 0.3.0 dependency graph
|
| 61 |
+
# (torch 2.7, llama_index, streamlit, transformers 4.50, openai~=2.7,
|
| 62 |
+
# anthropic~=0.72) hard-conflicts with this backend's pins. `app/evals/scorer.py`
|
| 63 |
+
# uses the same TREC-style LLM-judge prompt with the same metrics
|
| 64 |
+
# (groundedness / hallucination / answer_relevance / context_relevance) via
|
| 65 |
+
# the OpenAI SDK directly — identical signal, no dep collision.
|
| 66 |
+
apscheduler>=3.11.0
|
| 67 |
+
PyYAML>=6.0
|