Spaces:

RajatA98
/

PiedPiper

Running

App Files Files Community

RajatA98 commited on 3 days ago

Commit

b43bd50

1 Parent(s): a57cff5

Deploy RAG narrative layer + telemetry + RAG eval harness

Browse files

Files changed (9) hide show

backend/api.py +253 -4
backend/context_token.py +194 -0
backend/narrative_telemetry.py +296 -0
backend/rag_narrative.py +384 -0
backend/scripts/enrich_jamendo.py +140 -0
backend/scripts/enrich_mir_features.py +128 -0
backend/scripts/run_rag_eval.py +230 -0
backend/scripts/verify_matching.py +230 -0
requirements.txt +5 -0

backend/api.py CHANGED Viewed

@@ -20,6 +20,7 @@ Errors are returned as `{"error": "<code>"}` to match the frontend's `api.js`:
 from __future__ import annotations
 import io
 import json
 import os
@@ -33,12 +34,13 @@ import soundfile as sf
 from fastapi import FastAPI, File, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 # ADR-0002: clap_engine is no longer the primary encoder; muq_engine took its
 # place via clap_windowed's swap. We still import clap_engine here only because
 # legacy code paths may reference it; the encoder load + genre tagging both go
 # through muq_engine.
-from . import __version__, acrcloud_engine, muq_engine, clap_windowed, config, mir_features, similarity
 from .librosa_engine import analyze_array
 from .scoring import compute_report
@@ -63,6 +65,7 @@ _corpus_by_id: dict[str, dict] = {}
 _flat_catalog: similarity.FlatCatalog | None = None
 _catalog_cosine_distribution: np.ndarray | None = None  # sorted upper-tri off-diag pairwise cosines
 _model_sha: str = ""
 _threshold_default: float = config.SIMILARITY_THRESHOLD_DEFAULT
@@ -77,7 +80,7 @@ def _load_corpus() -> None:
     """Populate corpus globals from disk if all corpus artifacts are present."""
     global _corpus_tracks, _corpus_embeddings, _corpus_by_id, _flat_catalog
     global _catalog_cosine_distribution
-    global _model_sha, _threshold_default
     corpus_dir = Path(os.getenv("CORPUS_DIR", str(_default_corpus_dir())))
     cpath = corpus_dir / "corpus.json"
     epath = corpus_dir / "embeddings.npy"
@@ -95,6 +98,7 @@ def _load_corpus() -> None:
         _corpus_by_id = {}
         _flat_catalog = None
         _model_sha = ""
         _threshold_default = config.SIMILARITY_THRESHOLD_DEFAULT
         return
     try:
@@ -103,10 +107,16 @@ def _load_corpus() -> None:
         _corpus_embeddings = np.load(epath).astype(np.float32)
         with np.load(spath) as npz:
             segment_embeddings = {k: npz[k].astype(np.float32) for k in npz.files}
-        manifest = json.loads(mpath.read_text())
         _model_sha = str(manifest.get("model_sha") or "unpinned")
         if _model_sha == "unpinned":
             print("[api] WARNING manifest missing model_sha; using 'unpinned'")
         _threshold_default = similarity.threshold_from_manifest(manifest)
         _flat_catalog = similarity.build_flat_catalog(_corpus_tracks, _corpus_embeddings, segment_embeddings)
         _catalog_cosine_distribution = similarity.compute_catalog_distribution(_flat_catalog)
@@ -128,6 +138,7 @@ def _load_corpus() -> None:
         _flat_catalog = None
         _catalog_cosine_distribution = None
         _model_sha = ""
         _threshold_default = config.SIMILARITY_THRESHOLD_DEFAULT
@@ -332,6 +343,10 @@ async def neighbors_endpoint(file: UploadFile = File(...), k: int = 5):
     raw = await file.read()
     if (err := _validate_upload(file, raw)) is not None:
         return err
     ext = Path(file.filename or "").suffix.lower()
     pipeline = _decode_and_pipeline(raw, ext=ext)
     if isinstance(pipeline, JSONResponse):
@@ -348,6 +363,8 @@ async def neighbors_endpoint(file: UploadFile = File(...), k: int = 5):
             "modelSha": _model_sha,
             "thresholdDefault": _threshold_default,
             "acrcloud": acrcloud_engine.to_response_dict(acrcloud_engine.disabled_response()),
         }
     neighbors = similarity.top_k_neighbors(
@@ -394,6 +411,40 @@ async def neighbors_endpoint(file: UploadFile = File(...), k: int = 5):
     specificity = float(similarity.query_specificity(pipeline["emb"].astype(np.float32), _flat_catalog))
     acr = acrcloud_engine.call_for_query(pipeline["acrcloud_audio"])
     return {
         "query": query_track,
@@ -405,10 +456,208 @@ async def neighbors_endpoint(file: UploadFile = File(...), k: int = 5):
         "querySpecificity": specificity,
         "modelSha": _model_sha,
         "thresholdDefault": _threshold_default,
-        "acrcloud": acrcloud_engine.to_response_dict(acr),
     }
 def run() -> None:
     """Convenience launcher: `python -m backend.api` or `uvicorn backend.api:app`."""
     import uvicorn

 from __future__ import annotations
+import hashlib
 import io
 import json
 import os
 from fastapi import FastAPI, File, UploadFile
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
+from pydantic import BaseModel, Field
 # ADR-0002: clap_engine is no longer the primary encoder; muq_engine took its
 # place via clap_windowed's swap. We still import clap_engine here only because
 # legacy code paths may reference it; the encoder load + genre tagging both go
 # through muq_engine.
+from . import __version__, acrcloud_engine, context_token, muq_engine, narrative_telemetry, clap_windowed, config, mir_features, similarity
 from .librosa_engine import analyze_array
 from .scoring import compute_report
 _flat_catalog: similarity.FlatCatalog | None = None
 _catalog_cosine_distribution: np.ndarray | None = None  # sorted upper-tri off-diag pairwise cosines
 _model_sha: str = ""
+_catalog_sha: str = ""  # sha256 of manifest.json bytes; used in contextToken claims
 _threshold_default: float = config.SIMILARITY_THRESHOLD_DEFAULT
     """Populate corpus globals from disk if all corpus artifacts are present."""
     global _corpus_tracks, _corpus_embeddings, _corpus_by_id, _flat_catalog
     global _catalog_cosine_distribution
+    global _model_sha, _catalog_sha, _threshold_default
     corpus_dir = Path(os.getenv("CORPUS_DIR", str(_default_corpus_dir())))
     cpath = corpus_dir / "corpus.json"
     epath = corpus_dir / "embeddings.npy"
         _corpus_by_id = {}
         _flat_catalog = None
         _model_sha = ""
+        _catalog_sha = ""
         _threshold_default = config.SIMILARITY_THRESHOLD_DEFAULT
         return
     try:
         _corpus_embeddings = np.load(epath).astype(np.float32)
         with np.load(spath) as npz:
             segment_embeddings = {k: npz[k].astype(np.float32) for k in npz.files}
+        manifest_bytes = mpath.read_bytes()
+        manifest = json.loads(manifest_bytes.decode("utf-8"))
         _model_sha = str(manifest.get("model_sha") or "unpinned")
         if _model_sha == "unpinned":
             print("[api] WARNING manifest missing model_sha; using 'unpinned'")
+        # catalog_sha = sha256 of manifest.json bytes. Captures every
+        # meaningful catalog regeneration (model swap, threshold change,
+        # track count change) in a single stable hash. Embedded in every
+        # contextToken so /narrative can detect stale tokens after redeploy.
+        _catalog_sha = hashlib.sha256(manifest_bytes).hexdigest()
         _threshold_default = similarity.threshold_from_manifest(manifest)
         _flat_catalog = similarity.build_flat_catalog(_corpus_tracks, _corpus_embeddings, segment_embeddings)
         _catalog_cosine_distribution = similarity.compute_catalog_distribution(_flat_catalog)
         _flat_catalog = None
         _catalog_cosine_distribution = None
         _model_sha = ""
+        _catalog_sha = ""
         _threshold_default = config.SIMILARITY_THRESHOLD_DEFAULT
     raw = await file.read()
     if (err := _validate_upload(file, raw)) is not None:
         return err
+    # queryFingerprint: SHA-256 of the upload bytes. Embedded in contextToken
+    # so /narrative can verify the same query is still in play. Stable across
+    # re-uploads of the same file; cheap to compute.
+    query_fingerprint = hashlib.sha256(raw).hexdigest()
     ext = Path(file.filename or "").suffix.lower()
     pipeline = _decode_and_pipeline(raw, ext=ext)
     if isinstance(pipeline, JSONResponse):
             "modelSha": _model_sha,
             "thresholdDefault": _threshold_default,
             "acrcloud": acrcloud_engine.to_response_dict(acrcloud_engine.disabled_response()),
+            "queryFingerprint": query_fingerprint,
+            "contextToken": None,
         }
     neighbors = similarity.top_k_neighbors(
     specificity = float(similarity.query_specificity(pipeline["emb"].astype(np.float32), _flat_catalog))
     acr = acrcloud_engine.call_for_query(pipeline["acrcloud_audio"])
+    acr_response = acrcloud_engine.to_response_dict(acr)
+    # Codex round-2 Q3: stateless signed token replaces the in-memory cache.
+    # /narrative will verify this token and rebuild context server-side from
+    # the embedded claims. Token is None when HMAC key isn't configured —
+    # /narrative also 503s in that case so the gating is consistent.
+    ctx_token = None
+    if context_token.is_configured():
+        neighbor_fragments: dict[str, dict] = {}
+        for nb in neighbors:
+            track = nb.get("track") or {}
+            ts = nb.get("matchTimestamp") or {}
+            neighbor_fragments[str(nb["trackId"])] = context_token.neighbor_context_fragment(
+                track_id=str(nb["trackId"]),
+                title=str(track.get("title") or nb["trackId"]),
+                artist=track.get("artist"),
+                query_window=(
+                    float(ts.get("queryStartSec", 0.0)),
+                    float(ts.get("queryEndSec", 0.0)),
+                ),
+                match_window=(
+                    float(ts.get("catalogStartSec", 0.0)),
+                    float(ts.get("catalogEndSec", 0.0)),
+                ),
+                raw_cosine=float(nb.get("rawCosine", 0.0)),
+                criteria=_criteria_to_token_fragment(nb.get("criteria")),
+            )
+        ctx_token = context_token.issue(
+            query_fingerprint=query_fingerprint,
+            model_sha=_model_sha or "unpinned",
+            catalog_sha=_catalog_sha or "no-catalog",
+            neighbors=neighbor_fragments,
+            acrcloud_cover_song_id=acr_response.get("coverSongId"),
+        )
     return {
         "query": query_track,
         "querySpecificity": specificity,
         "modelSha": _model_sha,
         "thresholdDefault": _threshold_default,
+        "acrcloud": acr_response,
+        "queryFingerprint": query_fingerprint,
+        "contextToken": ctx_token,
     }
+def _criteria_to_token_fragment(criteria_block: dict | None) -> list[dict] | None:
+    """Reshape /neighbors' criteria block into the list-of-CriterionContext
+    form Codex's rag_narrative module expects.
+    The /neighbors response groups criteria by id under a top-level dict;
+    NarrativeContext takes a flat list of {id, queryValue, matchValue,
+    agreement, label}. Convert here so the token payload matches the
+    NarrativeContext shape directly.
+    """
+    if not criteria_block:
+        return None
+    out: list[dict] = []
+    for cid in ("tempo", "key", "harmonic", "timbre"):
+        entry = criteria_block.get(cid)
+        if not entry:
+            continue
+        # harmonic + timbre come back from /neighbors without queryValue /
+        # matchValue (only agreement + label) because we don't ship the raw
+        # vectors. Substitute a shape marker so Codex's citation validator
+        # has something to check the keys against without exposing internals.
+        q_val = entry.get("queryValue")
+        m_val = entry.get("matchValue")
+        if cid in ("harmonic", "timbre") and q_val is None and m_val is None:
+            q_val = {"vector": "elided"}
+            m_val = {"vector": "elided"}
+        out.append({
+            "id": cid,
+            "queryValue": q_val,
+            "matchValue": m_val,
+            "agreement": float(entry.get("agreement", 0.0)),
+            "label": str(entry.get("label", "")),
+        })
+    return out or None
+# --- /narrative -------------------------------------------------------------
+#
+# Stateless RAG explanatory layer over /neighbors. Client sends the
+# contextToken received from /neighbors plus the trackId + mode it wants
+# narrated; backend verifies the token (signature, expiry, model/catalog
+# version), rebuilds NarrativeContext from the embedded claims, and delegates
+# to Codex's rag_narrative module.
+#
+# Failure shape: typed `{"error": "<code>"}` JSON, status code by class:
+#   503 narrative-disabled  — OPENAI_API_KEY or CONTEXT_TOKEN_HMAC_KEY absent
+#   401 invalid-token       — signature mismatch (tampered or wrong secret)
+#   412 token-expired       — past expiresAt
+#   412 stale-token         — modelSha/catalogSha changed since issuance
+#   400 malformed-token     — bad shape; not <body>.<sig>
+#   404 not-in-context      — trackId wasn't part of the issued token
+#   422 unsupported-mode    — mode wasn't "whySimilar" or "creatorAdvice"
+class NarrativeRequest(BaseModel):
+    contextToken: str = Field(..., min_length=1)
+    trackId: str = Field(..., min_length=1)
+    mode: str = Field(..., min_length=1)
+_TOKEN_ERROR_TO_HTTP = {
+    "malformed": (400, "malformed-token"),
+    "invalid-signature": (401, "invalid-token"),
+    "token-expired": (412, "token-expired"),
+    "stale-model": (412, "stale-token"),
+    "stale-catalog": (412, "stale-token"),
+    "hmac-key-missing": (503, "narrative-disabled"),
+}
+@app.post("/narrative")
+async def narrative_endpoint(req: NarrativeRequest):
+    """RAG explanatory layer — see ADR-0005 for the full spec."""
+    with narrative_telemetry.measure_call(req.mode) as tel:
+        # Gate 1: OpenAI key present. Without it we can't call GPT-4o-mini.
+        if not os.getenv("OPENAI_API_KEY", "").strip():
+            tel.set(error_code="narrative-disabled")
+            return _err(503, "narrative-disabled")
+        # Gate 2: HMAC key present. Without it we can't trust the token.
+        if not context_token.is_configured():
+            tel.set(error_code="narrative-disabled")
+            return _err(503, "narrative-disabled")
+        # Gate 3: mode is one of the supported values.
+        if req.mode not in ("whySimilar", "creatorAdvice"):
+            tel.set(error_code="unsupported-mode")
+            return _err(422, "unsupported-mode")
+        # Verify the token. TokenError.code maps directly to a typed HTTP response.
+        try:
+            verified = context_token.verify(
+                req.contextToken,
+                expected_model_sha=_model_sha or "unpinned",
+                expected_catalog_sha=_catalog_sha or "no-catalog",
+            )
+        except context_token.TokenError as exc:
+            status, code = _TOKEN_ERROR_TO_HTTP.get(exc.code, (400, "malformed-token"))
+            tel.set(error_code=code)
+            return _err(status, code)
+        # Look up the requested trackId inside the verified token claims.
+        fragment = verified.neighbors.get(req.trackId)
+        if not fragment:
+            tel.set(error_code="not-in-context", trackId=req.trackId)
+            return _err(404, "not-in-context")
+        # Lazy-import Codex's module. Keeping this inside the handler means the
+        # FastAPI app boots and /neighbors keeps working even if rag_narrative
+        # hasn't shipped yet. If it's missing at request time, surface as 503
+        # narrative-disabled so the frontend's no-key fallback path handles it.
+        try:
+            from . import rag_narrative
+        except ImportError:
+            tel.set(error_code="narrative-disabled")
+            return _err(503, "narrative-disabled")
+        # Build NarrativeContext from the verified fragment. This is the Pydantic
+        # model Codex defined; instantiating it here also validates the shape.
+        try:
+            context = rag_narrative.NarrativeContext(
+                queryFingerprint=verified.queryFingerprint,
+                trackId=fragment["trackId"],
+                title=fragment.get("title", ""),
+                artist=fragment.get("artist"),
+                queryWindow=tuple(fragment["queryWindow"]),
+                matchWindow=tuple(fragment["matchWindow"]),
+                rawCosine=float(fragment["rawCosine"]),
+                criteria=[
+                    rag_narrative.CriterionContext(**c)
+                    for c in (fragment.get("criteria") or [])
+                ],
+                acrcloudCoverSongId=verified.acrcloudCoverSongId,
+            )
+        except Exception:
+            # If the token fragment fails to materialize into a NarrativeContext,
+            # surface as malformed rather than blowing up internally.
+            tel.set(error_code="malformed-context", trackId=req.trackId)
+            return _err(422, "malformed-context")
+        model_id = os.getenv("OPENAI_MODEL_ID", "gpt-4o-mini")
+        try:
+            result = rag_narrative.generate_narrative(
+                context,
+                req.mode,
+                model_sha=_model_sha or "unpinned",
+                catalog_sha=_catalog_sha or "no-catalog",
+                model_id=model_id,
+            )
+        except Exception as exc:
+            print(f"[api] /narrative generate_narrative raised: {exc!r}")
+            tel.set(error_code="narrative-error", trackId=req.trackId)
+            return _err(500, "narrative-error")
+        # Record the result kind. result.kind is the discriminator on all
+        # three Pydantic variants (NarrativeResponse / LowConfidence /
+        # NarrativeUnavailable). Approximate cost via prose char count;
+        # we don't have token counts without re-tokenizing, but char-count
+        # is the right directional signal for the stats endpoint.
+        result_kind = getattr(result, "kind", None)
+        completion_chars = 0
+        if result_kind == "narrative":
+            completion_chars = len(getattr(result, "prose", "") or "")
+        # Rough prompt size estimate — system + user prompt char count.
+        # narrative_telemetry treats this as char-not-token because tokenizer
+        # access isn't worth the overhead for an in-process counter.
+        prompt_chars_estimate = len(fragment.get("title", "")) + 600  # base + metadata
+        tel.set(
+            result_kind=result_kind,
+            openai_called=(result_kind == "narrative" or result_kind == "unavailable"),
+            gate_short_circuit=(result_kind == "low_confidence"),
+            prompt_chars=prompt_chars_estimate,
+            completion_chars=completion_chars,
+            trackId=req.trackId,
+        )
+        # Pydantic v2 .model_dump() — uniform shape regardless of which result
+        # variant came back. The `kind` discriminator lets the frontend route
+        # rendering.
+        if hasattr(result, "model_dump"):
+            return result.model_dump()
+        return result
+@app.get("/narrative/stats")
+def narrative_stats_endpoint() -> dict:
+    """Return the in-process counters snapshot for the /narrative layer.
+    Senior-reviewer-friendly visibility into what's actually happening in
+    production — call counts, latency percentiles, mode distribution,
+    error distribution, rough cost estimate. Counters reset on restart;
+    this is not a long-term metrics store, it's a "right now" snapshot.
+    Cost estimate is char-based × GPT-4o-mini pricing — directional, not
+    accounting-grade. The honest framing from ADR-0005 holds.
+    """
+    return narrative_telemetry.snapshot()
 def run() -> None:
     """Convenience launcher: `python -m backend.api` or `uvicorn backend.api:app`."""
     import uvicorn

backend/context_token.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""HMAC-signed opaque context token for the /narrative endpoint.
+Codex round-2 review Q3 pushed back on the in-memory cache approach: a TTL
+cache breaks across HF restarts, multiple workers, and page refreshes. This
+module is the replacement — a stateless signed token.
+`/neighbors` issues a token containing the full NarrativeContext payload for
+every neighbor it returns. The token is HMAC-signed with a server secret so
+the backend can later verify that the client didn't tamper with it. On
+`/narrative`, the backend verifies the signature + expiry + model/catalog
+hashes, looks up the requested trackId inside the token payload, and rebuilds
+NarrativeContext server-side from the verified claims.
+This means: zero server-side state, survives restarts, scales horizontally,
+and the client can't inflate the prompt with garbage to drive up cost.
+Token format:
+    base64url(json_payload) + "." + hex(hmac_sha256(payload_bytes, secret))
+Payload shape (sorted keys for stability):
+    {
+      "queryFingerprint": "<sha256 of upload bytes>",
+      "modelSha": "<MuQ-MuLan model sha>",
+      "catalogSha": "<sha256 of manifest.json bytes>",
+      "expiresAt": <unix seconds>,
+      "acrcloudCoverSongId": <dict | null>,
+      "neighbors": {
+        "<trackId>": {full per-neighbor context fields},
+        ...
+      }
+    }
+"""
+from __future__ import annotations
+import base64
+import hashlib
+import hmac
+import json
+import os
+import time
+from dataclasses import dataclass
+from typing import Any
+DEFAULT_TTL_SECONDS = 1800  # 30 minutes — long enough for a UI session, short
+                            # enough that stale tokens after a redeploy expire
+                            # on their own without operator intervention.
+class TokenError(Exception):
+    """Raised when a token is malformed, tampered, expired, or stale."""
+    def __init__(self, code: str):
+        super().__init__(code)
+        self.code = code
+@dataclass(frozen=True)
+class VerifiedToken:
+    queryFingerprint: str
+    modelSha: str
+    catalogSha: str
+    expiresAt: int
+    acrcloudCoverSongId: dict | None
+    neighbors: dict[str, dict]
+def _hmac_key() -> bytes:
+    raw = os.getenv("CONTEXT_TOKEN_HMAC_KEY", "").strip()
+    if not raw:
+        raise TokenError("hmac-key-missing")
+    return raw.encode("utf-8")
+def issue(
+    *,
+    query_fingerprint: str,
+    model_sha: str,
+    catalog_sha: str,
+    neighbors: dict[str, dict],
+    acrcloud_cover_song_id: dict | None = None,
+    ttl_seconds: int = DEFAULT_TTL_SECONDS,
+    now: int | None = None,
+) -> str:
+    """Sign a context token. Returns the encoded `<payload>.<sig>` string.
+    `neighbors` is a dict keyed by trackId; each value is the per-neighbor
+    fragment of NarrativeContext (title, artist, queryWindow, matchWindow,
+    rawCosine, criteria). Top-level acrcloud signal is global per query.
+    """
+    secret = _hmac_key()
+    now_ts = int(now if now is not None else time.time())
+    payload = {
+        "queryFingerprint": query_fingerprint,
+        "modelSha": model_sha,
+        "catalogSha": catalog_sha,
+        "expiresAt": now_ts + ttl_seconds,
+        "acrcloudCoverSongId": acrcloud_cover_song_id,
+        "neighbors": neighbors,
+    }
+    body = json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
+    body_b64 = base64.urlsafe_b64encode(body).rstrip(b"=").decode("ascii")
+    sig = hmac.new(secret, body, hashlib.sha256).hexdigest()
+    return f"{body_b64}.{sig}"
+def verify(
+    token: str,
+    *,
+    expected_model_sha: str,
+    expected_catalog_sha: str,
+    now: int | None = None,
+) -> VerifiedToken:
+    """Verify signature + expiry + model/catalog hashes; return decoded payload.
+    Raises TokenError with a specific code on every failure path:
+      - hmac-key-missing       — server isn't configured for tokens
+      - malformed              — token wasn't `<body>.<sig>` shape
+      - invalid-signature      — HMAC mismatch (tampered or wrong secret)
+      - token-expired          — past the embedded expiresAt
+      - stale-model            — modelSha doesn't match current load
+      - stale-catalog          — catalogSha doesn't match current load
+    """
+    secret = _hmac_key()
+    if not isinstance(token, str) or token.count(".") != 1:
+        raise TokenError("malformed")
+    body_b64, sig = token.split(".", 1)
+    try:
+        pad = "=" * (-len(body_b64) % 4)
+        body = base64.urlsafe_b64decode(body_b64 + pad)
+    except Exception as exc:
+        raise TokenError("malformed") from exc
+    expected_sig = hmac.new(secret, body, hashlib.sha256).hexdigest()
+    if not hmac.compare_digest(expected_sig, sig):
+        raise TokenError("invalid-signature")
+    try:
+        payload = json.loads(body)
+    except Exception as exc:
+        raise TokenError("malformed") from exc
+    required = {"queryFingerprint", "modelSha", "catalogSha", "expiresAt", "neighbors"}
+    if not isinstance(payload, dict) or not required.issubset(payload.keys()):
+        raise TokenError("malformed")
+    now_ts = int(now if now is not None else time.time())
+    if now_ts > int(payload["expiresAt"]):
+        raise TokenError("token-expired")
+    if payload["modelSha"] != expected_model_sha:
+        raise TokenError("stale-model")
+    if payload["catalogSha"] != expected_catalog_sha:
+        raise TokenError("stale-catalog")
+    return VerifiedToken(
+        queryFingerprint=str(payload["queryFingerprint"]),
+        modelSha=str(payload["modelSha"]),
+        catalogSha=str(payload["catalogSha"]),
+        expiresAt=int(payload["expiresAt"]),
+        acrcloudCoverSongId=payload.get("acrcloudCoverSongId"),
+        neighbors=dict(payload["neighbors"]),
+    )
+def is_configured() -> bool:
+    """True if CONTEXT_TOKEN_HMAC_KEY is set. /neighbors uses this to decide
+    whether to attach a token at all."""
+    return bool(os.getenv("CONTEXT_TOKEN_HMAC_KEY", "").strip())
+def neighbor_context_fragment(
+    *,
+    track_id: str,
+    title: str,
+    artist: str | None,
+    query_window: tuple[float, float],
+    match_window: tuple[float, float],
+    raw_cosine: float,
+    criteria: list[dict[str, Any]] | None,
+) -> dict:
+    """Build the per-neighbor dict the token embeds. Matches the
+    NarrativeContext shape Codex's rag_narrative module expects, minus the
+    global queryFingerprint + acrcloudCoverSongId (those live at top level)."""
+    return {
+        "trackId": track_id,
+        "title": title,
+        "artist": artist,
+        "queryWindow": [float(query_window[0]), float(query_window[1])],
+        "matchWindow": [float(match_window[0]), float(match_window[1])],
+        "rawCosine": float(raw_cosine),
+        "criteria": criteria if criteria is not None else [],
+    }

backend/narrative_telemetry.py ADDED Viewed

	@@ -0,0 +1,296 @@

+"""In-process telemetry for the /narrative RAG explanatory layer.
+The right rung for this scale: in-process counters + structured logs + Sentry
+tags. A Prometheus/Datadog stack would be overbuilt for a 155-track demo —
+same "progressive complexity" principle ADR-0005 commits to for retrieval.
+What this module owns:
+  - Counters: total_calls, by_mode, by_kind, by_error, openai_calls,
+    gate_short_circuits, token_invalid, token_expired, token_stale.
+  - Latency: a fixed-size sliding window of recent call durations, surfaced
+    as p50/p95/p99 in the stats snapshot.
+  - Cost estimate: a rough running total in cents, derived from prompt and
+    completion character counts × GPT-4o-mini pricing constants. Not an
+    accounting ledger — a directional cost-awareness signal for the
+    /narrative/stats endpoint.
+  - Structured logger: one INFO line per call with stable key=value fields
+    so the HF Space logs are grep-able without a parser.
+  - Sentry tags: when SENTRY_DSN is set, every call tags the current scope
+    with mode + result_kind so failures aggregate by category in the
+    existing dashboard.
+The module is thread-safe (one lock around counter mutations + window
+operations). All operations are O(1) except `snapshot()` which is O(N) over
+the sliding window — N is bounded by `LATENCY_WINDOW_SIZE`.
+"""
+from __future__ import annotations
+import logging
+import os
+import threading
+import time
+from collections import deque
+from dataclasses import dataclass, field
+from typing import Iterable
+# GPT-4o-mini pricing as of 2026-06 (rough; meant as a directional signal):
+#   $0.15 per 1M input tokens, $0.60 per 1M output tokens.
+# At ~4 chars per token (English average), 1 input char ≈ 0.0375 micro-cents
+# and 1 output char ≈ 0.150 micro-cents. Times 100 cents/dollar:
+COST_CENTS_PER_INPUT_CHAR = 0.0000375
+COST_CENTS_PER_OUTPUT_CHAR = 0.000150
+# Sliding window size for latency percentiles. ~30 minutes of activity at
+# steady demo traffic; bounded so memory stays trivial.
+LATENCY_WINDOW_SIZE = 256
+# Result kinds we expect from rag_narrative. Anything else gets coerced to
+# "unknown" in the counter so a bug doesn't silently grow a new key.
+_KNOWN_KINDS = {"narrative", "low_confidence", "unavailable"}
+# Backend error codes we surface in counters; HTTP-layer codes from
+# api.py /narrative endpoint.
+_KNOWN_ERROR_CODES = {
+    "narrative-disabled",
+    "invalid-token",
+    "malformed-token",
+    "token-expired",
+    "stale-token",
+    "not-in-context",
+    "unsupported-mode",
+    "malformed-context",
+    "narrative-error",
+}
+_logger = logging.getLogger("piedpiper.narrative")
+_logger.setLevel(logging.INFO)
+# Don't add handlers here — the FastAPI app's uvicorn config already streams
+# stdlib logging to stdout, which is the HF Space's log-collection surface.
+_lock = threading.Lock()
+@dataclass
+class _State:
+    total_calls: int = 0
+    by_mode: dict[str, int] = field(default_factory=dict)
+    by_kind: dict[str, int] = field(default_factory=dict)
+    by_error: dict[str, int] = field(default_factory=dict)
+    openai_calls: int = 0
+    gate_short_circuits: int = 0
+    cost_cents: float = 0.0
+    latencies_ms: deque[float] = field(default_factory=lambda: deque(maxlen=LATENCY_WINDOW_SIZE))
+    started_at: float = field(default_factory=time.time)
+_state = _State()
+def record_call(
+    *,
+    mode: str,
+    latency_ms: float,
+    result_kind: str | None = None,
+    error_code: str | None = None,
+    openai_called: bool = False,
+    gate_short_circuit: bool = False,
+    prompt_chars: int = 0,
+    completion_chars: int = 0,
+    trackId: str | None = None,
+    cache_key: str | None = None,
+) -> None:
+    """Record a single /narrative call.
+    Exactly one of `result_kind` and `error_code` should be set. The function
+    accepts both being None (e.g. a 503 returned before any LLM/gate work)
+    and records it as `by_error["narrative-disabled"]` so the counter still
+    reflects the dropped traffic.
+    Cost is estimated from prompt_chars + completion_chars × GPT-4o-mini
+    pricing. Tests can pass 0 for both to skip the cost increment.
+    """
+    mode_key = mode if mode else "unknown"
+    # Unknown result_kind / error_code values get bucketed under the
+    # "_other" sentinel rather than being either dropped silently or growing
+    # arbitrary counter keys. Operators see "_other > 0" → time to update
+    # _KNOWN_KINDS / _KNOWN_ERROR_CODES.
+    if result_kind is None:
+        kind_key = None
+    elif result_kind in _KNOWN_KINDS:
+        kind_key = result_kind
+    else:
+        kind_key = "_other"
+    if error_code is None:
+        err_key = None
+    elif error_code in _KNOWN_ERROR_CODES:
+        err_key = error_code
+    else:
+        err_key = "_other"
+    with _lock:
+        _state.total_calls += 1
+        _state.by_mode[mode_key] = _state.by_mode.get(mode_key, 0) + 1
+        if kind_key:
+            _state.by_kind[kind_key] = _state.by_kind.get(kind_key, 0) + 1
+        if err_key:
+            _state.by_error[err_key] = _state.by_error.get(err_key, 0) + 1
+        if openai_called:
+            _state.openai_calls += 1
+        if gate_short_circuit:
+            _state.gate_short_circuits += 1
+        _state.cost_cents += (
+            prompt_chars * COST_CENTS_PER_INPUT_CHAR
+            + completion_chars * COST_CENTS_PER_OUTPUT_CHAR
+        )
+        _state.latencies_ms.append(float(latency_ms))
+    # Structured log — one line, grep-able. Don't emit prompt or response
+    # bodies (would leak content + bloat logs). Cache key is logged for
+    # de-dup correlation across same-payload calls.
+    _logger.info(
+        "narrative.call mode=%s kind=%s error=%s latency_ms=%.1f openai_called=%s gate_short_circuit=%s prompt_chars=%d completion_chars=%d trackId=%s cache_key=%s",
+        mode_key,
+        kind_key or "-",
+        err_key or "-",
+        float(latency_ms),
+        openai_called,
+        gate_short_circuit,
+        prompt_chars,
+        completion_chars,
+        trackId or "-",
+        (cache_key or "-")[:16],  # prefix only; full key is high-cardinality
+    )
+    # Sentry tag scope. No-op when sentry_sdk isn't installed or SENTRY_DSN
+    # isn't set — both paths defer to the existing api.py wiring.
+    _set_sentry_tags(mode=mode_key, kind=kind_key, error=err_key)
+def _set_sentry_tags(*, mode: str, kind: str | None, error: str | None) -> None:
+    """Tag the current Sentry scope (no-op when Sentry isn't active).
+    Tagging here means /narrative-layer Sentry events are filterable by
+    mode + result_kind + error in the existing dashboard without a new
+    integration.
+    """
+    if not os.getenv("SENTRY_DSN", "").strip():
+        return
+    try:
+        import sentry_sdk
+        sentry_sdk.set_tag("narrative.mode", mode)
+        if kind:
+            sentry_sdk.set_tag("narrative.kind", kind)
+        if error:
+            sentry_sdk.set_tag("narrative.error", error)
+    except Exception:
+        # If Sentry tagging fails for any reason, don't let it break the
+        # request flow. Telemetry is observability, not control flow.
+        pass
+def snapshot() -> dict:
+    """Return a JSON-serializable snapshot of current counters + percentiles.
+    This is what `GET /narrative/stats` returns. Includes:
+      - all counters
+      - latency p50 / p95 / p99 over the sliding window (or null if empty)
+      - cost_cents rounded to 4 decimal places
+      - uptime_sec since process start
+    """
+    with _lock:
+        latencies = sorted(_state.latencies_ms)
+        sample_n = len(latencies)
+        return {
+            "total_calls": _state.total_calls,
+            "by_mode": dict(_state.by_mode),
+            "by_kind": dict(_state.by_kind),
+            "by_error": dict(_state.by_error),
+            "openai_calls": _state.openai_calls,
+            "gate_short_circuits": _state.gate_short_circuits,
+            "cost_cents_estimate": round(_state.cost_cents, 4),
+            "latency_ms": {
+                "p50": _percentile(latencies, 0.50) if sample_n else None,
+                "p95": _percentile(latencies, 0.95) if sample_n else None,
+                "p99": _percentile(latencies, 0.99) if sample_n else None,
+                "sample_n": sample_n,
+                "window_size": LATENCY_WINDOW_SIZE,
+            },
+            "uptime_sec": round(time.time() - _state.started_at, 1),
+        }
+def _percentile(sorted_samples: list[float], p: float) -> float:
+    """Linear-interpolation percentile over a pre-sorted list.
+    Returns rounded to 1 decimal ms — sub-ms precision adds noise without
+    signal at this aggregation level.
+    """
+    if not sorted_samples:
+        return 0.0
+    if len(sorted_samples) == 1:
+        return round(sorted_samples[0], 1)
+    rank = p * (len(sorted_samples) - 1)
+    lo = int(rank)
+    hi = min(lo + 1, len(sorted_samples) - 1)
+    frac = rank - lo
+    return round(sorted_samples[lo] + frac * (sorted_samples[hi] - sorted_samples[lo]), 1)
+def reset() -> None:
+    """Wipe state. Used by tests to isolate per-test counter assertions.
+    Production code should NEVER call this — the snapshot wouldn't survive
+    a restart anyway, so there's no use case beyond test isolation.
+    """
+    global _state
+    with _lock:
+        _state = _State()
+def measure_call(mode: str):
+    """Context manager: time a call and ensure record_call gets invoked
+    exactly once with the measured latency.
+    Usage in api.py:
+        with measure_call("whySimilar") as ctx:
+            ... do work ...
+            ctx.set(result_kind="narrative", openai_called=True, prompt_chars=N, completion_chars=M)
+    """
+    return _CallTimer(mode)
+class _CallTimer:
+    def __init__(self, mode: str):
+        self.mode = mode
+        self.start_ts: float = 0.0
+        self._fields: dict = {}
+    def __enter__(self):
+        self.start_ts = time.time()
+        return self
+    def __exit__(self, exc_type, exc, tb):
+        latency_ms = (time.time() - self.start_ts) * 1000.0
+        # If the context block raised, mark as a narrative-error if no other
+        # error was set.
+        if exc_type is not None and "error_code" not in self._fields:
+            self._fields["error_code"] = "narrative-error"
+        record_call(mode=self.mode, latency_ms=latency_ms, **self._fields)
+        return False  # don't suppress exceptions
+    def set(self, **kwargs) -> None:
+        """Stash fields for record_call. Last write wins per key."""
+        self._fields.update(kwargs)
+__all__: Iterable[str] = (
+    "record_call",
+    "snapshot",
+    "reset",
+    "measure_call",
+    "LATENCY_WINDOW_SIZE",
+    "COST_CENTS_PER_INPUT_CHAR",
+    "COST_CENTS_PER_OUTPUT_CHAR",
+)

backend/rag_narrative.py ADDED Viewed

	@@ -0,0 +1,384 @@

+"""Metadata-grounded narrative generation for PiedPiper match explanations.
+This module is intentionally independent from FastAPI. `api.py` supplies a
+trusted `NarrativeContext`; this module gates context quality, builds a
+bounded prompt, calls OpenAI through one adapter, validates structured
+citations, and returns typed Pydantic results for the frontend.
+"""
+from __future__ import annotations
+import hashlib
+import json
+import logging
+import time
+from typing import Any, Literal
+from pydantic import BaseModel, ValidationError
+NarrativeMode = Literal["whySimilar", "creatorAdvice"]
+CriterionId = Literal["tempo", "key", "harmonic", "timbre"]
+RESPONSE_SCHEMA_VERSION = "v1"
+CRITERIA_ALGORITHM_VERSION = "adr-0004-v1"
+MAX_PROMPT_CHARS = 8000
+MAX_COMPLETION_TOKENS = 400
+logger = logging.getLogger(__name__)
+class CriterionContext(BaseModel):
+    id: CriterionId
+    queryValue: float | str | dict
+    matchValue: float | str | dict
+    agreement: float
+    label: str
+class NarrativeContext(BaseModel):
+    queryFingerprint: str
+    trackId: str
+    title: str
+    artist: str | None
+    queryWindow: tuple[float, float]
+    matchWindow: tuple[float, float]
+    rawCosine: float
+    criteria: list[CriterionContext]
+    acrcloudCoverSongId: dict | None
+class StructuredCitation(BaseModel):
+    trackId: str
+    side: Literal["query", "match"]
+    timestampRange: tuple[float, float]
+    criterionIds: list[CriterionId]
+    citedValues: dict[str, float | str]
+class NarrativeResponse(BaseModel):
+    kind: Literal["narrative"] = "narrative"
+    mode: NarrativeMode
+    prose: str
+    citations: list[StructuredCitation]
+class LowConfidence(BaseModel):
+    kind: Literal["low_confidence"] = "low_confidence"
+    reason: str
+class NarrativeUnavailable(BaseModel):
+    kind: Literal["unavailable"] = "unavailable"
+    reason: str
+NarrativeResult = NarrativeResponse | LowConfidence | NarrativeUnavailable
+SYSTEM_PROMPTS: dict[NarrativeMode, str] = {
+    "whySimilar": (
+        "You are PiedPiper, an expert assistant explaining acoustic similarity "
+        "between music tracks. You receive structured metadata about two audio "
+        "segments. You do not hear the audio. You do not determine copyright "
+        "infringement, ownership, or legal status. Cite only tracks, criteria, "
+        "and values present in the supplied context. Output a single JSON object "
+        "matching the schema. No additional text, no markdown."
+    ),
+    "creatorAdvice": (
+        "You are PiedPiper, an expert assistant helping creators make a music "
+        "generation more distinctive from a retrieved catalog match. You receive "
+        "structured metadata about two audio segments. You do not hear the audio. "
+        "You do not determine copyright infringement, ownership, or legal status. "
+        "Cite only tracks, criteria, and values present in the supplied context. "
+        "Output a single JSON object matching the schema. No additional text, no markdown."
+    ),
+}
+USER_PROMPT_TEMPLATE = """Mode: {mode}
+Return JSON with exactly this shape:
+{{
+  "kind": "narrative",
+  "mode": "{mode}",
+  "prose": "80-140 words for whySimilar, or 60-120 words for creatorAdvice",
+  "citations": [
+    {{
+      "trackId": "{track_id}",
+      "side": "query|match",
+      "timestampRange": [start_seconds, end_seconds],
+      "criterionIds": ["tempo|key|harmonic|timbre"],
+      "citedValues": {{
+        "<criterionId>.queryValue": "exact supplied value when cited",
+        "<criterionId>.matchValue": "exact supplied value when cited",
+        "rawCosine": 0.0
+      }}
+    }}
+  ]
+}}
+Use the supplied context only. For creatorAdvice, write three concrete suggestion-style clauses in prose, each tied to a cited criterion.
+Context:
+{context_json}
+"""
+def cache_key(
+    context: NarrativeContext,
+    mode: NarrativeMode,
+    *,
+    model_sha: str,
+    catalog_sha: str,
+    model_id: str,
+) -> str:
+    """Return a stable cache key for the prompt-relevant narrative context."""
+    payload = {
+        "model_id": model_id,
+        "model_sha": model_sha,
+        "catalog_sha": catalog_sha,
+        "prompt_template_hash": _prompt_template_hash(mode),
+        "response_schema_version": RESPONSE_SCHEMA_VERSION,
+        "criteria_algorithm_version": CRITERIA_ALGORITHM_VERSION,
+        "query_fingerprint": context.queryFingerprint,
+        "track_id": context.trackId,
+        "mode": mode,
+        "criteria_rounded": [_criterion_for_cache(c) for c in sorted(context.criteria, key=lambda c: c.id)],
+        "raw_cosine": round(float(context.rawCosine), 3),
+    }
+    return _sha256_json(payload)
+def generate_narrative(
+    context: NarrativeContext,
+    mode: NarrativeMode,
+    *,
+    model_sha: str,
+    catalog_sha: str,
+    model_id: str = "gpt-4o-mini",
+    openai_client=None,
+) -> NarrativeResult:
+    start = time.perf_counter()
+    key = cache_key(context, mode, model_sha=model_sha, catalog_sha=catalog_sha, model_id=model_id)
+    def finish(result: NarrativeResult, *, gate_result: str, success: bool) -> NarrativeResult:
+        latency_ms = (time.perf_counter() - start) * 1000
+        logger.info(
+            "rag_narrative cache_key=%s mode=%s gate_result=%s latency_ms=%.1f success=%s",
+            key,
+            mode,
+            gate_result,
+            latency_ms,
+            success,
+        )
+        return result
+    gate_reason = _context_gate_reason(context)
+    if gate_reason is not None:
+        return finish(LowConfidence(reason=gate_reason), gate_result=gate_reason, success=False)
+    system_prompt = SYSTEM_PROMPTS[mode]
+    user_prompt = _build_user_prompt(context, mode)
+    if len(system_prompt) + len(user_prompt) > MAX_PROMPT_CHARS:
+        return finish(
+            LowConfidence(reason="context-cap-exceeded"),
+            gate_result="context-cap-exceeded",
+            success=False,
+        )
+    payload = _call_openai_json(
+        openai_client,
+        system_prompt=system_prompt,
+        user_prompt=user_prompt,
+        max_tokens=MAX_COMPLETION_TOKENS,
+        model_id=model_id,
+    )
+    if payload is None:
+        return finish(
+            NarrativeUnavailable(reason="openai-error"),
+            gate_result="called",
+            success=False,
+        )
+    try:
+        narrative = NarrativeResponse.model_validate(payload)
+    except ValidationError:
+        return finish(
+            NarrativeUnavailable(reason="malformed-llm-output"),
+            gate_result="called",
+            success=False,
+        )
+    if narrative.mode != mode:
+        return finish(
+            NarrativeUnavailable(reason="schema-mismatch"),
+            gate_result="called",
+            success=False,
+        )
+    if not _citations_are_grounded(narrative.citations, context):
+        return finish(
+            NarrativeUnavailable(reason="citation-hallucinated"),
+            gate_result="called",
+            success=False,
+        )
+    return finish(narrative, gate_result="called", success=True)
+def _call_openai_json(
+    client,
+    *,
+    system_prompt: str,
+    user_prompt: str,
+    max_tokens: int,
+    model_id: str,
+) -> dict | None:
+    """Call OpenAI once and return parsed JSON, or None on SDK/parse failure."""
+    try:
+        if client is None:
+            from openai import OpenAI
+            client = OpenAI()
+        response = client.chat.completions.create(
+            model=model_id,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+            response_format={"type": "json_object"},
+            max_tokens=max_tokens,
+            temperature=0,
+        )
+        content = response.choices[0].message.content
+        parsed = json.loads(content)
+        return parsed if isinstance(parsed, dict) else None
+    except Exception:
+        logger.exception("rag_narrative OpenAI JSON call failed")
+        return None
+def _context_gate_reason(context: NarrativeContext) -> str | None:
+    if not context.criteria:
+        return "missing-criteria"
+    if not context.title or not context.title.strip():
+        return "missing-metadata"
+    if not _window_is_valid(context.queryWindow) or not _window_is_valid(context.matchWindow):
+        return "missing-metadata"
+    if not any(float(c.agreement) >= 0.55 for c in context.criteria) and float(context.rawCosine) < 0.75:
+        return "weak-evidence"
+    return None
+def _window_is_valid(window: tuple[float, float]) -> bool:
+    start, end = float(window[0]), float(window[1])
+    return start >= 0 and end > start
+def _build_user_prompt(context: NarrativeContext, mode: NarrativeMode) -> str:
+    context_payload = {
+        "queryFingerprint": context.queryFingerprint,
+        "trackId": context.trackId,
+        "title": context.title,
+        "artist": context.artist,
+        "queryWindow": list(context.queryWindow),
+        "matchWindow": list(context.matchWindow),
+        "rawCosine": round(float(context.rawCosine), 3),
+        "criteria": [_criterion_for_prompt(c) for c in sorted(context.criteria, key=lambda c: c.id)],
+        "acrcloudCoverSongId": context.acrcloudCoverSongId,
+    }
+    return USER_PROMPT_TEMPLATE.format(
+        mode=mode,
+        track_id=context.trackId,
+        context_json=json.dumps(context_payload, sort_keys=True, separators=(",", ":")),
+    )
+def _criterion_for_prompt(criterion: CriterionContext) -> dict[str, Any]:
+    return {
+        "id": criterion.id,
+        "queryValue": criterion.queryValue,
+        "matchValue": criterion.matchValue,
+        "agreement": round(float(criterion.agreement), 3),
+        "label": criterion.label,
+    }
+def _criterion_for_cache(criterion: CriterionContext) -> dict[str, Any]:
+    return _round_numbers(_criterion_for_prompt(criterion))
+def _round_numbers(value: Any) -> Any:
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, int | float):
+        return round(float(value), 3)
+    if isinstance(value, list):
+        return [_round_numbers(v) for v in value]
+    if isinstance(value, tuple):
+        return [_round_numbers(v) for v in value]
+    if isinstance(value, dict):
+        return {str(k): _round_numbers(v) for k, v in sorted(value.items(), key=lambda item: str(item[0]))}
+    return value
+def _citations_are_grounded(citations: list[StructuredCitation], context: NarrativeContext) -> bool:
+    criteria = {c.id: c for c in context.criteria}
+    if not citations:
+        return False
+    for citation in citations:
+        if citation.trackId != context.trackId:
+            return False
+        if not all(criterion_id in criteria for criterion_id in citation.criterionIds):
+            return False
+        if not _timestamp_is_grounded(citation, context):
+            return False
+        for key, cited_value in citation.citedValues.items():
+            if key == "rawCosine":
+                if not _numeric_close(cited_value, context.rawCosine, tolerance=0.01):
+                    return False
+                continue
+            if "." not in key:
+                return False
+            criterion_id, side = key.split(".", 1)
+            if criterion_id not in criteria or side not in {"queryValue", "matchValue"}:
+                return False
+            criterion = criteria[criterion_id]
+            expected = getattr(criterion, side)
+            if criterion_id == "tempo":
+                if not _numeric_close(cited_value, expected, tolerance=2.0):
+                    return False
+            elif criterion_id == "key":
+                if str(cited_value) != str(expected):
+                    return False
+            elif criterion_id in {"harmonic", "timbre"}:
+                if not isinstance(expected, dict):
+                    return False
+            else:
+                return False
+    return True
+def _timestamp_is_grounded(citation: StructuredCitation, context: NarrativeContext) -> bool:
+    start, end = citation.timestampRange
+    if end <= start:
+        return False
+    window = context.queryWindow if citation.side == "query" else context.matchWindow
+    return start >= window[0] - 0.5 and end <= window[1] + 0.5
+def _numeric_close(actual: Any, expected: Any, *, tolerance: float) -> bool:
+    try:
+        return abs(float(actual) - float(expected)) <= tolerance
+    except (TypeError, ValueError):
+        return False
+def _prompt_template_hash(mode: NarrativeMode) -> str:
+    return hashlib.sha256((SYSTEM_PROMPTS[mode] + "\n" + USER_PROMPT_TEMPLATE).encode("utf-8")).hexdigest()
+def _sha256_json(payload: dict[str, Any]) -> str:
+    encoded = json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
+    return hashlib.sha256(encoded).hexdigest()

backend/scripts/enrich_jamendo.py ADDED Viewed

	@@ -0,0 +1,140 @@

+"""Patch corpus.json's Jamendo tracks with real metadata from Jamendo's public API.
+The MTG-Jamendo research dataset (the ingest source for Tier-2) anonymizes
+artist names per academic distribution convention — entries land in corpus.json
+with placeholders like:
+    title:  "Jamendo 382"
+    artist: "artist_000020"
+    artwork_url: null
+    external_ids: {"jamendoTrackId": "382"}
+Jamendo's own public Catalog API (`api.jamendo.com/v3.0/tracks/`) returns the
+real track name, real artist name, an MP3 stream URL, and an album cover URL
+keyed by the same numeric track ID. This script reconciles them.
+Frontend impact: the `audioUrlFor()` helper already reads
+`external_ids.jamendoAudioUrl` and `artworkUrlFor()` reads `artwork_url`, so
+the React rows light up automatically once corpus.json is patched.
+Usage:
+    JAMENDO_CLIENT_ID=ba16bbc1 \\
+        python -m backend.scripts.enrich_jamendo
+Options:
+    --corpus <path>       Path to corpus.json (default: quality-scorer/public/corpus/corpus.json)
+    --dry-run             Print what would change without writing
+    --sleep <seconds>     Per-request sleep (default 0.1)
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+import httpx
+REPO_ROOT = Path(__file__).resolve().parents[3]
+DEFAULT_CORPUS = REPO_ROOT / "quality-scorer" / "public" / "corpus" / "corpus.json"
+API_BASE = "https://api.jamendo.com/v3.0/tracks/"
+def main() -> int:
+    args = _parse_args()
+    client_id = os.environ.get("JAMENDO_CLIENT_ID")
+    if not client_id:
+        print("[enrich_jamendo] ERROR: JAMENDO_CLIENT_ID env var required", file=sys.stderr)
+        return 2
+    raw = json.loads(args.corpus.read_text())
+    tracks = raw if isinstance(raw, list) else raw.get("tracks", [])
+    jamendo_tracks = [t for t in tracks if t.get("source") == "jamendo"]
+    print(f"[enrich_jamendo] {len(jamendo_tracks)} Jamendo tracks to enrich")
+    patched = 0
+    failed: list[str] = []
+    with httpx.Client(timeout=20.0) as client:
+        for i, t in enumerate(jamendo_tracks, start=1):
+            jam_id = (t.get("external_ids") or {}).get("jamendoTrackId")
+            if not jam_id:
+                failed.append(f"{t.get('track_id')}: missing jamendoTrackId")
+                continue
+            try:
+                r = client.get(API_BASE, params={
+                    "client_id": client_id,
+                    "id": str(jam_id),
+                    "format": "json",
+                })
+                r.raise_for_status()
+                data = r.json()
+            except (httpx.HTTPError, json.JSONDecodeError) as exc:
+                failed.append(f"jamendo:{jam_id}: {exc}")
+                time.sleep(args.sleep)
+                continue
+            if data.get("headers", {}).get("status") != "success":
+                failed.append(f"jamendo:{jam_id}: {data.get('headers', {}).get('error_message', 'unknown')}")
+                time.sleep(args.sleep)
+                continue
+            results = data.get("results") or []
+            if not results:
+                failed.append(f"jamendo:{jam_id}: not in Jamendo catalog")
+                time.sleep(args.sleep)
+                continue
+            jam = results[0]
+            real_title = jam.get("name") or t.get("title")
+            real_artist = jam.get("artist_name") or t.get("artist")
+            audio_url = jam.get("audio") or None
+            image_url = jam.get("image") or None
+            track_view = f"https://www.jamendo.com/track/{jam_id}"
+            if args.dry_run:
+                print(f"  [{i:03d}/{len(jamendo_tracks)}] {t.get('track_id')}: {t.get('title')!r} -> {real_title!r} by {real_artist!r}")
+            else:
+                t["title"] = real_title
+                t["artist"] = real_artist
+                t["artwork_url"] = image_url
+                t["track_view_url"] = track_view
+                ext = dict(t.get("external_ids") or {})
+                if audio_url:
+                    ext["jamendoAudioUrl"] = audio_url
+                if jam.get("album_name"):
+                    ext["jamendoAlbum"] = jam.get("album_name")
+                t["external_ids"] = ext
+            patched += 1
+            if i % 20 == 0:
+                print(f"  [{i}/{len(jamendo_tracks)}] enriched so far: {patched}, failed: {len(failed)}")
+            time.sleep(args.sleep)
+    print(f"[enrich_jamendo] DONE: patched={patched} failed={len(failed)}")
+    if failed:
+        print("[enrich_jamendo] failures (first 10):")
+        for f in failed[:10]:
+            print(f"  - {f}")
+    if not args.dry_run and patched > 0:
+        args.corpus.write_text(json.dumps(raw, indent=2))
+        print(f"[enrich_jamendo] wrote {args.corpus}")
+    elif args.dry_run:
+        print("[enrich_jamendo] dry-run — no file written")
+    return 0
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--corpus", type=Path, default=DEFAULT_CORPUS)
+    p.add_argument("--dry-run", action="store_true")
+    p.add_argument("--sleep", type=float, default=0.1)
+    return p.parse_args()
+if __name__ == "__main__":
+    sys.exit(main())

backend/scripts/enrich_mir_features.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""Backfill the four ADR-0004 MIR features (tempo, key, chroma, MFCC) into
+corpus.json without re-encoding the catalog through MuQ-MuLan.
+Pattern matches `enrich_jamendo.py`: read corpus.json, for each entry that
+lacks `mir_features`, download its audio, run `mir_features.compute()`,
+write the result back. Idempotent — re-runs only touch un-enriched tracks.
+Usage:
+    python -m backend.scripts.enrich_mir_features
+The audio source depends on the tier:
+    - tier1 (iTunes):  external_ids.previewUrl — 30s AAC-LC preview
+    - tier2 (Jamendo): external_ids.jamendoAudioUrl — MP3 stream (set by
+                      enrich_jamendo earlier; falls back to source_url if missing)
+Cost: ~3 s per track on CPU (download + librosa decode + features). For the
+current 155-track catalog: ~8 min wall-clock.
+"""
+from __future__ import annotations
+import argparse
+import io
+import json
+import sys
+import tempfile
+import time
+import urllib.request
+from pathlib import Path
+import librosa
+from backend.mir_features import compute as compute_mir
+REPO_ROOT = Path(__file__).resolve().parents[3]
+DEFAULT_CORPUS = REPO_ROOT / "quality-scorer" / "public" / "corpus" / "corpus.json"
+APPLE_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 Safari/605.1.15"
+def main() -> int:
+    args = _parse_args()
+    raw = json.loads(args.corpus.read_text())
+    tracks = raw if isinstance(raw, list) else raw.get("tracks", [])
+    total = len(tracks)
+    print(f"[enrich_mir] {total} catalog tracks")
+    todo = [t for t in tracks if args.force or not t.get("mir_features")]
+    print(f"[enrich_mir] {len(todo)} to enrich (already-enriched skipped; pass --force to redo)")
+    patched = 0
+    failed: list[str] = []
+    for i, t in enumerate(todo, start=1):
+        track_id = t.get("track_id", "?")
+        try:
+            url = _audio_url_for(t)
+            if not url:
+                failed.append(f"{track_id}: no audio URL in corpus entry")
+                continue
+            audio_bytes = _download(url, args.timeout)
+            wav, sr = _decode(audio_bytes)
+            features = compute_mir(wav, sr)
+            t["mir_features"] = features.to_dict()
+            patched += 1
+            if i % 10 == 0 or i == len(todo):
+                print(f"  [{i:>3}/{len(todo)}] enriched={patched} failed={len(failed)}")
+        except Exception as exc:
+            failed.append(f"{track_id}: {exc!r}")
+        time.sleep(args.sleep)
+    args.corpus.write_text(json.dumps(raw, indent=2))
+    print(f"[enrich_mir] DONE: patched={patched} failed={len(failed)}")
+    if failed:
+        print("[enrich_mir] first 10 failures:")
+        for f in failed[:10]:
+            print(f"  - {f}")
+    return 0 if patched > 0 or not todo else 1
+def _audio_url_for(track: dict) -> str | None:
+    """Pick the best audio source URL for a track per its tier."""
+    ext = track.get("external_ids") or {}
+    return (
+        ext.get("previewUrl")            # iTunes Tier-1
+        or ext.get("jamendoAudioUrl")    # Jamendo enriched
+        or ext.get("jamendoStreamUrl")
+        or track.get("source_url")       # last resort
+    )
+def _download(url: str, timeout: float) -> bytes:
+    req = urllib.request.Request(url, headers={"User-Agent": APPLE_UA})
+    with urllib.request.urlopen(req, timeout=timeout) as r:
+        return r.read()
+def _decode(audio_bytes: bytes):
+    """Decode arbitrary audio bytes to a mono numpy array via the temp-file
+    path so AAC-LC (.m4a) works the same way it does in api.py.
+    """
+    # Try BytesIO first (works for mp3/wav/flac/ogg).
+    try:
+        wav, sr = librosa.load(io.BytesIO(audio_bytes), sr=22050, mono=True)
+        if wav.size > 0:
+            return wav, sr
+    except Exception:
+        pass
+    # Fall through to temp-file path.
+    with tempfile.NamedTemporaryFile(suffix=".m4a", delete=True) as tmp:
+        tmp.write(audio_bytes)
+        tmp.flush()
+        wav, sr = librosa.load(tmp.name, sr=22050, mono=True)
+    return wav, sr
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--corpus", type=Path, default=DEFAULT_CORPUS)
+    p.add_argument("--force", action="store_true",
+                   help="Re-enrich even tracks that already have mir_features")
+    p.add_argument("--sleep", type=float, default=0.2,
+                   help="Per-track sleep to be polite to source CDNs")
+    p.add_argument("--timeout", type=float, default=30.0,
+                   help="Per-request download timeout")
+    return p.parse_args()
+if __name__ == "__main__":
+    sys.exit(main())

backend/scripts/run_rag_eval.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""RAG eval harness for the /narrative explanatory layer.
+Reads `backend/tests/fixtures/narrative_golden_set.json`, runs each case
+through `rag_narrative.generate_narrative()` with `_call_openai_json` mocked
+to return the case's prepared LLM response, and scores aggregate metrics.
+Metrics:
+  - kind_agreement_rate:      result.kind == expected.kind
+  - reason_agreement_rate:    (result.reason or null) == (expected.reason or null)
+  - citation_groundedness_rate: among happy_path cases, fraction returning
+    `kind=narrative` AND all citations validate.
+  - gate_correctness_rate:    among low_context cases, fraction returning
+    `kind=low_confidence` with the right reason.
+  - hallucination_rejection_rate: among hallucinated_citation cases, fraction
+    returning `kind=unavailable, reason=citation-hallucinated`.
+Writes a summary JSON to `factory/artifacts/RAG_EVAL_RESULT.json` so the
+result is committable and reviewable in PRs that touch rag_narrative.py.
+Run:
+    python -m backend.scripts.run_rag_eval [--out PATH]
+Returns non-zero exit code if any baseline gate fails (kind_agreement < 1.0,
+gate_correctness < 1.0, hallucination_rejection < 1.0). Those gates are
+load-bearing — any regression in them is a real bug in the validation layer.
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from collections import Counter
+from pathlib import Path
+from typing import Any
+from unittest.mock import patch
+REPO_ROOT = Path(__file__).resolve().parents[3]
+GOLDEN_SET_PATH = REPO_ROOT / "backend" / "tests" / "fixtures" / "narrative_golden_set.json"
+DEFAULT_OUT_PATH = REPO_ROOT / "factory" / "artifacts" / "RAG_EVAL_RESULT.json"
+def _load_golden_set(path: Path) -> list[dict]:
+    data = json.loads(path.read_text())
+    cases = data.get("cases", [])
+    if not cases:
+        raise ValueError(f"empty golden set at {path}")
+    return cases
+def _build_context(spec: dict):
+    """Materialize a NarrativeContext from a golden-set spec dict."""
+    from backend import rag_narrative
+    return rag_narrative.NarrativeContext(
+        queryFingerprint=spec["queryFingerprint"],
+        trackId=spec["trackId"],
+        title=spec["title"],
+        artist=spec.get("artist"),
+        queryWindow=tuple(spec["queryWindow"]),
+        matchWindow=tuple(spec["matchWindow"]),
+        rawCosine=float(spec["rawCosine"]),
+        criteria=[
+            rag_narrative.CriterionContext(**c) for c in spec["criteria"]
+        ],
+        acrcloudCoverSongId=spec.get("acrcloudCoverSongId"),
+    )
+def _run_case(case: dict) -> dict:
+    """Run one golden-set case end-to-end and capture the result vs expected.
+    Returns a per-case report row.
+    """
+    from backend import rag_narrative
+    mocked = case["mocked_llm_response"]
+    expected_kind = case["expected_kind"]
+    expected_reason = case.get("expected_reason")
+    must_not_call_llm = mocked == "MUST_NOT_BE_CALLED"
+    # Sentinel "MUST_NOT_BE_CALLED" lives only in the JSON for readability;
+    # the patched helper still needs a real return value if called. We use
+    # None so a wrongful call surfaces as openai-error and the case will
+    # fail the kind-agreement assertion, surfacing the bug.
+    patch_return = None if must_not_call_llm else mocked
+    context = _build_context(case["context"])
+    with patch(
+        "backend.rag_narrative._call_openai_json", return_value=patch_return
+    ) as call_mock:
+        result = rag_narrative.generate_narrative(
+            context,
+            case["mode"],
+            model_sha="eval-model-sha",
+            catalog_sha="eval-catalog-sha",
+        )
+    llm_was_called = call_mock.called
+    actual_kind = getattr(result, "kind", None) or (
+        result.get("kind") if isinstance(result, dict) else None
+    )
+    actual_reason = getattr(result, "reason", None) or (
+        result.get("reason") if isinstance(result, dict) else None
+    )
+    return {
+        "name": case["name"],
+        "category": case["category"],
+        "mode": case["mode"],
+        "expected_kind": expected_kind,
+        "actual_kind": actual_kind,
+        "expected_reason": expected_reason,
+        "actual_reason": actual_reason,
+        "llm_was_called": llm_was_called,
+        "must_not_call_llm": must_not_call_llm,
+        "kind_match": actual_kind == expected_kind,
+        "reason_match": (actual_reason or None) == (expected_reason or None),
+        "gate_respected": (not must_not_call_llm) or (not llm_was_called),
+    }
+def _aggregate(rows: list[dict]) -> dict:
+    n = len(rows)
+    kind_correct = sum(1 for r in rows if r["kind_match"])
+    reason_correct = sum(1 for r in rows if r["reason_match"])
+    by_cat = {}
+    cat_total: Counter[str] = Counter()
+    cat_kind_correct: Counter[str] = Counter()
+    cat_reason_correct: Counter[str] = Counter()
+    cat_gate_respected: Counter[str] = Counter()
+    for r in rows:
+        cat_total[r["category"]] += 1
+        if r["kind_match"]:
+            cat_kind_correct[r["category"]] += 1
+        if r["reason_match"]:
+            cat_reason_correct[r["category"]] += 1
+        if r["gate_respected"]:
+            cat_gate_respected[r["category"]] += 1
+    for cat, total in cat_total.items():
+        by_cat[cat] = {
+            "total": total,
+            "kind_agreement_rate": round(cat_kind_correct[cat] / total, 4),
+            "reason_agreement_rate": round(cat_reason_correct[cat] / total, 4),
+            "gate_respected_rate": round(cat_gate_respected[cat] / total, 4),
+        }
+    return {
+        "n_cases": n,
+        "kind_agreement_rate": round(kind_correct / n, 4) if n else 0.0,
+        "reason_agreement_rate": round(reason_correct / n, 4) if n else 0.0,
+        "by_category": by_cat,
+        "baseline_gates": {
+            # Each gate is "must be 1.0 to pass" — any regression here is a
+            # real bug in validation / gating, not noise.
+            "happy_path_kind_agreement": by_cat.get("happy_path", {}).get("kind_agreement_rate", 0.0),
+            "low_context_gate_correctness": by_cat.get("low_context", {}).get("reason_agreement_rate", 0.0),
+            "hallucination_rejection": by_cat.get("hallucinated_citation", {}).get("reason_agreement_rate", 0.0),
+            "malformed_rejection": by_cat.get("malformed_output", {}).get("reason_agreement_rate", 0.0),
+            "openai_error_handling": by_cat.get("openai_error", {}).get("reason_agreement_rate", 0.0),
+        },
+    }
+def run_eval(golden_set_path: Path = GOLDEN_SET_PATH) -> dict:
+    """Public entry point — runs the eval and returns the aggregate dict.
+    Pure function from disk path → results dict. The CLI wrapper writes the
+    results to disk; the pytest gate calls this directly and asserts.
+    """
+    cases = _load_golden_set(golden_set_path)
+    rows = [_run_case(c) for c in cases]
+    summary = _aggregate(rows)
+    return {"summary": summary, "rows": rows}
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--out",
+        type=Path,
+        default=DEFAULT_OUT_PATH,
+        help="Where to write the eval result JSON.",
+    )
+    parser.add_argument(
+        "--golden",
+        type=Path,
+        default=GOLDEN_SET_PATH,
+        help="Override golden-set path (for experiments).",
+    )
+    args = parser.parse_args()
+    result = run_eval(args.golden)
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+    args.out.write_text(json.dumps(result, indent=2) + "\n")
+    summary = result["summary"]
+    print(f"RAG eval — {summary['n_cases']} cases")
+    print(f"  kind agreement   : {summary['kind_agreement_rate']:.2%}")
+    print(f"  reason agreement : {summary['reason_agreement_rate']:.2%}")
+    print()
+    print("  Baseline gates (each MUST be 1.0):")
+    failed: list[str] = []
+    for gate, score in summary["baseline_gates"].items():
+        status = "OK " if score >= 1.0 else "FAIL"
+        print(f"    [{status}] {gate}: {score:.2%}")
+        if score < 1.0:
+            failed.append(gate)
+    print()
+    print("  By category:")
+    for cat, stats in summary["by_category"].items():
+        print(
+            f"    {cat:<26} n={stats['total']:>2} kind={stats['kind_agreement_rate']:.2%} "
+            f"reason={stats['reason_agreement_rate']:.2%} gate_respected={stats['gate_respected_rate']:.2%}"
+        )
+    print()
+    print(f"Result written to {args.out}")
+    if failed:
+        print(f"FAILED gates: {', '.join(failed)}", file=sys.stderr)
+        return 1
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

backend/scripts/verify_matching.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""Self-retrieval verification for the PiedPiper matching pipeline.
+For each iTunes Tier-1 track in the catalog: download the actual Apple
+preview audio that was used to build the catalog embedding, POST it back
+to /neighbors as a fresh query, and check that the same track is returned
+at rank 1 with high cosine similarity.
+This is the falsifiable answer to "are the matches accurate?" — if the
+self-retrieval rate is high, the encoder + retrieval pipeline are doing
+what they claim. If it isn't, there's a bug worth finding before we
+trust any cross-track match.
+Usage:
+    # Against the live HF Space (default):
+    python -m backend.scripts.verify_matching
+    # Against a local backend:
+    python -m backend.scripts.verify_matching --base-url http://localhost:8000
+    # Just one target:
+    python -m backend.scripts.verify_matching --target tier1:itunes:1488408568
+ADR-0002 §"Verification" documents the methodology this harness implements.
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+import tempfile
+import time
+import urllib.request
+from pathlib import Path
+REPO_ROOT = Path(__file__).resolve().parents[3]
+DEFAULT_CORPUS = REPO_ROOT / "quality-scorer" / "public" / "corpus" / "corpus.json"
+DEFAULT_BASE_URL = "https://rajata98-piedpiper.hf.space"
+APPLE_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"
+def main() -> int:
+    args = _parse_args()
+    catalog = _load_catalog(args.corpus)
+    targets = _select_targets(catalog, args.target)
+    if not targets:
+        print("[verify_matching] no Tier-1 iTunes targets found in catalog; aborting", file=sys.stderr)
+        return 2
+    print(f"[verify_matching] running against {args.base_url}")
+    print(f"[verify_matching] {len(targets)} target(s) to verify")
+    print()
+    results: list[dict] = []
+    for i, target in enumerate(targets, start=1):
+        print(f"[{i}/{len(targets)}] {target['title']} — {target['artist']}")
+        try:
+            result = _verify_one(target, args.base_url, args.timeout)
+        except Exception as exc:
+            print(f"  ERROR: {exc!r}")
+            result = {
+                "track_id": target["track_id"],
+                "title": target["title"],
+                "artist": target["artist"],
+                "error": str(exc),
+                "self_rank": None,
+                "self_cosine": None,
+                "top1_track_id": None,
+                "top1_cosine": None,
+                "timestamp": None,
+            }
+        results.append(result)
+        _print_one(result)
+        time.sleep(args.sleep)
+    print()
+    print("=" * 70)
+    print("SUMMARY")
+    print("=" * 70)
+    _print_summary(results)
+    if args.json_out:
+        Path(args.json_out).write_text(json.dumps(results, indent=2))
+        print(f"\n[verify_matching] full results written to {args.json_out}")
+    return 0
+def _verify_one(target: dict, base_url: str, timeout: float) -> dict:
+    preview_url = (target.get("external_ids") or {}).get("previewUrl")
+    if not preview_url:
+        raise RuntimeError("no previewUrl in catalog entry")
+    # Download the iTunes preview. Apple CDN rejects default Python UA so set Safari.
+    req = urllib.request.Request(preview_url, headers={"User-Agent": APPLE_UA})
+    with urllib.request.urlopen(req, timeout=timeout) as r:
+        audio_bytes = r.read()
+    # POST to /neighbors. The temp file's .m4a suffix preserves Apple's AAC-LC
+    # format so the backend's audioread fallback picks the right decoder.
+    with tempfile.NamedTemporaryFile(suffix=".m4a", delete=True) as tmp:
+        tmp.write(audio_bytes)
+        tmp.flush()
+        try:
+            import httpx
+        except ImportError as exc:
+            raise RuntimeError(
+                "httpx required: pip install -e 'backend/[ingest]'"
+            ) from exc
+        with httpx.Client(timeout=timeout) as client:
+            with open(tmp.name, "rb") as f:
+                resp = client.post(
+                    f"{base_url}/neighbors",
+                    params={"k": 5},
+                    files={"file": (f"{target['track_id']}.m4a", f, "audio/mp4")},
+                )
+        resp.raise_for_status()
+        body = resp.json()
+    neighbors = body.get("neighbors") or []
+    if not neighbors:
+        raise RuntimeError(f"empty neighbors response: {body}")
+    # Find self-rank in the returned list.
+    self_rank = None
+    self_cosine = None
+    for j, n in enumerate(neighbors, start=1):
+        if n.get("trackId") == target["track_id"]:
+            self_rank = j
+            self_cosine = float(n.get("rawCosine") or 0)
+            self_ts = n.get("matchTimestamp") or {}
+            break
+    else:
+        self_ts = {}
+    top1 = neighbors[0]
+    return {
+        "track_id": target["track_id"],
+        "title": target["title"],
+        "artist": target["artist"],
+        "self_rank": self_rank,
+        "self_cosine": self_cosine,
+        "self_timestamp": self_ts,
+        "top1_track_id": top1.get("trackId"),
+        "top1_cosine": float(top1.get("rawCosine") or 0),
+        "top1_title": (top1.get("track") or {}).get("title"),
+        "top1_artist": (top1.get("track") or {}).get("artist"),
+    }
+def _print_one(r: dict) -> None:
+    if r.get("error"):
+        return
+    self_rank = r.get("self_rank")
+    self_cos = r.get("self_cosine")
+    top1_id = r.get("top1_track_id")
+    if self_rank == 1:
+        marker = "PASS"
+    elif self_rank is not None:
+        marker = f"SELF AT RANK {self_rank}"
+    else:
+        marker = "SELF NOT IN TOP-5"
+    cos_str = f"{self_cos:.4f}" if self_cos is not None else "—"
+    print(f"  -> rank-1 returned: {(top1_id or '')[:40]:<40} cos={r.get('top1_cosine', 0):.4f}")
+    print(f"     self-retrieval: {marker}  self-cos={cos_str}")
+    ts = r.get("self_timestamp") or {}
+    if ts and self_rank is not None:
+        print(f"     self-timestamp: query {ts.get('queryStartSec','?')}-{ts.get('queryEndSec','?')}s ↔ catalog {ts.get('catalogStartSec','?')}-{ts.get('catalogEndSec','?')}s")
+def _print_summary(results: list[dict]) -> None:
+    n = len(results)
+    succeeded = [r for r in results if not r.get("error")]
+    n_ok = len(succeeded)
+    if n_ok == 0:
+        print("All targets errored out. Check the base URL + network.")
+        return
+    self_at_1 = [r for r in succeeded if r.get("self_rank") == 1]
+    self_in_top5 = [r for r in succeeded if r.get("self_rank") is not None]
+    self_cosines = [r["self_cosine"] for r in succeeded if r.get("self_cosine") is not None]
+    ts_align = [r for r in succeeded
+                if (r.get("self_timestamp") or {}).get("queryStartSec") == (r.get("self_timestamp") or {}).get("catalogStartSec")
+                and r.get("self_rank") is not None]
+    print(f"Total targets:           {n}")
+    print(f"Successful round-trips:  {n_ok}")
+    print(f"Self at rank 1:          {len(self_at_1):>3} / {n_ok}  ({100*len(self_at_1)/n_ok:.0f}%)")
+    print(f"Self in top 5:           {len(self_in_top5):>3} / {n_ok}  ({100*len(self_in_top5)/n_ok:.0f}%)")
+    if self_cosines:
+        mean = sum(self_cosines) / len(self_cosines)
+        print(f"Self-match cosine mean:  {mean:.4f}  (n={len(self_cosines)})")
+        print(f"Self-match cosine min:   {min(self_cosines):.4f}")
+    print(f"Timestamp aligns to self: {len(ts_align):>3} / {n_ok}  ({100*len(ts_align)/n_ok:.0f}%)")
+    print()
+    print("Pass criteria (ADR-0002 §Verification):")
+    print(f"  self-retrieval rate >= 90%        ... {'PASS' if len(self_at_1) / n_ok >= 0.9 else 'FAIL'}")
+    print(f"  self-cosine mean >= 0.92          ... {'PASS' if self_cosines and sum(self_cosines)/len(self_cosines) >= 0.92 else 'FAIL'}")
+    print(f"  timestamp aligns in >= 8 cases    ... {'PASS' if len(ts_align) >= 8 else 'FAIL (or n<8 successful)'}")
+def _load_catalog(path: Path) -> list[dict]:
+    raw = json.loads(path.read_text())
+    return raw if isinstance(raw, list) else raw.get("tracks", [])
+def _select_targets(catalog: list[dict], target_id: str | None) -> list[dict]:
+    tier1 = [t for t in catalog if t.get("source") == "itunes" and (t.get("external_ids") or {}).get("previewUrl")]
+    if target_id:
+        return [t for t in tier1 if t.get("track_id") == target_id]
+    return tier1
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(description=__doc__)
+    p.add_argument("--corpus", type=Path, default=DEFAULT_CORPUS,
+                   help=f"corpus.json path (default: {DEFAULT_CORPUS})")
+    p.add_argument("--base-url", default=DEFAULT_BASE_URL,
+                   help=f"backend base URL (default: {DEFAULT_BASE_URL})")
+    p.add_argument("--target", default=None,
+                   help="single track_id to verify (default: all Tier-1 iTunes tracks)")
+    p.add_argument("--timeout", type=float, default=120.0,
+                   help="per-request timeout in seconds (default: 120)")
+    p.add_argument("--sleep", type=float, default=1.0,
+                   help="seconds between requests to be polite (default: 1.0)")
+    p.add_argument("--json-out", default=None,
+                   help="optional path to write full results as JSON")
+    return p.parse_args()
+if __name__ == "__main__":
+    sys.exit(main())

requirements.txt CHANGED Viewed

@@ -16,3 +16,8 @@ httpx>=0.27
 pyyaml>=6.0
 # Sentry error tracking — no-op when SENTRY_DSN env var is unset.
 sentry-sdk[fastapi]>=2.0

 pyyaml>=6.0
 # Sentry error tracking — no-op when SENTRY_DSN env var is unset.
 sentry-sdk[fastapi]>=2.0
+# ADR-0005 (Commit C): GPT-4o-mini for /narrative explanatory layer.
+# Endpoint returns 503 narrative-disabled when OPENAI_API_KEY is unset, so
+# this dep is safe to add even before the Space secret is provisioned.
+openai>=1.40
+pydantic>=2.0