Deploy RAG narrative layer + telemetry + RAG eval harness
Browse files- backend/api.py +253 -4
- backend/context_token.py +194 -0
- backend/narrative_telemetry.py +296 -0
- backend/rag_narrative.py +384 -0
- backend/scripts/enrich_jamendo.py +140 -0
- backend/scripts/enrich_mir_features.py +128 -0
- backend/scripts/run_rag_eval.py +230 -0
- backend/scripts/verify_matching.py +230 -0
- requirements.txt +5 -0
backend/api.py
CHANGED
|
@@ -20,6 +20,7 @@ Errors are returned as `{"error": "<code>"}` to match the frontend's `api.js`:
|
|
| 20 |
|
| 21 |
from __future__ import annotations
|
| 22 |
|
|
|
|
| 23 |
import io
|
| 24 |
import json
|
| 25 |
import os
|
|
@@ -33,12 +34,13 @@ import soundfile as sf
|
|
| 33 |
from fastapi import FastAPI, File, UploadFile
|
| 34 |
from fastapi.middleware.cors import CORSMiddleware
|
| 35 |
from fastapi.responses import JSONResponse
|
|
|
|
| 36 |
|
| 37 |
# ADR-0002: clap_engine is no longer the primary encoder; muq_engine took its
|
| 38 |
# place via clap_windowed's swap. We still import clap_engine here only because
|
| 39 |
# legacy code paths may reference it; the encoder load + genre tagging both go
|
| 40 |
# through muq_engine.
|
| 41 |
-
from . import __version__, acrcloud_engine, muq_engine, clap_windowed, config, mir_features, similarity
|
| 42 |
from .librosa_engine import analyze_array
|
| 43 |
from .scoring import compute_report
|
| 44 |
|
|
@@ -63,6 +65,7 @@ _corpus_by_id: dict[str, dict] = {}
|
|
| 63 |
_flat_catalog: similarity.FlatCatalog | None = None
|
| 64 |
_catalog_cosine_distribution: np.ndarray | None = None # sorted upper-tri off-diag pairwise cosines
|
| 65 |
_model_sha: str = ""
|
|
|
|
| 66 |
_threshold_default: float = config.SIMILARITY_THRESHOLD_DEFAULT
|
| 67 |
|
| 68 |
|
|
@@ -77,7 +80,7 @@ def _load_corpus() -> None:
|
|
| 77 |
"""Populate corpus globals from disk if all corpus artifacts are present."""
|
| 78 |
global _corpus_tracks, _corpus_embeddings, _corpus_by_id, _flat_catalog
|
| 79 |
global _catalog_cosine_distribution
|
| 80 |
-
global _model_sha, _threshold_default
|
| 81 |
corpus_dir = Path(os.getenv("CORPUS_DIR", str(_default_corpus_dir())))
|
| 82 |
cpath = corpus_dir / "corpus.json"
|
| 83 |
epath = corpus_dir / "embeddings.npy"
|
|
@@ -95,6 +98,7 @@ def _load_corpus() -> None:
|
|
| 95 |
_corpus_by_id = {}
|
| 96 |
_flat_catalog = None
|
| 97 |
_model_sha = ""
|
|
|
|
| 98 |
_threshold_default = config.SIMILARITY_THRESHOLD_DEFAULT
|
| 99 |
return
|
| 100 |
try:
|
|
@@ -103,10 +107,16 @@ def _load_corpus() -> None:
|
|
| 103 |
_corpus_embeddings = np.load(epath).astype(np.float32)
|
| 104 |
with np.load(spath) as npz:
|
| 105 |
segment_embeddings = {k: npz[k].astype(np.float32) for k in npz.files}
|
| 106 |
-
|
|
|
|
| 107 |
_model_sha = str(manifest.get("model_sha") or "unpinned")
|
| 108 |
if _model_sha == "unpinned":
|
| 109 |
print("[api] WARNING manifest missing model_sha; using 'unpinned'")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
_threshold_default = similarity.threshold_from_manifest(manifest)
|
| 111 |
_flat_catalog = similarity.build_flat_catalog(_corpus_tracks, _corpus_embeddings, segment_embeddings)
|
| 112 |
_catalog_cosine_distribution = similarity.compute_catalog_distribution(_flat_catalog)
|
|
@@ -128,6 +138,7 @@ def _load_corpus() -> None:
|
|
| 128 |
_flat_catalog = None
|
| 129 |
_catalog_cosine_distribution = None
|
| 130 |
_model_sha = ""
|
|
|
|
| 131 |
_threshold_default = config.SIMILARITY_THRESHOLD_DEFAULT
|
| 132 |
|
| 133 |
|
|
@@ -332,6 +343,10 @@ async def neighbors_endpoint(file: UploadFile = File(...), k: int = 5):
|
|
| 332 |
raw = await file.read()
|
| 333 |
if (err := _validate_upload(file, raw)) is not None:
|
| 334 |
return err
|
|
|
|
|
|
|
|
|
|
|
|
|
| 335 |
ext = Path(file.filename or "").suffix.lower()
|
| 336 |
pipeline = _decode_and_pipeline(raw, ext=ext)
|
| 337 |
if isinstance(pipeline, JSONResponse):
|
|
@@ -348,6 +363,8 @@ async def neighbors_endpoint(file: UploadFile = File(...), k: int = 5):
|
|
| 348 |
"modelSha": _model_sha,
|
| 349 |
"thresholdDefault": _threshold_default,
|
| 350 |
"acrcloud": acrcloud_engine.to_response_dict(acrcloud_engine.disabled_response()),
|
|
|
|
|
|
|
| 351 |
}
|
| 352 |
|
| 353 |
neighbors = similarity.top_k_neighbors(
|
|
@@ -394,6 +411,40 @@ async def neighbors_endpoint(file: UploadFile = File(...), k: int = 5):
|
|
| 394 |
|
| 395 |
specificity = float(similarity.query_specificity(pipeline["emb"].astype(np.float32), _flat_catalog))
|
| 396 |
acr = acrcloud_engine.call_for_query(pipeline["acrcloud_audio"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
|
| 398 |
return {
|
| 399 |
"query": query_track,
|
|
@@ -405,10 +456,208 @@ async def neighbors_endpoint(file: UploadFile = File(...), k: int = 5):
|
|
| 405 |
"querySpecificity": specificity,
|
| 406 |
"modelSha": _model_sha,
|
| 407 |
"thresholdDefault": _threshold_default,
|
| 408 |
-
"acrcloud":
|
|
|
|
|
|
|
| 409 |
}
|
| 410 |
|
| 411 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
def run() -> None:
|
| 413 |
"""Convenience launcher: `python -m backend.api` or `uvicorn backend.api:app`."""
|
| 414 |
import uvicorn
|
|
|
|
| 20 |
|
| 21 |
from __future__ import annotations
|
| 22 |
|
| 23 |
+
import hashlib
|
| 24 |
import io
|
| 25 |
import json
|
| 26 |
import os
|
|
|
|
| 34 |
from fastapi import FastAPI, File, UploadFile
|
| 35 |
from fastapi.middleware.cors import CORSMiddleware
|
| 36 |
from fastapi.responses import JSONResponse
|
| 37 |
+
from pydantic import BaseModel, Field
|
| 38 |
|
| 39 |
# ADR-0002: clap_engine is no longer the primary encoder; muq_engine took its
|
| 40 |
# place via clap_windowed's swap. We still import clap_engine here only because
|
| 41 |
# legacy code paths may reference it; the encoder load + genre tagging both go
|
| 42 |
# through muq_engine.
|
| 43 |
+
from . import __version__, acrcloud_engine, context_token, muq_engine, narrative_telemetry, clap_windowed, config, mir_features, similarity
|
| 44 |
from .librosa_engine import analyze_array
|
| 45 |
from .scoring import compute_report
|
| 46 |
|
|
|
|
| 65 |
_flat_catalog: similarity.FlatCatalog | None = None
|
| 66 |
_catalog_cosine_distribution: np.ndarray | None = None # sorted upper-tri off-diag pairwise cosines
|
| 67 |
_model_sha: str = ""
|
| 68 |
+
_catalog_sha: str = "" # sha256 of manifest.json bytes; used in contextToken claims
|
| 69 |
_threshold_default: float = config.SIMILARITY_THRESHOLD_DEFAULT
|
| 70 |
|
| 71 |
|
|
|
|
| 80 |
"""Populate corpus globals from disk if all corpus artifacts are present."""
|
| 81 |
global _corpus_tracks, _corpus_embeddings, _corpus_by_id, _flat_catalog
|
| 82 |
global _catalog_cosine_distribution
|
| 83 |
+
global _model_sha, _catalog_sha, _threshold_default
|
| 84 |
corpus_dir = Path(os.getenv("CORPUS_DIR", str(_default_corpus_dir())))
|
| 85 |
cpath = corpus_dir / "corpus.json"
|
| 86 |
epath = corpus_dir / "embeddings.npy"
|
|
|
|
| 98 |
_corpus_by_id = {}
|
| 99 |
_flat_catalog = None
|
| 100 |
_model_sha = ""
|
| 101 |
+
_catalog_sha = ""
|
| 102 |
_threshold_default = config.SIMILARITY_THRESHOLD_DEFAULT
|
| 103 |
return
|
| 104 |
try:
|
|
|
|
| 107 |
_corpus_embeddings = np.load(epath).astype(np.float32)
|
| 108 |
with np.load(spath) as npz:
|
| 109 |
segment_embeddings = {k: npz[k].astype(np.float32) for k in npz.files}
|
| 110 |
+
manifest_bytes = mpath.read_bytes()
|
| 111 |
+
manifest = json.loads(manifest_bytes.decode("utf-8"))
|
| 112 |
_model_sha = str(manifest.get("model_sha") or "unpinned")
|
| 113 |
if _model_sha == "unpinned":
|
| 114 |
print("[api] WARNING manifest missing model_sha; using 'unpinned'")
|
| 115 |
+
# catalog_sha = sha256 of manifest.json bytes. Captures every
|
| 116 |
+
# meaningful catalog regeneration (model swap, threshold change,
|
| 117 |
+
# track count change) in a single stable hash. Embedded in every
|
| 118 |
+
# contextToken so /narrative can detect stale tokens after redeploy.
|
| 119 |
+
_catalog_sha = hashlib.sha256(manifest_bytes).hexdigest()
|
| 120 |
_threshold_default = similarity.threshold_from_manifest(manifest)
|
| 121 |
_flat_catalog = similarity.build_flat_catalog(_corpus_tracks, _corpus_embeddings, segment_embeddings)
|
| 122 |
_catalog_cosine_distribution = similarity.compute_catalog_distribution(_flat_catalog)
|
|
|
|
| 138 |
_flat_catalog = None
|
| 139 |
_catalog_cosine_distribution = None
|
| 140 |
_model_sha = ""
|
| 141 |
+
_catalog_sha = ""
|
| 142 |
_threshold_default = config.SIMILARITY_THRESHOLD_DEFAULT
|
| 143 |
|
| 144 |
|
|
|
|
| 343 |
raw = await file.read()
|
| 344 |
if (err := _validate_upload(file, raw)) is not None:
|
| 345 |
return err
|
| 346 |
+
# queryFingerprint: SHA-256 of the upload bytes. Embedded in contextToken
|
| 347 |
+
# so /narrative can verify the same query is still in play. Stable across
|
| 348 |
+
# re-uploads of the same file; cheap to compute.
|
| 349 |
+
query_fingerprint = hashlib.sha256(raw).hexdigest()
|
| 350 |
ext = Path(file.filename or "").suffix.lower()
|
| 351 |
pipeline = _decode_and_pipeline(raw, ext=ext)
|
| 352 |
if isinstance(pipeline, JSONResponse):
|
|
|
|
| 363 |
"modelSha": _model_sha,
|
| 364 |
"thresholdDefault": _threshold_default,
|
| 365 |
"acrcloud": acrcloud_engine.to_response_dict(acrcloud_engine.disabled_response()),
|
| 366 |
+
"queryFingerprint": query_fingerprint,
|
| 367 |
+
"contextToken": None,
|
| 368 |
}
|
| 369 |
|
| 370 |
neighbors = similarity.top_k_neighbors(
|
|
|
|
| 411 |
|
| 412 |
specificity = float(similarity.query_specificity(pipeline["emb"].astype(np.float32), _flat_catalog))
|
| 413 |
acr = acrcloud_engine.call_for_query(pipeline["acrcloud_audio"])
|
| 414 |
+
acr_response = acrcloud_engine.to_response_dict(acr)
|
| 415 |
+
|
| 416 |
+
# Codex round-2 Q3: stateless signed token replaces the in-memory cache.
|
| 417 |
+
# /narrative will verify this token and rebuild context server-side from
|
| 418 |
+
# the embedded claims. Token is None when HMAC key isn't configured —
|
| 419 |
+
# /narrative also 503s in that case so the gating is consistent.
|
| 420 |
+
ctx_token = None
|
| 421 |
+
if context_token.is_configured():
|
| 422 |
+
neighbor_fragments: dict[str, dict] = {}
|
| 423 |
+
for nb in neighbors:
|
| 424 |
+
track = nb.get("track") or {}
|
| 425 |
+
ts = nb.get("matchTimestamp") or {}
|
| 426 |
+
neighbor_fragments[str(nb["trackId"])] = context_token.neighbor_context_fragment(
|
| 427 |
+
track_id=str(nb["trackId"]),
|
| 428 |
+
title=str(track.get("title") or nb["trackId"]),
|
| 429 |
+
artist=track.get("artist"),
|
| 430 |
+
query_window=(
|
| 431 |
+
float(ts.get("queryStartSec", 0.0)),
|
| 432 |
+
float(ts.get("queryEndSec", 0.0)),
|
| 433 |
+
),
|
| 434 |
+
match_window=(
|
| 435 |
+
float(ts.get("catalogStartSec", 0.0)),
|
| 436 |
+
float(ts.get("catalogEndSec", 0.0)),
|
| 437 |
+
),
|
| 438 |
+
raw_cosine=float(nb.get("rawCosine", 0.0)),
|
| 439 |
+
criteria=_criteria_to_token_fragment(nb.get("criteria")),
|
| 440 |
+
)
|
| 441 |
+
ctx_token = context_token.issue(
|
| 442 |
+
query_fingerprint=query_fingerprint,
|
| 443 |
+
model_sha=_model_sha or "unpinned",
|
| 444 |
+
catalog_sha=_catalog_sha or "no-catalog",
|
| 445 |
+
neighbors=neighbor_fragments,
|
| 446 |
+
acrcloud_cover_song_id=acr_response.get("coverSongId"),
|
| 447 |
+
)
|
| 448 |
|
| 449 |
return {
|
| 450 |
"query": query_track,
|
|
|
|
| 456 |
"querySpecificity": specificity,
|
| 457 |
"modelSha": _model_sha,
|
| 458 |
"thresholdDefault": _threshold_default,
|
| 459 |
+
"acrcloud": acr_response,
|
| 460 |
+
"queryFingerprint": query_fingerprint,
|
| 461 |
+
"contextToken": ctx_token,
|
| 462 |
}
|
| 463 |
|
| 464 |
|
| 465 |
+
def _criteria_to_token_fragment(criteria_block: dict | None) -> list[dict] | None:
|
| 466 |
+
"""Reshape /neighbors' criteria block into the list-of-CriterionContext
|
| 467 |
+
form Codex's rag_narrative module expects.
|
| 468 |
+
|
| 469 |
+
The /neighbors response groups criteria by id under a top-level dict;
|
| 470 |
+
NarrativeContext takes a flat list of {id, queryValue, matchValue,
|
| 471 |
+
agreement, label}. Convert here so the token payload matches the
|
| 472 |
+
NarrativeContext shape directly.
|
| 473 |
+
"""
|
| 474 |
+
if not criteria_block:
|
| 475 |
+
return None
|
| 476 |
+
out: list[dict] = []
|
| 477 |
+
for cid in ("tempo", "key", "harmonic", "timbre"):
|
| 478 |
+
entry = criteria_block.get(cid)
|
| 479 |
+
if not entry:
|
| 480 |
+
continue
|
| 481 |
+
# harmonic + timbre come back from /neighbors without queryValue /
|
| 482 |
+
# matchValue (only agreement + label) because we don't ship the raw
|
| 483 |
+
# vectors. Substitute a shape marker so Codex's citation validator
|
| 484 |
+
# has something to check the keys against without exposing internals.
|
| 485 |
+
q_val = entry.get("queryValue")
|
| 486 |
+
m_val = entry.get("matchValue")
|
| 487 |
+
if cid in ("harmonic", "timbre") and q_val is None and m_val is None:
|
| 488 |
+
q_val = {"vector": "elided"}
|
| 489 |
+
m_val = {"vector": "elided"}
|
| 490 |
+
out.append({
|
| 491 |
+
"id": cid,
|
| 492 |
+
"queryValue": q_val,
|
| 493 |
+
"matchValue": m_val,
|
| 494 |
+
"agreement": float(entry.get("agreement", 0.0)),
|
| 495 |
+
"label": str(entry.get("label", "")),
|
| 496 |
+
})
|
| 497 |
+
return out or None
|
| 498 |
+
|
| 499 |
+
|
| 500 |
+
# --- /narrative -------------------------------------------------------------
|
| 501 |
+
#
|
| 502 |
+
# Stateless RAG explanatory layer over /neighbors. Client sends the
|
| 503 |
+
# contextToken received from /neighbors plus the trackId + mode it wants
|
| 504 |
+
# narrated; backend verifies the token (signature, expiry, model/catalog
|
| 505 |
+
# version), rebuilds NarrativeContext from the embedded claims, and delegates
|
| 506 |
+
# to Codex's rag_narrative module.
|
| 507 |
+
#
|
| 508 |
+
# Failure shape: typed `{"error": "<code>"}` JSON, status code by class:
|
| 509 |
+
# 503 narrative-disabled — OPENAI_API_KEY or CONTEXT_TOKEN_HMAC_KEY absent
|
| 510 |
+
# 401 invalid-token — signature mismatch (tampered or wrong secret)
|
| 511 |
+
# 412 token-expired — past expiresAt
|
| 512 |
+
# 412 stale-token — modelSha/catalogSha changed since issuance
|
| 513 |
+
# 400 malformed-token — bad shape; not <body>.<sig>
|
| 514 |
+
# 404 not-in-context — trackId wasn't part of the issued token
|
| 515 |
+
# 422 unsupported-mode — mode wasn't "whySimilar" or "creatorAdvice"
|
| 516 |
+
|
| 517 |
+
|
| 518 |
+
class NarrativeRequest(BaseModel):
|
| 519 |
+
contextToken: str = Field(..., min_length=1)
|
| 520 |
+
trackId: str = Field(..., min_length=1)
|
| 521 |
+
mode: str = Field(..., min_length=1)
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
_TOKEN_ERROR_TO_HTTP = {
|
| 525 |
+
"malformed": (400, "malformed-token"),
|
| 526 |
+
"invalid-signature": (401, "invalid-token"),
|
| 527 |
+
"token-expired": (412, "token-expired"),
|
| 528 |
+
"stale-model": (412, "stale-token"),
|
| 529 |
+
"stale-catalog": (412, "stale-token"),
|
| 530 |
+
"hmac-key-missing": (503, "narrative-disabled"),
|
| 531 |
+
}
|
| 532 |
+
|
| 533 |
+
|
| 534 |
+
@app.post("/narrative")
|
| 535 |
+
async def narrative_endpoint(req: NarrativeRequest):
|
| 536 |
+
"""RAG explanatory layer — see ADR-0005 for the full spec."""
|
| 537 |
+
with narrative_telemetry.measure_call(req.mode) as tel:
|
| 538 |
+
# Gate 1: OpenAI key present. Without it we can't call GPT-4o-mini.
|
| 539 |
+
if not os.getenv("OPENAI_API_KEY", "").strip():
|
| 540 |
+
tel.set(error_code="narrative-disabled")
|
| 541 |
+
return _err(503, "narrative-disabled")
|
| 542 |
+
# Gate 2: HMAC key present. Without it we can't trust the token.
|
| 543 |
+
if not context_token.is_configured():
|
| 544 |
+
tel.set(error_code="narrative-disabled")
|
| 545 |
+
return _err(503, "narrative-disabled")
|
| 546 |
+
# Gate 3: mode is one of the supported values.
|
| 547 |
+
if req.mode not in ("whySimilar", "creatorAdvice"):
|
| 548 |
+
tel.set(error_code="unsupported-mode")
|
| 549 |
+
return _err(422, "unsupported-mode")
|
| 550 |
+
|
| 551 |
+
# Verify the token. TokenError.code maps directly to a typed HTTP response.
|
| 552 |
+
try:
|
| 553 |
+
verified = context_token.verify(
|
| 554 |
+
req.contextToken,
|
| 555 |
+
expected_model_sha=_model_sha or "unpinned",
|
| 556 |
+
expected_catalog_sha=_catalog_sha or "no-catalog",
|
| 557 |
+
)
|
| 558 |
+
except context_token.TokenError as exc:
|
| 559 |
+
status, code = _TOKEN_ERROR_TO_HTTP.get(exc.code, (400, "malformed-token"))
|
| 560 |
+
tel.set(error_code=code)
|
| 561 |
+
return _err(status, code)
|
| 562 |
+
|
| 563 |
+
# Look up the requested trackId inside the verified token claims.
|
| 564 |
+
fragment = verified.neighbors.get(req.trackId)
|
| 565 |
+
if not fragment:
|
| 566 |
+
tel.set(error_code="not-in-context", trackId=req.trackId)
|
| 567 |
+
return _err(404, "not-in-context")
|
| 568 |
+
|
| 569 |
+
# Lazy-import Codex's module. Keeping this inside the handler means the
|
| 570 |
+
# FastAPI app boots and /neighbors keeps working even if rag_narrative
|
| 571 |
+
# hasn't shipped yet. If it's missing at request time, surface as 503
|
| 572 |
+
# narrative-disabled so the frontend's no-key fallback path handles it.
|
| 573 |
+
try:
|
| 574 |
+
from . import rag_narrative
|
| 575 |
+
except ImportError:
|
| 576 |
+
tel.set(error_code="narrative-disabled")
|
| 577 |
+
return _err(503, "narrative-disabled")
|
| 578 |
+
|
| 579 |
+
# Build NarrativeContext from the verified fragment. This is the Pydantic
|
| 580 |
+
# model Codex defined; instantiating it here also validates the shape.
|
| 581 |
+
try:
|
| 582 |
+
context = rag_narrative.NarrativeContext(
|
| 583 |
+
queryFingerprint=verified.queryFingerprint,
|
| 584 |
+
trackId=fragment["trackId"],
|
| 585 |
+
title=fragment.get("title", ""),
|
| 586 |
+
artist=fragment.get("artist"),
|
| 587 |
+
queryWindow=tuple(fragment["queryWindow"]),
|
| 588 |
+
matchWindow=tuple(fragment["matchWindow"]),
|
| 589 |
+
rawCosine=float(fragment["rawCosine"]),
|
| 590 |
+
criteria=[
|
| 591 |
+
rag_narrative.CriterionContext(**c)
|
| 592 |
+
for c in (fragment.get("criteria") or [])
|
| 593 |
+
],
|
| 594 |
+
acrcloudCoverSongId=verified.acrcloudCoverSongId,
|
| 595 |
+
)
|
| 596 |
+
except Exception:
|
| 597 |
+
# If the token fragment fails to materialize into a NarrativeContext,
|
| 598 |
+
# surface as malformed rather than blowing up internally.
|
| 599 |
+
tel.set(error_code="malformed-context", trackId=req.trackId)
|
| 600 |
+
return _err(422, "malformed-context")
|
| 601 |
+
|
| 602 |
+
model_id = os.getenv("OPENAI_MODEL_ID", "gpt-4o-mini")
|
| 603 |
+
try:
|
| 604 |
+
result = rag_narrative.generate_narrative(
|
| 605 |
+
context,
|
| 606 |
+
req.mode,
|
| 607 |
+
model_sha=_model_sha or "unpinned",
|
| 608 |
+
catalog_sha=_catalog_sha or "no-catalog",
|
| 609 |
+
model_id=model_id,
|
| 610 |
+
)
|
| 611 |
+
except Exception as exc:
|
| 612 |
+
print(f"[api] /narrative generate_narrative raised: {exc!r}")
|
| 613 |
+
tel.set(error_code="narrative-error", trackId=req.trackId)
|
| 614 |
+
return _err(500, "narrative-error")
|
| 615 |
+
|
| 616 |
+
# Record the result kind. result.kind is the discriminator on all
|
| 617 |
+
# three Pydantic variants (NarrativeResponse / LowConfidence /
|
| 618 |
+
# NarrativeUnavailable). Approximate cost via prose char count;
|
| 619 |
+
# we don't have token counts without re-tokenizing, but char-count
|
| 620 |
+
# is the right directional signal for the stats endpoint.
|
| 621 |
+
result_kind = getattr(result, "kind", None)
|
| 622 |
+
completion_chars = 0
|
| 623 |
+
if result_kind == "narrative":
|
| 624 |
+
completion_chars = len(getattr(result, "prose", "") or "")
|
| 625 |
+
# Rough prompt size estimate — system + user prompt char count.
|
| 626 |
+
# narrative_telemetry treats this as char-not-token because tokenizer
|
| 627 |
+
# access isn't worth the overhead for an in-process counter.
|
| 628 |
+
prompt_chars_estimate = len(fragment.get("title", "")) + 600 # base + metadata
|
| 629 |
+
tel.set(
|
| 630 |
+
result_kind=result_kind,
|
| 631 |
+
openai_called=(result_kind == "narrative" or result_kind == "unavailable"),
|
| 632 |
+
gate_short_circuit=(result_kind == "low_confidence"),
|
| 633 |
+
prompt_chars=prompt_chars_estimate,
|
| 634 |
+
completion_chars=completion_chars,
|
| 635 |
+
trackId=req.trackId,
|
| 636 |
+
)
|
| 637 |
+
|
| 638 |
+
# Pydantic v2 .model_dump() — uniform shape regardless of which result
|
| 639 |
+
# variant came back. The `kind` discriminator lets the frontend route
|
| 640 |
+
# rendering.
|
| 641 |
+
if hasattr(result, "model_dump"):
|
| 642 |
+
return result.model_dump()
|
| 643 |
+
return result
|
| 644 |
+
|
| 645 |
+
|
| 646 |
+
@app.get("/narrative/stats")
|
| 647 |
+
def narrative_stats_endpoint() -> dict:
|
| 648 |
+
"""Return the in-process counters snapshot for the /narrative layer.
|
| 649 |
+
|
| 650 |
+
Senior-reviewer-friendly visibility into what's actually happening in
|
| 651 |
+
production — call counts, latency percentiles, mode distribution,
|
| 652 |
+
error distribution, rough cost estimate. Counters reset on restart;
|
| 653 |
+
this is not a long-term metrics store, it's a "right now" snapshot.
|
| 654 |
+
|
| 655 |
+
Cost estimate is char-based × GPT-4o-mini pricing — directional, not
|
| 656 |
+
accounting-grade. The honest framing from ADR-0005 holds.
|
| 657 |
+
"""
|
| 658 |
+
return narrative_telemetry.snapshot()
|
| 659 |
+
|
| 660 |
+
|
| 661 |
def run() -> None:
|
| 662 |
"""Convenience launcher: `python -m backend.api` or `uvicorn backend.api:app`."""
|
| 663 |
import uvicorn
|
backend/context_token.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HMAC-signed opaque context token for the /narrative endpoint.
|
| 2 |
+
|
| 3 |
+
Codex round-2 review Q3 pushed back on the in-memory cache approach: a TTL
|
| 4 |
+
cache breaks across HF restarts, multiple workers, and page refreshes. This
|
| 5 |
+
module is the replacement — a stateless signed token.
|
| 6 |
+
|
| 7 |
+
`/neighbors` issues a token containing the full NarrativeContext payload for
|
| 8 |
+
every neighbor it returns. The token is HMAC-signed with a server secret so
|
| 9 |
+
the backend can later verify that the client didn't tamper with it. On
|
| 10 |
+
`/narrative`, the backend verifies the signature + expiry + model/catalog
|
| 11 |
+
hashes, looks up the requested trackId inside the token payload, and rebuilds
|
| 12 |
+
NarrativeContext server-side from the verified claims.
|
| 13 |
+
|
| 14 |
+
This means: zero server-side state, survives restarts, scales horizontally,
|
| 15 |
+
and the client can't inflate the prompt with garbage to drive up cost.
|
| 16 |
+
|
| 17 |
+
Token format:
|
| 18 |
+
base64url(json_payload) + "." + hex(hmac_sha256(payload_bytes, secret))
|
| 19 |
+
|
| 20 |
+
Payload shape (sorted keys for stability):
|
| 21 |
+
{
|
| 22 |
+
"queryFingerprint": "<sha256 of upload bytes>",
|
| 23 |
+
"modelSha": "<MuQ-MuLan model sha>",
|
| 24 |
+
"catalogSha": "<sha256 of manifest.json bytes>",
|
| 25 |
+
"expiresAt": <unix seconds>,
|
| 26 |
+
"acrcloudCoverSongId": <dict | null>,
|
| 27 |
+
"neighbors": {
|
| 28 |
+
"<trackId>": {full per-neighbor context fields},
|
| 29 |
+
...
|
| 30 |
+
}
|
| 31 |
+
}
|
| 32 |
+
"""
|
| 33 |
+
|
| 34 |
+
from __future__ import annotations
|
| 35 |
+
|
| 36 |
+
import base64
|
| 37 |
+
import hashlib
|
| 38 |
+
import hmac
|
| 39 |
+
import json
|
| 40 |
+
import os
|
| 41 |
+
import time
|
| 42 |
+
from dataclasses import dataclass
|
| 43 |
+
from typing import Any
|
| 44 |
+
|
| 45 |
+
DEFAULT_TTL_SECONDS = 1800 # 30 minutes — long enough for a UI session, short
|
| 46 |
+
# enough that stale tokens after a redeploy expire
|
| 47 |
+
# on their own without operator intervention.
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class TokenError(Exception):
|
| 51 |
+
"""Raised when a token is malformed, tampered, expired, or stale."""
|
| 52 |
+
|
| 53 |
+
def __init__(self, code: str):
|
| 54 |
+
super().__init__(code)
|
| 55 |
+
self.code = code
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass(frozen=True)
|
| 59 |
+
class VerifiedToken:
|
| 60 |
+
queryFingerprint: str
|
| 61 |
+
modelSha: str
|
| 62 |
+
catalogSha: str
|
| 63 |
+
expiresAt: int
|
| 64 |
+
acrcloudCoverSongId: dict | None
|
| 65 |
+
neighbors: dict[str, dict]
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _hmac_key() -> bytes:
|
| 69 |
+
raw = os.getenv("CONTEXT_TOKEN_HMAC_KEY", "").strip()
|
| 70 |
+
if not raw:
|
| 71 |
+
raise TokenError("hmac-key-missing")
|
| 72 |
+
return raw.encode("utf-8")
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def issue(
|
| 76 |
+
*,
|
| 77 |
+
query_fingerprint: str,
|
| 78 |
+
model_sha: str,
|
| 79 |
+
catalog_sha: str,
|
| 80 |
+
neighbors: dict[str, dict],
|
| 81 |
+
acrcloud_cover_song_id: dict | None = None,
|
| 82 |
+
ttl_seconds: int = DEFAULT_TTL_SECONDS,
|
| 83 |
+
now: int | None = None,
|
| 84 |
+
) -> str:
|
| 85 |
+
"""Sign a context token. Returns the encoded `<payload>.<sig>` string.
|
| 86 |
+
|
| 87 |
+
`neighbors` is a dict keyed by trackId; each value is the per-neighbor
|
| 88 |
+
fragment of NarrativeContext (title, artist, queryWindow, matchWindow,
|
| 89 |
+
rawCosine, criteria). Top-level acrcloud signal is global per query.
|
| 90 |
+
"""
|
| 91 |
+
secret = _hmac_key()
|
| 92 |
+
now_ts = int(now if now is not None else time.time())
|
| 93 |
+
payload = {
|
| 94 |
+
"queryFingerprint": query_fingerprint,
|
| 95 |
+
"modelSha": model_sha,
|
| 96 |
+
"catalogSha": catalog_sha,
|
| 97 |
+
"expiresAt": now_ts + ttl_seconds,
|
| 98 |
+
"acrcloudCoverSongId": acrcloud_cover_song_id,
|
| 99 |
+
"neighbors": neighbors,
|
| 100 |
+
}
|
| 101 |
+
body = json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
|
| 102 |
+
body_b64 = base64.urlsafe_b64encode(body).rstrip(b"=").decode("ascii")
|
| 103 |
+
sig = hmac.new(secret, body, hashlib.sha256).hexdigest()
|
| 104 |
+
return f"{body_b64}.{sig}"
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def verify(
|
| 108 |
+
token: str,
|
| 109 |
+
*,
|
| 110 |
+
expected_model_sha: str,
|
| 111 |
+
expected_catalog_sha: str,
|
| 112 |
+
now: int | None = None,
|
| 113 |
+
) -> VerifiedToken:
|
| 114 |
+
"""Verify signature + expiry + model/catalog hashes; return decoded payload.
|
| 115 |
+
|
| 116 |
+
Raises TokenError with a specific code on every failure path:
|
| 117 |
+
- hmac-key-missing — server isn't configured for tokens
|
| 118 |
+
- malformed — token wasn't `<body>.<sig>` shape
|
| 119 |
+
- invalid-signature — HMAC mismatch (tampered or wrong secret)
|
| 120 |
+
- token-expired — past the embedded expiresAt
|
| 121 |
+
- stale-model — modelSha doesn't match current load
|
| 122 |
+
- stale-catalog — catalogSha doesn't match current load
|
| 123 |
+
"""
|
| 124 |
+
secret = _hmac_key()
|
| 125 |
+
|
| 126 |
+
if not isinstance(token, str) or token.count(".") != 1:
|
| 127 |
+
raise TokenError("malformed")
|
| 128 |
+
body_b64, sig = token.split(".", 1)
|
| 129 |
+
try:
|
| 130 |
+
pad = "=" * (-len(body_b64) % 4)
|
| 131 |
+
body = base64.urlsafe_b64decode(body_b64 + pad)
|
| 132 |
+
except Exception as exc:
|
| 133 |
+
raise TokenError("malformed") from exc
|
| 134 |
+
|
| 135 |
+
expected_sig = hmac.new(secret, body, hashlib.sha256).hexdigest()
|
| 136 |
+
if not hmac.compare_digest(expected_sig, sig):
|
| 137 |
+
raise TokenError("invalid-signature")
|
| 138 |
+
|
| 139 |
+
try:
|
| 140 |
+
payload = json.loads(body)
|
| 141 |
+
except Exception as exc:
|
| 142 |
+
raise TokenError("malformed") from exc
|
| 143 |
+
|
| 144 |
+
required = {"queryFingerprint", "modelSha", "catalogSha", "expiresAt", "neighbors"}
|
| 145 |
+
if not isinstance(payload, dict) or not required.issubset(payload.keys()):
|
| 146 |
+
raise TokenError("malformed")
|
| 147 |
+
|
| 148 |
+
now_ts = int(now if now is not None else time.time())
|
| 149 |
+
if now_ts > int(payload["expiresAt"]):
|
| 150 |
+
raise TokenError("token-expired")
|
| 151 |
+
|
| 152 |
+
if payload["modelSha"] != expected_model_sha:
|
| 153 |
+
raise TokenError("stale-model")
|
| 154 |
+
if payload["catalogSha"] != expected_catalog_sha:
|
| 155 |
+
raise TokenError("stale-catalog")
|
| 156 |
+
|
| 157 |
+
return VerifiedToken(
|
| 158 |
+
queryFingerprint=str(payload["queryFingerprint"]),
|
| 159 |
+
modelSha=str(payload["modelSha"]),
|
| 160 |
+
catalogSha=str(payload["catalogSha"]),
|
| 161 |
+
expiresAt=int(payload["expiresAt"]),
|
| 162 |
+
acrcloudCoverSongId=payload.get("acrcloudCoverSongId"),
|
| 163 |
+
neighbors=dict(payload["neighbors"]),
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def is_configured() -> bool:
|
| 168 |
+
"""True if CONTEXT_TOKEN_HMAC_KEY is set. /neighbors uses this to decide
|
| 169 |
+
whether to attach a token at all."""
|
| 170 |
+
return bool(os.getenv("CONTEXT_TOKEN_HMAC_KEY", "").strip())
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def neighbor_context_fragment(
|
| 174 |
+
*,
|
| 175 |
+
track_id: str,
|
| 176 |
+
title: str,
|
| 177 |
+
artist: str | None,
|
| 178 |
+
query_window: tuple[float, float],
|
| 179 |
+
match_window: tuple[float, float],
|
| 180 |
+
raw_cosine: float,
|
| 181 |
+
criteria: list[dict[str, Any]] | None,
|
| 182 |
+
) -> dict:
|
| 183 |
+
"""Build the per-neighbor dict the token embeds. Matches the
|
| 184 |
+
NarrativeContext shape Codex's rag_narrative module expects, minus the
|
| 185 |
+
global queryFingerprint + acrcloudCoverSongId (those live at top level)."""
|
| 186 |
+
return {
|
| 187 |
+
"trackId": track_id,
|
| 188 |
+
"title": title,
|
| 189 |
+
"artist": artist,
|
| 190 |
+
"queryWindow": [float(query_window[0]), float(query_window[1])],
|
| 191 |
+
"matchWindow": [float(match_window[0]), float(match_window[1])],
|
| 192 |
+
"rawCosine": float(raw_cosine),
|
| 193 |
+
"criteria": criteria if criteria is not None else [],
|
| 194 |
+
}
|
backend/narrative_telemetry.py
ADDED
|
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""In-process telemetry for the /narrative RAG explanatory layer.
|
| 2 |
+
|
| 3 |
+
The right rung for this scale: in-process counters + structured logs + Sentry
|
| 4 |
+
tags. A Prometheus/Datadog stack would be overbuilt for a 155-track demo —
|
| 5 |
+
same "progressive complexity" principle ADR-0005 commits to for retrieval.
|
| 6 |
+
|
| 7 |
+
What this module owns:
|
| 8 |
+
- Counters: total_calls, by_mode, by_kind, by_error, openai_calls,
|
| 9 |
+
gate_short_circuits, token_invalid, token_expired, token_stale.
|
| 10 |
+
- Latency: a fixed-size sliding window of recent call durations, surfaced
|
| 11 |
+
as p50/p95/p99 in the stats snapshot.
|
| 12 |
+
- Cost estimate: a rough running total in cents, derived from prompt and
|
| 13 |
+
completion character counts × GPT-4o-mini pricing constants. Not an
|
| 14 |
+
accounting ledger — a directional cost-awareness signal for the
|
| 15 |
+
/narrative/stats endpoint.
|
| 16 |
+
- Structured logger: one INFO line per call with stable key=value fields
|
| 17 |
+
so the HF Space logs are grep-able without a parser.
|
| 18 |
+
- Sentry tags: when SENTRY_DSN is set, every call tags the current scope
|
| 19 |
+
with mode + result_kind so failures aggregate by category in the
|
| 20 |
+
existing dashboard.
|
| 21 |
+
|
| 22 |
+
The module is thread-safe (one lock around counter mutations + window
|
| 23 |
+
operations). All operations are O(1) except `snapshot()` which is O(N) over
|
| 24 |
+
the sliding window — N is bounded by `LATENCY_WINDOW_SIZE`.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
|
| 29 |
+
import logging
|
| 30 |
+
import os
|
| 31 |
+
import threading
|
| 32 |
+
import time
|
| 33 |
+
from collections import deque
|
| 34 |
+
from dataclasses import dataclass, field
|
| 35 |
+
from typing import Iterable
|
| 36 |
+
|
| 37 |
+
# GPT-4o-mini pricing as of 2026-06 (rough; meant as a directional signal):
|
| 38 |
+
# $0.15 per 1M input tokens, $0.60 per 1M output tokens.
|
| 39 |
+
# At ~4 chars per token (English average), 1 input char ≈ 0.0375 micro-cents
|
| 40 |
+
# and 1 output char ≈ 0.150 micro-cents. Times 100 cents/dollar:
|
| 41 |
+
COST_CENTS_PER_INPUT_CHAR = 0.0000375
|
| 42 |
+
COST_CENTS_PER_OUTPUT_CHAR = 0.000150
|
| 43 |
+
|
| 44 |
+
# Sliding window size for latency percentiles. ~30 minutes of activity at
|
| 45 |
+
# steady demo traffic; bounded so memory stays trivial.
|
| 46 |
+
LATENCY_WINDOW_SIZE = 256
|
| 47 |
+
|
| 48 |
+
# Result kinds we expect from rag_narrative. Anything else gets coerced to
|
| 49 |
+
# "unknown" in the counter so a bug doesn't silently grow a new key.
|
| 50 |
+
_KNOWN_KINDS = {"narrative", "low_confidence", "unavailable"}
|
| 51 |
+
|
| 52 |
+
# Backend error codes we surface in counters; HTTP-layer codes from
|
| 53 |
+
# api.py /narrative endpoint.
|
| 54 |
+
_KNOWN_ERROR_CODES = {
|
| 55 |
+
"narrative-disabled",
|
| 56 |
+
"invalid-token",
|
| 57 |
+
"malformed-token",
|
| 58 |
+
"token-expired",
|
| 59 |
+
"stale-token",
|
| 60 |
+
"not-in-context",
|
| 61 |
+
"unsupported-mode",
|
| 62 |
+
"malformed-context",
|
| 63 |
+
"narrative-error",
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
_logger = logging.getLogger("piedpiper.narrative")
|
| 67 |
+
_logger.setLevel(logging.INFO)
|
| 68 |
+
# Don't add handlers here — the FastAPI app's uvicorn config already streams
|
| 69 |
+
# stdlib logging to stdout, which is the HF Space's log-collection surface.
|
| 70 |
+
|
| 71 |
+
_lock = threading.Lock()
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@dataclass
|
| 75 |
+
class _State:
|
| 76 |
+
total_calls: int = 0
|
| 77 |
+
by_mode: dict[str, int] = field(default_factory=dict)
|
| 78 |
+
by_kind: dict[str, int] = field(default_factory=dict)
|
| 79 |
+
by_error: dict[str, int] = field(default_factory=dict)
|
| 80 |
+
openai_calls: int = 0
|
| 81 |
+
gate_short_circuits: int = 0
|
| 82 |
+
cost_cents: float = 0.0
|
| 83 |
+
latencies_ms: deque[float] = field(default_factory=lambda: deque(maxlen=LATENCY_WINDOW_SIZE))
|
| 84 |
+
started_at: float = field(default_factory=time.time)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
_state = _State()
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def record_call(
|
| 91 |
+
*,
|
| 92 |
+
mode: str,
|
| 93 |
+
latency_ms: float,
|
| 94 |
+
result_kind: str | None = None,
|
| 95 |
+
error_code: str | None = None,
|
| 96 |
+
openai_called: bool = False,
|
| 97 |
+
gate_short_circuit: bool = False,
|
| 98 |
+
prompt_chars: int = 0,
|
| 99 |
+
completion_chars: int = 0,
|
| 100 |
+
trackId: str | None = None,
|
| 101 |
+
cache_key: str | None = None,
|
| 102 |
+
) -> None:
|
| 103 |
+
"""Record a single /narrative call.
|
| 104 |
+
|
| 105 |
+
Exactly one of `result_kind` and `error_code` should be set. The function
|
| 106 |
+
accepts both being None (e.g. a 503 returned before any LLM/gate work)
|
| 107 |
+
and records it as `by_error["narrative-disabled"]` so the counter still
|
| 108 |
+
reflects the dropped traffic.
|
| 109 |
+
|
| 110 |
+
Cost is estimated from prompt_chars + completion_chars × GPT-4o-mini
|
| 111 |
+
pricing. Tests can pass 0 for both to skip the cost increment.
|
| 112 |
+
"""
|
| 113 |
+
mode_key = mode if mode else "unknown"
|
| 114 |
+
# Unknown result_kind / error_code values get bucketed under the
|
| 115 |
+
# "_other" sentinel rather than being either dropped silently or growing
|
| 116 |
+
# arbitrary counter keys. Operators see "_other > 0" → time to update
|
| 117 |
+
# _KNOWN_KINDS / _KNOWN_ERROR_CODES.
|
| 118 |
+
if result_kind is None:
|
| 119 |
+
kind_key = None
|
| 120 |
+
elif result_kind in _KNOWN_KINDS:
|
| 121 |
+
kind_key = result_kind
|
| 122 |
+
else:
|
| 123 |
+
kind_key = "_other"
|
| 124 |
+
if error_code is None:
|
| 125 |
+
err_key = None
|
| 126 |
+
elif error_code in _KNOWN_ERROR_CODES:
|
| 127 |
+
err_key = error_code
|
| 128 |
+
else:
|
| 129 |
+
err_key = "_other"
|
| 130 |
+
|
| 131 |
+
with _lock:
|
| 132 |
+
_state.total_calls += 1
|
| 133 |
+
_state.by_mode[mode_key] = _state.by_mode.get(mode_key, 0) + 1
|
| 134 |
+
if kind_key:
|
| 135 |
+
_state.by_kind[kind_key] = _state.by_kind.get(kind_key, 0) + 1
|
| 136 |
+
if err_key:
|
| 137 |
+
_state.by_error[err_key] = _state.by_error.get(err_key, 0) + 1
|
| 138 |
+
if openai_called:
|
| 139 |
+
_state.openai_calls += 1
|
| 140 |
+
if gate_short_circuit:
|
| 141 |
+
_state.gate_short_circuits += 1
|
| 142 |
+
_state.cost_cents += (
|
| 143 |
+
prompt_chars * COST_CENTS_PER_INPUT_CHAR
|
| 144 |
+
+ completion_chars * COST_CENTS_PER_OUTPUT_CHAR
|
| 145 |
+
)
|
| 146 |
+
_state.latencies_ms.append(float(latency_ms))
|
| 147 |
+
|
| 148 |
+
# Structured log — one line, grep-able. Don't emit prompt or response
|
| 149 |
+
# bodies (would leak content + bloat logs). Cache key is logged for
|
| 150 |
+
# de-dup correlation across same-payload calls.
|
| 151 |
+
_logger.info(
|
| 152 |
+
"narrative.call mode=%s kind=%s error=%s latency_ms=%.1f openai_called=%s gate_short_circuit=%s prompt_chars=%d completion_chars=%d trackId=%s cache_key=%s",
|
| 153 |
+
mode_key,
|
| 154 |
+
kind_key or "-",
|
| 155 |
+
err_key or "-",
|
| 156 |
+
float(latency_ms),
|
| 157 |
+
openai_called,
|
| 158 |
+
gate_short_circuit,
|
| 159 |
+
prompt_chars,
|
| 160 |
+
completion_chars,
|
| 161 |
+
trackId or "-",
|
| 162 |
+
(cache_key or "-")[:16], # prefix only; full key is high-cardinality
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
# Sentry tag scope. No-op when sentry_sdk isn't installed or SENTRY_DSN
|
| 166 |
+
# isn't set — both paths defer to the existing api.py wiring.
|
| 167 |
+
_set_sentry_tags(mode=mode_key, kind=kind_key, error=err_key)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def _set_sentry_tags(*, mode: str, kind: str | None, error: str | None) -> None:
|
| 171 |
+
"""Tag the current Sentry scope (no-op when Sentry isn't active).
|
| 172 |
+
|
| 173 |
+
Tagging here means /narrative-layer Sentry events are filterable by
|
| 174 |
+
mode + result_kind + error in the existing dashboard without a new
|
| 175 |
+
integration.
|
| 176 |
+
"""
|
| 177 |
+
if not os.getenv("SENTRY_DSN", "").strip():
|
| 178 |
+
return
|
| 179 |
+
try:
|
| 180 |
+
import sentry_sdk
|
| 181 |
+
|
| 182 |
+
sentry_sdk.set_tag("narrative.mode", mode)
|
| 183 |
+
if kind:
|
| 184 |
+
sentry_sdk.set_tag("narrative.kind", kind)
|
| 185 |
+
if error:
|
| 186 |
+
sentry_sdk.set_tag("narrative.error", error)
|
| 187 |
+
except Exception:
|
| 188 |
+
# If Sentry tagging fails for any reason, don't let it break the
|
| 189 |
+
# request flow. Telemetry is observability, not control flow.
|
| 190 |
+
pass
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def snapshot() -> dict:
|
| 194 |
+
"""Return a JSON-serializable snapshot of current counters + percentiles.
|
| 195 |
+
|
| 196 |
+
This is what `GET /narrative/stats` returns. Includes:
|
| 197 |
+
- all counters
|
| 198 |
+
- latency p50 / p95 / p99 over the sliding window (or null if empty)
|
| 199 |
+
- cost_cents rounded to 4 decimal places
|
| 200 |
+
- uptime_sec since process start
|
| 201 |
+
"""
|
| 202 |
+
with _lock:
|
| 203 |
+
latencies = sorted(_state.latencies_ms)
|
| 204 |
+
sample_n = len(latencies)
|
| 205 |
+
return {
|
| 206 |
+
"total_calls": _state.total_calls,
|
| 207 |
+
"by_mode": dict(_state.by_mode),
|
| 208 |
+
"by_kind": dict(_state.by_kind),
|
| 209 |
+
"by_error": dict(_state.by_error),
|
| 210 |
+
"openai_calls": _state.openai_calls,
|
| 211 |
+
"gate_short_circuits": _state.gate_short_circuits,
|
| 212 |
+
"cost_cents_estimate": round(_state.cost_cents, 4),
|
| 213 |
+
"latency_ms": {
|
| 214 |
+
"p50": _percentile(latencies, 0.50) if sample_n else None,
|
| 215 |
+
"p95": _percentile(latencies, 0.95) if sample_n else None,
|
| 216 |
+
"p99": _percentile(latencies, 0.99) if sample_n else None,
|
| 217 |
+
"sample_n": sample_n,
|
| 218 |
+
"window_size": LATENCY_WINDOW_SIZE,
|
| 219 |
+
},
|
| 220 |
+
"uptime_sec": round(time.time() - _state.started_at, 1),
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def _percentile(sorted_samples: list[float], p: float) -> float:
|
| 225 |
+
"""Linear-interpolation percentile over a pre-sorted list.
|
| 226 |
+
|
| 227 |
+
Returns rounded to 1 decimal ms — sub-ms precision adds noise without
|
| 228 |
+
signal at this aggregation level.
|
| 229 |
+
"""
|
| 230 |
+
if not sorted_samples:
|
| 231 |
+
return 0.0
|
| 232 |
+
if len(sorted_samples) == 1:
|
| 233 |
+
return round(sorted_samples[0], 1)
|
| 234 |
+
rank = p * (len(sorted_samples) - 1)
|
| 235 |
+
lo = int(rank)
|
| 236 |
+
hi = min(lo + 1, len(sorted_samples) - 1)
|
| 237 |
+
frac = rank - lo
|
| 238 |
+
return round(sorted_samples[lo] + frac * (sorted_samples[hi] - sorted_samples[lo]), 1)
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
def reset() -> None:
|
| 242 |
+
"""Wipe state. Used by tests to isolate per-test counter assertions.
|
| 243 |
+
|
| 244 |
+
Production code should NEVER call this — the snapshot wouldn't survive
|
| 245 |
+
a restart anyway, so there's no use case beyond test isolation.
|
| 246 |
+
"""
|
| 247 |
+
global _state
|
| 248 |
+
with _lock:
|
| 249 |
+
_state = _State()
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def measure_call(mode: str):
|
| 253 |
+
"""Context manager: time a call and ensure record_call gets invoked
|
| 254 |
+
exactly once with the measured latency.
|
| 255 |
+
|
| 256 |
+
Usage in api.py:
|
| 257 |
+
with measure_call("whySimilar") as ctx:
|
| 258 |
+
... do work ...
|
| 259 |
+
ctx.set(result_kind="narrative", openai_called=True, prompt_chars=N, completion_chars=M)
|
| 260 |
+
"""
|
| 261 |
+
return _CallTimer(mode)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
class _CallTimer:
|
| 265 |
+
def __init__(self, mode: str):
|
| 266 |
+
self.mode = mode
|
| 267 |
+
self.start_ts: float = 0.0
|
| 268 |
+
self._fields: dict = {}
|
| 269 |
+
|
| 270 |
+
def __enter__(self):
|
| 271 |
+
self.start_ts = time.time()
|
| 272 |
+
return self
|
| 273 |
+
|
| 274 |
+
def __exit__(self, exc_type, exc, tb):
|
| 275 |
+
latency_ms = (time.time() - self.start_ts) * 1000.0
|
| 276 |
+
# If the context block raised, mark as a narrative-error if no other
|
| 277 |
+
# error was set.
|
| 278 |
+
if exc_type is not None and "error_code" not in self._fields:
|
| 279 |
+
self._fields["error_code"] = "narrative-error"
|
| 280 |
+
record_call(mode=self.mode, latency_ms=latency_ms, **self._fields)
|
| 281 |
+
return False # don't suppress exceptions
|
| 282 |
+
|
| 283 |
+
def set(self, **kwargs) -> None:
|
| 284 |
+
"""Stash fields for record_call. Last write wins per key."""
|
| 285 |
+
self._fields.update(kwargs)
|
| 286 |
+
|
| 287 |
+
|
| 288 |
+
__all__: Iterable[str] = (
|
| 289 |
+
"record_call",
|
| 290 |
+
"snapshot",
|
| 291 |
+
"reset",
|
| 292 |
+
"measure_call",
|
| 293 |
+
"LATENCY_WINDOW_SIZE",
|
| 294 |
+
"COST_CENTS_PER_INPUT_CHAR",
|
| 295 |
+
"COST_CENTS_PER_OUTPUT_CHAR",
|
| 296 |
+
)
|
backend/rag_narrative.py
ADDED
|
@@ -0,0 +1,384 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Metadata-grounded narrative generation for PiedPiper match explanations.
|
| 2 |
+
|
| 3 |
+
This module is intentionally independent from FastAPI. `api.py` supplies a
|
| 4 |
+
trusted `NarrativeContext`; this module gates context quality, builds a
|
| 5 |
+
bounded prompt, calls OpenAI through one adapter, validates structured
|
| 6 |
+
citations, and returns typed Pydantic results for the frontend.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import hashlib
|
| 12 |
+
import json
|
| 13 |
+
import logging
|
| 14 |
+
import time
|
| 15 |
+
from typing import Any, Literal
|
| 16 |
+
|
| 17 |
+
from pydantic import BaseModel, ValidationError
|
| 18 |
+
|
| 19 |
+
NarrativeMode = Literal["whySimilar", "creatorAdvice"]
|
| 20 |
+
CriterionId = Literal["tempo", "key", "harmonic", "timbre"]
|
| 21 |
+
|
| 22 |
+
RESPONSE_SCHEMA_VERSION = "v1"
|
| 23 |
+
CRITERIA_ALGORITHM_VERSION = "adr-0004-v1"
|
| 24 |
+
MAX_PROMPT_CHARS = 8000
|
| 25 |
+
MAX_COMPLETION_TOKENS = 400
|
| 26 |
+
|
| 27 |
+
logger = logging.getLogger(__name__)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class CriterionContext(BaseModel):
|
| 31 |
+
id: CriterionId
|
| 32 |
+
queryValue: float | str | dict
|
| 33 |
+
matchValue: float | str | dict
|
| 34 |
+
agreement: float
|
| 35 |
+
label: str
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class NarrativeContext(BaseModel):
|
| 39 |
+
queryFingerprint: str
|
| 40 |
+
trackId: str
|
| 41 |
+
title: str
|
| 42 |
+
artist: str | None
|
| 43 |
+
queryWindow: tuple[float, float]
|
| 44 |
+
matchWindow: tuple[float, float]
|
| 45 |
+
rawCosine: float
|
| 46 |
+
criteria: list[CriterionContext]
|
| 47 |
+
acrcloudCoverSongId: dict | None
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class StructuredCitation(BaseModel):
|
| 51 |
+
trackId: str
|
| 52 |
+
side: Literal["query", "match"]
|
| 53 |
+
timestampRange: tuple[float, float]
|
| 54 |
+
criterionIds: list[CriterionId]
|
| 55 |
+
citedValues: dict[str, float | str]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
class NarrativeResponse(BaseModel):
|
| 59 |
+
kind: Literal["narrative"] = "narrative"
|
| 60 |
+
mode: NarrativeMode
|
| 61 |
+
prose: str
|
| 62 |
+
citations: list[StructuredCitation]
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class LowConfidence(BaseModel):
|
| 66 |
+
kind: Literal["low_confidence"] = "low_confidence"
|
| 67 |
+
reason: str
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
class NarrativeUnavailable(BaseModel):
|
| 71 |
+
kind: Literal["unavailable"] = "unavailable"
|
| 72 |
+
reason: str
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
NarrativeResult = NarrativeResponse | LowConfidence | NarrativeUnavailable
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
SYSTEM_PROMPTS: dict[NarrativeMode, str] = {
|
| 79 |
+
"whySimilar": (
|
| 80 |
+
"You are PiedPiper, an expert assistant explaining acoustic similarity "
|
| 81 |
+
"between music tracks. You receive structured metadata about two audio "
|
| 82 |
+
"segments. You do not hear the audio. You do not determine copyright "
|
| 83 |
+
"infringement, ownership, or legal status. Cite only tracks, criteria, "
|
| 84 |
+
"and values present in the supplied context. Output a single JSON object "
|
| 85 |
+
"matching the schema. No additional text, no markdown."
|
| 86 |
+
),
|
| 87 |
+
"creatorAdvice": (
|
| 88 |
+
"You are PiedPiper, an expert assistant helping creators make a music "
|
| 89 |
+
"generation more distinctive from a retrieved catalog match. You receive "
|
| 90 |
+
"structured metadata about two audio segments. You do not hear the audio. "
|
| 91 |
+
"You do not determine copyright infringement, ownership, or legal status. "
|
| 92 |
+
"Cite only tracks, criteria, and values present in the supplied context. "
|
| 93 |
+
"Output a single JSON object matching the schema. No additional text, no markdown."
|
| 94 |
+
),
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
USER_PROMPT_TEMPLATE = """Mode: {mode}
|
| 98 |
+
|
| 99 |
+
Return JSON with exactly this shape:
|
| 100 |
+
{{
|
| 101 |
+
"kind": "narrative",
|
| 102 |
+
"mode": "{mode}",
|
| 103 |
+
"prose": "80-140 words for whySimilar, or 60-120 words for creatorAdvice",
|
| 104 |
+
"citations": [
|
| 105 |
+
{{
|
| 106 |
+
"trackId": "{track_id}",
|
| 107 |
+
"side": "query|match",
|
| 108 |
+
"timestampRange": [start_seconds, end_seconds],
|
| 109 |
+
"criterionIds": ["tempo|key|harmonic|timbre"],
|
| 110 |
+
"citedValues": {{
|
| 111 |
+
"<criterionId>.queryValue": "exact supplied value when cited",
|
| 112 |
+
"<criterionId>.matchValue": "exact supplied value when cited",
|
| 113 |
+
"rawCosine": 0.0
|
| 114 |
+
}}
|
| 115 |
+
}}
|
| 116 |
+
]
|
| 117 |
+
}}
|
| 118 |
+
|
| 119 |
+
Use the supplied context only. For creatorAdvice, write three concrete suggestion-style clauses in prose, each tied to a cited criterion.
|
| 120 |
+
|
| 121 |
+
Context:
|
| 122 |
+
{context_json}
|
| 123 |
+
"""
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def cache_key(
|
| 127 |
+
context: NarrativeContext,
|
| 128 |
+
mode: NarrativeMode,
|
| 129 |
+
*,
|
| 130 |
+
model_sha: str,
|
| 131 |
+
catalog_sha: str,
|
| 132 |
+
model_id: str,
|
| 133 |
+
) -> str:
|
| 134 |
+
"""Return a stable cache key for the prompt-relevant narrative context."""
|
| 135 |
+
payload = {
|
| 136 |
+
"model_id": model_id,
|
| 137 |
+
"model_sha": model_sha,
|
| 138 |
+
"catalog_sha": catalog_sha,
|
| 139 |
+
"prompt_template_hash": _prompt_template_hash(mode),
|
| 140 |
+
"response_schema_version": RESPONSE_SCHEMA_VERSION,
|
| 141 |
+
"criteria_algorithm_version": CRITERIA_ALGORITHM_VERSION,
|
| 142 |
+
"query_fingerprint": context.queryFingerprint,
|
| 143 |
+
"track_id": context.trackId,
|
| 144 |
+
"mode": mode,
|
| 145 |
+
"criteria_rounded": [_criterion_for_cache(c) for c in sorted(context.criteria, key=lambda c: c.id)],
|
| 146 |
+
"raw_cosine": round(float(context.rawCosine), 3),
|
| 147 |
+
}
|
| 148 |
+
return _sha256_json(payload)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def generate_narrative(
|
| 152 |
+
context: NarrativeContext,
|
| 153 |
+
mode: NarrativeMode,
|
| 154 |
+
*,
|
| 155 |
+
model_sha: str,
|
| 156 |
+
catalog_sha: str,
|
| 157 |
+
model_id: str = "gpt-4o-mini",
|
| 158 |
+
openai_client=None,
|
| 159 |
+
) -> NarrativeResult:
|
| 160 |
+
start = time.perf_counter()
|
| 161 |
+
key = cache_key(context, mode, model_sha=model_sha, catalog_sha=catalog_sha, model_id=model_id)
|
| 162 |
+
|
| 163 |
+
def finish(result: NarrativeResult, *, gate_result: str, success: bool) -> NarrativeResult:
|
| 164 |
+
latency_ms = (time.perf_counter() - start) * 1000
|
| 165 |
+
logger.info(
|
| 166 |
+
"rag_narrative cache_key=%s mode=%s gate_result=%s latency_ms=%.1f success=%s",
|
| 167 |
+
key,
|
| 168 |
+
mode,
|
| 169 |
+
gate_result,
|
| 170 |
+
latency_ms,
|
| 171 |
+
success,
|
| 172 |
+
)
|
| 173 |
+
return result
|
| 174 |
+
|
| 175 |
+
gate_reason = _context_gate_reason(context)
|
| 176 |
+
if gate_reason is not None:
|
| 177 |
+
return finish(LowConfidence(reason=gate_reason), gate_result=gate_reason, success=False)
|
| 178 |
+
|
| 179 |
+
system_prompt = SYSTEM_PROMPTS[mode]
|
| 180 |
+
user_prompt = _build_user_prompt(context, mode)
|
| 181 |
+
if len(system_prompt) + len(user_prompt) > MAX_PROMPT_CHARS:
|
| 182 |
+
return finish(
|
| 183 |
+
LowConfidence(reason="context-cap-exceeded"),
|
| 184 |
+
gate_result="context-cap-exceeded",
|
| 185 |
+
success=False,
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
payload = _call_openai_json(
|
| 189 |
+
openai_client,
|
| 190 |
+
system_prompt=system_prompt,
|
| 191 |
+
user_prompt=user_prompt,
|
| 192 |
+
max_tokens=MAX_COMPLETION_TOKENS,
|
| 193 |
+
model_id=model_id,
|
| 194 |
+
)
|
| 195 |
+
if payload is None:
|
| 196 |
+
return finish(
|
| 197 |
+
NarrativeUnavailable(reason="openai-error"),
|
| 198 |
+
gate_result="called",
|
| 199 |
+
success=False,
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
try:
|
| 203 |
+
narrative = NarrativeResponse.model_validate(payload)
|
| 204 |
+
except ValidationError:
|
| 205 |
+
return finish(
|
| 206 |
+
NarrativeUnavailable(reason="malformed-llm-output"),
|
| 207 |
+
gate_result="called",
|
| 208 |
+
success=False,
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
if narrative.mode != mode:
|
| 212 |
+
return finish(
|
| 213 |
+
NarrativeUnavailable(reason="schema-mismatch"),
|
| 214 |
+
gate_result="called",
|
| 215 |
+
success=False,
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
if not _citations_are_grounded(narrative.citations, context):
|
| 219 |
+
return finish(
|
| 220 |
+
NarrativeUnavailable(reason="citation-hallucinated"),
|
| 221 |
+
gate_result="called",
|
| 222 |
+
success=False,
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
return finish(narrative, gate_result="called", success=True)
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def _call_openai_json(
|
| 229 |
+
client,
|
| 230 |
+
*,
|
| 231 |
+
system_prompt: str,
|
| 232 |
+
user_prompt: str,
|
| 233 |
+
max_tokens: int,
|
| 234 |
+
model_id: str,
|
| 235 |
+
) -> dict | None:
|
| 236 |
+
"""Call OpenAI once and return parsed JSON, or None on SDK/parse failure."""
|
| 237 |
+
try:
|
| 238 |
+
if client is None:
|
| 239 |
+
from openai import OpenAI
|
| 240 |
+
|
| 241 |
+
client = OpenAI()
|
| 242 |
+
|
| 243 |
+
response = client.chat.completions.create(
|
| 244 |
+
model=model_id,
|
| 245 |
+
messages=[
|
| 246 |
+
{"role": "system", "content": system_prompt},
|
| 247 |
+
{"role": "user", "content": user_prompt},
|
| 248 |
+
],
|
| 249 |
+
response_format={"type": "json_object"},
|
| 250 |
+
max_tokens=max_tokens,
|
| 251 |
+
temperature=0,
|
| 252 |
+
)
|
| 253 |
+
content = response.choices[0].message.content
|
| 254 |
+
parsed = json.loads(content)
|
| 255 |
+
return parsed if isinstance(parsed, dict) else None
|
| 256 |
+
except Exception:
|
| 257 |
+
logger.exception("rag_narrative OpenAI JSON call failed")
|
| 258 |
+
return None
|
| 259 |
+
|
| 260 |
+
|
| 261 |
+
def _context_gate_reason(context: NarrativeContext) -> str | None:
|
| 262 |
+
if not context.criteria:
|
| 263 |
+
return "missing-criteria"
|
| 264 |
+
if not context.title or not context.title.strip():
|
| 265 |
+
return "missing-metadata"
|
| 266 |
+
if not _window_is_valid(context.queryWindow) or not _window_is_valid(context.matchWindow):
|
| 267 |
+
return "missing-metadata"
|
| 268 |
+
if not any(float(c.agreement) >= 0.55 for c in context.criteria) and float(context.rawCosine) < 0.75:
|
| 269 |
+
return "weak-evidence"
|
| 270 |
+
return None
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def _window_is_valid(window: tuple[float, float]) -> bool:
|
| 274 |
+
start, end = float(window[0]), float(window[1])
|
| 275 |
+
return start >= 0 and end > start
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
def _build_user_prompt(context: NarrativeContext, mode: NarrativeMode) -> str:
|
| 279 |
+
context_payload = {
|
| 280 |
+
"queryFingerprint": context.queryFingerprint,
|
| 281 |
+
"trackId": context.trackId,
|
| 282 |
+
"title": context.title,
|
| 283 |
+
"artist": context.artist,
|
| 284 |
+
"queryWindow": list(context.queryWindow),
|
| 285 |
+
"matchWindow": list(context.matchWindow),
|
| 286 |
+
"rawCosine": round(float(context.rawCosine), 3),
|
| 287 |
+
"criteria": [_criterion_for_prompt(c) for c in sorted(context.criteria, key=lambda c: c.id)],
|
| 288 |
+
"acrcloudCoverSongId": context.acrcloudCoverSongId,
|
| 289 |
+
}
|
| 290 |
+
return USER_PROMPT_TEMPLATE.format(
|
| 291 |
+
mode=mode,
|
| 292 |
+
track_id=context.trackId,
|
| 293 |
+
context_json=json.dumps(context_payload, sort_keys=True, separators=(",", ":")),
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
def _criterion_for_prompt(criterion: CriterionContext) -> dict[str, Any]:
|
| 298 |
+
return {
|
| 299 |
+
"id": criterion.id,
|
| 300 |
+
"queryValue": criterion.queryValue,
|
| 301 |
+
"matchValue": criterion.matchValue,
|
| 302 |
+
"agreement": round(float(criterion.agreement), 3),
|
| 303 |
+
"label": criterion.label,
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
def _criterion_for_cache(criterion: CriterionContext) -> dict[str, Any]:
|
| 308 |
+
return _round_numbers(_criterion_for_prompt(criterion))
|
| 309 |
+
|
| 310 |
+
|
| 311 |
+
def _round_numbers(value: Any) -> Any:
|
| 312 |
+
if isinstance(value, bool):
|
| 313 |
+
return value
|
| 314 |
+
if isinstance(value, int | float):
|
| 315 |
+
return round(float(value), 3)
|
| 316 |
+
if isinstance(value, list):
|
| 317 |
+
return [_round_numbers(v) for v in value]
|
| 318 |
+
if isinstance(value, tuple):
|
| 319 |
+
return [_round_numbers(v) for v in value]
|
| 320 |
+
if isinstance(value, dict):
|
| 321 |
+
return {str(k): _round_numbers(v) for k, v in sorted(value.items(), key=lambda item: str(item[0]))}
|
| 322 |
+
return value
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
def _citations_are_grounded(citations: list[StructuredCitation], context: NarrativeContext) -> bool:
|
| 326 |
+
criteria = {c.id: c for c in context.criteria}
|
| 327 |
+
if not citations:
|
| 328 |
+
return False
|
| 329 |
+
|
| 330 |
+
for citation in citations:
|
| 331 |
+
if citation.trackId != context.trackId:
|
| 332 |
+
return False
|
| 333 |
+
if not all(criterion_id in criteria for criterion_id in citation.criterionIds):
|
| 334 |
+
return False
|
| 335 |
+
if not _timestamp_is_grounded(citation, context):
|
| 336 |
+
return False
|
| 337 |
+
for key, cited_value in citation.citedValues.items():
|
| 338 |
+
if key == "rawCosine":
|
| 339 |
+
if not _numeric_close(cited_value, context.rawCosine, tolerance=0.01):
|
| 340 |
+
return False
|
| 341 |
+
continue
|
| 342 |
+
if "." not in key:
|
| 343 |
+
return False
|
| 344 |
+
criterion_id, side = key.split(".", 1)
|
| 345 |
+
if criterion_id not in criteria or side not in {"queryValue", "matchValue"}:
|
| 346 |
+
return False
|
| 347 |
+
criterion = criteria[criterion_id]
|
| 348 |
+
expected = getattr(criterion, side)
|
| 349 |
+
if criterion_id == "tempo":
|
| 350 |
+
if not _numeric_close(cited_value, expected, tolerance=2.0):
|
| 351 |
+
return False
|
| 352 |
+
elif criterion_id == "key":
|
| 353 |
+
if str(cited_value) != str(expected):
|
| 354 |
+
return False
|
| 355 |
+
elif criterion_id in {"harmonic", "timbre"}:
|
| 356 |
+
if not isinstance(expected, dict):
|
| 357 |
+
return False
|
| 358 |
+
else:
|
| 359 |
+
return False
|
| 360 |
+
return True
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
def _timestamp_is_grounded(citation: StructuredCitation, context: NarrativeContext) -> bool:
|
| 364 |
+
start, end = citation.timestampRange
|
| 365 |
+
if end <= start:
|
| 366 |
+
return False
|
| 367 |
+
window = context.queryWindow if citation.side == "query" else context.matchWindow
|
| 368 |
+
return start >= window[0] - 0.5 and end <= window[1] + 0.5
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
def _numeric_close(actual: Any, expected: Any, *, tolerance: float) -> bool:
|
| 372 |
+
try:
|
| 373 |
+
return abs(float(actual) - float(expected)) <= tolerance
|
| 374 |
+
except (TypeError, ValueError):
|
| 375 |
+
return False
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
def _prompt_template_hash(mode: NarrativeMode) -> str:
|
| 379 |
+
return hashlib.sha256((SYSTEM_PROMPTS[mode] + "\n" + USER_PROMPT_TEMPLATE).encode("utf-8")).hexdigest()
|
| 380 |
+
|
| 381 |
+
|
| 382 |
+
def _sha256_json(payload: dict[str, Any]) -> str:
|
| 383 |
+
encoded = json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
|
| 384 |
+
return hashlib.sha256(encoded).hexdigest()
|
backend/scripts/enrich_jamendo.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Patch corpus.json's Jamendo tracks with real metadata from Jamendo's public API.
|
| 2 |
+
|
| 3 |
+
The MTG-Jamendo research dataset (the ingest source for Tier-2) anonymizes
|
| 4 |
+
artist names per academic distribution convention — entries land in corpus.json
|
| 5 |
+
with placeholders like:
|
| 6 |
+
|
| 7 |
+
title: "Jamendo 382"
|
| 8 |
+
artist: "artist_000020"
|
| 9 |
+
artwork_url: null
|
| 10 |
+
external_ids: {"jamendoTrackId": "382"}
|
| 11 |
+
|
| 12 |
+
Jamendo's own public Catalog API (`api.jamendo.com/v3.0/tracks/`) returns the
|
| 13 |
+
real track name, real artist name, an MP3 stream URL, and an album cover URL
|
| 14 |
+
keyed by the same numeric track ID. This script reconciles them.
|
| 15 |
+
|
| 16 |
+
Frontend impact: the `audioUrlFor()` helper already reads
|
| 17 |
+
`external_ids.jamendoAudioUrl` and `artworkUrlFor()` reads `artwork_url`, so
|
| 18 |
+
the React rows light up automatically once corpus.json is patched.
|
| 19 |
+
|
| 20 |
+
Usage:
|
| 21 |
+
JAMENDO_CLIENT_ID=ba16bbc1 \\
|
| 22 |
+
python -m backend.scripts.enrich_jamendo
|
| 23 |
+
|
| 24 |
+
Options:
|
| 25 |
+
--corpus <path> Path to corpus.json (default: quality-scorer/public/corpus/corpus.json)
|
| 26 |
+
--dry-run Print what would change without writing
|
| 27 |
+
--sleep <seconds> Per-request sleep (default 0.1)
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
from __future__ import annotations
|
| 31 |
+
|
| 32 |
+
import argparse
|
| 33 |
+
import json
|
| 34 |
+
import os
|
| 35 |
+
import sys
|
| 36 |
+
import time
|
| 37 |
+
from pathlib import Path
|
| 38 |
+
|
| 39 |
+
import httpx
|
| 40 |
+
|
| 41 |
+
REPO_ROOT = Path(__file__).resolve().parents[3]
|
| 42 |
+
DEFAULT_CORPUS = REPO_ROOT / "quality-scorer" / "public" / "corpus" / "corpus.json"
|
| 43 |
+
API_BASE = "https://api.jamendo.com/v3.0/tracks/"
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def main() -> int:
|
| 47 |
+
args = _parse_args()
|
| 48 |
+
client_id = os.environ.get("JAMENDO_CLIENT_ID")
|
| 49 |
+
if not client_id:
|
| 50 |
+
print("[enrich_jamendo] ERROR: JAMENDO_CLIENT_ID env var required", file=sys.stderr)
|
| 51 |
+
return 2
|
| 52 |
+
|
| 53 |
+
raw = json.loads(args.corpus.read_text())
|
| 54 |
+
tracks = raw if isinstance(raw, list) else raw.get("tracks", [])
|
| 55 |
+
jamendo_tracks = [t for t in tracks if t.get("source") == "jamendo"]
|
| 56 |
+
print(f"[enrich_jamendo] {len(jamendo_tracks)} Jamendo tracks to enrich")
|
| 57 |
+
|
| 58 |
+
patched = 0
|
| 59 |
+
failed: list[str] = []
|
| 60 |
+
with httpx.Client(timeout=20.0) as client:
|
| 61 |
+
for i, t in enumerate(jamendo_tracks, start=1):
|
| 62 |
+
jam_id = (t.get("external_ids") or {}).get("jamendoTrackId")
|
| 63 |
+
if not jam_id:
|
| 64 |
+
failed.append(f"{t.get('track_id')}: missing jamendoTrackId")
|
| 65 |
+
continue
|
| 66 |
+
try:
|
| 67 |
+
r = client.get(API_BASE, params={
|
| 68 |
+
"client_id": client_id,
|
| 69 |
+
"id": str(jam_id),
|
| 70 |
+
"format": "json",
|
| 71 |
+
})
|
| 72 |
+
r.raise_for_status()
|
| 73 |
+
data = r.json()
|
| 74 |
+
except (httpx.HTTPError, json.JSONDecodeError) as exc:
|
| 75 |
+
failed.append(f"jamendo:{jam_id}: {exc}")
|
| 76 |
+
time.sleep(args.sleep)
|
| 77 |
+
continue
|
| 78 |
+
|
| 79 |
+
if data.get("headers", {}).get("status") != "success":
|
| 80 |
+
failed.append(f"jamendo:{jam_id}: {data.get('headers', {}).get('error_message', 'unknown')}")
|
| 81 |
+
time.sleep(args.sleep)
|
| 82 |
+
continue
|
| 83 |
+
|
| 84 |
+
results = data.get("results") or []
|
| 85 |
+
if not results:
|
| 86 |
+
failed.append(f"jamendo:{jam_id}: not in Jamendo catalog")
|
| 87 |
+
time.sleep(args.sleep)
|
| 88 |
+
continue
|
| 89 |
+
|
| 90 |
+
jam = results[0]
|
| 91 |
+
real_title = jam.get("name") or t.get("title")
|
| 92 |
+
real_artist = jam.get("artist_name") or t.get("artist")
|
| 93 |
+
audio_url = jam.get("audio") or None
|
| 94 |
+
image_url = jam.get("image") or None
|
| 95 |
+
track_view = f"https://www.jamendo.com/track/{jam_id}"
|
| 96 |
+
|
| 97 |
+
if args.dry_run:
|
| 98 |
+
print(f" [{i:03d}/{len(jamendo_tracks)}] {t.get('track_id')}: {t.get('title')!r} -> {real_title!r} by {real_artist!r}")
|
| 99 |
+
else:
|
| 100 |
+
t["title"] = real_title
|
| 101 |
+
t["artist"] = real_artist
|
| 102 |
+
t["artwork_url"] = image_url
|
| 103 |
+
t["track_view_url"] = track_view
|
| 104 |
+
ext = dict(t.get("external_ids") or {})
|
| 105 |
+
if audio_url:
|
| 106 |
+
ext["jamendoAudioUrl"] = audio_url
|
| 107 |
+
if jam.get("album_name"):
|
| 108 |
+
ext["jamendoAlbum"] = jam.get("album_name")
|
| 109 |
+
t["external_ids"] = ext
|
| 110 |
+
|
| 111 |
+
patched += 1
|
| 112 |
+
if i % 20 == 0:
|
| 113 |
+
print(f" [{i}/{len(jamendo_tracks)}] enriched so far: {patched}, failed: {len(failed)}")
|
| 114 |
+
time.sleep(args.sleep)
|
| 115 |
+
|
| 116 |
+
print(f"[enrich_jamendo] DONE: patched={patched} failed={len(failed)}")
|
| 117 |
+
if failed:
|
| 118 |
+
print("[enrich_jamendo] failures (first 10):")
|
| 119 |
+
for f in failed[:10]:
|
| 120 |
+
print(f" - {f}")
|
| 121 |
+
|
| 122 |
+
if not args.dry_run and patched > 0:
|
| 123 |
+
args.corpus.write_text(json.dumps(raw, indent=2))
|
| 124 |
+
print(f"[enrich_jamendo] wrote {args.corpus}")
|
| 125 |
+
elif args.dry_run:
|
| 126 |
+
print("[enrich_jamendo] dry-run — no file written")
|
| 127 |
+
|
| 128 |
+
return 0
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def _parse_args() -> argparse.Namespace:
|
| 132 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 133 |
+
p.add_argument("--corpus", type=Path, default=DEFAULT_CORPUS)
|
| 134 |
+
p.add_argument("--dry-run", action="store_true")
|
| 135 |
+
p.add_argument("--sleep", type=float, default=0.1)
|
| 136 |
+
return p.parse_args()
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
if __name__ == "__main__":
|
| 140 |
+
sys.exit(main())
|
backend/scripts/enrich_mir_features.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Backfill the four ADR-0004 MIR features (tempo, key, chroma, MFCC) into
|
| 2 |
+
corpus.json without re-encoding the catalog through MuQ-MuLan.
|
| 3 |
+
|
| 4 |
+
Pattern matches `enrich_jamendo.py`: read corpus.json, for each entry that
|
| 5 |
+
lacks `mir_features`, download its audio, run `mir_features.compute()`,
|
| 6 |
+
write the result back. Idempotent — re-runs only touch un-enriched tracks.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python -m backend.scripts.enrich_mir_features
|
| 10 |
+
|
| 11 |
+
The audio source depends on the tier:
|
| 12 |
+
- tier1 (iTunes): external_ids.previewUrl — 30s AAC-LC preview
|
| 13 |
+
- tier2 (Jamendo): external_ids.jamendoAudioUrl — MP3 stream (set by
|
| 14 |
+
enrich_jamendo earlier; falls back to source_url if missing)
|
| 15 |
+
|
| 16 |
+
Cost: ~3 s per track on CPU (download + librosa decode + features). For the
|
| 17 |
+
current 155-track catalog: ~8 min wall-clock.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import argparse
|
| 23 |
+
import io
|
| 24 |
+
import json
|
| 25 |
+
import sys
|
| 26 |
+
import tempfile
|
| 27 |
+
import time
|
| 28 |
+
import urllib.request
|
| 29 |
+
from pathlib import Path
|
| 30 |
+
|
| 31 |
+
import librosa
|
| 32 |
+
|
| 33 |
+
from backend.mir_features import compute as compute_mir
|
| 34 |
+
|
| 35 |
+
REPO_ROOT = Path(__file__).resolve().parents[3]
|
| 36 |
+
DEFAULT_CORPUS = REPO_ROOT / "quality-scorer" / "public" / "corpus" / "corpus.json"
|
| 37 |
+
APPLE_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 Safari/605.1.15"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def main() -> int:
|
| 41 |
+
args = _parse_args()
|
| 42 |
+
raw = json.loads(args.corpus.read_text())
|
| 43 |
+
tracks = raw if isinstance(raw, list) else raw.get("tracks", [])
|
| 44 |
+
total = len(tracks)
|
| 45 |
+
print(f"[enrich_mir] {total} catalog tracks")
|
| 46 |
+
|
| 47 |
+
todo = [t for t in tracks if args.force or not t.get("mir_features")]
|
| 48 |
+
print(f"[enrich_mir] {len(todo)} to enrich (already-enriched skipped; pass --force to redo)")
|
| 49 |
+
|
| 50 |
+
patched = 0
|
| 51 |
+
failed: list[str] = []
|
| 52 |
+
for i, t in enumerate(todo, start=1):
|
| 53 |
+
track_id = t.get("track_id", "?")
|
| 54 |
+
try:
|
| 55 |
+
url = _audio_url_for(t)
|
| 56 |
+
if not url:
|
| 57 |
+
failed.append(f"{track_id}: no audio URL in corpus entry")
|
| 58 |
+
continue
|
| 59 |
+
audio_bytes = _download(url, args.timeout)
|
| 60 |
+
wav, sr = _decode(audio_bytes)
|
| 61 |
+
features = compute_mir(wav, sr)
|
| 62 |
+
t["mir_features"] = features.to_dict()
|
| 63 |
+
patched += 1
|
| 64 |
+
if i % 10 == 0 or i == len(todo):
|
| 65 |
+
print(f" [{i:>3}/{len(todo)}] enriched={patched} failed={len(failed)}")
|
| 66 |
+
except Exception as exc:
|
| 67 |
+
failed.append(f"{track_id}: {exc!r}")
|
| 68 |
+
time.sleep(args.sleep)
|
| 69 |
+
|
| 70 |
+
args.corpus.write_text(json.dumps(raw, indent=2))
|
| 71 |
+
print(f"[enrich_mir] DONE: patched={patched} failed={len(failed)}")
|
| 72 |
+
if failed:
|
| 73 |
+
print("[enrich_mir] first 10 failures:")
|
| 74 |
+
for f in failed[:10]:
|
| 75 |
+
print(f" - {f}")
|
| 76 |
+
return 0 if patched > 0 or not todo else 1
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def _audio_url_for(track: dict) -> str | None:
|
| 80 |
+
"""Pick the best audio source URL for a track per its tier."""
|
| 81 |
+
ext = track.get("external_ids") or {}
|
| 82 |
+
return (
|
| 83 |
+
ext.get("previewUrl") # iTunes Tier-1
|
| 84 |
+
or ext.get("jamendoAudioUrl") # Jamendo enriched
|
| 85 |
+
or ext.get("jamendoStreamUrl")
|
| 86 |
+
or track.get("source_url") # last resort
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def _download(url: str, timeout: float) -> bytes:
|
| 91 |
+
req = urllib.request.Request(url, headers={"User-Agent": APPLE_UA})
|
| 92 |
+
with urllib.request.urlopen(req, timeout=timeout) as r:
|
| 93 |
+
return r.read()
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _decode(audio_bytes: bytes):
|
| 97 |
+
"""Decode arbitrary audio bytes to a mono numpy array via the temp-file
|
| 98 |
+
path so AAC-LC (.m4a) works the same way it does in api.py.
|
| 99 |
+
"""
|
| 100 |
+
# Try BytesIO first (works for mp3/wav/flac/ogg).
|
| 101 |
+
try:
|
| 102 |
+
wav, sr = librosa.load(io.BytesIO(audio_bytes), sr=22050, mono=True)
|
| 103 |
+
if wav.size > 0:
|
| 104 |
+
return wav, sr
|
| 105 |
+
except Exception:
|
| 106 |
+
pass
|
| 107 |
+
# Fall through to temp-file path.
|
| 108 |
+
with tempfile.NamedTemporaryFile(suffix=".m4a", delete=True) as tmp:
|
| 109 |
+
tmp.write(audio_bytes)
|
| 110 |
+
tmp.flush()
|
| 111 |
+
wav, sr = librosa.load(tmp.name, sr=22050, mono=True)
|
| 112 |
+
return wav, sr
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def _parse_args() -> argparse.Namespace:
|
| 116 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 117 |
+
p.add_argument("--corpus", type=Path, default=DEFAULT_CORPUS)
|
| 118 |
+
p.add_argument("--force", action="store_true",
|
| 119 |
+
help="Re-enrich even tracks that already have mir_features")
|
| 120 |
+
p.add_argument("--sleep", type=float, default=0.2,
|
| 121 |
+
help="Per-track sleep to be polite to source CDNs")
|
| 122 |
+
p.add_argument("--timeout", type=float, default=30.0,
|
| 123 |
+
help="Per-request download timeout")
|
| 124 |
+
return p.parse_args()
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
if __name__ == "__main__":
|
| 128 |
+
sys.exit(main())
|
backend/scripts/run_rag_eval.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""RAG eval harness for the /narrative explanatory layer.
|
| 2 |
+
|
| 3 |
+
Reads `backend/tests/fixtures/narrative_golden_set.json`, runs each case
|
| 4 |
+
through `rag_narrative.generate_narrative()` with `_call_openai_json` mocked
|
| 5 |
+
to return the case's prepared LLM response, and scores aggregate metrics.
|
| 6 |
+
|
| 7 |
+
Metrics:
|
| 8 |
+
- kind_agreement_rate: result.kind == expected.kind
|
| 9 |
+
- reason_agreement_rate: (result.reason or null) == (expected.reason or null)
|
| 10 |
+
- citation_groundedness_rate: among happy_path cases, fraction returning
|
| 11 |
+
`kind=narrative` AND all citations validate.
|
| 12 |
+
- gate_correctness_rate: among low_context cases, fraction returning
|
| 13 |
+
`kind=low_confidence` with the right reason.
|
| 14 |
+
- hallucination_rejection_rate: among hallucinated_citation cases, fraction
|
| 15 |
+
returning `kind=unavailable, reason=citation-hallucinated`.
|
| 16 |
+
|
| 17 |
+
Writes a summary JSON to `factory/artifacts/RAG_EVAL_RESULT.json` so the
|
| 18 |
+
result is committable and reviewable in PRs that touch rag_narrative.py.
|
| 19 |
+
|
| 20 |
+
Run:
|
| 21 |
+
python -m backend.scripts.run_rag_eval [--out PATH]
|
| 22 |
+
|
| 23 |
+
Returns non-zero exit code if any baseline gate fails (kind_agreement < 1.0,
|
| 24 |
+
gate_correctness < 1.0, hallucination_rejection < 1.0). Those gates are
|
| 25 |
+
load-bearing — any regression in them is a real bug in the validation layer.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
from __future__ import annotations
|
| 29 |
+
|
| 30 |
+
import argparse
|
| 31 |
+
import json
|
| 32 |
+
import sys
|
| 33 |
+
from collections import Counter
|
| 34 |
+
from pathlib import Path
|
| 35 |
+
from typing import Any
|
| 36 |
+
from unittest.mock import patch
|
| 37 |
+
|
| 38 |
+
REPO_ROOT = Path(__file__).resolve().parents[3]
|
| 39 |
+
GOLDEN_SET_PATH = REPO_ROOT / "backend" / "tests" / "fixtures" / "narrative_golden_set.json"
|
| 40 |
+
DEFAULT_OUT_PATH = REPO_ROOT / "factory" / "artifacts" / "RAG_EVAL_RESULT.json"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def _load_golden_set(path: Path) -> list[dict]:
|
| 44 |
+
data = json.loads(path.read_text())
|
| 45 |
+
cases = data.get("cases", [])
|
| 46 |
+
if not cases:
|
| 47 |
+
raise ValueError(f"empty golden set at {path}")
|
| 48 |
+
return cases
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _build_context(spec: dict):
|
| 52 |
+
"""Materialize a NarrativeContext from a golden-set spec dict."""
|
| 53 |
+
from backend import rag_narrative
|
| 54 |
+
|
| 55 |
+
return rag_narrative.NarrativeContext(
|
| 56 |
+
queryFingerprint=spec["queryFingerprint"],
|
| 57 |
+
trackId=spec["trackId"],
|
| 58 |
+
title=spec["title"],
|
| 59 |
+
artist=spec.get("artist"),
|
| 60 |
+
queryWindow=tuple(spec["queryWindow"]),
|
| 61 |
+
matchWindow=tuple(spec["matchWindow"]),
|
| 62 |
+
rawCosine=float(spec["rawCosine"]),
|
| 63 |
+
criteria=[
|
| 64 |
+
rag_narrative.CriterionContext(**c) for c in spec["criteria"]
|
| 65 |
+
],
|
| 66 |
+
acrcloudCoverSongId=spec.get("acrcloudCoverSongId"),
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _run_case(case: dict) -> dict:
|
| 71 |
+
"""Run one golden-set case end-to-end and capture the result vs expected.
|
| 72 |
+
|
| 73 |
+
Returns a per-case report row.
|
| 74 |
+
"""
|
| 75 |
+
from backend import rag_narrative
|
| 76 |
+
|
| 77 |
+
mocked = case["mocked_llm_response"]
|
| 78 |
+
expected_kind = case["expected_kind"]
|
| 79 |
+
expected_reason = case.get("expected_reason")
|
| 80 |
+
must_not_call_llm = mocked == "MUST_NOT_BE_CALLED"
|
| 81 |
+
|
| 82 |
+
# Sentinel "MUST_NOT_BE_CALLED" lives only in the JSON for readability;
|
| 83 |
+
# the patched helper still needs a real return value if called. We use
|
| 84 |
+
# None so a wrongful call surfaces as openai-error and the case will
|
| 85 |
+
# fail the kind-agreement assertion, surfacing the bug.
|
| 86 |
+
patch_return = None if must_not_call_llm else mocked
|
| 87 |
+
|
| 88 |
+
context = _build_context(case["context"])
|
| 89 |
+
|
| 90 |
+
with patch(
|
| 91 |
+
"backend.rag_narrative._call_openai_json", return_value=patch_return
|
| 92 |
+
) as call_mock:
|
| 93 |
+
result = rag_narrative.generate_narrative(
|
| 94 |
+
context,
|
| 95 |
+
case["mode"],
|
| 96 |
+
model_sha="eval-model-sha",
|
| 97 |
+
catalog_sha="eval-catalog-sha",
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
llm_was_called = call_mock.called
|
| 101 |
+
actual_kind = getattr(result, "kind", None) or (
|
| 102 |
+
result.get("kind") if isinstance(result, dict) else None
|
| 103 |
+
)
|
| 104 |
+
actual_reason = getattr(result, "reason", None) or (
|
| 105 |
+
result.get("reason") if isinstance(result, dict) else None
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
return {
|
| 109 |
+
"name": case["name"],
|
| 110 |
+
"category": case["category"],
|
| 111 |
+
"mode": case["mode"],
|
| 112 |
+
"expected_kind": expected_kind,
|
| 113 |
+
"actual_kind": actual_kind,
|
| 114 |
+
"expected_reason": expected_reason,
|
| 115 |
+
"actual_reason": actual_reason,
|
| 116 |
+
"llm_was_called": llm_was_called,
|
| 117 |
+
"must_not_call_llm": must_not_call_llm,
|
| 118 |
+
"kind_match": actual_kind == expected_kind,
|
| 119 |
+
"reason_match": (actual_reason or None) == (expected_reason or None),
|
| 120 |
+
"gate_respected": (not must_not_call_llm) or (not llm_was_called),
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def _aggregate(rows: list[dict]) -> dict:
|
| 125 |
+
n = len(rows)
|
| 126 |
+
kind_correct = sum(1 for r in rows if r["kind_match"])
|
| 127 |
+
reason_correct = sum(1 for r in rows if r["reason_match"])
|
| 128 |
+
by_cat = {}
|
| 129 |
+
cat_total: Counter[str] = Counter()
|
| 130 |
+
cat_kind_correct: Counter[str] = Counter()
|
| 131 |
+
cat_reason_correct: Counter[str] = Counter()
|
| 132 |
+
cat_gate_respected: Counter[str] = Counter()
|
| 133 |
+
for r in rows:
|
| 134 |
+
cat_total[r["category"]] += 1
|
| 135 |
+
if r["kind_match"]:
|
| 136 |
+
cat_kind_correct[r["category"]] += 1
|
| 137 |
+
if r["reason_match"]:
|
| 138 |
+
cat_reason_correct[r["category"]] += 1
|
| 139 |
+
if r["gate_respected"]:
|
| 140 |
+
cat_gate_respected[r["category"]] += 1
|
| 141 |
+
|
| 142 |
+
for cat, total in cat_total.items():
|
| 143 |
+
by_cat[cat] = {
|
| 144 |
+
"total": total,
|
| 145 |
+
"kind_agreement_rate": round(cat_kind_correct[cat] / total, 4),
|
| 146 |
+
"reason_agreement_rate": round(cat_reason_correct[cat] / total, 4),
|
| 147 |
+
"gate_respected_rate": round(cat_gate_respected[cat] / total, 4),
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
return {
|
| 151 |
+
"n_cases": n,
|
| 152 |
+
"kind_agreement_rate": round(kind_correct / n, 4) if n else 0.0,
|
| 153 |
+
"reason_agreement_rate": round(reason_correct / n, 4) if n else 0.0,
|
| 154 |
+
"by_category": by_cat,
|
| 155 |
+
"baseline_gates": {
|
| 156 |
+
# Each gate is "must be 1.0 to pass" — any regression here is a
|
| 157 |
+
# real bug in validation / gating, not noise.
|
| 158 |
+
"happy_path_kind_agreement": by_cat.get("happy_path", {}).get("kind_agreement_rate", 0.0),
|
| 159 |
+
"low_context_gate_correctness": by_cat.get("low_context", {}).get("reason_agreement_rate", 0.0),
|
| 160 |
+
"hallucination_rejection": by_cat.get("hallucinated_citation", {}).get("reason_agreement_rate", 0.0),
|
| 161 |
+
"malformed_rejection": by_cat.get("malformed_output", {}).get("reason_agreement_rate", 0.0),
|
| 162 |
+
"openai_error_handling": by_cat.get("openai_error", {}).get("reason_agreement_rate", 0.0),
|
| 163 |
+
},
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def run_eval(golden_set_path: Path = GOLDEN_SET_PATH) -> dict:
|
| 168 |
+
"""Public entry point — runs the eval and returns the aggregate dict.
|
| 169 |
+
|
| 170 |
+
Pure function from disk path → results dict. The CLI wrapper writes the
|
| 171 |
+
results to disk; the pytest gate calls this directly and asserts.
|
| 172 |
+
"""
|
| 173 |
+
cases = _load_golden_set(golden_set_path)
|
| 174 |
+
rows = [_run_case(c) for c in cases]
|
| 175 |
+
summary = _aggregate(rows)
|
| 176 |
+
return {"summary": summary, "rows": rows}
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
def main() -> int:
|
| 180 |
+
parser = argparse.ArgumentParser(description=__doc__)
|
| 181 |
+
parser.add_argument(
|
| 182 |
+
"--out",
|
| 183 |
+
type=Path,
|
| 184 |
+
default=DEFAULT_OUT_PATH,
|
| 185 |
+
help="Where to write the eval result JSON.",
|
| 186 |
+
)
|
| 187 |
+
parser.add_argument(
|
| 188 |
+
"--golden",
|
| 189 |
+
type=Path,
|
| 190 |
+
default=GOLDEN_SET_PATH,
|
| 191 |
+
help="Override golden-set path (for experiments).",
|
| 192 |
+
)
|
| 193 |
+
args = parser.parse_args()
|
| 194 |
+
|
| 195 |
+
result = run_eval(args.golden)
|
| 196 |
+
args.out.parent.mkdir(parents=True, exist_ok=True)
|
| 197 |
+
args.out.write_text(json.dumps(result, indent=2) + "\n")
|
| 198 |
+
|
| 199 |
+
summary = result["summary"]
|
| 200 |
+
print(f"RAG eval — {summary['n_cases']} cases")
|
| 201 |
+
print(f" kind agreement : {summary['kind_agreement_rate']:.2%}")
|
| 202 |
+
print(f" reason agreement : {summary['reason_agreement_rate']:.2%}")
|
| 203 |
+
print()
|
| 204 |
+
print(" Baseline gates (each MUST be 1.0):")
|
| 205 |
+
failed: list[str] = []
|
| 206 |
+
for gate, score in summary["baseline_gates"].items():
|
| 207 |
+
status = "OK " if score >= 1.0 else "FAIL"
|
| 208 |
+
print(f" [{status}] {gate}: {score:.2%}")
|
| 209 |
+
if score < 1.0:
|
| 210 |
+
failed.append(gate)
|
| 211 |
+
|
| 212 |
+
print()
|
| 213 |
+
print(" By category:")
|
| 214 |
+
for cat, stats in summary["by_category"].items():
|
| 215 |
+
print(
|
| 216 |
+
f" {cat:<26} n={stats['total']:>2} kind={stats['kind_agreement_rate']:.2%} "
|
| 217 |
+
f"reason={stats['reason_agreement_rate']:.2%} gate_respected={stats['gate_respected_rate']:.2%}"
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
print()
|
| 221 |
+
print(f"Result written to {args.out}")
|
| 222 |
+
|
| 223 |
+
if failed:
|
| 224 |
+
print(f"FAILED gates: {', '.join(failed)}", file=sys.stderr)
|
| 225 |
+
return 1
|
| 226 |
+
return 0
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
if __name__ == "__main__":
|
| 230 |
+
sys.exit(main())
|
backend/scripts/verify_matching.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Self-retrieval verification for the PiedPiper matching pipeline.
|
| 2 |
+
|
| 3 |
+
For each iTunes Tier-1 track in the catalog: download the actual Apple
|
| 4 |
+
preview audio that was used to build the catalog embedding, POST it back
|
| 5 |
+
to /neighbors as a fresh query, and check that the same track is returned
|
| 6 |
+
at rank 1 with high cosine similarity.
|
| 7 |
+
|
| 8 |
+
This is the falsifiable answer to "are the matches accurate?" — if the
|
| 9 |
+
self-retrieval rate is high, the encoder + retrieval pipeline are doing
|
| 10 |
+
what they claim. If it isn't, there's a bug worth finding before we
|
| 11 |
+
trust any cross-track match.
|
| 12 |
+
|
| 13 |
+
Usage:
|
| 14 |
+
# Against the live HF Space (default):
|
| 15 |
+
python -m backend.scripts.verify_matching
|
| 16 |
+
|
| 17 |
+
# Against a local backend:
|
| 18 |
+
python -m backend.scripts.verify_matching --base-url http://localhost:8000
|
| 19 |
+
|
| 20 |
+
# Just one target:
|
| 21 |
+
python -m backend.scripts.verify_matching --target tier1:itunes:1488408568
|
| 22 |
+
|
| 23 |
+
ADR-0002 §"Verification" documents the methodology this harness implements.
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
from __future__ import annotations
|
| 27 |
+
|
| 28 |
+
import argparse
|
| 29 |
+
import json
|
| 30 |
+
import sys
|
| 31 |
+
import tempfile
|
| 32 |
+
import time
|
| 33 |
+
import urllib.request
|
| 34 |
+
from pathlib import Path
|
| 35 |
+
|
| 36 |
+
REPO_ROOT = Path(__file__).resolve().parents[3]
|
| 37 |
+
DEFAULT_CORPUS = REPO_ROOT / "quality-scorer" / "public" / "corpus" / "corpus.json"
|
| 38 |
+
DEFAULT_BASE_URL = "https://rajata98-piedpiper.hf.space"
|
| 39 |
+
APPLE_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def main() -> int:
|
| 43 |
+
args = _parse_args()
|
| 44 |
+
catalog = _load_catalog(args.corpus)
|
| 45 |
+
targets = _select_targets(catalog, args.target)
|
| 46 |
+
if not targets:
|
| 47 |
+
print("[verify_matching] no Tier-1 iTunes targets found in catalog; aborting", file=sys.stderr)
|
| 48 |
+
return 2
|
| 49 |
+
print(f"[verify_matching] running against {args.base_url}")
|
| 50 |
+
print(f"[verify_matching] {len(targets)} target(s) to verify")
|
| 51 |
+
print()
|
| 52 |
+
|
| 53 |
+
results: list[dict] = []
|
| 54 |
+
for i, target in enumerate(targets, start=1):
|
| 55 |
+
print(f"[{i}/{len(targets)}] {target['title']} — {target['artist']}")
|
| 56 |
+
try:
|
| 57 |
+
result = _verify_one(target, args.base_url, args.timeout)
|
| 58 |
+
except Exception as exc:
|
| 59 |
+
print(f" ERROR: {exc!r}")
|
| 60 |
+
result = {
|
| 61 |
+
"track_id": target["track_id"],
|
| 62 |
+
"title": target["title"],
|
| 63 |
+
"artist": target["artist"],
|
| 64 |
+
"error": str(exc),
|
| 65 |
+
"self_rank": None,
|
| 66 |
+
"self_cosine": None,
|
| 67 |
+
"top1_track_id": None,
|
| 68 |
+
"top1_cosine": None,
|
| 69 |
+
"timestamp": None,
|
| 70 |
+
}
|
| 71 |
+
results.append(result)
|
| 72 |
+
_print_one(result)
|
| 73 |
+
time.sleep(args.sleep)
|
| 74 |
+
|
| 75 |
+
print()
|
| 76 |
+
print("=" * 70)
|
| 77 |
+
print("SUMMARY")
|
| 78 |
+
print("=" * 70)
|
| 79 |
+
_print_summary(results)
|
| 80 |
+
|
| 81 |
+
if args.json_out:
|
| 82 |
+
Path(args.json_out).write_text(json.dumps(results, indent=2))
|
| 83 |
+
print(f"\n[verify_matching] full results written to {args.json_out}")
|
| 84 |
+
|
| 85 |
+
return 0
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def _verify_one(target: dict, base_url: str, timeout: float) -> dict:
|
| 89 |
+
preview_url = (target.get("external_ids") or {}).get("previewUrl")
|
| 90 |
+
if not preview_url:
|
| 91 |
+
raise RuntimeError("no previewUrl in catalog entry")
|
| 92 |
+
|
| 93 |
+
# Download the iTunes preview. Apple CDN rejects default Python UA so set Safari.
|
| 94 |
+
req = urllib.request.Request(preview_url, headers={"User-Agent": APPLE_UA})
|
| 95 |
+
with urllib.request.urlopen(req, timeout=timeout) as r:
|
| 96 |
+
audio_bytes = r.read()
|
| 97 |
+
|
| 98 |
+
# POST to /neighbors. The temp file's .m4a suffix preserves Apple's AAC-LC
|
| 99 |
+
# format so the backend's audioread fallback picks the right decoder.
|
| 100 |
+
with tempfile.NamedTemporaryFile(suffix=".m4a", delete=True) as tmp:
|
| 101 |
+
tmp.write(audio_bytes)
|
| 102 |
+
tmp.flush()
|
| 103 |
+
try:
|
| 104 |
+
import httpx
|
| 105 |
+
except ImportError as exc:
|
| 106 |
+
raise RuntimeError(
|
| 107 |
+
"httpx required: pip install -e 'backend/[ingest]'"
|
| 108 |
+
) from exc
|
| 109 |
+
with httpx.Client(timeout=timeout) as client:
|
| 110 |
+
with open(tmp.name, "rb") as f:
|
| 111 |
+
resp = client.post(
|
| 112 |
+
f"{base_url}/neighbors",
|
| 113 |
+
params={"k": 5},
|
| 114 |
+
files={"file": (f"{target['track_id']}.m4a", f, "audio/mp4")},
|
| 115 |
+
)
|
| 116 |
+
resp.raise_for_status()
|
| 117 |
+
body = resp.json()
|
| 118 |
+
|
| 119 |
+
neighbors = body.get("neighbors") or []
|
| 120 |
+
if not neighbors:
|
| 121 |
+
raise RuntimeError(f"empty neighbors response: {body}")
|
| 122 |
+
|
| 123 |
+
# Find self-rank in the returned list.
|
| 124 |
+
self_rank = None
|
| 125 |
+
self_cosine = None
|
| 126 |
+
for j, n in enumerate(neighbors, start=1):
|
| 127 |
+
if n.get("trackId") == target["track_id"]:
|
| 128 |
+
self_rank = j
|
| 129 |
+
self_cosine = float(n.get("rawCosine") or 0)
|
| 130 |
+
self_ts = n.get("matchTimestamp") or {}
|
| 131 |
+
break
|
| 132 |
+
else:
|
| 133 |
+
self_ts = {}
|
| 134 |
+
|
| 135 |
+
top1 = neighbors[0]
|
| 136 |
+
return {
|
| 137 |
+
"track_id": target["track_id"],
|
| 138 |
+
"title": target["title"],
|
| 139 |
+
"artist": target["artist"],
|
| 140 |
+
"self_rank": self_rank,
|
| 141 |
+
"self_cosine": self_cosine,
|
| 142 |
+
"self_timestamp": self_ts,
|
| 143 |
+
"top1_track_id": top1.get("trackId"),
|
| 144 |
+
"top1_cosine": float(top1.get("rawCosine") or 0),
|
| 145 |
+
"top1_title": (top1.get("track") or {}).get("title"),
|
| 146 |
+
"top1_artist": (top1.get("track") or {}).get("artist"),
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def _print_one(r: dict) -> None:
|
| 151 |
+
if r.get("error"):
|
| 152 |
+
return
|
| 153 |
+
self_rank = r.get("self_rank")
|
| 154 |
+
self_cos = r.get("self_cosine")
|
| 155 |
+
top1_id = r.get("top1_track_id")
|
| 156 |
+
if self_rank == 1:
|
| 157 |
+
marker = "PASS"
|
| 158 |
+
elif self_rank is not None:
|
| 159 |
+
marker = f"SELF AT RANK {self_rank}"
|
| 160 |
+
else:
|
| 161 |
+
marker = "SELF NOT IN TOP-5"
|
| 162 |
+
cos_str = f"{self_cos:.4f}" if self_cos is not None else "—"
|
| 163 |
+
print(f" -> rank-1 returned: {(top1_id or '')[:40]:<40} cos={r.get('top1_cosine', 0):.4f}")
|
| 164 |
+
print(f" self-retrieval: {marker} self-cos={cos_str}")
|
| 165 |
+
ts = r.get("self_timestamp") or {}
|
| 166 |
+
if ts and self_rank is not None:
|
| 167 |
+
print(f" self-timestamp: query {ts.get('queryStartSec','?')}-{ts.get('queryEndSec','?')}s ↔ catalog {ts.get('catalogStartSec','?')}-{ts.get('catalogEndSec','?')}s")
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def _print_summary(results: list[dict]) -> None:
|
| 171 |
+
n = len(results)
|
| 172 |
+
succeeded = [r for r in results if not r.get("error")]
|
| 173 |
+
n_ok = len(succeeded)
|
| 174 |
+
if n_ok == 0:
|
| 175 |
+
print("All targets errored out. Check the base URL + network.")
|
| 176 |
+
return
|
| 177 |
+
self_at_1 = [r for r in succeeded if r.get("self_rank") == 1]
|
| 178 |
+
self_in_top5 = [r for r in succeeded if r.get("self_rank") is not None]
|
| 179 |
+
self_cosines = [r["self_cosine"] for r in succeeded if r.get("self_cosine") is not None]
|
| 180 |
+
ts_align = [r for r in succeeded
|
| 181 |
+
if (r.get("self_timestamp") or {}).get("queryStartSec") == (r.get("self_timestamp") or {}).get("catalogStartSec")
|
| 182 |
+
and r.get("self_rank") is not None]
|
| 183 |
+
|
| 184 |
+
print(f"Total targets: {n}")
|
| 185 |
+
print(f"Successful round-trips: {n_ok}")
|
| 186 |
+
print(f"Self at rank 1: {len(self_at_1):>3} / {n_ok} ({100*len(self_at_1)/n_ok:.0f}%)")
|
| 187 |
+
print(f"Self in top 5: {len(self_in_top5):>3} / {n_ok} ({100*len(self_in_top5)/n_ok:.0f}%)")
|
| 188 |
+
if self_cosines:
|
| 189 |
+
mean = sum(self_cosines) / len(self_cosines)
|
| 190 |
+
print(f"Self-match cosine mean: {mean:.4f} (n={len(self_cosines)})")
|
| 191 |
+
print(f"Self-match cosine min: {min(self_cosines):.4f}")
|
| 192 |
+
print(f"Timestamp aligns to self: {len(ts_align):>3} / {n_ok} ({100*len(ts_align)/n_ok:.0f}%)")
|
| 193 |
+
print()
|
| 194 |
+
print("Pass criteria (ADR-0002 §Verification):")
|
| 195 |
+
print(f" self-retrieval rate >= 90% ... {'PASS' if len(self_at_1) / n_ok >= 0.9 else 'FAIL'}")
|
| 196 |
+
print(f" self-cosine mean >= 0.92 ... {'PASS' if self_cosines and sum(self_cosines)/len(self_cosines) >= 0.92 else 'FAIL'}")
|
| 197 |
+
print(f" timestamp aligns in >= 8 cases ... {'PASS' if len(ts_align) >= 8 else 'FAIL (or n<8 successful)'}")
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def _load_catalog(path: Path) -> list[dict]:
|
| 201 |
+
raw = json.loads(path.read_text())
|
| 202 |
+
return raw if isinstance(raw, list) else raw.get("tracks", [])
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def _select_targets(catalog: list[dict], target_id: str | None) -> list[dict]:
|
| 206 |
+
tier1 = [t for t in catalog if t.get("source") == "itunes" and (t.get("external_ids") or {}).get("previewUrl")]
|
| 207 |
+
if target_id:
|
| 208 |
+
return [t for t in tier1 if t.get("track_id") == target_id]
|
| 209 |
+
return tier1
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def _parse_args() -> argparse.Namespace:
|
| 213 |
+
p = argparse.ArgumentParser(description=__doc__)
|
| 214 |
+
p.add_argument("--corpus", type=Path, default=DEFAULT_CORPUS,
|
| 215 |
+
help=f"corpus.json path (default: {DEFAULT_CORPUS})")
|
| 216 |
+
p.add_argument("--base-url", default=DEFAULT_BASE_URL,
|
| 217 |
+
help=f"backend base URL (default: {DEFAULT_BASE_URL})")
|
| 218 |
+
p.add_argument("--target", default=None,
|
| 219 |
+
help="single track_id to verify (default: all Tier-1 iTunes tracks)")
|
| 220 |
+
p.add_argument("--timeout", type=float, default=120.0,
|
| 221 |
+
help="per-request timeout in seconds (default: 120)")
|
| 222 |
+
p.add_argument("--sleep", type=float, default=1.0,
|
| 223 |
+
help="seconds between requests to be polite (default: 1.0)")
|
| 224 |
+
p.add_argument("--json-out", default=None,
|
| 225 |
+
help="optional path to write full results as JSON")
|
| 226 |
+
return p.parse_args()
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
if __name__ == "__main__":
|
| 230 |
+
sys.exit(main())
|
requirements.txt
CHANGED
|
@@ -16,3 +16,8 @@ httpx>=0.27
|
|
| 16 |
pyyaml>=6.0
|
| 17 |
# Sentry error tracking — no-op when SENTRY_DSN env var is unset.
|
| 18 |
sentry-sdk[fastapi]>=2.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
pyyaml>=6.0
|
| 17 |
# Sentry error tracking — no-op when SENTRY_DSN env var is unset.
|
| 18 |
sentry-sdk[fastapi]>=2.0
|
| 19 |
+
# ADR-0005 (Commit C): GPT-4o-mini for /narrative explanatory layer.
|
| 20 |
+
# Endpoint returns 503 narrative-disabled when OPENAI_API_KEY is unset, so
|
| 21 |
+
# this dep is safe to add even before the Space secret is provisioned.
|
| 22 |
+
openai>=1.40
|
| 23 |
+
pydantic>=2.0
|