RajatA98 commited on
Commit
b43bd50
·
1 Parent(s): a57cff5

Deploy RAG narrative layer + telemetry + RAG eval harness

Browse files
backend/api.py CHANGED
@@ -20,6 +20,7 @@ Errors are returned as `{"error": "<code>"}` to match the frontend's `api.js`:
20
 
21
  from __future__ import annotations
22
 
 
23
  import io
24
  import json
25
  import os
@@ -33,12 +34,13 @@ import soundfile as sf
33
  from fastapi import FastAPI, File, UploadFile
34
  from fastapi.middleware.cors import CORSMiddleware
35
  from fastapi.responses import JSONResponse
 
36
 
37
  # ADR-0002: clap_engine is no longer the primary encoder; muq_engine took its
38
  # place via clap_windowed's swap. We still import clap_engine here only because
39
  # legacy code paths may reference it; the encoder load + genre tagging both go
40
  # through muq_engine.
41
- from . import __version__, acrcloud_engine, muq_engine, clap_windowed, config, mir_features, similarity
42
  from .librosa_engine import analyze_array
43
  from .scoring import compute_report
44
 
@@ -63,6 +65,7 @@ _corpus_by_id: dict[str, dict] = {}
63
  _flat_catalog: similarity.FlatCatalog | None = None
64
  _catalog_cosine_distribution: np.ndarray | None = None # sorted upper-tri off-diag pairwise cosines
65
  _model_sha: str = ""
 
66
  _threshold_default: float = config.SIMILARITY_THRESHOLD_DEFAULT
67
 
68
 
@@ -77,7 +80,7 @@ def _load_corpus() -> None:
77
  """Populate corpus globals from disk if all corpus artifacts are present."""
78
  global _corpus_tracks, _corpus_embeddings, _corpus_by_id, _flat_catalog
79
  global _catalog_cosine_distribution
80
- global _model_sha, _threshold_default
81
  corpus_dir = Path(os.getenv("CORPUS_DIR", str(_default_corpus_dir())))
82
  cpath = corpus_dir / "corpus.json"
83
  epath = corpus_dir / "embeddings.npy"
@@ -95,6 +98,7 @@ def _load_corpus() -> None:
95
  _corpus_by_id = {}
96
  _flat_catalog = None
97
  _model_sha = ""
 
98
  _threshold_default = config.SIMILARITY_THRESHOLD_DEFAULT
99
  return
100
  try:
@@ -103,10 +107,16 @@ def _load_corpus() -> None:
103
  _corpus_embeddings = np.load(epath).astype(np.float32)
104
  with np.load(spath) as npz:
105
  segment_embeddings = {k: npz[k].astype(np.float32) for k in npz.files}
106
- manifest = json.loads(mpath.read_text())
 
107
  _model_sha = str(manifest.get("model_sha") or "unpinned")
108
  if _model_sha == "unpinned":
109
  print("[api] WARNING manifest missing model_sha; using 'unpinned'")
 
 
 
 
 
110
  _threshold_default = similarity.threshold_from_manifest(manifest)
111
  _flat_catalog = similarity.build_flat_catalog(_corpus_tracks, _corpus_embeddings, segment_embeddings)
112
  _catalog_cosine_distribution = similarity.compute_catalog_distribution(_flat_catalog)
@@ -128,6 +138,7 @@ def _load_corpus() -> None:
128
  _flat_catalog = None
129
  _catalog_cosine_distribution = None
130
  _model_sha = ""
 
131
  _threshold_default = config.SIMILARITY_THRESHOLD_DEFAULT
132
 
133
 
@@ -332,6 +343,10 @@ async def neighbors_endpoint(file: UploadFile = File(...), k: int = 5):
332
  raw = await file.read()
333
  if (err := _validate_upload(file, raw)) is not None:
334
  return err
 
 
 
 
335
  ext = Path(file.filename or "").suffix.lower()
336
  pipeline = _decode_and_pipeline(raw, ext=ext)
337
  if isinstance(pipeline, JSONResponse):
@@ -348,6 +363,8 @@ async def neighbors_endpoint(file: UploadFile = File(...), k: int = 5):
348
  "modelSha": _model_sha,
349
  "thresholdDefault": _threshold_default,
350
  "acrcloud": acrcloud_engine.to_response_dict(acrcloud_engine.disabled_response()),
 
 
351
  }
352
 
353
  neighbors = similarity.top_k_neighbors(
@@ -394,6 +411,40 @@ async def neighbors_endpoint(file: UploadFile = File(...), k: int = 5):
394
 
395
  specificity = float(similarity.query_specificity(pipeline["emb"].astype(np.float32), _flat_catalog))
396
  acr = acrcloud_engine.call_for_query(pipeline["acrcloud_audio"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
 
398
  return {
399
  "query": query_track,
@@ -405,10 +456,208 @@ async def neighbors_endpoint(file: UploadFile = File(...), k: int = 5):
405
  "querySpecificity": specificity,
406
  "modelSha": _model_sha,
407
  "thresholdDefault": _threshold_default,
408
- "acrcloud": acrcloud_engine.to_response_dict(acr),
 
 
409
  }
410
 
411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  def run() -> None:
413
  """Convenience launcher: `python -m backend.api` or `uvicorn backend.api:app`."""
414
  import uvicorn
 
20
 
21
  from __future__ import annotations
22
 
23
+ import hashlib
24
  import io
25
  import json
26
  import os
 
34
  from fastapi import FastAPI, File, UploadFile
35
  from fastapi.middleware.cors import CORSMiddleware
36
  from fastapi.responses import JSONResponse
37
+ from pydantic import BaseModel, Field
38
 
39
  # ADR-0002: clap_engine is no longer the primary encoder; muq_engine took its
40
  # place via clap_windowed's swap. We still import clap_engine here only because
41
  # legacy code paths may reference it; the encoder load + genre tagging both go
42
  # through muq_engine.
43
+ from . import __version__, acrcloud_engine, context_token, muq_engine, narrative_telemetry, clap_windowed, config, mir_features, similarity
44
  from .librosa_engine import analyze_array
45
  from .scoring import compute_report
46
 
 
65
  _flat_catalog: similarity.FlatCatalog | None = None
66
  _catalog_cosine_distribution: np.ndarray | None = None # sorted upper-tri off-diag pairwise cosines
67
  _model_sha: str = ""
68
+ _catalog_sha: str = "" # sha256 of manifest.json bytes; used in contextToken claims
69
  _threshold_default: float = config.SIMILARITY_THRESHOLD_DEFAULT
70
 
71
 
 
80
  """Populate corpus globals from disk if all corpus artifacts are present."""
81
  global _corpus_tracks, _corpus_embeddings, _corpus_by_id, _flat_catalog
82
  global _catalog_cosine_distribution
83
+ global _model_sha, _catalog_sha, _threshold_default
84
  corpus_dir = Path(os.getenv("CORPUS_DIR", str(_default_corpus_dir())))
85
  cpath = corpus_dir / "corpus.json"
86
  epath = corpus_dir / "embeddings.npy"
 
98
  _corpus_by_id = {}
99
  _flat_catalog = None
100
  _model_sha = ""
101
+ _catalog_sha = ""
102
  _threshold_default = config.SIMILARITY_THRESHOLD_DEFAULT
103
  return
104
  try:
 
107
  _corpus_embeddings = np.load(epath).astype(np.float32)
108
  with np.load(spath) as npz:
109
  segment_embeddings = {k: npz[k].astype(np.float32) for k in npz.files}
110
+ manifest_bytes = mpath.read_bytes()
111
+ manifest = json.loads(manifest_bytes.decode("utf-8"))
112
  _model_sha = str(manifest.get("model_sha") or "unpinned")
113
  if _model_sha == "unpinned":
114
  print("[api] WARNING manifest missing model_sha; using 'unpinned'")
115
+ # catalog_sha = sha256 of manifest.json bytes. Captures every
116
+ # meaningful catalog regeneration (model swap, threshold change,
117
+ # track count change) in a single stable hash. Embedded in every
118
+ # contextToken so /narrative can detect stale tokens after redeploy.
119
+ _catalog_sha = hashlib.sha256(manifest_bytes).hexdigest()
120
  _threshold_default = similarity.threshold_from_manifest(manifest)
121
  _flat_catalog = similarity.build_flat_catalog(_corpus_tracks, _corpus_embeddings, segment_embeddings)
122
  _catalog_cosine_distribution = similarity.compute_catalog_distribution(_flat_catalog)
 
138
  _flat_catalog = None
139
  _catalog_cosine_distribution = None
140
  _model_sha = ""
141
+ _catalog_sha = ""
142
  _threshold_default = config.SIMILARITY_THRESHOLD_DEFAULT
143
 
144
 
 
343
  raw = await file.read()
344
  if (err := _validate_upload(file, raw)) is not None:
345
  return err
346
+ # queryFingerprint: SHA-256 of the upload bytes. Embedded in contextToken
347
+ # so /narrative can verify the same query is still in play. Stable across
348
+ # re-uploads of the same file; cheap to compute.
349
+ query_fingerprint = hashlib.sha256(raw).hexdigest()
350
  ext = Path(file.filename or "").suffix.lower()
351
  pipeline = _decode_and_pipeline(raw, ext=ext)
352
  if isinstance(pipeline, JSONResponse):
 
363
  "modelSha": _model_sha,
364
  "thresholdDefault": _threshold_default,
365
  "acrcloud": acrcloud_engine.to_response_dict(acrcloud_engine.disabled_response()),
366
+ "queryFingerprint": query_fingerprint,
367
+ "contextToken": None,
368
  }
369
 
370
  neighbors = similarity.top_k_neighbors(
 
411
 
412
  specificity = float(similarity.query_specificity(pipeline["emb"].astype(np.float32), _flat_catalog))
413
  acr = acrcloud_engine.call_for_query(pipeline["acrcloud_audio"])
414
+ acr_response = acrcloud_engine.to_response_dict(acr)
415
+
416
+ # Codex round-2 Q3: stateless signed token replaces the in-memory cache.
417
+ # /narrative will verify this token and rebuild context server-side from
418
+ # the embedded claims. Token is None when HMAC key isn't configured —
419
+ # /narrative also 503s in that case so the gating is consistent.
420
+ ctx_token = None
421
+ if context_token.is_configured():
422
+ neighbor_fragments: dict[str, dict] = {}
423
+ for nb in neighbors:
424
+ track = nb.get("track") or {}
425
+ ts = nb.get("matchTimestamp") or {}
426
+ neighbor_fragments[str(nb["trackId"])] = context_token.neighbor_context_fragment(
427
+ track_id=str(nb["trackId"]),
428
+ title=str(track.get("title") or nb["trackId"]),
429
+ artist=track.get("artist"),
430
+ query_window=(
431
+ float(ts.get("queryStartSec", 0.0)),
432
+ float(ts.get("queryEndSec", 0.0)),
433
+ ),
434
+ match_window=(
435
+ float(ts.get("catalogStartSec", 0.0)),
436
+ float(ts.get("catalogEndSec", 0.0)),
437
+ ),
438
+ raw_cosine=float(nb.get("rawCosine", 0.0)),
439
+ criteria=_criteria_to_token_fragment(nb.get("criteria")),
440
+ )
441
+ ctx_token = context_token.issue(
442
+ query_fingerprint=query_fingerprint,
443
+ model_sha=_model_sha or "unpinned",
444
+ catalog_sha=_catalog_sha or "no-catalog",
445
+ neighbors=neighbor_fragments,
446
+ acrcloud_cover_song_id=acr_response.get("coverSongId"),
447
+ )
448
 
449
  return {
450
  "query": query_track,
 
456
  "querySpecificity": specificity,
457
  "modelSha": _model_sha,
458
  "thresholdDefault": _threshold_default,
459
+ "acrcloud": acr_response,
460
+ "queryFingerprint": query_fingerprint,
461
+ "contextToken": ctx_token,
462
  }
463
 
464
 
465
+ def _criteria_to_token_fragment(criteria_block: dict | None) -> list[dict] | None:
466
+ """Reshape /neighbors' criteria block into the list-of-CriterionContext
467
+ form Codex's rag_narrative module expects.
468
+
469
+ The /neighbors response groups criteria by id under a top-level dict;
470
+ NarrativeContext takes a flat list of {id, queryValue, matchValue,
471
+ agreement, label}. Convert here so the token payload matches the
472
+ NarrativeContext shape directly.
473
+ """
474
+ if not criteria_block:
475
+ return None
476
+ out: list[dict] = []
477
+ for cid in ("tempo", "key", "harmonic", "timbre"):
478
+ entry = criteria_block.get(cid)
479
+ if not entry:
480
+ continue
481
+ # harmonic + timbre come back from /neighbors without queryValue /
482
+ # matchValue (only agreement + label) because we don't ship the raw
483
+ # vectors. Substitute a shape marker so Codex's citation validator
484
+ # has something to check the keys against without exposing internals.
485
+ q_val = entry.get("queryValue")
486
+ m_val = entry.get("matchValue")
487
+ if cid in ("harmonic", "timbre") and q_val is None and m_val is None:
488
+ q_val = {"vector": "elided"}
489
+ m_val = {"vector": "elided"}
490
+ out.append({
491
+ "id": cid,
492
+ "queryValue": q_val,
493
+ "matchValue": m_val,
494
+ "agreement": float(entry.get("agreement", 0.0)),
495
+ "label": str(entry.get("label", "")),
496
+ })
497
+ return out or None
498
+
499
+
500
+ # --- /narrative -------------------------------------------------------------
501
+ #
502
+ # Stateless RAG explanatory layer over /neighbors. Client sends the
503
+ # contextToken received from /neighbors plus the trackId + mode it wants
504
+ # narrated; backend verifies the token (signature, expiry, model/catalog
505
+ # version), rebuilds NarrativeContext from the embedded claims, and delegates
506
+ # to Codex's rag_narrative module.
507
+ #
508
+ # Failure shape: typed `{"error": "<code>"}` JSON, status code by class:
509
+ # 503 narrative-disabled — OPENAI_API_KEY or CONTEXT_TOKEN_HMAC_KEY absent
510
+ # 401 invalid-token — signature mismatch (tampered or wrong secret)
511
+ # 412 token-expired — past expiresAt
512
+ # 412 stale-token — modelSha/catalogSha changed since issuance
513
+ # 400 malformed-token — bad shape; not <body>.<sig>
514
+ # 404 not-in-context — trackId wasn't part of the issued token
515
+ # 422 unsupported-mode — mode wasn't "whySimilar" or "creatorAdvice"
516
+
517
+
518
+ class NarrativeRequest(BaseModel):
519
+ contextToken: str = Field(..., min_length=1)
520
+ trackId: str = Field(..., min_length=1)
521
+ mode: str = Field(..., min_length=1)
522
+
523
+
524
+ _TOKEN_ERROR_TO_HTTP = {
525
+ "malformed": (400, "malformed-token"),
526
+ "invalid-signature": (401, "invalid-token"),
527
+ "token-expired": (412, "token-expired"),
528
+ "stale-model": (412, "stale-token"),
529
+ "stale-catalog": (412, "stale-token"),
530
+ "hmac-key-missing": (503, "narrative-disabled"),
531
+ }
532
+
533
+
534
+ @app.post("/narrative")
535
+ async def narrative_endpoint(req: NarrativeRequest):
536
+ """RAG explanatory layer — see ADR-0005 for the full spec."""
537
+ with narrative_telemetry.measure_call(req.mode) as tel:
538
+ # Gate 1: OpenAI key present. Without it we can't call GPT-4o-mini.
539
+ if not os.getenv("OPENAI_API_KEY", "").strip():
540
+ tel.set(error_code="narrative-disabled")
541
+ return _err(503, "narrative-disabled")
542
+ # Gate 2: HMAC key present. Without it we can't trust the token.
543
+ if not context_token.is_configured():
544
+ tel.set(error_code="narrative-disabled")
545
+ return _err(503, "narrative-disabled")
546
+ # Gate 3: mode is one of the supported values.
547
+ if req.mode not in ("whySimilar", "creatorAdvice"):
548
+ tel.set(error_code="unsupported-mode")
549
+ return _err(422, "unsupported-mode")
550
+
551
+ # Verify the token. TokenError.code maps directly to a typed HTTP response.
552
+ try:
553
+ verified = context_token.verify(
554
+ req.contextToken,
555
+ expected_model_sha=_model_sha or "unpinned",
556
+ expected_catalog_sha=_catalog_sha or "no-catalog",
557
+ )
558
+ except context_token.TokenError as exc:
559
+ status, code = _TOKEN_ERROR_TO_HTTP.get(exc.code, (400, "malformed-token"))
560
+ tel.set(error_code=code)
561
+ return _err(status, code)
562
+
563
+ # Look up the requested trackId inside the verified token claims.
564
+ fragment = verified.neighbors.get(req.trackId)
565
+ if not fragment:
566
+ tel.set(error_code="not-in-context", trackId=req.trackId)
567
+ return _err(404, "not-in-context")
568
+
569
+ # Lazy-import Codex's module. Keeping this inside the handler means the
570
+ # FastAPI app boots and /neighbors keeps working even if rag_narrative
571
+ # hasn't shipped yet. If it's missing at request time, surface as 503
572
+ # narrative-disabled so the frontend's no-key fallback path handles it.
573
+ try:
574
+ from . import rag_narrative
575
+ except ImportError:
576
+ tel.set(error_code="narrative-disabled")
577
+ return _err(503, "narrative-disabled")
578
+
579
+ # Build NarrativeContext from the verified fragment. This is the Pydantic
580
+ # model Codex defined; instantiating it here also validates the shape.
581
+ try:
582
+ context = rag_narrative.NarrativeContext(
583
+ queryFingerprint=verified.queryFingerprint,
584
+ trackId=fragment["trackId"],
585
+ title=fragment.get("title", ""),
586
+ artist=fragment.get("artist"),
587
+ queryWindow=tuple(fragment["queryWindow"]),
588
+ matchWindow=tuple(fragment["matchWindow"]),
589
+ rawCosine=float(fragment["rawCosine"]),
590
+ criteria=[
591
+ rag_narrative.CriterionContext(**c)
592
+ for c in (fragment.get("criteria") or [])
593
+ ],
594
+ acrcloudCoverSongId=verified.acrcloudCoverSongId,
595
+ )
596
+ except Exception:
597
+ # If the token fragment fails to materialize into a NarrativeContext,
598
+ # surface as malformed rather than blowing up internally.
599
+ tel.set(error_code="malformed-context", trackId=req.trackId)
600
+ return _err(422, "malformed-context")
601
+
602
+ model_id = os.getenv("OPENAI_MODEL_ID", "gpt-4o-mini")
603
+ try:
604
+ result = rag_narrative.generate_narrative(
605
+ context,
606
+ req.mode,
607
+ model_sha=_model_sha or "unpinned",
608
+ catalog_sha=_catalog_sha or "no-catalog",
609
+ model_id=model_id,
610
+ )
611
+ except Exception as exc:
612
+ print(f"[api] /narrative generate_narrative raised: {exc!r}")
613
+ tel.set(error_code="narrative-error", trackId=req.trackId)
614
+ return _err(500, "narrative-error")
615
+
616
+ # Record the result kind. result.kind is the discriminator on all
617
+ # three Pydantic variants (NarrativeResponse / LowConfidence /
618
+ # NarrativeUnavailable). Approximate cost via prose char count;
619
+ # we don't have token counts without re-tokenizing, but char-count
620
+ # is the right directional signal for the stats endpoint.
621
+ result_kind = getattr(result, "kind", None)
622
+ completion_chars = 0
623
+ if result_kind == "narrative":
624
+ completion_chars = len(getattr(result, "prose", "") or "")
625
+ # Rough prompt size estimate — system + user prompt char count.
626
+ # narrative_telemetry treats this as char-not-token because tokenizer
627
+ # access isn't worth the overhead for an in-process counter.
628
+ prompt_chars_estimate = len(fragment.get("title", "")) + 600 # base + metadata
629
+ tel.set(
630
+ result_kind=result_kind,
631
+ openai_called=(result_kind == "narrative" or result_kind == "unavailable"),
632
+ gate_short_circuit=(result_kind == "low_confidence"),
633
+ prompt_chars=prompt_chars_estimate,
634
+ completion_chars=completion_chars,
635
+ trackId=req.trackId,
636
+ )
637
+
638
+ # Pydantic v2 .model_dump() — uniform shape regardless of which result
639
+ # variant came back. The `kind` discriminator lets the frontend route
640
+ # rendering.
641
+ if hasattr(result, "model_dump"):
642
+ return result.model_dump()
643
+ return result
644
+
645
+
646
+ @app.get("/narrative/stats")
647
+ def narrative_stats_endpoint() -> dict:
648
+ """Return the in-process counters snapshot for the /narrative layer.
649
+
650
+ Senior-reviewer-friendly visibility into what's actually happening in
651
+ production — call counts, latency percentiles, mode distribution,
652
+ error distribution, rough cost estimate. Counters reset on restart;
653
+ this is not a long-term metrics store, it's a "right now" snapshot.
654
+
655
+ Cost estimate is char-based × GPT-4o-mini pricing — directional, not
656
+ accounting-grade. The honest framing from ADR-0005 holds.
657
+ """
658
+ return narrative_telemetry.snapshot()
659
+
660
+
661
  def run() -> None:
662
  """Convenience launcher: `python -m backend.api` or `uvicorn backend.api:app`."""
663
  import uvicorn
backend/context_token.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HMAC-signed opaque context token for the /narrative endpoint.
2
+
3
+ Codex round-2 review Q3 pushed back on the in-memory cache approach: a TTL
4
+ cache breaks across HF restarts, multiple workers, and page refreshes. This
5
+ module is the replacement — a stateless signed token.
6
+
7
+ `/neighbors` issues a token containing the full NarrativeContext payload for
8
+ every neighbor it returns. The token is HMAC-signed with a server secret so
9
+ the backend can later verify that the client didn't tamper with it. On
10
+ `/narrative`, the backend verifies the signature + expiry + model/catalog
11
+ hashes, looks up the requested trackId inside the token payload, and rebuilds
12
+ NarrativeContext server-side from the verified claims.
13
+
14
+ This means: zero server-side state, survives restarts, scales horizontally,
15
+ and the client can't inflate the prompt with garbage to drive up cost.
16
+
17
+ Token format:
18
+ base64url(json_payload) + "." + hex(hmac_sha256(payload_bytes, secret))
19
+
20
+ Payload shape (sorted keys for stability):
21
+ {
22
+ "queryFingerprint": "<sha256 of upload bytes>",
23
+ "modelSha": "<MuQ-MuLan model sha>",
24
+ "catalogSha": "<sha256 of manifest.json bytes>",
25
+ "expiresAt": <unix seconds>,
26
+ "acrcloudCoverSongId": <dict | null>,
27
+ "neighbors": {
28
+ "<trackId>": {full per-neighbor context fields},
29
+ ...
30
+ }
31
+ }
32
+ """
33
+
34
+ from __future__ import annotations
35
+
36
+ import base64
37
+ import hashlib
38
+ import hmac
39
+ import json
40
+ import os
41
+ import time
42
+ from dataclasses import dataclass
43
+ from typing import Any
44
+
45
+ DEFAULT_TTL_SECONDS = 1800 # 30 minutes — long enough for a UI session, short
46
+ # enough that stale tokens after a redeploy expire
47
+ # on their own without operator intervention.
48
+
49
+
50
+ class TokenError(Exception):
51
+ """Raised when a token is malformed, tampered, expired, or stale."""
52
+
53
+ def __init__(self, code: str):
54
+ super().__init__(code)
55
+ self.code = code
56
+
57
+
58
+ @dataclass(frozen=True)
59
+ class VerifiedToken:
60
+ queryFingerprint: str
61
+ modelSha: str
62
+ catalogSha: str
63
+ expiresAt: int
64
+ acrcloudCoverSongId: dict | None
65
+ neighbors: dict[str, dict]
66
+
67
+
68
+ def _hmac_key() -> bytes:
69
+ raw = os.getenv("CONTEXT_TOKEN_HMAC_KEY", "").strip()
70
+ if not raw:
71
+ raise TokenError("hmac-key-missing")
72
+ return raw.encode("utf-8")
73
+
74
+
75
+ def issue(
76
+ *,
77
+ query_fingerprint: str,
78
+ model_sha: str,
79
+ catalog_sha: str,
80
+ neighbors: dict[str, dict],
81
+ acrcloud_cover_song_id: dict | None = None,
82
+ ttl_seconds: int = DEFAULT_TTL_SECONDS,
83
+ now: int | None = None,
84
+ ) -> str:
85
+ """Sign a context token. Returns the encoded `<payload>.<sig>` string.
86
+
87
+ `neighbors` is a dict keyed by trackId; each value is the per-neighbor
88
+ fragment of NarrativeContext (title, artist, queryWindow, matchWindow,
89
+ rawCosine, criteria). Top-level acrcloud signal is global per query.
90
+ """
91
+ secret = _hmac_key()
92
+ now_ts = int(now if now is not None else time.time())
93
+ payload = {
94
+ "queryFingerprint": query_fingerprint,
95
+ "modelSha": model_sha,
96
+ "catalogSha": catalog_sha,
97
+ "expiresAt": now_ts + ttl_seconds,
98
+ "acrcloudCoverSongId": acrcloud_cover_song_id,
99
+ "neighbors": neighbors,
100
+ }
101
+ body = json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
102
+ body_b64 = base64.urlsafe_b64encode(body).rstrip(b"=").decode("ascii")
103
+ sig = hmac.new(secret, body, hashlib.sha256).hexdigest()
104
+ return f"{body_b64}.{sig}"
105
+
106
+
107
+ def verify(
108
+ token: str,
109
+ *,
110
+ expected_model_sha: str,
111
+ expected_catalog_sha: str,
112
+ now: int | None = None,
113
+ ) -> VerifiedToken:
114
+ """Verify signature + expiry + model/catalog hashes; return decoded payload.
115
+
116
+ Raises TokenError with a specific code on every failure path:
117
+ - hmac-key-missing — server isn't configured for tokens
118
+ - malformed — token wasn't `<body>.<sig>` shape
119
+ - invalid-signature — HMAC mismatch (tampered or wrong secret)
120
+ - token-expired — past the embedded expiresAt
121
+ - stale-model — modelSha doesn't match current load
122
+ - stale-catalog — catalogSha doesn't match current load
123
+ """
124
+ secret = _hmac_key()
125
+
126
+ if not isinstance(token, str) or token.count(".") != 1:
127
+ raise TokenError("malformed")
128
+ body_b64, sig = token.split(".", 1)
129
+ try:
130
+ pad = "=" * (-len(body_b64) % 4)
131
+ body = base64.urlsafe_b64decode(body_b64 + pad)
132
+ except Exception as exc:
133
+ raise TokenError("malformed") from exc
134
+
135
+ expected_sig = hmac.new(secret, body, hashlib.sha256).hexdigest()
136
+ if not hmac.compare_digest(expected_sig, sig):
137
+ raise TokenError("invalid-signature")
138
+
139
+ try:
140
+ payload = json.loads(body)
141
+ except Exception as exc:
142
+ raise TokenError("malformed") from exc
143
+
144
+ required = {"queryFingerprint", "modelSha", "catalogSha", "expiresAt", "neighbors"}
145
+ if not isinstance(payload, dict) or not required.issubset(payload.keys()):
146
+ raise TokenError("malformed")
147
+
148
+ now_ts = int(now if now is not None else time.time())
149
+ if now_ts > int(payload["expiresAt"]):
150
+ raise TokenError("token-expired")
151
+
152
+ if payload["modelSha"] != expected_model_sha:
153
+ raise TokenError("stale-model")
154
+ if payload["catalogSha"] != expected_catalog_sha:
155
+ raise TokenError("stale-catalog")
156
+
157
+ return VerifiedToken(
158
+ queryFingerprint=str(payload["queryFingerprint"]),
159
+ modelSha=str(payload["modelSha"]),
160
+ catalogSha=str(payload["catalogSha"]),
161
+ expiresAt=int(payload["expiresAt"]),
162
+ acrcloudCoverSongId=payload.get("acrcloudCoverSongId"),
163
+ neighbors=dict(payload["neighbors"]),
164
+ )
165
+
166
+
167
+ def is_configured() -> bool:
168
+ """True if CONTEXT_TOKEN_HMAC_KEY is set. /neighbors uses this to decide
169
+ whether to attach a token at all."""
170
+ return bool(os.getenv("CONTEXT_TOKEN_HMAC_KEY", "").strip())
171
+
172
+
173
+ def neighbor_context_fragment(
174
+ *,
175
+ track_id: str,
176
+ title: str,
177
+ artist: str | None,
178
+ query_window: tuple[float, float],
179
+ match_window: tuple[float, float],
180
+ raw_cosine: float,
181
+ criteria: list[dict[str, Any]] | None,
182
+ ) -> dict:
183
+ """Build the per-neighbor dict the token embeds. Matches the
184
+ NarrativeContext shape Codex's rag_narrative module expects, minus the
185
+ global queryFingerprint + acrcloudCoverSongId (those live at top level)."""
186
+ return {
187
+ "trackId": track_id,
188
+ "title": title,
189
+ "artist": artist,
190
+ "queryWindow": [float(query_window[0]), float(query_window[1])],
191
+ "matchWindow": [float(match_window[0]), float(match_window[1])],
192
+ "rawCosine": float(raw_cosine),
193
+ "criteria": criteria if criteria is not None else [],
194
+ }
backend/narrative_telemetry.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """In-process telemetry for the /narrative RAG explanatory layer.
2
+
3
+ The right rung for this scale: in-process counters + structured logs + Sentry
4
+ tags. A Prometheus/Datadog stack would be overbuilt for a 155-track demo —
5
+ same "progressive complexity" principle ADR-0005 commits to for retrieval.
6
+
7
+ What this module owns:
8
+ - Counters: total_calls, by_mode, by_kind, by_error, openai_calls,
9
+ gate_short_circuits, token_invalid, token_expired, token_stale.
10
+ - Latency: a fixed-size sliding window of recent call durations, surfaced
11
+ as p50/p95/p99 in the stats snapshot.
12
+ - Cost estimate: a rough running total in cents, derived from prompt and
13
+ completion character counts × GPT-4o-mini pricing constants. Not an
14
+ accounting ledger — a directional cost-awareness signal for the
15
+ /narrative/stats endpoint.
16
+ - Structured logger: one INFO line per call with stable key=value fields
17
+ so the HF Space logs are grep-able without a parser.
18
+ - Sentry tags: when SENTRY_DSN is set, every call tags the current scope
19
+ with mode + result_kind so failures aggregate by category in the
20
+ existing dashboard.
21
+
22
+ The module is thread-safe (one lock around counter mutations + window
23
+ operations). All operations are O(1) except `snapshot()` which is O(N) over
24
+ the sliding window — N is bounded by `LATENCY_WINDOW_SIZE`.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import logging
30
+ import os
31
+ import threading
32
+ import time
33
+ from collections import deque
34
+ from dataclasses import dataclass, field
35
+ from typing import Iterable
36
+
37
+ # GPT-4o-mini pricing as of 2026-06 (rough; meant as a directional signal):
38
+ # $0.15 per 1M input tokens, $0.60 per 1M output tokens.
39
+ # At ~4 chars per token (English average), 1 input char ≈ 0.0375 micro-cents
40
+ # and 1 output char ≈ 0.150 micro-cents. Times 100 cents/dollar:
41
+ COST_CENTS_PER_INPUT_CHAR = 0.0000375
42
+ COST_CENTS_PER_OUTPUT_CHAR = 0.000150
43
+
44
+ # Sliding window size for latency percentiles. ~30 minutes of activity at
45
+ # steady demo traffic; bounded so memory stays trivial.
46
+ LATENCY_WINDOW_SIZE = 256
47
+
48
+ # Result kinds we expect from rag_narrative. Anything else gets coerced to
49
+ # "unknown" in the counter so a bug doesn't silently grow a new key.
50
+ _KNOWN_KINDS = {"narrative", "low_confidence", "unavailable"}
51
+
52
+ # Backend error codes we surface in counters; HTTP-layer codes from
53
+ # api.py /narrative endpoint.
54
+ _KNOWN_ERROR_CODES = {
55
+ "narrative-disabled",
56
+ "invalid-token",
57
+ "malformed-token",
58
+ "token-expired",
59
+ "stale-token",
60
+ "not-in-context",
61
+ "unsupported-mode",
62
+ "malformed-context",
63
+ "narrative-error",
64
+ }
65
+
66
+ _logger = logging.getLogger("piedpiper.narrative")
67
+ _logger.setLevel(logging.INFO)
68
+ # Don't add handlers here — the FastAPI app's uvicorn config already streams
69
+ # stdlib logging to stdout, which is the HF Space's log-collection surface.
70
+
71
+ _lock = threading.Lock()
72
+
73
+
74
+ @dataclass
75
+ class _State:
76
+ total_calls: int = 0
77
+ by_mode: dict[str, int] = field(default_factory=dict)
78
+ by_kind: dict[str, int] = field(default_factory=dict)
79
+ by_error: dict[str, int] = field(default_factory=dict)
80
+ openai_calls: int = 0
81
+ gate_short_circuits: int = 0
82
+ cost_cents: float = 0.0
83
+ latencies_ms: deque[float] = field(default_factory=lambda: deque(maxlen=LATENCY_WINDOW_SIZE))
84
+ started_at: float = field(default_factory=time.time)
85
+
86
+
87
+ _state = _State()
88
+
89
+
90
+ def record_call(
91
+ *,
92
+ mode: str,
93
+ latency_ms: float,
94
+ result_kind: str | None = None,
95
+ error_code: str | None = None,
96
+ openai_called: bool = False,
97
+ gate_short_circuit: bool = False,
98
+ prompt_chars: int = 0,
99
+ completion_chars: int = 0,
100
+ trackId: str | None = None,
101
+ cache_key: str | None = None,
102
+ ) -> None:
103
+ """Record a single /narrative call.
104
+
105
+ Exactly one of `result_kind` and `error_code` should be set. The function
106
+ accepts both being None (e.g. a 503 returned before any LLM/gate work)
107
+ and records it as `by_error["narrative-disabled"]` so the counter still
108
+ reflects the dropped traffic.
109
+
110
+ Cost is estimated from prompt_chars + completion_chars × GPT-4o-mini
111
+ pricing. Tests can pass 0 for both to skip the cost increment.
112
+ """
113
+ mode_key = mode if mode else "unknown"
114
+ # Unknown result_kind / error_code values get bucketed under the
115
+ # "_other" sentinel rather than being either dropped silently or growing
116
+ # arbitrary counter keys. Operators see "_other > 0" → time to update
117
+ # _KNOWN_KINDS / _KNOWN_ERROR_CODES.
118
+ if result_kind is None:
119
+ kind_key = None
120
+ elif result_kind in _KNOWN_KINDS:
121
+ kind_key = result_kind
122
+ else:
123
+ kind_key = "_other"
124
+ if error_code is None:
125
+ err_key = None
126
+ elif error_code in _KNOWN_ERROR_CODES:
127
+ err_key = error_code
128
+ else:
129
+ err_key = "_other"
130
+
131
+ with _lock:
132
+ _state.total_calls += 1
133
+ _state.by_mode[mode_key] = _state.by_mode.get(mode_key, 0) + 1
134
+ if kind_key:
135
+ _state.by_kind[kind_key] = _state.by_kind.get(kind_key, 0) + 1
136
+ if err_key:
137
+ _state.by_error[err_key] = _state.by_error.get(err_key, 0) + 1
138
+ if openai_called:
139
+ _state.openai_calls += 1
140
+ if gate_short_circuit:
141
+ _state.gate_short_circuits += 1
142
+ _state.cost_cents += (
143
+ prompt_chars * COST_CENTS_PER_INPUT_CHAR
144
+ + completion_chars * COST_CENTS_PER_OUTPUT_CHAR
145
+ )
146
+ _state.latencies_ms.append(float(latency_ms))
147
+
148
+ # Structured log — one line, grep-able. Don't emit prompt or response
149
+ # bodies (would leak content + bloat logs). Cache key is logged for
150
+ # de-dup correlation across same-payload calls.
151
+ _logger.info(
152
+ "narrative.call mode=%s kind=%s error=%s latency_ms=%.1f openai_called=%s gate_short_circuit=%s prompt_chars=%d completion_chars=%d trackId=%s cache_key=%s",
153
+ mode_key,
154
+ kind_key or "-",
155
+ err_key or "-",
156
+ float(latency_ms),
157
+ openai_called,
158
+ gate_short_circuit,
159
+ prompt_chars,
160
+ completion_chars,
161
+ trackId or "-",
162
+ (cache_key or "-")[:16], # prefix only; full key is high-cardinality
163
+ )
164
+
165
+ # Sentry tag scope. No-op when sentry_sdk isn't installed or SENTRY_DSN
166
+ # isn't set — both paths defer to the existing api.py wiring.
167
+ _set_sentry_tags(mode=mode_key, kind=kind_key, error=err_key)
168
+
169
+
170
+ def _set_sentry_tags(*, mode: str, kind: str | None, error: str | None) -> None:
171
+ """Tag the current Sentry scope (no-op when Sentry isn't active).
172
+
173
+ Tagging here means /narrative-layer Sentry events are filterable by
174
+ mode + result_kind + error in the existing dashboard without a new
175
+ integration.
176
+ """
177
+ if not os.getenv("SENTRY_DSN", "").strip():
178
+ return
179
+ try:
180
+ import sentry_sdk
181
+
182
+ sentry_sdk.set_tag("narrative.mode", mode)
183
+ if kind:
184
+ sentry_sdk.set_tag("narrative.kind", kind)
185
+ if error:
186
+ sentry_sdk.set_tag("narrative.error", error)
187
+ except Exception:
188
+ # If Sentry tagging fails for any reason, don't let it break the
189
+ # request flow. Telemetry is observability, not control flow.
190
+ pass
191
+
192
+
193
+ def snapshot() -> dict:
194
+ """Return a JSON-serializable snapshot of current counters + percentiles.
195
+
196
+ This is what `GET /narrative/stats` returns. Includes:
197
+ - all counters
198
+ - latency p50 / p95 / p99 over the sliding window (or null if empty)
199
+ - cost_cents rounded to 4 decimal places
200
+ - uptime_sec since process start
201
+ """
202
+ with _lock:
203
+ latencies = sorted(_state.latencies_ms)
204
+ sample_n = len(latencies)
205
+ return {
206
+ "total_calls": _state.total_calls,
207
+ "by_mode": dict(_state.by_mode),
208
+ "by_kind": dict(_state.by_kind),
209
+ "by_error": dict(_state.by_error),
210
+ "openai_calls": _state.openai_calls,
211
+ "gate_short_circuits": _state.gate_short_circuits,
212
+ "cost_cents_estimate": round(_state.cost_cents, 4),
213
+ "latency_ms": {
214
+ "p50": _percentile(latencies, 0.50) if sample_n else None,
215
+ "p95": _percentile(latencies, 0.95) if sample_n else None,
216
+ "p99": _percentile(latencies, 0.99) if sample_n else None,
217
+ "sample_n": sample_n,
218
+ "window_size": LATENCY_WINDOW_SIZE,
219
+ },
220
+ "uptime_sec": round(time.time() - _state.started_at, 1),
221
+ }
222
+
223
+
224
+ def _percentile(sorted_samples: list[float], p: float) -> float:
225
+ """Linear-interpolation percentile over a pre-sorted list.
226
+
227
+ Returns rounded to 1 decimal ms — sub-ms precision adds noise without
228
+ signal at this aggregation level.
229
+ """
230
+ if not sorted_samples:
231
+ return 0.0
232
+ if len(sorted_samples) == 1:
233
+ return round(sorted_samples[0], 1)
234
+ rank = p * (len(sorted_samples) - 1)
235
+ lo = int(rank)
236
+ hi = min(lo + 1, len(sorted_samples) - 1)
237
+ frac = rank - lo
238
+ return round(sorted_samples[lo] + frac * (sorted_samples[hi] - sorted_samples[lo]), 1)
239
+
240
+
241
+ def reset() -> None:
242
+ """Wipe state. Used by tests to isolate per-test counter assertions.
243
+
244
+ Production code should NEVER call this — the snapshot wouldn't survive
245
+ a restart anyway, so there's no use case beyond test isolation.
246
+ """
247
+ global _state
248
+ with _lock:
249
+ _state = _State()
250
+
251
+
252
+ def measure_call(mode: str):
253
+ """Context manager: time a call and ensure record_call gets invoked
254
+ exactly once with the measured latency.
255
+
256
+ Usage in api.py:
257
+ with measure_call("whySimilar") as ctx:
258
+ ... do work ...
259
+ ctx.set(result_kind="narrative", openai_called=True, prompt_chars=N, completion_chars=M)
260
+ """
261
+ return _CallTimer(mode)
262
+
263
+
264
+ class _CallTimer:
265
+ def __init__(self, mode: str):
266
+ self.mode = mode
267
+ self.start_ts: float = 0.0
268
+ self._fields: dict = {}
269
+
270
+ def __enter__(self):
271
+ self.start_ts = time.time()
272
+ return self
273
+
274
+ def __exit__(self, exc_type, exc, tb):
275
+ latency_ms = (time.time() - self.start_ts) * 1000.0
276
+ # If the context block raised, mark as a narrative-error if no other
277
+ # error was set.
278
+ if exc_type is not None and "error_code" not in self._fields:
279
+ self._fields["error_code"] = "narrative-error"
280
+ record_call(mode=self.mode, latency_ms=latency_ms, **self._fields)
281
+ return False # don't suppress exceptions
282
+
283
+ def set(self, **kwargs) -> None:
284
+ """Stash fields for record_call. Last write wins per key."""
285
+ self._fields.update(kwargs)
286
+
287
+
288
+ __all__: Iterable[str] = (
289
+ "record_call",
290
+ "snapshot",
291
+ "reset",
292
+ "measure_call",
293
+ "LATENCY_WINDOW_SIZE",
294
+ "COST_CENTS_PER_INPUT_CHAR",
295
+ "COST_CENTS_PER_OUTPUT_CHAR",
296
+ )
backend/rag_narrative.py ADDED
@@ -0,0 +1,384 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Metadata-grounded narrative generation for PiedPiper match explanations.
2
+
3
+ This module is intentionally independent from FastAPI. `api.py` supplies a
4
+ trusted `NarrativeContext`; this module gates context quality, builds a
5
+ bounded prompt, calls OpenAI through one adapter, validates structured
6
+ citations, and returns typed Pydantic results for the frontend.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import hashlib
12
+ import json
13
+ import logging
14
+ import time
15
+ from typing import Any, Literal
16
+
17
+ from pydantic import BaseModel, ValidationError
18
+
19
+ NarrativeMode = Literal["whySimilar", "creatorAdvice"]
20
+ CriterionId = Literal["tempo", "key", "harmonic", "timbre"]
21
+
22
+ RESPONSE_SCHEMA_VERSION = "v1"
23
+ CRITERIA_ALGORITHM_VERSION = "adr-0004-v1"
24
+ MAX_PROMPT_CHARS = 8000
25
+ MAX_COMPLETION_TOKENS = 400
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ class CriterionContext(BaseModel):
31
+ id: CriterionId
32
+ queryValue: float | str | dict
33
+ matchValue: float | str | dict
34
+ agreement: float
35
+ label: str
36
+
37
+
38
+ class NarrativeContext(BaseModel):
39
+ queryFingerprint: str
40
+ trackId: str
41
+ title: str
42
+ artist: str | None
43
+ queryWindow: tuple[float, float]
44
+ matchWindow: tuple[float, float]
45
+ rawCosine: float
46
+ criteria: list[CriterionContext]
47
+ acrcloudCoverSongId: dict | None
48
+
49
+
50
+ class StructuredCitation(BaseModel):
51
+ trackId: str
52
+ side: Literal["query", "match"]
53
+ timestampRange: tuple[float, float]
54
+ criterionIds: list[CriterionId]
55
+ citedValues: dict[str, float | str]
56
+
57
+
58
+ class NarrativeResponse(BaseModel):
59
+ kind: Literal["narrative"] = "narrative"
60
+ mode: NarrativeMode
61
+ prose: str
62
+ citations: list[StructuredCitation]
63
+
64
+
65
+ class LowConfidence(BaseModel):
66
+ kind: Literal["low_confidence"] = "low_confidence"
67
+ reason: str
68
+
69
+
70
+ class NarrativeUnavailable(BaseModel):
71
+ kind: Literal["unavailable"] = "unavailable"
72
+ reason: str
73
+
74
+
75
+ NarrativeResult = NarrativeResponse | LowConfidence | NarrativeUnavailable
76
+
77
+
78
+ SYSTEM_PROMPTS: dict[NarrativeMode, str] = {
79
+ "whySimilar": (
80
+ "You are PiedPiper, an expert assistant explaining acoustic similarity "
81
+ "between music tracks. You receive structured metadata about two audio "
82
+ "segments. You do not hear the audio. You do not determine copyright "
83
+ "infringement, ownership, or legal status. Cite only tracks, criteria, "
84
+ "and values present in the supplied context. Output a single JSON object "
85
+ "matching the schema. No additional text, no markdown."
86
+ ),
87
+ "creatorAdvice": (
88
+ "You are PiedPiper, an expert assistant helping creators make a music "
89
+ "generation more distinctive from a retrieved catalog match. You receive "
90
+ "structured metadata about two audio segments. You do not hear the audio. "
91
+ "You do not determine copyright infringement, ownership, or legal status. "
92
+ "Cite only tracks, criteria, and values present in the supplied context. "
93
+ "Output a single JSON object matching the schema. No additional text, no markdown."
94
+ ),
95
+ }
96
+
97
+ USER_PROMPT_TEMPLATE = """Mode: {mode}
98
+
99
+ Return JSON with exactly this shape:
100
+ {{
101
+ "kind": "narrative",
102
+ "mode": "{mode}",
103
+ "prose": "80-140 words for whySimilar, or 60-120 words for creatorAdvice",
104
+ "citations": [
105
+ {{
106
+ "trackId": "{track_id}",
107
+ "side": "query|match",
108
+ "timestampRange": [start_seconds, end_seconds],
109
+ "criterionIds": ["tempo|key|harmonic|timbre"],
110
+ "citedValues": {{
111
+ "<criterionId>.queryValue": "exact supplied value when cited",
112
+ "<criterionId>.matchValue": "exact supplied value when cited",
113
+ "rawCosine": 0.0
114
+ }}
115
+ }}
116
+ ]
117
+ }}
118
+
119
+ Use the supplied context only. For creatorAdvice, write three concrete suggestion-style clauses in prose, each tied to a cited criterion.
120
+
121
+ Context:
122
+ {context_json}
123
+ """
124
+
125
+
126
+ def cache_key(
127
+ context: NarrativeContext,
128
+ mode: NarrativeMode,
129
+ *,
130
+ model_sha: str,
131
+ catalog_sha: str,
132
+ model_id: str,
133
+ ) -> str:
134
+ """Return a stable cache key for the prompt-relevant narrative context."""
135
+ payload = {
136
+ "model_id": model_id,
137
+ "model_sha": model_sha,
138
+ "catalog_sha": catalog_sha,
139
+ "prompt_template_hash": _prompt_template_hash(mode),
140
+ "response_schema_version": RESPONSE_SCHEMA_VERSION,
141
+ "criteria_algorithm_version": CRITERIA_ALGORITHM_VERSION,
142
+ "query_fingerprint": context.queryFingerprint,
143
+ "track_id": context.trackId,
144
+ "mode": mode,
145
+ "criteria_rounded": [_criterion_for_cache(c) for c in sorted(context.criteria, key=lambda c: c.id)],
146
+ "raw_cosine": round(float(context.rawCosine), 3),
147
+ }
148
+ return _sha256_json(payload)
149
+
150
+
151
+ def generate_narrative(
152
+ context: NarrativeContext,
153
+ mode: NarrativeMode,
154
+ *,
155
+ model_sha: str,
156
+ catalog_sha: str,
157
+ model_id: str = "gpt-4o-mini",
158
+ openai_client=None,
159
+ ) -> NarrativeResult:
160
+ start = time.perf_counter()
161
+ key = cache_key(context, mode, model_sha=model_sha, catalog_sha=catalog_sha, model_id=model_id)
162
+
163
+ def finish(result: NarrativeResult, *, gate_result: str, success: bool) -> NarrativeResult:
164
+ latency_ms = (time.perf_counter() - start) * 1000
165
+ logger.info(
166
+ "rag_narrative cache_key=%s mode=%s gate_result=%s latency_ms=%.1f success=%s",
167
+ key,
168
+ mode,
169
+ gate_result,
170
+ latency_ms,
171
+ success,
172
+ )
173
+ return result
174
+
175
+ gate_reason = _context_gate_reason(context)
176
+ if gate_reason is not None:
177
+ return finish(LowConfidence(reason=gate_reason), gate_result=gate_reason, success=False)
178
+
179
+ system_prompt = SYSTEM_PROMPTS[mode]
180
+ user_prompt = _build_user_prompt(context, mode)
181
+ if len(system_prompt) + len(user_prompt) > MAX_PROMPT_CHARS:
182
+ return finish(
183
+ LowConfidence(reason="context-cap-exceeded"),
184
+ gate_result="context-cap-exceeded",
185
+ success=False,
186
+ )
187
+
188
+ payload = _call_openai_json(
189
+ openai_client,
190
+ system_prompt=system_prompt,
191
+ user_prompt=user_prompt,
192
+ max_tokens=MAX_COMPLETION_TOKENS,
193
+ model_id=model_id,
194
+ )
195
+ if payload is None:
196
+ return finish(
197
+ NarrativeUnavailable(reason="openai-error"),
198
+ gate_result="called",
199
+ success=False,
200
+ )
201
+
202
+ try:
203
+ narrative = NarrativeResponse.model_validate(payload)
204
+ except ValidationError:
205
+ return finish(
206
+ NarrativeUnavailable(reason="malformed-llm-output"),
207
+ gate_result="called",
208
+ success=False,
209
+ )
210
+
211
+ if narrative.mode != mode:
212
+ return finish(
213
+ NarrativeUnavailable(reason="schema-mismatch"),
214
+ gate_result="called",
215
+ success=False,
216
+ )
217
+
218
+ if not _citations_are_grounded(narrative.citations, context):
219
+ return finish(
220
+ NarrativeUnavailable(reason="citation-hallucinated"),
221
+ gate_result="called",
222
+ success=False,
223
+ )
224
+
225
+ return finish(narrative, gate_result="called", success=True)
226
+
227
+
228
+ def _call_openai_json(
229
+ client,
230
+ *,
231
+ system_prompt: str,
232
+ user_prompt: str,
233
+ max_tokens: int,
234
+ model_id: str,
235
+ ) -> dict | None:
236
+ """Call OpenAI once and return parsed JSON, or None on SDK/parse failure."""
237
+ try:
238
+ if client is None:
239
+ from openai import OpenAI
240
+
241
+ client = OpenAI()
242
+
243
+ response = client.chat.completions.create(
244
+ model=model_id,
245
+ messages=[
246
+ {"role": "system", "content": system_prompt},
247
+ {"role": "user", "content": user_prompt},
248
+ ],
249
+ response_format={"type": "json_object"},
250
+ max_tokens=max_tokens,
251
+ temperature=0,
252
+ )
253
+ content = response.choices[0].message.content
254
+ parsed = json.loads(content)
255
+ return parsed if isinstance(parsed, dict) else None
256
+ except Exception:
257
+ logger.exception("rag_narrative OpenAI JSON call failed")
258
+ return None
259
+
260
+
261
+ def _context_gate_reason(context: NarrativeContext) -> str | None:
262
+ if not context.criteria:
263
+ return "missing-criteria"
264
+ if not context.title or not context.title.strip():
265
+ return "missing-metadata"
266
+ if not _window_is_valid(context.queryWindow) or not _window_is_valid(context.matchWindow):
267
+ return "missing-metadata"
268
+ if not any(float(c.agreement) >= 0.55 for c in context.criteria) and float(context.rawCosine) < 0.75:
269
+ return "weak-evidence"
270
+ return None
271
+
272
+
273
+ def _window_is_valid(window: tuple[float, float]) -> bool:
274
+ start, end = float(window[0]), float(window[1])
275
+ return start >= 0 and end > start
276
+
277
+
278
+ def _build_user_prompt(context: NarrativeContext, mode: NarrativeMode) -> str:
279
+ context_payload = {
280
+ "queryFingerprint": context.queryFingerprint,
281
+ "trackId": context.trackId,
282
+ "title": context.title,
283
+ "artist": context.artist,
284
+ "queryWindow": list(context.queryWindow),
285
+ "matchWindow": list(context.matchWindow),
286
+ "rawCosine": round(float(context.rawCosine), 3),
287
+ "criteria": [_criterion_for_prompt(c) for c in sorted(context.criteria, key=lambda c: c.id)],
288
+ "acrcloudCoverSongId": context.acrcloudCoverSongId,
289
+ }
290
+ return USER_PROMPT_TEMPLATE.format(
291
+ mode=mode,
292
+ track_id=context.trackId,
293
+ context_json=json.dumps(context_payload, sort_keys=True, separators=(",", ":")),
294
+ )
295
+
296
+
297
+ def _criterion_for_prompt(criterion: CriterionContext) -> dict[str, Any]:
298
+ return {
299
+ "id": criterion.id,
300
+ "queryValue": criterion.queryValue,
301
+ "matchValue": criterion.matchValue,
302
+ "agreement": round(float(criterion.agreement), 3),
303
+ "label": criterion.label,
304
+ }
305
+
306
+
307
+ def _criterion_for_cache(criterion: CriterionContext) -> dict[str, Any]:
308
+ return _round_numbers(_criterion_for_prompt(criterion))
309
+
310
+
311
+ def _round_numbers(value: Any) -> Any:
312
+ if isinstance(value, bool):
313
+ return value
314
+ if isinstance(value, int | float):
315
+ return round(float(value), 3)
316
+ if isinstance(value, list):
317
+ return [_round_numbers(v) for v in value]
318
+ if isinstance(value, tuple):
319
+ return [_round_numbers(v) for v in value]
320
+ if isinstance(value, dict):
321
+ return {str(k): _round_numbers(v) for k, v in sorted(value.items(), key=lambda item: str(item[0]))}
322
+ return value
323
+
324
+
325
+ def _citations_are_grounded(citations: list[StructuredCitation], context: NarrativeContext) -> bool:
326
+ criteria = {c.id: c for c in context.criteria}
327
+ if not citations:
328
+ return False
329
+
330
+ for citation in citations:
331
+ if citation.trackId != context.trackId:
332
+ return False
333
+ if not all(criterion_id in criteria for criterion_id in citation.criterionIds):
334
+ return False
335
+ if not _timestamp_is_grounded(citation, context):
336
+ return False
337
+ for key, cited_value in citation.citedValues.items():
338
+ if key == "rawCosine":
339
+ if not _numeric_close(cited_value, context.rawCosine, tolerance=0.01):
340
+ return False
341
+ continue
342
+ if "." not in key:
343
+ return False
344
+ criterion_id, side = key.split(".", 1)
345
+ if criterion_id not in criteria or side not in {"queryValue", "matchValue"}:
346
+ return False
347
+ criterion = criteria[criterion_id]
348
+ expected = getattr(criterion, side)
349
+ if criterion_id == "tempo":
350
+ if not _numeric_close(cited_value, expected, tolerance=2.0):
351
+ return False
352
+ elif criterion_id == "key":
353
+ if str(cited_value) != str(expected):
354
+ return False
355
+ elif criterion_id in {"harmonic", "timbre"}:
356
+ if not isinstance(expected, dict):
357
+ return False
358
+ else:
359
+ return False
360
+ return True
361
+
362
+
363
+ def _timestamp_is_grounded(citation: StructuredCitation, context: NarrativeContext) -> bool:
364
+ start, end = citation.timestampRange
365
+ if end <= start:
366
+ return False
367
+ window = context.queryWindow if citation.side == "query" else context.matchWindow
368
+ return start >= window[0] - 0.5 and end <= window[1] + 0.5
369
+
370
+
371
+ def _numeric_close(actual: Any, expected: Any, *, tolerance: float) -> bool:
372
+ try:
373
+ return abs(float(actual) - float(expected)) <= tolerance
374
+ except (TypeError, ValueError):
375
+ return False
376
+
377
+
378
+ def _prompt_template_hash(mode: NarrativeMode) -> str:
379
+ return hashlib.sha256((SYSTEM_PROMPTS[mode] + "\n" + USER_PROMPT_TEMPLATE).encode("utf-8")).hexdigest()
380
+
381
+
382
+ def _sha256_json(payload: dict[str, Any]) -> str:
383
+ encoded = json.dumps(payload, sort_keys=True, separators=(",", ":")).encode("utf-8")
384
+ return hashlib.sha256(encoded).hexdigest()
backend/scripts/enrich_jamendo.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Patch corpus.json's Jamendo tracks with real metadata from Jamendo's public API.
2
+
3
+ The MTG-Jamendo research dataset (the ingest source for Tier-2) anonymizes
4
+ artist names per academic distribution convention — entries land in corpus.json
5
+ with placeholders like:
6
+
7
+ title: "Jamendo 382"
8
+ artist: "artist_000020"
9
+ artwork_url: null
10
+ external_ids: {"jamendoTrackId": "382"}
11
+
12
+ Jamendo's own public Catalog API (`api.jamendo.com/v3.0/tracks/`) returns the
13
+ real track name, real artist name, an MP3 stream URL, and an album cover URL
14
+ keyed by the same numeric track ID. This script reconciles them.
15
+
16
+ Frontend impact: the `audioUrlFor()` helper already reads
17
+ `external_ids.jamendoAudioUrl` and `artworkUrlFor()` reads `artwork_url`, so
18
+ the React rows light up automatically once corpus.json is patched.
19
+
20
+ Usage:
21
+ JAMENDO_CLIENT_ID=ba16bbc1 \\
22
+ python -m backend.scripts.enrich_jamendo
23
+
24
+ Options:
25
+ --corpus <path> Path to corpus.json (default: quality-scorer/public/corpus/corpus.json)
26
+ --dry-run Print what would change without writing
27
+ --sleep <seconds> Per-request sleep (default 0.1)
28
+ """
29
+
30
+ from __future__ import annotations
31
+
32
+ import argparse
33
+ import json
34
+ import os
35
+ import sys
36
+ import time
37
+ from pathlib import Path
38
+
39
+ import httpx
40
+
41
+ REPO_ROOT = Path(__file__).resolve().parents[3]
42
+ DEFAULT_CORPUS = REPO_ROOT / "quality-scorer" / "public" / "corpus" / "corpus.json"
43
+ API_BASE = "https://api.jamendo.com/v3.0/tracks/"
44
+
45
+
46
+ def main() -> int:
47
+ args = _parse_args()
48
+ client_id = os.environ.get("JAMENDO_CLIENT_ID")
49
+ if not client_id:
50
+ print("[enrich_jamendo] ERROR: JAMENDO_CLIENT_ID env var required", file=sys.stderr)
51
+ return 2
52
+
53
+ raw = json.loads(args.corpus.read_text())
54
+ tracks = raw if isinstance(raw, list) else raw.get("tracks", [])
55
+ jamendo_tracks = [t for t in tracks if t.get("source") == "jamendo"]
56
+ print(f"[enrich_jamendo] {len(jamendo_tracks)} Jamendo tracks to enrich")
57
+
58
+ patched = 0
59
+ failed: list[str] = []
60
+ with httpx.Client(timeout=20.0) as client:
61
+ for i, t in enumerate(jamendo_tracks, start=1):
62
+ jam_id = (t.get("external_ids") or {}).get("jamendoTrackId")
63
+ if not jam_id:
64
+ failed.append(f"{t.get('track_id')}: missing jamendoTrackId")
65
+ continue
66
+ try:
67
+ r = client.get(API_BASE, params={
68
+ "client_id": client_id,
69
+ "id": str(jam_id),
70
+ "format": "json",
71
+ })
72
+ r.raise_for_status()
73
+ data = r.json()
74
+ except (httpx.HTTPError, json.JSONDecodeError) as exc:
75
+ failed.append(f"jamendo:{jam_id}: {exc}")
76
+ time.sleep(args.sleep)
77
+ continue
78
+
79
+ if data.get("headers", {}).get("status") != "success":
80
+ failed.append(f"jamendo:{jam_id}: {data.get('headers', {}).get('error_message', 'unknown')}")
81
+ time.sleep(args.sleep)
82
+ continue
83
+
84
+ results = data.get("results") or []
85
+ if not results:
86
+ failed.append(f"jamendo:{jam_id}: not in Jamendo catalog")
87
+ time.sleep(args.sleep)
88
+ continue
89
+
90
+ jam = results[0]
91
+ real_title = jam.get("name") or t.get("title")
92
+ real_artist = jam.get("artist_name") or t.get("artist")
93
+ audio_url = jam.get("audio") or None
94
+ image_url = jam.get("image") or None
95
+ track_view = f"https://www.jamendo.com/track/{jam_id}"
96
+
97
+ if args.dry_run:
98
+ print(f" [{i:03d}/{len(jamendo_tracks)}] {t.get('track_id')}: {t.get('title')!r} -> {real_title!r} by {real_artist!r}")
99
+ else:
100
+ t["title"] = real_title
101
+ t["artist"] = real_artist
102
+ t["artwork_url"] = image_url
103
+ t["track_view_url"] = track_view
104
+ ext = dict(t.get("external_ids") or {})
105
+ if audio_url:
106
+ ext["jamendoAudioUrl"] = audio_url
107
+ if jam.get("album_name"):
108
+ ext["jamendoAlbum"] = jam.get("album_name")
109
+ t["external_ids"] = ext
110
+
111
+ patched += 1
112
+ if i % 20 == 0:
113
+ print(f" [{i}/{len(jamendo_tracks)}] enriched so far: {patched}, failed: {len(failed)}")
114
+ time.sleep(args.sleep)
115
+
116
+ print(f"[enrich_jamendo] DONE: patched={patched} failed={len(failed)}")
117
+ if failed:
118
+ print("[enrich_jamendo] failures (first 10):")
119
+ for f in failed[:10]:
120
+ print(f" - {f}")
121
+
122
+ if not args.dry_run and patched > 0:
123
+ args.corpus.write_text(json.dumps(raw, indent=2))
124
+ print(f"[enrich_jamendo] wrote {args.corpus}")
125
+ elif args.dry_run:
126
+ print("[enrich_jamendo] dry-run — no file written")
127
+
128
+ return 0
129
+
130
+
131
+ def _parse_args() -> argparse.Namespace:
132
+ p = argparse.ArgumentParser(description=__doc__)
133
+ p.add_argument("--corpus", type=Path, default=DEFAULT_CORPUS)
134
+ p.add_argument("--dry-run", action="store_true")
135
+ p.add_argument("--sleep", type=float, default=0.1)
136
+ return p.parse_args()
137
+
138
+
139
+ if __name__ == "__main__":
140
+ sys.exit(main())
backend/scripts/enrich_mir_features.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Backfill the four ADR-0004 MIR features (tempo, key, chroma, MFCC) into
2
+ corpus.json without re-encoding the catalog through MuQ-MuLan.
3
+
4
+ Pattern matches `enrich_jamendo.py`: read corpus.json, for each entry that
5
+ lacks `mir_features`, download its audio, run `mir_features.compute()`,
6
+ write the result back. Idempotent — re-runs only touch un-enriched tracks.
7
+
8
+ Usage:
9
+ python -m backend.scripts.enrich_mir_features
10
+
11
+ The audio source depends on the tier:
12
+ - tier1 (iTunes): external_ids.previewUrl — 30s AAC-LC preview
13
+ - tier2 (Jamendo): external_ids.jamendoAudioUrl — MP3 stream (set by
14
+ enrich_jamendo earlier; falls back to source_url if missing)
15
+
16
+ Cost: ~3 s per track on CPU (download + librosa decode + features). For the
17
+ current 155-track catalog: ~8 min wall-clock.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import argparse
23
+ import io
24
+ import json
25
+ import sys
26
+ import tempfile
27
+ import time
28
+ import urllib.request
29
+ from pathlib import Path
30
+
31
+ import librosa
32
+
33
+ from backend.mir_features import compute as compute_mir
34
+
35
+ REPO_ROOT = Path(__file__).resolve().parents[3]
36
+ DEFAULT_CORPUS = REPO_ROOT / "quality-scorer" / "public" / "corpus" / "corpus.json"
37
+ APPLE_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 Safari/605.1.15"
38
+
39
+
40
+ def main() -> int:
41
+ args = _parse_args()
42
+ raw = json.loads(args.corpus.read_text())
43
+ tracks = raw if isinstance(raw, list) else raw.get("tracks", [])
44
+ total = len(tracks)
45
+ print(f"[enrich_mir] {total} catalog tracks")
46
+
47
+ todo = [t for t in tracks if args.force or not t.get("mir_features")]
48
+ print(f"[enrich_mir] {len(todo)} to enrich (already-enriched skipped; pass --force to redo)")
49
+
50
+ patched = 0
51
+ failed: list[str] = []
52
+ for i, t in enumerate(todo, start=1):
53
+ track_id = t.get("track_id", "?")
54
+ try:
55
+ url = _audio_url_for(t)
56
+ if not url:
57
+ failed.append(f"{track_id}: no audio URL in corpus entry")
58
+ continue
59
+ audio_bytes = _download(url, args.timeout)
60
+ wav, sr = _decode(audio_bytes)
61
+ features = compute_mir(wav, sr)
62
+ t["mir_features"] = features.to_dict()
63
+ patched += 1
64
+ if i % 10 == 0 or i == len(todo):
65
+ print(f" [{i:>3}/{len(todo)}] enriched={patched} failed={len(failed)}")
66
+ except Exception as exc:
67
+ failed.append(f"{track_id}: {exc!r}")
68
+ time.sleep(args.sleep)
69
+
70
+ args.corpus.write_text(json.dumps(raw, indent=2))
71
+ print(f"[enrich_mir] DONE: patched={patched} failed={len(failed)}")
72
+ if failed:
73
+ print("[enrich_mir] first 10 failures:")
74
+ for f in failed[:10]:
75
+ print(f" - {f}")
76
+ return 0 if patched > 0 or not todo else 1
77
+
78
+
79
+ def _audio_url_for(track: dict) -> str | None:
80
+ """Pick the best audio source URL for a track per its tier."""
81
+ ext = track.get("external_ids") or {}
82
+ return (
83
+ ext.get("previewUrl") # iTunes Tier-1
84
+ or ext.get("jamendoAudioUrl") # Jamendo enriched
85
+ or ext.get("jamendoStreamUrl")
86
+ or track.get("source_url") # last resort
87
+ )
88
+
89
+
90
+ def _download(url: str, timeout: float) -> bytes:
91
+ req = urllib.request.Request(url, headers={"User-Agent": APPLE_UA})
92
+ with urllib.request.urlopen(req, timeout=timeout) as r:
93
+ return r.read()
94
+
95
+
96
+ def _decode(audio_bytes: bytes):
97
+ """Decode arbitrary audio bytes to a mono numpy array via the temp-file
98
+ path so AAC-LC (.m4a) works the same way it does in api.py.
99
+ """
100
+ # Try BytesIO first (works for mp3/wav/flac/ogg).
101
+ try:
102
+ wav, sr = librosa.load(io.BytesIO(audio_bytes), sr=22050, mono=True)
103
+ if wav.size > 0:
104
+ return wav, sr
105
+ except Exception:
106
+ pass
107
+ # Fall through to temp-file path.
108
+ with tempfile.NamedTemporaryFile(suffix=".m4a", delete=True) as tmp:
109
+ tmp.write(audio_bytes)
110
+ tmp.flush()
111
+ wav, sr = librosa.load(tmp.name, sr=22050, mono=True)
112
+ return wav, sr
113
+
114
+
115
+ def _parse_args() -> argparse.Namespace:
116
+ p = argparse.ArgumentParser(description=__doc__)
117
+ p.add_argument("--corpus", type=Path, default=DEFAULT_CORPUS)
118
+ p.add_argument("--force", action="store_true",
119
+ help="Re-enrich even tracks that already have mir_features")
120
+ p.add_argument("--sleep", type=float, default=0.2,
121
+ help="Per-track sleep to be polite to source CDNs")
122
+ p.add_argument("--timeout", type=float, default=30.0,
123
+ help="Per-request download timeout")
124
+ return p.parse_args()
125
+
126
+
127
+ if __name__ == "__main__":
128
+ sys.exit(main())
backend/scripts/run_rag_eval.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """RAG eval harness for the /narrative explanatory layer.
2
+
3
+ Reads `backend/tests/fixtures/narrative_golden_set.json`, runs each case
4
+ through `rag_narrative.generate_narrative()` with `_call_openai_json` mocked
5
+ to return the case's prepared LLM response, and scores aggregate metrics.
6
+
7
+ Metrics:
8
+ - kind_agreement_rate: result.kind == expected.kind
9
+ - reason_agreement_rate: (result.reason or null) == (expected.reason or null)
10
+ - citation_groundedness_rate: among happy_path cases, fraction returning
11
+ `kind=narrative` AND all citations validate.
12
+ - gate_correctness_rate: among low_context cases, fraction returning
13
+ `kind=low_confidence` with the right reason.
14
+ - hallucination_rejection_rate: among hallucinated_citation cases, fraction
15
+ returning `kind=unavailable, reason=citation-hallucinated`.
16
+
17
+ Writes a summary JSON to `factory/artifacts/RAG_EVAL_RESULT.json` so the
18
+ result is committable and reviewable in PRs that touch rag_narrative.py.
19
+
20
+ Run:
21
+ python -m backend.scripts.run_rag_eval [--out PATH]
22
+
23
+ Returns non-zero exit code if any baseline gate fails (kind_agreement < 1.0,
24
+ gate_correctness < 1.0, hallucination_rejection < 1.0). Those gates are
25
+ load-bearing — any regression in them is a real bug in the validation layer.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import argparse
31
+ import json
32
+ import sys
33
+ from collections import Counter
34
+ from pathlib import Path
35
+ from typing import Any
36
+ from unittest.mock import patch
37
+
38
+ REPO_ROOT = Path(__file__).resolve().parents[3]
39
+ GOLDEN_SET_PATH = REPO_ROOT / "backend" / "tests" / "fixtures" / "narrative_golden_set.json"
40
+ DEFAULT_OUT_PATH = REPO_ROOT / "factory" / "artifacts" / "RAG_EVAL_RESULT.json"
41
+
42
+
43
+ def _load_golden_set(path: Path) -> list[dict]:
44
+ data = json.loads(path.read_text())
45
+ cases = data.get("cases", [])
46
+ if not cases:
47
+ raise ValueError(f"empty golden set at {path}")
48
+ return cases
49
+
50
+
51
+ def _build_context(spec: dict):
52
+ """Materialize a NarrativeContext from a golden-set spec dict."""
53
+ from backend import rag_narrative
54
+
55
+ return rag_narrative.NarrativeContext(
56
+ queryFingerprint=spec["queryFingerprint"],
57
+ trackId=spec["trackId"],
58
+ title=spec["title"],
59
+ artist=spec.get("artist"),
60
+ queryWindow=tuple(spec["queryWindow"]),
61
+ matchWindow=tuple(spec["matchWindow"]),
62
+ rawCosine=float(spec["rawCosine"]),
63
+ criteria=[
64
+ rag_narrative.CriterionContext(**c) for c in spec["criteria"]
65
+ ],
66
+ acrcloudCoverSongId=spec.get("acrcloudCoverSongId"),
67
+ )
68
+
69
+
70
+ def _run_case(case: dict) -> dict:
71
+ """Run one golden-set case end-to-end and capture the result vs expected.
72
+
73
+ Returns a per-case report row.
74
+ """
75
+ from backend import rag_narrative
76
+
77
+ mocked = case["mocked_llm_response"]
78
+ expected_kind = case["expected_kind"]
79
+ expected_reason = case.get("expected_reason")
80
+ must_not_call_llm = mocked == "MUST_NOT_BE_CALLED"
81
+
82
+ # Sentinel "MUST_NOT_BE_CALLED" lives only in the JSON for readability;
83
+ # the patched helper still needs a real return value if called. We use
84
+ # None so a wrongful call surfaces as openai-error and the case will
85
+ # fail the kind-agreement assertion, surfacing the bug.
86
+ patch_return = None if must_not_call_llm else mocked
87
+
88
+ context = _build_context(case["context"])
89
+
90
+ with patch(
91
+ "backend.rag_narrative._call_openai_json", return_value=patch_return
92
+ ) as call_mock:
93
+ result = rag_narrative.generate_narrative(
94
+ context,
95
+ case["mode"],
96
+ model_sha="eval-model-sha",
97
+ catalog_sha="eval-catalog-sha",
98
+ )
99
+
100
+ llm_was_called = call_mock.called
101
+ actual_kind = getattr(result, "kind", None) or (
102
+ result.get("kind") if isinstance(result, dict) else None
103
+ )
104
+ actual_reason = getattr(result, "reason", None) or (
105
+ result.get("reason") if isinstance(result, dict) else None
106
+ )
107
+
108
+ return {
109
+ "name": case["name"],
110
+ "category": case["category"],
111
+ "mode": case["mode"],
112
+ "expected_kind": expected_kind,
113
+ "actual_kind": actual_kind,
114
+ "expected_reason": expected_reason,
115
+ "actual_reason": actual_reason,
116
+ "llm_was_called": llm_was_called,
117
+ "must_not_call_llm": must_not_call_llm,
118
+ "kind_match": actual_kind == expected_kind,
119
+ "reason_match": (actual_reason or None) == (expected_reason or None),
120
+ "gate_respected": (not must_not_call_llm) or (not llm_was_called),
121
+ }
122
+
123
+
124
+ def _aggregate(rows: list[dict]) -> dict:
125
+ n = len(rows)
126
+ kind_correct = sum(1 for r in rows if r["kind_match"])
127
+ reason_correct = sum(1 for r in rows if r["reason_match"])
128
+ by_cat = {}
129
+ cat_total: Counter[str] = Counter()
130
+ cat_kind_correct: Counter[str] = Counter()
131
+ cat_reason_correct: Counter[str] = Counter()
132
+ cat_gate_respected: Counter[str] = Counter()
133
+ for r in rows:
134
+ cat_total[r["category"]] += 1
135
+ if r["kind_match"]:
136
+ cat_kind_correct[r["category"]] += 1
137
+ if r["reason_match"]:
138
+ cat_reason_correct[r["category"]] += 1
139
+ if r["gate_respected"]:
140
+ cat_gate_respected[r["category"]] += 1
141
+
142
+ for cat, total in cat_total.items():
143
+ by_cat[cat] = {
144
+ "total": total,
145
+ "kind_agreement_rate": round(cat_kind_correct[cat] / total, 4),
146
+ "reason_agreement_rate": round(cat_reason_correct[cat] / total, 4),
147
+ "gate_respected_rate": round(cat_gate_respected[cat] / total, 4),
148
+ }
149
+
150
+ return {
151
+ "n_cases": n,
152
+ "kind_agreement_rate": round(kind_correct / n, 4) if n else 0.0,
153
+ "reason_agreement_rate": round(reason_correct / n, 4) if n else 0.0,
154
+ "by_category": by_cat,
155
+ "baseline_gates": {
156
+ # Each gate is "must be 1.0 to pass" — any regression here is a
157
+ # real bug in validation / gating, not noise.
158
+ "happy_path_kind_agreement": by_cat.get("happy_path", {}).get("kind_agreement_rate", 0.0),
159
+ "low_context_gate_correctness": by_cat.get("low_context", {}).get("reason_agreement_rate", 0.0),
160
+ "hallucination_rejection": by_cat.get("hallucinated_citation", {}).get("reason_agreement_rate", 0.0),
161
+ "malformed_rejection": by_cat.get("malformed_output", {}).get("reason_agreement_rate", 0.0),
162
+ "openai_error_handling": by_cat.get("openai_error", {}).get("reason_agreement_rate", 0.0),
163
+ },
164
+ }
165
+
166
+
167
+ def run_eval(golden_set_path: Path = GOLDEN_SET_PATH) -> dict:
168
+ """Public entry point — runs the eval and returns the aggregate dict.
169
+
170
+ Pure function from disk path → results dict. The CLI wrapper writes the
171
+ results to disk; the pytest gate calls this directly and asserts.
172
+ """
173
+ cases = _load_golden_set(golden_set_path)
174
+ rows = [_run_case(c) for c in cases]
175
+ summary = _aggregate(rows)
176
+ return {"summary": summary, "rows": rows}
177
+
178
+
179
+ def main() -> int:
180
+ parser = argparse.ArgumentParser(description=__doc__)
181
+ parser.add_argument(
182
+ "--out",
183
+ type=Path,
184
+ default=DEFAULT_OUT_PATH,
185
+ help="Where to write the eval result JSON.",
186
+ )
187
+ parser.add_argument(
188
+ "--golden",
189
+ type=Path,
190
+ default=GOLDEN_SET_PATH,
191
+ help="Override golden-set path (for experiments).",
192
+ )
193
+ args = parser.parse_args()
194
+
195
+ result = run_eval(args.golden)
196
+ args.out.parent.mkdir(parents=True, exist_ok=True)
197
+ args.out.write_text(json.dumps(result, indent=2) + "\n")
198
+
199
+ summary = result["summary"]
200
+ print(f"RAG eval — {summary['n_cases']} cases")
201
+ print(f" kind agreement : {summary['kind_agreement_rate']:.2%}")
202
+ print(f" reason agreement : {summary['reason_agreement_rate']:.2%}")
203
+ print()
204
+ print(" Baseline gates (each MUST be 1.0):")
205
+ failed: list[str] = []
206
+ for gate, score in summary["baseline_gates"].items():
207
+ status = "OK " if score >= 1.0 else "FAIL"
208
+ print(f" [{status}] {gate}: {score:.2%}")
209
+ if score < 1.0:
210
+ failed.append(gate)
211
+
212
+ print()
213
+ print(" By category:")
214
+ for cat, stats in summary["by_category"].items():
215
+ print(
216
+ f" {cat:<26} n={stats['total']:>2} kind={stats['kind_agreement_rate']:.2%} "
217
+ f"reason={stats['reason_agreement_rate']:.2%} gate_respected={stats['gate_respected_rate']:.2%}"
218
+ )
219
+
220
+ print()
221
+ print(f"Result written to {args.out}")
222
+
223
+ if failed:
224
+ print(f"FAILED gates: {', '.join(failed)}", file=sys.stderr)
225
+ return 1
226
+ return 0
227
+
228
+
229
+ if __name__ == "__main__":
230
+ sys.exit(main())
backend/scripts/verify_matching.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Self-retrieval verification for the PiedPiper matching pipeline.
2
+
3
+ For each iTunes Tier-1 track in the catalog: download the actual Apple
4
+ preview audio that was used to build the catalog embedding, POST it back
5
+ to /neighbors as a fresh query, and check that the same track is returned
6
+ at rank 1 with high cosine similarity.
7
+
8
+ This is the falsifiable answer to "are the matches accurate?" — if the
9
+ self-retrieval rate is high, the encoder + retrieval pipeline are doing
10
+ what they claim. If it isn't, there's a bug worth finding before we
11
+ trust any cross-track match.
12
+
13
+ Usage:
14
+ # Against the live HF Space (default):
15
+ python -m backend.scripts.verify_matching
16
+
17
+ # Against a local backend:
18
+ python -m backend.scripts.verify_matching --base-url http://localhost:8000
19
+
20
+ # Just one target:
21
+ python -m backend.scripts.verify_matching --target tier1:itunes:1488408568
22
+
23
+ ADR-0002 §"Verification" documents the methodology this harness implements.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import argparse
29
+ import json
30
+ import sys
31
+ import tempfile
32
+ import time
33
+ import urllib.request
34
+ from pathlib import Path
35
+
36
+ REPO_ROOT = Path(__file__).resolve().parents[3]
37
+ DEFAULT_CORPUS = REPO_ROOT / "quality-scorer" / "public" / "corpus" / "corpus.json"
38
+ DEFAULT_BASE_URL = "https://rajata98-piedpiper.hf.space"
39
+ APPLE_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_0) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"
40
+
41
+
42
+ def main() -> int:
43
+ args = _parse_args()
44
+ catalog = _load_catalog(args.corpus)
45
+ targets = _select_targets(catalog, args.target)
46
+ if not targets:
47
+ print("[verify_matching] no Tier-1 iTunes targets found in catalog; aborting", file=sys.stderr)
48
+ return 2
49
+ print(f"[verify_matching] running against {args.base_url}")
50
+ print(f"[verify_matching] {len(targets)} target(s) to verify")
51
+ print()
52
+
53
+ results: list[dict] = []
54
+ for i, target in enumerate(targets, start=1):
55
+ print(f"[{i}/{len(targets)}] {target['title']} — {target['artist']}")
56
+ try:
57
+ result = _verify_one(target, args.base_url, args.timeout)
58
+ except Exception as exc:
59
+ print(f" ERROR: {exc!r}")
60
+ result = {
61
+ "track_id": target["track_id"],
62
+ "title": target["title"],
63
+ "artist": target["artist"],
64
+ "error": str(exc),
65
+ "self_rank": None,
66
+ "self_cosine": None,
67
+ "top1_track_id": None,
68
+ "top1_cosine": None,
69
+ "timestamp": None,
70
+ }
71
+ results.append(result)
72
+ _print_one(result)
73
+ time.sleep(args.sleep)
74
+
75
+ print()
76
+ print("=" * 70)
77
+ print("SUMMARY")
78
+ print("=" * 70)
79
+ _print_summary(results)
80
+
81
+ if args.json_out:
82
+ Path(args.json_out).write_text(json.dumps(results, indent=2))
83
+ print(f"\n[verify_matching] full results written to {args.json_out}")
84
+
85
+ return 0
86
+
87
+
88
+ def _verify_one(target: dict, base_url: str, timeout: float) -> dict:
89
+ preview_url = (target.get("external_ids") or {}).get("previewUrl")
90
+ if not preview_url:
91
+ raise RuntimeError("no previewUrl in catalog entry")
92
+
93
+ # Download the iTunes preview. Apple CDN rejects default Python UA so set Safari.
94
+ req = urllib.request.Request(preview_url, headers={"User-Agent": APPLE_UA})
95
+ with urllib.request.urlopen(req, timeout=timeout) as r:
96
+ audio_bytes = r.read()
97
+
98
+ # POST to /neighbors. The temp file's .m4a suffix preserves Apple's AAC-LC
99
+ # format so the backend's audioread fallback picks the right decoder.
100
+ with tempfile.NamedTemporaryFile(suffix=".m4a", delete=True) as tmp:
101
+ tmp.write(audio_bytes)
102
+ tmp.flush()
103
+ try:
104
+ import httpx
105
+ except ImportError as exc:
106
+ raise RuntimeError(
107
+ "httpx required: pip install -e 'backend/[ingest]'"
108
+ ) from exc
109
+ with httpx.Client(timeout=timeout) as client:
110
+ with open(tmp.name, "rb") as f:
111
+ resp = client.post(
112
+ f"{base_url}/neighbors",
113
+ params={"k": 5},
114
+ files={"file": (f"{target['track_id']}.m4a", f, "audio/mp4")},
115
+ )
116
+ resp.raise_for_status()
117
+ body = resp.json()
118
+
119
+ neighbors = body.get("neighbors") or []
120
+ if not neighbors:
121
+ raise RuntimeError(f"empty neighbors response: {body}")
122
+
123
+ # Find self-rank in the returned list.
124
+ self_rank = None
125
+ self_cosine = None
126
+ for j, n in enumerate(neighbors, start=1):
127
+ if n.get("trackId") == target["track_id"]:
128
+ self_rank = j
129
+ self_cosine = float(n.get("rawCosine") or 0)
130
+ self_ts = n.get("matchTimestamp") or {}
131
+ break
132
+ else:
133
+ self_ts = {}
134
+
135
+ top1 = neighbors[0]
136
+ return {
137
+ "track_id": target["track_id"],
138
+ "title": target["title"],
139
+ "artist": target["artist"],
140
+ "self_rank": self_rank,
141
+ "self_cosine": self_cosine,
142
+ "self_timestamp": self_ts,
143
+ "top1_track_id": top1.get("trackId"),
144
+ "top1_cosine": float(top1.get("rawCosine") or 0),
145
+ "top1_title": (top1.get("track") or {}).get("title"),
146
+ "top1_artist": (top1.get("track") or {}).get("artist"),
147
+ }
148
+
149
+
150
+ def _print_one(r: dict) -> None:
151
+ if r.get("error"):
152
+ return
153
+ self_rank = r.get("self_rank")
154
+ self_cos = r.get("self_cosine")
155
+ top1_id = r.get("top1_track_id")
156
+ if self_rank == 1:
157
+ marker = "PASS"
158
+ elif self_rank is not None:
159
+ marker = f"SELF AT RANK {self_rank}"
160
+ else:
161
+ marker = "SELF NOT IN TOP-5"
162
+ cos_str = f"{self_cos:.4f}" if self_cos is not None else "—"
163
+ print(f" -> rank-1 returned: {(top1_id or '')[:40]:<40} cos={r.get('top1_cosine', 0):.4f}")
164
+ print(f" self-retrieval: {marker} self-cos={cos_str}")
165
+ ts = r.get("self_timestamp") or {}
166
+ if ts and self_rank is not None:
167
+ print(f" self-timestamp: query {ts.get('queryStartSec','?')}-{ts.get('queryEndSec','?')}s ↔ catalog {ts.get('catalogStartSec','?')}-{ts.get('catalogEndSec','?')}s")
168
+
169
+
170
+ def _print_summary(results: list[dict]) -> None:
171
+ n = len(results)
172
+ succeeded = [r for r in results if not r.get("error")]
173
+ n_ok = len(succeeded)
174
+ if n_ok == 0:
175
+ print("All targets errored out. Check the base URL + network.")
176
+ return
177
+ self_at_1 = [r for r in succeeded if r.get("self_rank") == 1]
178
+ self_in_top5 = [r for r in succeeded if r.get("self_rank") is not None]
179
+ self_cosines = [r["self_cosine"] for r in succeeded if r.get("self_cosine") is not None]
180
+ ts_align = [r for r in succeeded
181
+ if (r.get("self_timestamp") or {}).get("queryStartSec") == (r.get("self_timestamp") or {}).get("catalogStartSec")
182
+ and r.get("self_rank") is not None]
183
+
184
+ print(f"Total targets: {n}")
185
+ print(f"Successful round-trips: {n_ok}")
186
+ print(f"Self at rank 1: {len(self_at_1):>3} / {n_ok} ({100*len(self_at_1)/n_ok:.0f}%)")
187
+ print(f"Self in top 5: {len(self_in_top5):>3} / {n_ok} ({100*len(self_in_top5)/n_ok:.0f}%)")
188
+ if self_cosines:
189
+ mean = sum(self_cosines) / len(self_cosines)
190
+ print(f"Self-match cosine mean: {mean:.4f} (n={len(self_cosines)})")
191
+ print(f"Self-match cosine min: {min(self_cosines):.4f}")
192
+ print(f"Timestamp aligns to self: {len(ts_align):>3} / {n_ok} ({100*len(ts_align)/n_ok:.0f}%)")
193
+ print()
194
+ print("Pass criteria (ADR-0002 §Verification):")
195
+ print(f" self-retrieval rate >= 90% ... {'PASS' if len(self_at_1) / n_ok >= 0.9 else 'FAIL'}")
196
+ print(f" self-cosine mean >= 0.92 ... {'PASS' if self_cosines and sum(self_cosines)/len(self_cosines) >= 0.92 else 'FAIL'}")
197
+ print(f" timestamp aligns in >= 8 cases ... {'PASS' if len(ts_align) >= 8 else 'FAIL (or n<8 successful)'}")
198
+
199
+
200
+ def _load_catalog(path: Path) -> list[dict]:
201
+ raw = json.loads(path.read_text())
202
+ return raw if isinstance(raw, list) else raw.get("tracks", [])
203
+
204
+
205
+ def _select_targets(catalog: list[dict], target_id: str | None) -> list[dict]:
206
+ tier1 = [t for t in catalog if t.get("source") == "itunes" and (t.get("external_ids") or {}).get("previewUrl")]
207
+ if target_id:
208
+ return [t for t in tier1 if t.get("track_id") == target_id]
209
+ return tier1
210
+
211
+
212
+ def _parse_args() -> argparse.Namespace:
213
+ p = argparse.ArgumentParser(description=__doc__)
214
+ p.add_argument("--corpus", type=Path, default=DEFAULT_CORPUS,
215
+ help=f"corpus.json path (default: {DEFAULT_CORPUS})")
216
+ p.add_argument("--base-url", default=DEFAULT_BASE_URL,
217
+ help=f"backend base URL (default: {DEFAULT_BASE_URL})")
218
+ p.add_argument("--target", default=None,
219
+ help="single track_id to verify (default: all Tier-1 iTunes tracks)")
220
+ p.add_argument("--timeout", type=float, default=120.0,
221
+ help="per-request timeout in seconds (default: 120)")
222
+ p.add_argument("--sleep", type=float, default=1.0,
223
+ help="seconds between requests to be polite (default: 1.0)")
224
+ p.add_argument("--json-out", default=None,
225
+ help="optional path to write full results as JSON")
226
+ return p.parse_args()
227
+
228
+
229
+ if __name__ == "__main__":
230
+ sys.exit(main())
requirements.txt CHANGED
@@ -16,3 +16,8 @@ httpx>=0.27
16
  pyyaml>=6.0
17
  # Sentry error tracking — no-op when SENTRY_DSN env var is unset.
18
  sentry-sdk[fastapi]>=2.0
 
 
 
 
 
 
16
  pyyaml>=6.0
17
  # Sentry error tracking — no-op when SENTRY_DSN env var is unset.
18
  sentry-sdk[fastapi]>=2.0
19
+ # ADR-0005 (Commit C): GPT-4o-mini for /narrative explanatory layer.
20
+ # Endpoint returns 503 narrative-disabled when OPENAI_API_KEY is unset, so
21
+ # this dep is safe to add even before the Space secret is provisioned.
22
+ openai>=1.40
23
+ pydantic>=2.0