{
  "metrics": {
    "recall_at_1": 0.6387096774193548,
    "recall_at_3": 0.7354838709677419,
    "mrr": 0.6922759856630825,
    "n_queries": 155
  },
  "negatives_histogram": {
    "bins": [
      0.0,
      0.05,
      0.1,
      0.15,
      0.2,
      0.25,
      0.3,
      0.35,
      0.4,
      0.45,
      0.5,
      0.55,
      0.6,
      0.65,
      0.7,
      0.75,
      0.8,
      0.85,
      0.9,
      0.95,
      1.0
    ],
    "counts": [
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      0,
      2,
      6,
      6,
      8,
      13,
      11,
      7,
      3
    ],
    "step": 0.05
  },
  "latency": {
    "p50_ms": 0.266,
    "p95_ms": 0.741,
    "p99_ms": 0.741,
    "n_samples": 20,
    "note": "Wall-clock per /neighbors ranking call against the in-memory catalog. Excludes audio decode + CLAP encode (those are bounded by file size, not index size)."
  },
  "named_examples": {
    "false_positives": [],
    "false_negatives": []
  },
  "methodology": "Retrieval check - leave-one-out over the existing catalog. Each catalog track is used as a query embedding while that exact row is held out of the index; the system then ranks the remaining catalog tracks with the same mean-pooled CLAP cosine used by the live /neighbors endpoint. Recall@k and MRR count whether another track by the same artist appears in the top-k. Because each LOO query has at most one ground-truth target, Precision@1 equals Recall@1 here; we report Recall@k by convention and Precision@k = Recall@k / k for any k. Latency is wall-clock per /neighbors ranking call against the in-memory catalog. Groundedness (entity extraction from generated text) is not applicable - this system retrieves, it does not generate. The histogram is a LOO top-1 score distribution for queries that did not retrieve a same-artist track at rank 1.",
  "limitations": "This is a retrieval sanity check, not a definitive AI-generation eval. It does not use Suno generations or unrelated human-labeled negatives, so the histogram should not be read as a production false-positive distribution. Many catalog artists have only one track, which makes same-artist recall strict and depresses the headline metrics. The check is still useful because it is reproducible, uses the shipped catalog, and exercises the same similarity path as the demo.",
  "manifest": {
    "model_sha": "2e01c796b71dca71b45251384c04cd7b237c9020",
    "generated_at": "2026-06-13T18:23:37.060991+00:00",
    "n_positives": 155,
    "n_negatives": 0,
    "eval_mode": "loo",
    "threshold_default": 0.7,
    "golden_set_version": "corpus@2e01c796b71d"
  }
}