"""Classical MIR features per ADR-0004 — tempo, key+mode, chroma, MFCC.

Four locked criteria for the multi-criterion similarity layer:

  - tempo      : librosa.beat.beat_track → BPM scalar
  - key, mode  : chroma_cens mean → Krumhansl-Schmuckler 24-profile correlation
  - chroma     : chroma_cens 12-d mean vector → cosine over catalog
  - mfcc       : 13 MFCCs + their stddevs → 26-d "timbre fingerprint" → cosine

All four are computed at ingest time per catalog track (stored alongside the
MuQ-MuLan embedding) and at query time per upload. Pure NumPy + librosa, no
new dependencies. ~350 ms total per 30-second clip on CPU.

The comparison helpers (compare_tempos, compare_keys, compare_chroma_vectors,
compare_timbre_vectors) live in `similarity.py` so all per-criterion math
stays next to the existing similarity primitives.

See `docs/decisions/0004-multi-criterion-similarity.md` for the design.
"""

from __future__ import annotations

from dataclasses import asdict, dataclass

import numpy as np


# Krumhansl-Schmuckler key profiles — 12 major + 12 minor, each shifted to
# a different tonic. Source: Krumhansl 1990, "Cognitive Foundations of
# Musical Pitch." These are the standard tonal-strength weights for each
# pitch class within a given key; correlating a measured chroma mean
# against each rotation finds the most-likely key.
_KS_MAJOR = np.array(
    [6.35, 2.23, 3.48, 2.33, 4.38, 4.09, 2.52, 5.19, 2.39, 3.66, 2.29, 2.88],
    dtype=np.float32,
)
_KS_MINOR = np.array(
    [6.33, 2.68, 3.52, 5.38, 2.60, 3.53, 2.54, 4.75, 3.98, 2.69, 3.34, 3.17],
    dtype=np.float32,
)
_PITCH_CLASSES = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]


@dataclass
class MirFeatures:
    """Per-track MIR feature payload.

    Stored in corpus.json under the `mir_features` key and computed at
    query time on uploads. Numeric scalars + small vectors only, JSON-safe.
    """

    tempo_bpm: float
    key: str            # e.g. "A"
    mode: str           # "major" or "minor"
    key_confidence: float  # 0-1 — Krumhansl-Schmuckler correlation strength
    chroma_mean: list   # 12-d, float, sums approximately to 1.0 (probability over pitch classes)
    timbre_mean: list   # 26-d (13 MFCC means + 13 MFCC stddevs), float

    def to_dict(self) -> dict:
        return asdict(self)

    @classmethod
    def from_dict(cls, payload: dict) -> "MirFeatures":
        return cls(
            tempo_bpm=float(payload["tempo_bpm"]),
            key=str(payload["key"]),
            mode=str(payload["mode"]),
            key_confidence=float(payload.get("key_confidence", 0.0)),
            chroma_mean=[float(v) for v in payload["chroma_mean"]],
            timbre_mean=[float(v) for v in payload["timbre_mean"]],
        )


def compute(wav_mono: np.ndarray, sr: int) -> MirFeatures:
    """Run all four locked MIR features on a mono audio array.

    Args:
        wav_mono: 1-D float audio at any sample rate.
        sr:       sample rate of `wav_mono`.

    Returns:
        MirFeatures dataclass with tempo, key, mode, key_confidence,
        chroma_mean (12-d), timbre_mean (26-d).

    Cost: ~350 ms on CPU for a 30-second clip.
    """
    import librosa

    wav = np.asarray(wav_mono, dtype=np.float32).reshape(-1)
    if wav.size == 0:
        return MirFeatures(
            tempo_bpm=0.0,
            key="C",
            mode="major",
            key_confidence=0.0,
            chroma_mean=[0.0] * 12,
            timbre_mean=[0.0] * 26,
        )

    # --- tempo ----------------------------------------------------------
    # beat_track returns a scalar BPM. librosa 0.10+ returns it as a
    # 1-element ndarray; coerce to float.
    tempo_arr, _beats = librosa.beat.beat_track(y=wav, sr=sr)
    tempo_bpm = float(np.asarray(tempo_arr).flatten()[0])

    # --- chroma --------------------------------------------------------
    # chroma_cens is the smoothed CENS variant; more robust to articulation
    # and tempo variations than basic chroma_stft. 12 pitch-class energies.
    chroma = librosa.feature.chroma_cens(y=wav, sr=sr)
    chroma_mean_raw = chroma.mean(axis=1).astype(np.float32)
    # Normalize to a probability-ish distribution so downstream cosine
    # comparison is scale-invariant.
    s = float(chroma_mean_raw.sum())
    chroma_mean = chroma_mean_raw / s if s > 0 else chroma_mean_raw

    # --- key + mode + confidence ---------------------------------------
    # Krumhansl-Schmuckler: correlate chroma mean against 12 rotations of
    # the major profile and 12 of the minor profile, pick the maximum.
    cm = chroma_mean.astype(np.float64)
    cm_centered = cm - cm.mean()
    cm_denom = float(np.sqrt((cm_centered ** 2).sum())) or 1.0

    best_r = -1.0
    best_idx = 0
    best_mode = "major"
    for mode_label, profile in (("major", _KS_MAJOR), ("minor", _KS_MINOR)):
        for shift in range(12):
            prof = np.roll(profile, shift).astype(np.float64)
            prof_centered = prof - prof.mean()
            prof_denom = float(np.sqrt((prof_centered ** 2).sum())) or 1.0
            r = float((cm_centered * prof_centered).sum() / (cm_denom * prof_denom))
            if r > best_r:
                best_r = r
                best_idx = shift
                best_mode = mode_label

    key = _PITCH_CLASSES[best_idx]
    mode = best_mode
    # Pearson correlation ranges [-1, 1]; map to [0, 1] confidence.
    key_confidence = float(max(0.0, min(1.0, (best_r + 1.0) / 2.0)))

    # --- MFCC (timbre fingerprint) -------------------------------------
    # 13 MFCC coefficients (standard; the 0th captures overall energy and
    # is sometimes dropped, but we keep it because the mean+std combination
    # carries useful texture information).
    mfcc = librosa.feature.mfcc(y=wav, sr=sr, n_mfcc=13)
    timbre_mean = np.concatenate(
        [mfcc.mean(axis=1), mfcc.std(axis=1)],
    ).astype(np.float32)

    return MirFeatures(
        tempo_bpm=tempo_bpm,
        key=key,
        mode=mode,
        key_confidence=key_confidence,
        chroma_mean=[float(v) for v in chroma_mean],
        timbre_mean=[float(v) for v in timbre_mean],
    )