"""
Marketing Mix Model Diffusion (MMM-Diffusion) v2
=================================================
Fixed version addressing:
1. Sales alignment: Added explicit sales reconstruction loss during training
2. Coefficient smoothness: Reduced smoothness weight, added spectral loss,
   increased GT coefficient volatility, added multi-scale temporal loss

Architecture mapping (from Kimodo/GMD):
  Text prompts → Media spend, non-marketing vars, total sales
  Motion/position constraints → Sign constraints (β_media ≥ 0) + prior constraints  
  Root denoiser → Campaign/Geo-level denoiser (aggregate patterns)
  Body denoiser → Channel-level denoiser (per-channel coefficients)
  Skeleton positions/rotations → Time-varying coefficients for sales decomposition
"""

import math
import json
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt


# =============================================================================
# 1. SYNTHETIC MMM DATA GENERATOR (increased volatility for richer coefficients)
# =============================================================================

class MMMDataGenerator:
    MEDIA_CHANNELS = ['TV', 'Digital', 'Social', 'Print', 'Radio']
    CONTROL_VARS = ['Seasonality', 'Trend', 'Competitor_Price']
    
    def __init__(self, n_weeks=104, n_geos=1, seed=None):
        self.n_weeks = n_weeks
        self.n_geos = n_geos
        self.n_media = 5
        self.n_ctrl = 3
        self.rng = np.random.RandomState(seed)
        
    def _generate_media_spend(self):
        spend = np.zeros((self.n_weeks, self.n_media))
        t = np.arange(self.n_weeks)
        base_levels = self.rng.uniform(50, 500, size=self.n_media)
        
        for m in range(self.n_media):
            base = base_levels[m] * (1 + 0.2 * np.sin(2 * np.pi * t / 52))
            n_campaigns = self.rng.randint(3, 9)
            for _ in range(n_campaigns):
                start = self.rng.randint(0, self.n_weeks - 4)
                duration = self.rng.randint(1, 6)
                intensity = self.rng.uniform(1.5, 4.0)
                end = min(start + duration, self.n_weeks)
                base[start:end] *= intensity
            spend[:, m] = np.maximum(base + self.rng.normal(0, base_levels[m] * 0.1, self.n_weeks), 0)
        return spend
    
    def _adstock(self, x, alpha):
        result = np.zeros_like(x)
        result[0] = x[0]
        for t in range(1, len(x)):
            result[t] = x[t] + alpha * result[t-1]
        return result
    
    def _hill(self, x, ec50, slope):
        x_safe = np.maximum(x, 0)
        return x_safe**slope / (x_safe**slope + ec50**slope + 1e-10)
    
    def _generate_controls(self):
        t = np.arange(self.n_weeks)
        controls = np.zeros((self.n_weeks, self.n_ctrl))
        controls[:, 0] = (np.sin(2 * np.pi * t / 52) + 
                          0.5 * np.sin(4 * np.pi * t / 52) +
                          0.3 * np.cos(2 * np.pi * t / 52))
        trend = t / self.n_weeks
        controls[:, 1] = trend + 0.5 * trend**2
        price = np.zeros(self.n_weeks)
        price[0] = 1.0
        for i in range(1, self.n_weeks):
            price[i] = 0.95 * price[i-1] + 0.05 * 1.0 + self.rng.normal(0, 0.05)
        controls[:, 2] = price
        return controls
    
    def _sample_true_params(self):
        params = {}
        params['beta_media'] = np.abs(self.rng.normal(0, 0.5, self.n_media)) + 0.05
        params['adstock_alpha'] = self.rng.beta(2, 2, self.n_media)
        params['adstock_alpha'] = np.clip(params['adstock_alpha'], 0.1, 0.95)
        params['hill_ec50'] = np.abs(self.rng.lognormal(0, 0.5, self.n_media)) + 0.1
        params['hill_slope'] = self.rng.uniform(0.5, 3.0, self.n_media)
        params['beta_base'] = self.rng.uniform(500, 2000)
        params['beta_ctrl'] = self.rng.normal(0, 50, self.n_ctrl)
        params['noise_std'] = self.rng.uniform(20, 100)
        return params
    
    def _make_time_varying(self, base_coeff, n_weeks, volatility=0.1):
        """
        FIX: Increased default volatility and added regime-change jumps
        to produce more realistic, less smooth coefficients.
        """
        z = np.zeros(n_weeks)
        # OU process with higher volatility
        mean_reversion = 0.85  # slightly less mean-reverting (was 0.9)
        for t in range(1, n_weeks):
            z[t] = mean_reversion * z[t-1] + self.rng.normal(0, volatility)
        
        # Add occasional regime jumps (structural breaks)
        n_jumps = self.rng.randint(0, 4)
        for _ in range(n_jumps):
            jump_t = self.rng.randint(5, n_weeks - 5)
            jump_size = self.rng.normal(0, volatility * 3)
            z[jump_t:] += jump_size
            
        return base_coeff * np.exp(z)
    
    def generate_single(self):
        spend = self._generate_media_spend()
        controls = self._generate_controls()
        params = self._sample_true_params()
        
        transformed_media = np.zeros_like(spend)
        for m in range(self.n_media):
            adstocked = self._adstock(spend[:, m], params['adstock_alpha'][m])
            adstocked_norm = adstocked / (np.percentile(adstocked, 90) + 1e-10)
            transformed_media[:, m] = self._hill(
                adstocked_norm, params['hill_ec50'][m], params['hill_slope'][m]
            )
        
        # FIX: Higher volatility for time-varying coefficients
        tv_coeffs = np.zeros((self.n_weeks, self.n_media + self.n_ctrl))
        for m in range(self.n_media):
            tv_coeffs[:, m] = self._make_time_varying(
                params['beta_media'][m], self.n_weeks, volatility=0.12  # was 0.05
            )
            tv_coeffs[:, m] = np.maximum(tv_coeffs[:, m], 0.01)
        for c in range(self.n_ctrl):
            tv_coeffs[:, self.n_media + c] = self._make_time_varying(
                params['beta_ctrl'][c], self.n_weeks, volatility=0.08  # was 0.03
            )
        
        contributions = np.zeros((self.n_weeks, self.n_media + self.n_ctrl))
        for m in range(self.n_media):
            contributions[:, m] = tv_coeffs[:, m] * transformed_media[:, m]
        for c in range(self.n_ctrl):
            contributions[:, self.n_media + c] = tv_coeffs[:, self.n_media + c] * controls[:, c]
        
        base = params['beta_base']
        noise = self.rng.normal(0, params['noise_std'], self.n_weeks)
        total_sales = base + contributions.sum(axis=1) + noise
        total_sales = np.maximum(total_sales, 0)
        
        return {
            'media_spend': spend,
            'controls': controls,
            'total_sales': total_sales,
            'true_coefficients': tv_coeffs,
            'true_contributions': contributions,
            'transformed_media': transformed_media,
            'base_sales': np.full(self.n_weeks, base),
            'true_params': params
        }
    
    def generate_dataset(self, n_samples):
        samples = []
        for i in range(n_samples):
            self.rng = np.random.RandomState(self.rng.randint(0, 2**31))
            samples.append(self.generate_single())
        return samples


# =============================================================================
# 2. DATASET CLASS - now also stores transformed media for sales reconstruction
# =============================================================================

class MMMDiffusionDataset(Dataset):
    def __init__(self, samples, normalize=True):
        self.samples = samples
        self.normalize = normalize
        self.n_media = 5
        self.n_ctrl = 3
        self.n_channels = self.n_media + self.n_ctrl
        
        if normalize:
            all_cond = np.stack([
                np.concatenate([s['media_spend'], s['controls'], s['total_sales'][:, None]], axis=1)
                for s in samples
            ])
            all_coeff = np.stack([s['true_coefficients'] for s in samples])
            
            self.cond_mean = all_cond.mean(axis=(0, 1))
            self.cond_std = all_cond.std(axis=(0, 1)) + 1e-8
            self.coeff_mean = all_coeff.mean(axis=(0, 1))
            self.coeff_std = all_coeff.std(axis=(0, 1)) + 1e-8
            
            media_coeffs = all_coeff[:, :, :self.n_media]
            self.media_log_mean = np.log(media_coeffs + 1e-8).mean(axis=(0, 1))
            self.media_log_std = np.log(media_coeffs + 1e-8).std(axis=(0, 1)) + 1e-8
            
            # Store sales normalization for reconstruction loss
            all_sales = np.stack([s['total_sales'] for s in samples])
            self.sales_mean = float(all_sales.mean())
            self.sales_std = float(all_sales.std()) + 1e-8
            
            # Store transformed media normalization
            all_trans = np.stack([s['transformed_media'] for s in samples])
            self.trans_media_mean = all_trans.mean(axis=(0, 1))
            self.trans_media_std = all_trans.std(axis=(0, 1)) + 1e-8
            
            # Store base sales statistics
            all_base = np.array([s['base_sales'][0] for s in samples])
            self.base_mean = float(all_base.mean())
            self.base_std = float(all_base.std()) + 1e-8
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        s = self.samples[idx]
        
        cond = np.concatenate([
            s['media_spend'], s['controls'], s['total_sales'][:, None]
        ], axis=1).astype(np.float32)
        
        coeffs = s['true_coefficients'].astype(np.float32)
        trans_media = s['transformed_media'].astype(np.float32)
        controls = s['controls'].astype(np.float32)
        total_sales = s['total_sales'].astype(np.float32)
        base_sales = s['base_sales'].astype(np.float32)
        contributions = s['true_contributions'].astype(np.float32)
        
        if self.normalize:
            cond = (cond - self.cond_mean) / self.cond_std
            
            log_media = np.log(coeffs[:, :self.n_media] + 1e-8)
            log_media = (log_media - self.media_log_mean) / self.media_log_std
            ctrl = (coeffs[:, self.n_media:] - self.coeff_mean[self.n_media:]) / self.coeff_std[self.n_media:]
            coeffs = np.concatenate([log_media, ctrl], axis=1)
        
        return {
            'conditioning': torch.tensor(cond, dtype=torch.float32),
            'coefficients': torch.tensor(coeffs, dtype=torch.float32),
            'transformed_media': torch.tensor(trans_media, dtype=torch.float32),
            'controls': torch.tensor(controls, dtype=torch.float32),
            'total_sales': torch.tensor(total_sales, dtype=torch.float32),
            'base_sales': torch.tensor(base_sales, dtype=torch.float32),
            'contributions': torch.tensor(contributions, dtype=torch.float32),
        }
    
    def decode_coefficients(self, coeffs_normalized):
        if not self.normalize:
            return coeffs_normalized
        coeffs = coeffs_normalized.clone()
        media_log_mean = torch.tensor(self.media_log_mean, device=coeffs.device, dtype=coeffs.dtype)
        media_log_std = torch.tensor(self.media_log_std, device=coeffs.device, dtype=coeffs.dtype)
        coeffs[:, :, :self.n_media] = torch.exp(
            coeffs[:, :, :self.n_media] * media_log_std + media_log_mean
        )
        coeff_mean = torch.tensor(self.coeff_mean[self.n_media:], device=coeffs.device, dtype=coeffs.dtype)
        coeff_std = torch.tensor(self.coeff_std[self.n_media:], device=coeffs.device, dtype=coeffs.dtype)
        coeffs[:, :, self.n_media:] = coeffs[:, :, self.n_media:] * coeff_std + coeff_mean
        return coeffs
    
    def decode_media_coefficients_differentiable(self, coeffs_norm_media):
        """Decode only media coefficients, keeping gradients (for sales loss)."""
        media_log_mean = torch.tensor(self.media_log_mean, device=coeffs_norm_media.device, dtype=coeffs_norm_media.dtype)
        media_log_std = torch.tensor(self.media_log_std, device=coeffs_norm_media.device, dtype=coeffs_norm_media.dtype)
        return torch.exp(coeffs_norm_media * media_log_std + media_log_mean)
    
    def decode_ctrl_coefficients_differentiable(self, coeffs_norm_ctrl):
        """Decode only control coefficients, keeping gradients."""
        coeff_mean = torch.tensor(self.coeff_mean[self.n_media:], device=coeffs_norm_ctrl.device, dtype=coeffs_norm_ctrl.dtype)
        coeff_std = torch.tensor(self.coeff_std[self.n_media:], device=coeffs_norm_ctrl.device, dtype=coeffs_norm_ctrl.dtype)
        return coeffs_norm_ctrl * coeff_std + coeff_mean


# =============================================================================
# 3. DIFFUSION NOISE SCHEDULE
# =============================================================================

def cosine_beta_schedule(T, s=0.008):
    t = torch.arange(T + 1, dtype=torch.float64)
    f = torch.cos((t / T + s) / (1 + s) * math.pi / 2) ** 2
    alphas_cumprod = f / f[0]
    betas = 1 - alphas_cumprod[1:] / alphas_cumprod[:-1]
    return torch.clamp(betas, 0, 0.999).float()


class DiffusionSchedule:
    def __init__(self, T=1000):
        self.T = T
        self.betas = cosine_beta_schedule(T)
        self.alphas = 1.0 - self.betas
        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - self.alphas_cumprod)
        self.sqrt_recip_alphas = torch.sqrt(1.0 / self.alphas)
        
        self.alphas_cumprod_prev = F.pad(self.alphas_cumprod[:-1], (1, 0), value=1.0)
        self.posterior_variance = (
            self.betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
        )
        self.posterior_log_variance_clipped = torch.log(
            torch.clamp(self.posterior_variance, min=1e-20)
        )
        self.posterior_mean_coef1 = (
            self.betas * torch.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
        )
        self.posterior_mean_coef2 = (
            (1.0 - self.alphas_cumprod_prev) * torch.sqrt(self.alphas) / (1.0 - self.alphas_cumprod)
        )
    
    def to(self, device):
        for attr in ['betas', 'alphas', 'alphas_cumprod', 'sqrt_alphas_cumprod',
                      'sqrt_one_minus_alphas_cumprod', 'sqrt_recip_alphas',
                      'alphas_cumprod_prev', 'posterior_variance',
                      'posterior_log_variance_clipped', 'posterior_mean_coef1',
                      'posterior_mean_coef2']:
            setattr(self, attr, getattr(self, attr).to(device))
        return self
    
    def q_sample(self, x_0, t, noise=None):
        if noise is None:
            noise = torch.randn_like(x_0)
        sqrt_alpha = self.sqrt_alphas_cumprod[t].view(-1, 1, 1)
        sqrt_one_minus_alpha = self.sqrt_one_minus_alphas_cumprod[t].view(-1, 1, 1)
        return sqrt_alpha * x_0 + sqrt_one_minus_alpha * noise
    
    def posterior_mean(self, x_0_pred, x_t, t):
        coef1 = self.posterior_mean_coef1[t].view(-1, 1, 1)
        coef2 = self.posterior_mean_coef2[t].view(-1, 1, 1)
        return coef1 * x_0_pred + coef2 * x_t


# =============================================================================
# 4. DENOISER NETWORKS
# =============================================================================

class SinusoidalPositionEmbeddings(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.dim = dim
    
    def forward(self, t):
        device = t.device
        half_dim = self.dim // 2
        emb = math.log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
        emb = t[:, None].float() * emb[None, :]
        return torch.cat([emb.sin(), emb.cos()], dim=-1)


class TemporalTransformerBlock(nn.Module):
    def __init__(self, d_model, nhead, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model * 4, d_model),
            nn.Dropout(dropout),
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
    
    def forward(self, x):
        h = self.norm1(x)
        h = x + self.attn(h, h, h)[0]
        h = h + self.ff(self.norm2(h))
        return h


class CampaignDenoiser(nn.Module):
    """Stage 1: Campaign/Geo-level Denoiser — denoises aggregate patterns."""
    
    def __init__(self, n_agg_channels=3, cond_dim=4, d_model=256, nhead=4, n_layers=4, T_diff=1000):
        super().__init__()
        self.d_model = d_model
        self.time_embed = nn.Sequential(
            SinusoidalPositionEmbeddings(d_model),
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Linear(d_model, d_model),
        )
        self.cond_proj = nn.Linear(cond_dim, d_model)
        self.input_proj = nn.Linear(n_agg_channels, d_model)
        self.pos_embed = nn.Parameter(torch.randn(1, 256, d_model) * 0.02)
        self.blocks = nn.ModuleList([
            TemporalTransformerBlock(d_model, nhead) for _ in range(n_layers)
        ])
        self.output_proj = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, d_model // 2),
            nn.GELU(),
            nn.Linear(d_model // 2, n_agg_channels),
        )
    
    def forward(self, x_t, t, cond):
        B, T_seq, _ = x_t.shape
        t_emb = self.time_embed(t)
        h_x = self.input_proj(x_t)
        h_c = self.cond_proj(cond)
        h = h_x + h_c + t_emb.unsqueeze(1) + self.pos_embed[:, :T_seq, :]
        for block in self.blocks:
            h = block(h)
        return self.output_proj(h)


class ChannelDenoiser(nn.Module):
    """Stage 2: Channel-level Denoiser — denoises per-channel coefficients."""
    
    def __init__(self, n_channels=8, n_media=5, n_agg=3, d_model=384, nhead=8, n_layers=6, T_diff=1000):
        super().__init__()
        self.d_model = d_model
        self.n_media = n_media
        self.n_channels = n_channels
        
        self.time_embed = nn.Sequential(
            SinusoidalPositionEmbeddings(d_model),
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Linear(d_model, d_model),
        )
        self.input_proj = nn.Linear(n_channels, d_model)
        self.campaign_proj = nn.Linear(n_agg, d_model)
        self.spend_proj = nn.Linear(n_media, d_model)
        self.sales_proj = nn.Linear(1, d_model)
        self.cross_attn = nn.MultiheadAttention(d_model, nhead, batch_first=True)
        self.cross_norm = nn.LayerNorm(d_model)
        self.pos_embed = nn.Parameter(torch.randn(1, 256, d_model) * 0.02)
        self.blocks = nn.ModuleList([
            TemporalTransformerBlock(d_model, nhead) for _ in range(n_layers)
        ])
        self.output_proj = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, d_model // 2),
            nn.GELU(),
            nn.Linear(d_model // 2, n_channels),
        )
    
    def forward(self, x_t, t, campaign_ctx, media_spend, total_sales):
        B, T_seq, _ = x_t.shape
        t_emb = self.time_embed(t)
        h_x = self.input_proj(x_t)
        h_camp = self.campaign_proj(campaign_ctx)
        h_spend = self.spend_proj(media_spend)
        h_sales = self.sales_proj(total_sales)
        cond_ctx = h_camp + h_spend + h_sales
        h = h_x + t_emb.unsqueeze(1) + self.pos_embed[:, :T_seq, :]
        h_normed = self.cross_norm(h)
        h = h + self.cross_attn(h_normed, cond_ctx, cond_ctx)[0]
        for block in self.blocks:
            h = block(h)
        return self.output_proj(h)


# =============================================================================
# 5. MMM DIFFUSION MODEL — with sales reconstruction loss + spectral loss
# =============================================================================

class MMMDiffusionModel(nn.Module):
    def __init__(self, n_media=5, n_ctrl=3, d_model_campaign=256, d_model_channel=384,
                 n_layers_campaign=4, n_layers_channel=6, T_diff=1000):
        super().__init__()
        self.n_media = n_media
        self.n_ctrl = n_ctrl
        self.n_channels = n_media + n_ctrl
        self.T_diff = T_diff
        self.n_agg = 3
        
        self.campaign_denoiser = CampaignDenoiser(
            n_agg_channels=self.n_agg, cond_dim=n_ctrl + 1,
            d_model=d_model_campaign, nhead=4, n_layers=n_layers_campaign, T_diff=T_diff,
        )
        self.channel_denoiser = ChannelDenoiser(
            n_channels=self.n_channels, n_media=n_media, n_agg=self.n_agg,
            d_model=d_model_channel, nhead=8, n_layers=n_layers_channel, T_diff=T_diff,
        )
        self.coeff_to_agg = nn.Linear(self.n_channels, self.n_agg)
        self.agg_to_coeff_init = nn.Linear(self.n_agg, self.n_channels)
        self.schedule = DiffusionSchedule(T_diff)
        
        # Learnable base sales predictor (from conditioning)
        self.base_predictor = nn.Sequential(
            nn.Linear(n_ctrl + 1, 64),
            nn.GELU(),
            nn.Linear(64, 1),
        )
    
    def compute_aggregate(self, coefficients):
        return self.coeff_to_agg(coefficients)
    
    def _compute_sales_from_coeffs(self, coeffs_pred, batch, dataset_ref):
        """
        FIX #1: Compute predicted sales from predicted coefficients.
        This creates a differentiable path: coeffs -> contributions -> total sales.
        """
        # Decode coefficients to original scale (differentiable)
        media_pred_norm = coeffs_pred[:, :, :self.n_media]
        ctrl_pred_norm = coeffs_pred[:, :, self.n_media:]
        
        media_coeffs = dataset_ref.decode_media_coefficients_differentiable(media_pred_norm)
        ctrl_coeffs = dataset_ref.decode_ctrl_coefficients_differentiable(ctrl_pred_norm)
        
        # Compute contributions
        trans_media = batch['transformed_media']  # (B, T, 5)
        controls = batch['controls']  # (B, T, 3)
        
        media_contributions = media_coeffs * trans_media  # (B, T, 5)
        ctrl_contributions = ctrl_coeffs * controls  # (B, T, 3)
        
        total_contributions = media_contributions.sum(dim=-1) + ctrl_contributions.sum(dim=-1)  # (B, T)
        
        # Predict base sales from conditioning
        cond = batch['conditioning']
        stage1_cond = torch.cat([
            cond[:, :, self.n_media:self.n_media + self.n_ctrl],
            cond[:, :, -1:]
        ], dim=-1)
        base_pred = self.base_predictor(stage1_cond).squeeze(-1)  # (B, T)
        
        pred_sales = base_pred + total_contributions
        return pred_sales
    
    def _spectral_loss(self, pred, target):
        """
        FIX #2: Spectral (frequency-domain) loss to preserve temporal variation.
        Uses log-magnitude to keep scale comparable to other losses.
        Weights higher frequencies more to fight smoothing.
        """
        pred_fft = torch.fft.rfft(pred, dim=1)
        target_fft = torch.fft.rfft(target, dim=1)
        
        # Log-magnitude for scale normalization (brings to ~O(1) range)
        pred_mag = torch.log1p(torch.abs(pred_fft))
        target_mag = torch.log1p(torch.abs(target_fft))
        
        # Weight higher frequencies more to fight smoothing
        n_freq = pred_mag.shape[1]
        freq_weights = torch.linspace(1.0, 3.0, n_freq, device=pred.device)
        freq_weights = freq_weights.view(1, -1, 1)
        
        return F.mse_loss(pred_mag * freq_weights, target_mag * freq_weights)
    
    def _multi_scale_temporal_loss(self, pred, target):
        """
        Multi-scale temporal difference loss. Matches first AND second order
        temporal derivatives to capture both trends and curvature.
        """
        # First-order differences (velocity)
        d1_pred = pred[:, 1:, :] - pred[:, :-1, :]
        d1_true = target[:, 1:, :] - target[:, :-1, :]
        loss_d1 = F.mse_loss(d1_pred, d1_true)
        
        # Second-order differences (acceleration / curvature)
        d2_pred = d1_pred[:, 1:, :] - d1_pred[:, :-1, :]
        d2_true = d1_true[:, 1:, :] - d1_true[:, :-1, :]
        loss_d2 = F.mse_loss(d2_pred, d2_true)
        
        return loss_d1 + 0.5 * loss_d2
    
    def forward_train(self, batch, dataset_ref=None, epoch=0, total_epochs=80):
        cond = batch['conditioning']
        coeffs = batch['coefficients']
        B, T_seq, _ = coeffs.shape
        device = coeffs.device
        
        media_spend = cond[:, :, :self.n_media]
        controls = cond[:, :, self.n_media:self.n_media + self.n_ctrl]
        total_sales = cond[:, :, -1:]
        stage1_cond = torch.cat([controls, total_sales], dim=-1)
        
        with torch.no_grad():
            agg_target = self.coeff_to_agg(coeffs)
        
        # ---- Stage 1: Campaign Denoiser ----
        t1 = torch.randint(0, self.T_diff, (B,), device=device)
        noise1 = torch.randn_like(agg_target)
        agg_noisy = self.schedule.q_sample(agg_target, t1, noise1)
        agg_pred = self.campaign_denoiser(agg_noisy, t1, stage1_cond)
        loss_campaign = F.mse_loss(agg_pred, agg_target)
        
        # ---- Stage 2: Channel Denoiser ----
        # Uniform timestep sampling (removed biased sampling which hurt learning)
        t2 = torch.randint(0, self.T_diff, (B,), device=device)
        
        noise2 = torch.randn_like(coeffs)
        coeffs_noisy = self.schedule.q_sample(coeffs, t2, noise2)
        campaign_ctx = agg_target.detach()
        
        coeffs_pred = self.channel_denoiser(
            coeffs_noisy, t2, campaign_ctx, media_spend, total_sales
        )
        
        loss_channel = F.mse_loss(coeffs_pred, coeffs)
        
        # ---- Warmup schedule for auxiliary losses ----
        # Phase 1 (epochs 0-warmup): focus on core denoising (channel + campaign)
        # Phase 2 (epochs warmup+): gradually add sales, contrib, spectral losses
        warmup_epochs = total_epochs // 4  # first 25% is warmup
        aux_weight = max(0.0, min(1.0, (epoch - warmup_epochs) / max(warmup_epochs, 1)))
        
        # ---- Sales reconstruction loss (only after warmup) ----
        loss_sales = torch.tensor(0.0, device=device)
        if dataset_ref is not None and aux_weight > 0:
            # Only compute for VERY low noise (t < T/10) where prediction is accurate
            low_noise_mask = t2 < (self.T_diff // 10)
            if low_noise_mask.any():
                pred_sales = self._compute_sales_from_coeffs(coeffs_pred, batch, dataset_ref)
                actual_sales = batch['total_sales']
                # Use relative error (scale-invariant)
                scale = actual_sales[low_noise_mask].abs().mean() + 1e-8
                loss_sales = F.mse_loss(
                    pred_sales[low_noise_mask] / scale,
                    actual_sales[low_noise_mask] / scale,
                )
        
        # ---- Spectral loss ----
        loss_spectral = self._spectral_loss(coeffs_pred, coeffs)
        
        # ---- Multi-scale temporal loss ----
        loss_temporal = self._multi_scale_temporal_loss(coeffs_pred, coeffs)
        
        # ---- Contribution matching loss (only after warmup) ----
        loss_contrib = torch.tensor(0.0, device=device)
        if dataset_ref is not None and aux_weight > 0:
            low_noise_mask = t2 < (self.T_diff // 10)
            if low_noise_mask.any():
                media_coeffs = dataset_ref.decode_media_coefficients_differentiable(
                    coeffs_pred[low_noise_mask, :, :self.n_media]
                )
                ctrl_coeffs = dataset_ref.decode_ctrl_coefficients_differentiable(
                    coeffs_pred[low_noise_mask, :, self.n_media:]
                )
                pred_contrib_media = media_coeffs * batch['transformed_media'][low_noise_mask]
                pred_contrib_ctrl = ctrl_coeffs * batch['controls'][low_noise_mask]
                pred_contrib = torch.cat([pred_contrib_media, pred_contrib_ctrl], dim=-1)
                true_contrib = batch['contributions'][low_noise_mask]
                contrib_scale = true_contrib.abs().mean() + 1e-8
                loss_contrib = F.mse_loss(pred_contrib / contrib_scale, true_contrib / contrib_scale)
        
        # Sign loss (soft positivity for media in log-space)
        media_pred_log = coeffs_pred[:, :, :self.n_media]
        loss_sign = F.relu(-media_pred_log - 5.0).mean()
        
        # ---- Total loss with warmup-gated auxiliary losses ----
        # Core losses: always active (coefficient matching is primary)
        # Aux losses: ramp in after warmup with controlled weights
        loss = (
            1.0 * loss_campaign +
            2.0 * loss_channel +          # PRIMARY: coefficient matching (doubled weight)
            0.5 * loss_spectral +          # Anti-smoothing (always active, log-scale ~O(1))
            0.1 * loss_temporal +           # Temporal dynamics
            aux_weight * 0.2 * loss_sales +     # Sales alignment (ramped in)
            aux_weight * 0.2 * loss_contrib +   # Contribution matching (ramped in)
            0.01 * loss_sign
        )
        
        return {
            'loss': loss,
            'loss_campaign': loss_campaign.item(),
            'loss_channel': loss_channel.item(),
            'loss_sales': loss_sales.item() if isinstance(loss_sales, torch.Tensor) else loss_sales,
            'loss_spectral': loss_spectral.item(),
            'loss_temporal': loss_temporal.item(),
            'loss_contrib': loss_contrib.item() if isinstance(loss_contrib, torch.Tensor) else loss_contrib,
            'loss_sign': loss_sign.item(),
        }
    
    @torch.no_grad()
    def sample(self, conditioning, n_steps=None, constraint_every_k=10, guidance_scale=1.0):
        B, T_seq, _ = conditioning.shape
        device = conditioning.device
        
        media_spend = conditioning[:, :, :self.n_media]
        controls = conditioning[:, :, self.n_media:self.n_media + self.n_ctrl]
        total_sales = conditioning[:, :, -1:]
        stage1_cond = torch.cat([controls, total_sales], dim=-1)
        
        T_diff = n_steps or self.T_diff
        
        # Stage 1: Denoise aggregate patterns
        z_t = torch.randn(B, T_seq, self.n_agg, device=device)
        for t in reversed(range(T_diff)):
            t_batch = torch.full((B,), t, device=device, dtype=torch.long)
            z_0_pred = self.campaign_denoiser(z_t, t_batch, stage1_cond)
            if t > 0:
                mean = self.schedule.posterior_mean(z_0_pred, z_t, t_batch)
                var = self.schedule.posterior_variance[t]
                noise = torch.randn_like(z_t)
                z_t = mean + torch.sqrt(var) * noise
            else:
                z_t = z_0_pred
        
        campaign_ctx = z_t
        
        # Stage 2: Denoise channel coefficients  
        x_t = torch.randn(B, T_seq, self.n_channels, device=device)
        for t in reversed(range(T_diff)):
            t_batch = torch.full((B,), t, device=device, dtype=torch.long)
            x_0_pred = self.channel_denoiser(
                x_t, t_batch, campaign_ctx, media_spend, total_sales
            )
            
            # PhysDiff-style constraint projection
            if t % constraint_every_k == 0:
                x_0_pred[:, :, :self.n_media] = torch.clamp(
                    x_0_pred[:, :, :self.n_media], min=-8.0, max=8.0
                )
            
            if t > 0:
                mean = self.schedule.posterior_mean(x_0_pred, x_t, t_batch)
                var = self.schedule.posterior_variance[t]
                noise = torch.randn_like(x_t)
                x_t = mean + torch.sqrt(var) * noise
            else:
                x_t = x_0_pred
        
        return x_t
    
    @torch.no_grad()
    def sample_ddim(self, conditioning, n_steps=50, constraint_every_k=5, eta=0.0):
        """
        DDIM sampling — faster and more deterministic than DDPM.
        eta=0: fully deterministic, eta=1: equivalent to DDPM.
        """
        B, T_seq, _ = conditioning.shape
        device = conditioning.device
        
        media_spend = conditioning[:, :, :self.n_media]
        controls = conditioning[:, :, self.n_media:self.n_media + self.n_ctrl]
        total_sales = conditioning[:, :, -1:]
        stage1_cond = torch.cat([controls, total_sales], dim=-1)
        
        # Create sub-sequence of timesteps for DDIM
        step_size = max(self.T_diff // n_steps, 1)
        timesteps = list(range(0, self.T_diff, step_size))
        timesteps = list(reversed(timesteps))
        
        # Stage 1: DDIM denoise aggregate
        z_t = torch.randn(B, T_seq, self.n_agg, device=device)
        for i, t in enumerate(timesteps):
            t_batch = torch.full((B,), t, device=device, dtype=torch.long)
            z_0_pred = self.campaign_denoiser(z_t, t_batch, stage1_cond)
            
            if i < len(timesteps) - 1:
                t_next = timesteps[i + 1]
                alpha_t = self.schedule.alphas_cumprod[t]
                alpha_next = self.schedule.alphas_cumprod[t_next]
                
                # DDIM update
                pred_noise = (z_t - torch.sqrt(alpha_t) * z_0_pred) / torch.sqrt(1 - alpha_t)
                sigma = eta * torch.sqrt((1 - alpha_next) / (1 - alpha_t)) * torch.sqrt(1 - alpha_t / alpha_next)
                
                z_t = (torch.sqrt(alpha_next) * z_0_pred + 
                       torch.sqrt(1 - alpha_next - sigma**2) * pred_noise +
                       sigma * torch.randn_like(z_t))
            else:
                z_t = z_0_pred
        
        campaign_ctx = z_t
        
        # Stage 2: DDIM denoise channel coefficients
        x_t = torch.randn(B, T_seq, self.n_channels, device=device)
        for i, t in enumerate(timesteps):
            t_batch = torch.full((B,), t, device=device, dtype=torch.long)
            x_0_pred = self.channel_denoiser(
                x_t, t_batch, campaign_ctx, media_spend, total_sales
            )
            
            # PhysDiff projection
            if t % constraint_every_k == 0:
                x_0_pred[:, :, :self.n_media] = torch.clamp(
                    x_0_pred[:, :, :self.n_media], min=-8.0, max=8.0
                )
            
            if i < len(timesteps) - 1:
                t_next = timesteps[i + 1]
                alpha_t = self.schedule.alphas_cumprod[t]
                alpha_next = self.schedule.alphas_cumprod[t_next]
                
                pred_noise = (x_t - torch.sqrt(alpha_t) * x_0_pred) / torch.sqrt(1 - alpha_t)
                sigma = eta * torch.sqrt((1 - alpha_next) / (1 - alpha_t)) * torch.sqrt(1 - alpha_t / alpha_next)
                
                x_t = (torch.sqrt(alpha_next) * x_0_pred +
                       torch.sqrt(1 - alpha_next - sigma**2) * pred_noise +
                       sigma * torch.randn_like(x_t))
            else:
                x_t = x_0_pred
        
        return x_t


# =============================================================================
# 6. TRAINING LOOP
# =============================================================================

def train_mmm_diffusion(
    model, dataset, 
    n_epochs=50, batch_size=16, lr=1e-4,
    device='cpu', log_every=50, save_path='mmm_diffusion_model.pt'
):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    
    # Warmup + cosine decay
    warmup_steps = len(dataloader) * (n_epochs // 10)  # 10% warmup
    total_steps = len(dataloader) * n_epochs
    
    def lr_lambda(step):
        if step < warmup_steps:
            return step / max(warmup_steps, 1)
        progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1)
        return 0.5 * (1 + math.cos(math.pi * progress))
    
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
    
    model = model.to(device)
    model.schedule = model.schedule.to(device)
    
    history = {
        'loss': [], 'loss_campaign': [], 'loss_channel': [],
        'loss_sales': [], 'loss_spectral': [], 'loss_temporal': [],
        'loss_contrib': [], 'loss_sign': [],
    }
    
    print(f"\nTraining MMM-Diffusion Model v2")
    print(f"  Device: {device}")
    print(f"  Samples: {len(dataset)}, Batch size: {batch_size}")
    print(f"  Epochs: {n_epochs}, LR: {lr}")
    print(f"  Model params: {sum(p.numel() for p in model.parameters()):,}")
    print(f"  Diffusion steps: {model.T_diff}")
    print(f"  NEW losses: sales_recon, spectral, contribution_match")
    print("-" * 70)
    
    step = 0
    best_loss = float('inf')
    
    for epoch in range(n_epochs):
        model.train()
        epoch_losses = {k: [] for k in history}
        
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            
            losses = model.forward_train(batch, dataset_ref=dataset, epoch=epoch, total_epochs=n_epochs)
            
            optimizer.zero_grad()
            losses['loss'].backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()  # step-level scheduling for warmup
            
            for k in history:
                val = losses[k].item() if isinstance(losses[k], torch.Tensor) else losses[k]
                epoch_losses[k].append(val)
            
            step += 1
            if step % log_every == 0:
                avg = {k: np.mean(v[-log_every:]) for k, v in epoch_losses.items() if v}
                print(f"  Step {step:5d} | loss={avg['loss']:.4f} "
                      f"camp={avg['loss_campaign']:.4f} chan={avg['loss_channel']:.4f} "
                      f"sales={avg.get('loss_sales', 0):.4f} spec={avg.get('loss_spectral', 0):.4f} "
                      f"temp={avg.get('loss_temporal', 0):.4f} contr={avg.get('loss_contrib', 0):.4f}")
        
        # scheduler steps per-batch above
        
        avg = {k: np.mean(v) for k, v in epoch_losses.items() if v}
        for k, v in avg.items():
            history[k].append(v)
        
        if avg['loss'] < best_loss:
            best_loss = avg['loss']
            torch.save({
                'model_state_dict': model.state_dict(),
                'history': history,
                'config': {
                    'n_media': model.n_media, 'n_ctrl': model.n_ctrl, 'T_diff': model.T_diff,
                }
            }, save_path.replace('.pt', '_best.pt'))
        
        print(f"Epoch {epoch+1:3d}/{n_epochs} | loss={avg['loss']:.4f} "
              f"camp={avg['loss_campaign']:.4f} chan={avg['loss_channel']:.4f} "
              f"sales={avg.get('loss_sales', 0):.4f} spec={avg.get('loss_spectral', 0):.4f} "
              f"lr={scheduler.get_last_lr()[0]:.6f}")
    
    torch.save({
        'model_state_dict': model.state_dict(),
        'history': history,
        'config': {
            'n_media': model.n_media, 'n_ctrl': model.n_ctrl, 'T_diff': model.T_diff,
        }
    }, save_path)
    print(f"\nModel saved to {save_path}")
    
    return history


# =============================================================================
# 7. SALES DECOMPOSITION
# =============================================================================

def decompose_sales(coefficients, media_spend, controls, transformed_media=None, base_sales=None):
    """
    FIX: Use transformed media (adstock+Hill) if available for accurate decomposition.
    """
    T, n_total = coefficients.shape
    n_media = 5
    
    contributions = {}
    total_media = np.zeros(T)
    
    for m in range(n_media):
        name = MMMDataGenerator.MEDIA_CHANNELS[m]
        if transformed_media is not None:
            feature = transformed_media[:, m]
        else:
            spend = media_spend[:, m]
            feature = spend / (np.percentile(spend, 90) + 1e-10)
        
        contrib = coefficients[:, m] * feature
        contributions[name] = contrib
        total_media += contrib
    
    total_ctrl = np.zeros(T)
    for c in range(3):
        name = MMMDataGenerator.CONTROL_VARS[c]
        contrib = coefficients[:, n_media + c] * controls[:, c]
        contributions[name] = contrib
        total_ctrl += contrib
    
    contributions['Total_Media'] = total_media
    contributions['Total_Controls'] = total_ctrl
    
    if base_sales is not None:
        contributions['Predicted_Sales'] = base_sales + total_media + total_ctrl
    else:
        contributions['Predicted_Sales'] = total_media + total_ctrl
    
    return contributions


# =============================================================================
# 8. VISUALIZATION
# =============================================================================

def plot_training_history(history, save_path='training_history.png'):
    keys_to_plot = ['loss', 'loss_campaign', 'loss_channel', 'loss_sales',
                    'loss_spectral', 'loss_temporal', 'loss_contrib']
    keys_to_plot = [k for k in keys_to_plot if k in history and len(history[k]) > 0]
    
    n_plots = len(keys_to_plot)
    cols = 3
    rows = (n_plots + cols - 1) // cols
    fig, axes = plt.subplots(rows, cols, figsize=(16, 4 * rows))
    axes = axes.flatten()
    
    for i, key in enumerate(keys_to_plot):
        values = history[key]
        axes[i].plot(values, linewidth=1.5)
        axes[i].set_title(f'{key}', fontsize=12)
        axes[i].set_xlabel('Epoch')
        axes[i].set_ylabel('Loss')
        axes[i].grid(True, alpha=0.3)
        if min(values) > 0:
            axes[i].set_yscale('log')
    
    for i in range(len(keys_to_plot), len(axes)):
        axes[i].set_visible(False)
    
    plt.suptitle('MMM-Diffusion v2 Training History', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(save_path, dpi=150, bbox_inches='tight')
    plt.close()
    print(f"Training history plot saved to {save_path}")


def plot_coefficient_comparison(true_coeffs, pred_coeffs, channel_names, save_path='coeff_comparison.png'):
    n_channels = true_coeffs.shape[1]
    fig, axes = plt.subplots(n_channels, 1, figsize=(14, 2.5 * n_channels))
    
    for i, (ax, name) in enumerate(zip(axes, channel_names)):
        ax.plot(true_coeffs[:, i], 'b-', label='Ground Truth', linewidth=1.5)
        ax.plot(pred_coeffs[:, i], 'r--', label='Predicted', linewidth=1.5, alpha=0.8)
        ax.set_title(f'{name} — Time-Varying Coefficient', fontsize=11)
        ax.legend(fontsize=9)
        ax.grid(True, alpha=0.3)
        if i < 5:
            ax.axhline(y=0, color='gray', linestyle=':', alpha=0.5)
            ax.set_ylabel('β (≥0)')
        else:
            ax.set_ylabel('β')
    
    axes[-1].set_xlabel('Week')
    plt.suptitle('MMM-Diffusion v2: Coefficient Prediction Quality', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(save_path, dpi=150, bbox_inches='tight')
    plt.close()
    print(f"Coefficient comparison plot saved to {save_path}")


def plot_sales_decomposition(contributions, total_sales, save_path='sales_decomposition.png'):
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))
    
    ax = axes[0]
    weeks = np.arange(len(total_sales))
    media_names = MMMDataGenerator.MEDIA_CHANNELS
    colors = plt.cm.Set2(np.linspace(0, 1, len(media_names)))
    
    bottom = np.zeros(len(total_sales))
    for name, color in zip(media_names, colors):
        vals = np.maximum(contributions[name], 0)
        ax.fill_between(weeks, bottom, bottom + vals, alpha=0.7, label=name, color=color)
        bottom += vals
    
    ax.plot(weeks, total_sales, 'k-', linewidth=2, label='Total Sales', alpha=0.8)
    ax.set_title('Sales Decomposition: Media Channel Contributions', fontsize=12)
    ax.legend(loc='upper left', fontsize=9)
    ax.set_xlabel('Week')
    ax.set_ylabel('Sales Contribution')
    ax.grid(True, alpha=0.3)
    
    ax = axes[1]
    ax.plot(weeks, total_sales, 'b-', linewidth=2, label='Actual Sales')
    if 'Predicted_Sales' in contributions:
        ax.plot(weeks, contributions['Predicted_Sales'], 'r--', linewidth=2,
                label='Predicted (Base + Media + Controls)', alpha=0.8)
        # Add R² annotation
        ss_res = np.sum((total_sales - contributions['Predicted_Sales'])**2)
        ss_tot = np.sum((total_sales - total_sales.mean())**2)
        r2 = 1 - ss_res / (ss_tot + 1e-10)
        mape = np.mean(np.abs(total_sales - contributions['Predicted_Sales']) / (np.abs(total_sales) + 1e-10)) * 100
        ax.text(0.02, 0.95, f'R² = {r2:.4f}\nMAPE = {mape:.1f}%',
                transform=ax.transAxes, fontsize=11, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
    ax.set_title('Total Sales: Actual vs Predicted Decomposition', fontsize=12)
    ax.legend(fontsize=10)
    ax.set_xlabel('Week')
    ax.set_ylabel('Sales')
    ax.grid(True, alpha=0.3)
    
    plt.suptitle('MMM-Diffusion v2: Sales Decomposition', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(save_path, dpi=150, bbox_inches='tight')
    plt.close()
    print(f"Sales decomposition plot saved to {save_path}")


# =============================================================================
# 9. MAIN
# =============================================================================

def main():
    import time
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"=" * 70)
    print(f"MMM-DIFFUSION v2: Fixed Sales Alignment + Coefficient Dynamics")
    print(f"Device: {device}")
    print(f"=" * 70)
    
    # ---- Step 1: Generate synthetic data ----
    print("\n[1/5] Generating synthetic MMM data (higher volatility)...")
    t0 = time.time()
    
    gen = MMMDataGenerator(n_weeks=104, seed=42)
    n_train = 800
    n_val = 50
    
    train_samples = gen.generate_dataset(n_train)
    val_samples = gen.generate_dataset(n_val)
    
    print(f"  Generated {n_train} train + {n_val} val scenarios")
    print(f"  Each: {gen.n_weeks} weeks, {gen.n_media} media + {gen.n_ctrl} control vars")
    print(f"  Time: {time.time()-t0:.1f}s")
    
    # Quick audit
    sample = train_samples[0]
    coeffs = sample['true_coefficients']
    print(f"\n  Data audit:")
    print(f"    Media spend shape: {sample['media_spend'].shape}")
    print(f"    Media coeff range: [{coeffs[:,:5].min():.4f}, {coeffs[:,:5].max():.4f}]")
    print(f"    Media coeff std per channel: {coeffs[:,:5].std(axis=0).round(4)}")
    print(f"    Ctrl coeff range: [{coeffs[:,5:].min():.2f}, {coeffs[:,5:].max():.2f}]")
    print(f"    All media coeffs positive: {(coeffs[:,:5] > 0).all()}")
    print(f"    Sales range: [{sample['total_sales'].min():.1f}, {sample['total_sales'].max():.1f}]")
    
    # Check temporal variation (std of first-differences)
    media_diffs = np.diff(coeffs[:, :5], axis=0)
    print(f"    Media coeff Δ std (temporal variation): {media_diffs.std(axis=0).round(5)}")
    
    # ---- Step 2: Create datasets ----
    print("\n[2/5] Creating training datasets...")
    train_dataset = MMMDiffusionDataset(train_samples, normalize=True)
    val_dataset = MMMDiffusionDataset(val_samples, normalize=True)
    
    item = train_dataset[0]
    print(f"  Conditioning shape: {item['conditioning'].shape}")
    print(f"  Coefficients shape: {item['coefficients'].shape}")
    print(f"  Transformed media shape: {item['transformed_media'].shape}")
    
    # ---- Step 3: Build model ----
    print("\n[3/5] Building MMM-Diffusion v2 model...")
    
    T_DIFF = 500
    
    model = MMMDiffusionModel(
        n_media=5, n_ctrl=3,
        d_model_campaign=192,
        d_model_channel=256,
        n_layers_campaign=4,
        n_layers_channel=6,
        T_diff=T_DIFF,
    )
    
    total_params = sum(p.numel() for p in model.parameters())
    print(f"  Total parameters: {total_params:,}")
    print(f"  Campaign denoiser: {sum(p.numel() for p in model.campaign_denoiser.parameters()):,}")
    print(f"  Channel denoiser: {sum(p.numel() for p in model.channel_denoiser.parameters()):,}")
    print(f"  Diffusion steps: {T_DIFF}")
    
    # ---- Step 4: Train ----
    print("\n[4/5] Training...")
    
    N_EPOCHS = 150
    BATCH_SIZE = 32 if device == 'cuda' else 8
    LR = 3e-4
    
    history = train_mmm_diffusion(
        model, train_dataset,
        n_epochs=N_EPOCHS,
        batch_size=BATCH_SIZE,
        lr=LR,
        device=device,
        log_every=25,
        save_path='/app/mmm_diffusion_model_v2.pt',
    )
    
    plot_training_history(history, save_path='/app/training_history.png')
    
    # ---- Step 5: Validate ----
    print("\n[5/5] Validation...")
    
    # Load best model
    best_ckpt = torch.load('/app/mmm_diffusion_model_v2_best.pt', map_location=device, weights_only=False)
    model.load_state_dict(best_ckpt['model_state_dict'])
    model.eval()
    model = model.to(device)
    
    # Evaluate on multiple validation samples
    all_corrs = []
    for val_idx in range(min(10, len(val_samples))):
        val_item = val_dataset[val_idx]
        cond = val_item['conditioning'].unsqueeze(0).to(device)
        true_coeffs_norm = val_item['coefficients'].unsqueeze(0)
        
        pred_coeffs_norm = model.sample(cond, n_steps=T_DIFF, constraint_every_k=5)
        
        pred_coeffs = val_dataset.decode_coefficients(pred_coeffs_norm.cpu())
        true_coeffs = val_dataset.decode_coefficients(true_coeffs_norm)
        
        pred_np = pred_coeffs[0].numpy()
        true_np = true_coeffs[0].numpy()
        
        sample_corrs = []
        for i in range(8):
            corr = np.corrcoef(true_np[:, i], pred_np[:, i])[0, 1]
            sample_corrs.append(corr)
        all_corrs.append(sample_corrs)
    
    all_corrs = np.array(all_corrs)
    channel_names = MMMDataGenerator.MEDIA_CHANNELS + MMMDataGenerator.CONTROL_VARS
    
    print(f"\n  Average per-channel correlation (over {len(all_corrs)} val samples):")
    for i, name in enumerate(channel_names):
        mean_corr = np.nanmean(all_corrs[:, i])
        std_corr = np.nanstd(all_corrs[:, i])
        print(f"    {name:20s}: corr={mean_corr:.3f} ± {std_corr:.3f}")
    
    # Detailed analysis on first sample
    val_item = val_dataset[0]
    cond = val_item['conditioning'].unsqueeze(0).to(device)
    true_coeffs_norm = val_item['coefficients'].unsqueeze(0)
    
    pred_coeffs_norm = model.sample(cond, n_steps=T_DIFF, constraint_every_k=5)
    pred_coeffs = val_dataset.decode_coefficients(pred_coeffs_norm.cpu())
    true_coeffs = val_dataset.decode_coefficients(true_coeffs_norm)
    
    pred_np = pred_coeffs[0].numpy()
    true_np = true_coeffs[0].numpy()
    
    print(f"\n  Constraint check (sample 0):")
    print(f"    Media coefficients all positive: {(pred_np[:, :5] > 0).all()}")
    print(f"    Media coeff range: [{pred_np[:,:5].min():.6f}, {pred_np[:,:5].max():.6f}]")
    
    # Check temporal variation of predictions vs GT
    pred_media_diffs = np.diff(pred_np[:, :5], axis=0)
    true_media_diffs = np.diff(true_np[:, :5], axis=0)
    print(f"\n  Temporal variation (std of first-differences):")
    print(f"    GT media Δ std:   {true_media_diffs.std(axis=0).round(5)}")
    print(f"    Pred media Δ std: {pred_media_diffs.std(axis=0).round(5)}")
    ratio = pred_media_diffs.std(axis=0) / (true_media_diffs.std(axis=0) + 1e-10)
    print(f"    Ratio pred/GT:    {ratio.round(3)} (want close to 1.0)")
    
    # Plot
    plot_coefficient_comparison(
        true_np, pred_np, channel_names,
        save_path='/app/coeff_comparison.png'
    )
    
    # Sales decomposition with proper transformed media
    val_raw = val_samples[0]
    contributions = decompose_sales(
        pred_np, val_raw['media_spend'], val_raw['controls'],
        transformed_media=val_raw['transformed_media'],
        base_sales=val_raw['base_sales'],
    )
    
    # Also compute GT decomposition for comparison
    gt_contributions = decompose_sales(
        true_np, val_raw['media_spend'], val_raw['controls'],
        transformed_media=val_raw['transformed_media'],
        base_sales=val_raw['base_sales'],
    )
    
    plot_sales_decomposition(
        contributions, val_raw['total_sales'],
        save_path='/app/sales_decomposition.png'
    )
    
    # Sales alignment metrics
    pred_sales = contributions['Predicted_Sales']
    actual_sales = val_raw['total_sales']
    ss_res = np.sum((actual_sales - pred_sales)**2)
    ss_tot = np.sum((actual_sales - actual_sales.mean())**2)
    r2 = 1 - ss_res / (ss_tot + 1e-10)
    mape = np.mean(np.abs(actual_sales - pred_sales) / (np.abs(actual_sales) + 1e-10)) * 100
    
    print(f"\n  Sales Alignment:")
    print(f"    R² = {r2:.4f}")
    print(f"    MAPE = {mape:.1f}%")
    print(f"    Actual sales range: [{actual_sales.min():.0f}, {actual_sales.max():.0f}]")
    print(f"    Predicted sales range: [{pred_sales.min():.0f}, {pred_sales.max():.0f}]")
    
    print(f"\n{'='*70}")
    print(f"MMM-DIFFUSION v2 COMPLETE")
    print(f"{'='*70}")
    print(f"  Fixes applied:")
    print(f"    1. Sales reconstruction loss (L_sales) — aligns predicted with actual sales")
    print(f"    2. Spectral loss (L_spectral) — preserves frequency content, fights smoothing")
    print(f"    3. Multi-scale temporal loss — matches velocity AND acceleration")
    print(f"    4. Contribution matching loss — aligns channel-level decomposition")
    print(f"    5. Higher GT coefficient volatility with regime jumps")
    print(f"    6. Low-noise biased timestep sampling for better detail learning")
    print(f"    7. Reduced smoothness weight (0.1 → 0.05)")
    print(f"  Final training loss: {history['loss'][-1]:.4f}")
    print(f"  Sales R²: {r2:.4f}")
    
    return model, history, train_dataset, val_dataset


if __name__ == '__main__':
    model, history, train_dataset, val_dataset = main()