"""
Neural network layers for TopoHyper.

Implements:
- SimplicialConv: Hodge Laplacian-based convolution on simplicial complexes
- HypergraphConv: Spectral convolution on hypergraphs
- CrossStructureAttention: Attention-gated fusion between simplicial and hypergraph views
- TopoHyperConv: Three-phase integrated convolution layer
- TopoHyperPool: Graph-level readout with both structures
- GCNLayer, GATLayer: Baselines
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import math


class SimplicialConv(nn.Module):
    """
    Simplicial convolution using the Hodge Laplacian.
    
    Implements: X' = sigma(L_0 X W + X W_self)
    where L_0 = B1 B1^T is the 0-th Hodge Laplacian (graph Laplacian).
    
    Uses |B1| (unsigned) for compatibility with HGNN's non-negative space.
    """
    
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.W = nn.Linear(in_dim, out_dim, bias=False)
        self.W_self = nn.Linear(in_dim, out_dim, bias=True)
    
    def forward(self, x, sc):
        """
        Args:
            x: (N, in_dim) node features
            sc: SimplicialComplex object
        Returns:
            (N, out_dim) updated features
        """
        # Use unsigned boundary for compatibility with HGNN
        B1_abs = sc.B1.abs().to(x.device)
        # L0_unsigned = |B1| |B1|^T
        L0 = B1_abs @ B1_abs.t()
        
        # Normalize
        D = L0.sum(dim=1, keepdim=True).clamp(min=1e-7)
        L0_norm = L0 / D
        
        # Convolution
        h_neigh = L0_norm @ x
        out = self.W(h_neigh) + self.W_self(x)
        return out


class HypergraphConv(nn.Module):
    """
    Hypergraph convolution using spectral propagation.
    
    Implements: X' = sigma(D_v^{-1/2} H W D_e^{-1} H^T D_v^{-1/2} X Theta)
    """
    
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.theta = nn.Linear(in_dim, out_dim, bias=True)
    
    def forward(self, x, hg):
        """
        Args:
            x: (N, in_dim) node features
            hg: Hypergraph object
        Returns:
            (N, out_dim) updated features
        """
        P = hg.propagation_matrix().to(x.device)
        h = P @ x
        return self.theta(h)


class CrossStructureAttention(nn.Module):
    """
    Attention-gated fusion between simplicial and hypergraph representations.
    
    Learns alpha_i in [0, 1] for each node indicating how much to weight
    each view, enabling adaptive combination.
    """
    
    def __init__(self, dim):
        super().__init__()
        self.gate = nn.Sequential(
            nn.Linear(dim * 2, dim),
            nn.ReLU(),
            nn.Linear(dim, 1),
            nn.Sigmoid()
        )
    
    def forward(self, h_sc, h_hg):
        """
        Args:
            h_sc: (N, dim) simplicial features
            h_hg: (N, dim) hypergraph features
        Returns:
            (N, dim) fused features
        """
        combined = torch.cat([h_sc, h_hg], dim=-1)
        alpha = self.gate(combined)  # (N, 1)
        return alpha * h_sc + (1 - alpha) * h_hg


class TopoHyperConv(nn.Module):
    """
    Three-phase TopoHyper convolution layer.
    
    Phase 1: Simplicial convolution via unsigned Hodge Laplacian
    Phase 2: Hypergraph convolution via spectral propagation
    Phase 3: Cross-structure fusion with attention gate + bridge matrix
    
    The bridge matrix B = A_sc . A_hg captures nodes connected in BOTH views,
    enabling topological constraints to inform hypergraph propagation and vice versa.
    """
    
    def __init__(self, in_dim, out_dim, use_bridge=True, use_attention=True):
        super().__init__()
        self.use_bridge = use_bridge
        self.use_attention = use_attention
        
        self.sc_conv = SimplicialConv(in_dim, out_dim)
        self.hg_conv = HypergraphConv(in_dim, out_dim)
        
        if use_attention:
            self.attention = CrossStructureAttention(out_dim)
        
        if use_bridge:
            self.bridge_proj = nn.Linear(out_dim, out_dim, bias=False)
        
        self.norm = nn.LayerNorm(out_dim)
    
    def forward(self, x, sc, hg):
        """
        Args:
            x: (N, in_dim) node features
            sc: SimplicialComplex
            hg: Hypergraph
        Returns:
            (N, out_dim) updated features
        """
        # Phase 1: Simplicial convolution
        h_sc = F.relu(self.sc_conv(x, sc))
        
        # Phase 2: Hypergraph convolution
        h_hg = F.relu(self.hg_conv(x, hg))
        
        # Phase 3: Fusion
        if self.use_attention:
            h_fused = self.attention(h_sc, h_hg)
        else:
            h_fused = (h_sc + h_hg) / 2.0
        
        # Bridge: propagate through shared structure
        if self.use_bridge:
            A_sc = sc.adjacency_matrix().to(x.device)
            A_hg = hg.adjacency_matrix().to(x.device)
            bridge = A_sc * A_hg  # element-wise: nodes connected in BOTH views
            
            # Normalize bridge
            D_bridge = bridge.sum(dim=1, keepdim=True).clamp(min=1e-7)
            bridge_norm = bridge / D_bridge
            
            h_bridge = bridge_norm @ h_fused
            h_fused = h_fused + self.bridge_proj(h_bridge)
        
        return self.norm(h_fused)


class TopoHyperPool(nn.Module):
    """
    Graph-level readout combining mean, max, and attention pooling.
    """
    
    def __init__(self, dim):
        super().__init__()
        self.att = nn.Linear(dim, 1)
    
    def forward(self, x):
        """
        Args:
            x: (N, dim) node features
        Returns:
            (3*dim,) graph-level feature vector
        """
        h_mean = x.mean(dim=0)
        h_max = x.max(dim=0)[0]
        
        att_weights = F.softmax(self.att(x), dim=0)
        h_att = (att_weights * x).sum(dim=0)
        
        return torch.cat([h_mean, h_max, h_att])


# ==================== Baseline Layers ====================

class GCNLayer(nn.Module):
    """Standard GCN layer: X' = D^{-1/2} A D^{-1/2} X W"""
    
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.W = nn.Linear(in_dim, out_dim, bias=True)
    
    def forward(self, x, edge_index):
        N = x.shape[0]
        device = x.device
        
        # Build adjacency with self-loops
        A = torch.zeros(N, N, device=device)
        A[edge_index[0], edge_index[1]] = 1.0
        A = A + torch.eye(N, device=device)
        
        # Symmetric normalization
        D_inv_sqrt = torch.diag(1.0 / (A.sum(dim=1).sqrt() + 1e-7))
        A_norm = D_inv_sqrt @ A @ D_inv_sqrt
        
        return self.W(A_norm @ x)


class GATLayer(nn.Module):
    """Graph Attention Network layer."""
    
    def __init__(self, in_dim, out_dim, heads=4):
        super().__init__()
        self.heads = heads
        self.head_dim = out_dim // heads
        assert out_dim % heads == 0
        
        self.W = nn.Linear(in_dim, out_dim, bias=False)
        self.a_src = nn.Parameter(torch.randn(heads, self.head_dim))
        self.a_dst = nn.Parameter(torch.randn(heads, self.head_dim))
        self.bias = nn.Parameter(torch.zeros(out_dim))
        
        nn.init.xavier_uniform_(self.W.weight)
        nn.init.xavier_normal_(self.a_src.unsqueeze(0))
        nn.init.xavier_normal_(self.a_dst.unsqueeze(0))
    
    def forward(self, x, edge_index):
        N = x.shape[0]
        device = x.device
        
        h = self.W(x).view(N, self.heads, self.head_dim)
        
        # Build adjacency
        A = torch.zeros(N, N, device=device)
        A[edge_index[0], edge_index[1]] = 1.0
        A = A + torch.eye(N, device=device)
        
        # Attention scores
        e_src = (h * self.a_src.unsqueeze(0)).sum(-1)  # (N, heads)
        e_dst = (h * self.a_dst.unsqueeze(0)).sum(-1)  # (N, heads)
        
        # e_ij = LeakyReLU(e_src_i + e_dst_j)
        attn = F.leaky_relu(e_src.unsqueeze(2) + e_dst.unsqueeze(1), 0.2)  # (N, N, heads)
        
        # Mask non-edges
        mask = (A == 0).unsqueeze(-1).expand_as(attn)
        attn = attn.masked_fill(mask, float('-inf'))
        attn = F.softmax(attn, dim=2)
        attn = torch.nan_to_num(attn, nan=0.0)
        
        # Aggregate
        out = torch.einsum('ijh,jhd->ihd', attn, h)
        out = out.reshape(N, -1) + self.bias
        
        return out