Kernels
File size: 10,379 Bytes
82f6f0e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
# SPDX-License-Identifier: Apache-2.0
# MegaBlocks CPU Fused MoE Implementation
#
# This is a pure Python/PyTorch implementation for CPU.
# For better performance, consider using the C++ kernel implementation.
#
import torch
import torch.nn.functional as F


def swigluoai_activation(gate: torch.Tensor, up: torch.Tensor, 
                         alpha: float = 1.702, limit: float = 7.0) -> torch.Tensor:
    """
    SwigluOAI activation function used in GptOss models.
    
    Formula:
        gate = clamp(gate, max=limit)
        up = clamp(up, -limit, limit)
        glu = gate * sigmoid(gate * alpha)
        output = (up + 1) * glu
    
    Args:
        gate: Gate tensor from gate projection
        up: Up tensor from up projection  
        alpha: Scaling factor for sigmoid (default: 1.702)
        limit: Clamp limit (default: 7.0)
    
    Returns:
        Activated tensor
    """
    gate = gate.clamp(max=limit)
    up = up.clamp(min=-limit, max=limit)
    glu = gate * torch.sigmoid(gate * alpha)
    return (up + 1) * glu


def silu_and_mul_activation(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
    """
    SiLU (Swish) activation with element-wise multiplication.
    
    Formula:
        output = silu(gate) * up
    
    Args:
        gate: Gate tensor
        up: Up tensor
    
    Returns:
        Activated tensor
    """
    return F.silu(gate) * up


def route_tokens_cpu(
    x: torch.Tensor,
    router_weight: torch.Tensor,
    router_bias: torch.Tensor | None,
    moe_top_k: int,
    moe_num_experts: int,
    moe_normalize_expert_weights: int | None = None,
) -> tuple:
    """
    Route tokens to experts and compute expert weights and indices (CPU version).
    
    Args:
        x: Input tensor [batch, seq, hidden] or [tokens, hidden]
        router_weight: Router weight [num_experts, hidden]
        router_bias: Router bias [num_experts] or None
        moe_top_k: Number of experts per token
        moe_num_experts: Total number of experts
        moe_normalize_expert_weights: Normalization order or None
    
    Returns:
        Tuple of (logits, expert_weights, expert_indices)
    """
    x_flat = x.view(-1, x.shape[-1])
    logits = F.linear(x_flat, router_weight, router_bias)
    
    if moe_top_k == 1:
        expert_weights, expert_indices = logits.max(dim=-1, keepdim=True)
    else:
        expert_weights, expert_indices = torch.topk(logits, moe_top_k, dim=-1)
    
    expert_weights = expert_weights.softmax(dim=-1)
    
    if moe_normalize_expert_weights is not None:
        expert_weights = expert_weights / torch.norm(
            expert_weights,
            p=moe_normalize_expert_weights,
            dim=-1,
            keepdim=True,
        )
    
    return logits, expert_weights, expert_indices


def cpu_fused_moe(
    hidden_states: torch.Tensor,
    w1: torch.Tensor,
    w2: torch.Tensor,
    topk_weights: torch.Tensor,
    topk_ids: torch.Tensor,
    w1_bias: torch.Tensor | None = None,
    w2_bias: torch.Tensor | None = None,
    activation: str = "silu",
    alpha: float = 1.702,
    limit: float = 7.0,
    is_interleaved: bool = True,
) -> torch.Tensor:
    """
    CPU Fused MoE using PyTorch operations.
    
    This implementation processes all experts in parallel using batched operations
    instead of sequential for loops, which is more efficient on CPU.
    
    Args:
        hidden_states: [num_tokens, hidden_size]
        w1: [num_experts, hidden_size, 2*inter_size] - gate_up_proj weights
        w2: [num_experts, inter_size, hidden_size] - down_proj weights
        topk_weights: [num_tokens, topk] - routing weights
        topk_ids: [num_tokens, topk] - expert indices
        w1_bias: [num_experts, 2*inter_size] or None
        w2_bias: [num_experts, hidden_size] or None
        activation: "silu" or "swigluoai"
        alpha: swigluoai alpha parameter
        limit: swigluoai limit parameter
        is_interleaved: whether gate_up is interleaved [g0,u0,g1,u1,...] (True for GptOss)
    
    Returns:
        output: [num_tokens, hidden_size]
    """
    num_tokens, hidden_size = hidden_states.shape
    num_experts = w1.shape[0]
    inter_size = w2.shape[1]
    topk = topk_weights.shape[1]
    
    # Initialize output
    output = torch.zeros_like(hidden_states)
    
    # Build expert mask: which tokens go to which expert
    # expert_mask[expert_id] contains indices of (token_idx, topk_pos) pairs
    for expert_idx in range(num_experts):
        # Find tokens assigned to this expert
        # mask shape: [num_tokens, topk], True where topk_ids == expert_idx
        mask = (topk_ids == expert_idx)
        
        if not mask.any():
            continue
        
        # Get token indices and topk positions
        token_indices, topk_positions = torch.where(mask)
        
        if len(token_indices) == 0:
            continue
        
        # Gather input tokens for this expert
        # current_hidden: [num_selected_tokens, hidden_size]
        current_hidden = hidden_states[token_indices]
        
        # Get weights for this expert
        # w1[expert_idx]: [hidden_size, 2*inter_size]
        # w2[expert_idx]: [inter_size, hidden_size]
        expert_w1 = w1[expert_idx]  # [hidden_size, 2*inter_size]
        expert_w2 = w2[expert_idx]  # [inter_size, hidden_size]
        
        # First projection: hidden @ w1 -> [num_selected, 2*inter_size]
        gate_up = current_hidden @ expert_w1
        
        # Add bias if present
        if w1_bias is not None:
            gate_up = gate_up + w1_bias[expert_idx]
        
        # Split gate and up projections
        if is_interleaved:
            # GptOss uses interleaved layout: [g0, u0, g1, u1, ...]
            gate = gate_up[..., ::2]   # [num_selected, inter_size]
            up = gate_up[..., 1::2]    # [num_selected, inter_size]
        else:
            # Standard layout: [gate_all, up_all]
            gate = gate_up[..., :inter_size]
            up = gate_up[..., inter_size:]
        
        # Apply activation
        if activation == "swigluoai":
            activated = swigluoai_activation(gate, up, alpha, limit)
        else:  # silu
            activated = silu_and_mul_activation(gate, up)
        
        # Second projection: activated @ w2 -> [num_selected, hidden_size]
        expert_out = activated @ expert_w2
        
        # Add bias if present
        if w2_bias is not None:
            expert_out = expert_out + w2_bias[expert_idx]
        
        # Apply routing weights and accumulate
        # weights shape: [num_selected]
        weights = topk_weights[token_indices, topk_positions].unsqueeze(-1)
        weighted_out = expert_out * weights
        
        # Accumulate to output
        output.index_add_(0, token_indices, weighted_out.to(output.dtype))
    
    return output


class MegaBlocksMoeMLP(torch.nn.Module):
    """
    CPU MoE MLP module that can be used as a drop-in replacement for
    the transformers GptOssMLP when using @use_kernel_forward_from_hub.
    """
    can_torch_compile: bool = True
    
    def forward(self, x: torch.Tensor) -> tuple:
        """
        Forward pass through the MoE layer.
        
        Args:
            x: Input tensor of shape [batch_size, seq_len, hidden_size] or [tokens, hidden_size]
            
        Returns:
            Tuple of (output, expert_weights) where:
                - output: Tensor of same shape as input
                - expert_weights: Expert weights for each token [tokens, top_k]
        """
        # Get MoE parameters from the wrapped modules
        moe_top_k = getattr(self.router, "top_k", 4)
        moe_num_experts = getattr(self.experts, "num_experts", 128)
        moe_normalize_expert_weights = getattr(
            self.experts, "normalize_expert_weights", None
        )
        
        # Detect activation type
        if hasattr(self.experts, "alpha") and hasattr(self.experts, "limit"):
            activation = "swigluoai"
            alpha = self.experts.alpha
            limit = self.experts.limit
        else:
            activation = getattr(self.experts, "activation", "silu")
            alpha = 1.702
            limit = 7.0
        
        # Get weight tensors
        if hasattr(self.experts, "gate_up_proj"):
            w1 = self.experts.gate_up_proj
            is_interleaved = True  # GptOss uses interleaved layout
        elif hasattr(self.experts, "w1"):
            w1 = self.experts.w1
            w3 = getattr(self.experts, "w3", None)
            if w3 is not None:
                w1 = torch.cat([w1, w3], dim=-1)
            is_interleaved = False
        else:
            raise AttributeError("experts module must have 'gate_up_proj' or 'w1' attribute")
        
        if hasattr(self.experts, "down_proj"):
            w2 = self.experts.down_proj
        elif hasattr(self.experts, "w2"):
            w2 = self.experts.w2
        else:
            raise AttributeError("experts module must have 'down_proj' or 'w2' attribute")
        
        # Get optional bias tensors
        w1_bias = getattr(self.experts, "gate_up_proj_bias", None)
        w2_bias = getattr(self.experts, "down_proj_bias", None)
        
        # Store original shape
        in_shape = x.size()
        
        # Route tokens to experts
        logits, expert_weights, expert_indices = route_tokens_cpu(
            x,
            self.router.weight,
            getattr(self.router, "bias", None),
            moe_top_k,
            moe_num_experts,
            moe_normalize_expert_weights,
        )
        
        # Reshape input for fused MoE
        x_flat = x.view(-1, x.shape[-1])
        
        # Call CPU fused MoE
        output = cpu_fused_moe(
            hidden_states=x_flat,
            w1=w1,
            w2=w2,
            topk_weights=expert_weights,
            topk_ids=expert_indices,
            w1_bias=w1_bias,
            w2_bias=w2_bias,
            activation=activation,
            alpha=alpha,
            limit=limit,
            is_interleaved=is_interleaved,
        )
        
        # Restore original shape
        output = output.view(in_shape)
        
        return output, expert_weights


# Export classes and functions
__all__ = [
    "MegaBlocksMoeMLP",
    "cpu_fused_moe",
    "route_tokens_cpu",
    "swigluoai_activation",
    "silu_and_mul_activation",
]