Upload the code and the weights

dc84c90 verified about 2 months ago

19.4 kB


	from gc import enable

	from functorch import dim
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import numpy as np
	from torch.utils.data import Dataset, DataLoader

	class SwiGLU(nn.Module):
	def __init__(self):
	super().__init__()
	self.silu = nn.SiLU() # SiLU is the same as Swish, and is available in PyTorch

	def forward(self, x):
	x, gate = x.chunk(2, dim=-1)
	return self.silu(gate) * x

	class GatedResidual(nn.Module):
	def __init__(self, dim):
	super().__init__()
	# This layer looks at both the current state and the new info
	# to decide what to keep.
	#layernorm
	self.gate_layer = nn.Linear(dim * 2, dim)
	self.output_norm = nn.RMSNorm(dim)
	self.gate_x = nn.Linear(dim, dim, bias=False)
	self.gate_r = nn.Linear(dim, dim, bias=False)

	def forward(self, x, residual):
	"""
	x: The new information (e.g., from Attention)
	residual: The current memory/state (the 'highway')
	"""
	# 1. Concatenate them and calculate the 'Valve' (0 to 1)


	# 2. The 'Convex Combination' - pure stability
	# If gate is 0, we keep only the old memory.
	# If gate is 1, we take only the new info.
	gate = torch.sigmoid(self.gate_x(x) + self.gate_r(residual)) # no cat needed
	mixed = (1 - gate) * residual + gate * x
	# 3. Final cleanup
	return self.output_norm(mixed)

	def rotate_half(x):
	"""Rotates half the hidden dims of the input."""
	x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
	return torch.cat((-x2, x1), dim=-1)

	class RoPE(nn.Module):
	def __init__(self, head_dim, max_seq_len=2048, base=10000):
	super().__init__()
	inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2).float() / head_dim))
	self.register_buffer("inv_freq", inv_freq, persistent=False)
	self.max_seq_len = max_seq_len
	self._build_cache(max_seq_len)

	def _build_cache(self, seq_len):
	t = torch.arange(seq_len, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
	freqs = torch.einsum("i,j->ij", t, self.inv_freq)
	emb = torch.cat((freqs, freqs), dim=-1)
	self.register_buffer("cos_cached", emb.cos()[None, None, :, :], persistent=False)
	self.register_buffer("sin_cached", emb.sin()[None, None, :, :], persistent=False)

	def forward(self, x, seq_len):
	if seq_len > self.max_seq_len:
	self._build_cache(seq_len)
	cos = self.cos_cached[:, :, :seq_len, :x.shape[-1]]
	sin = self.sin_cached[:, :, :seq_len, :x.shape[-1]]
	return (x * cos) + (rotate_half(x) * sin)

	class RoPEAttention(nn.Module):
	def __init__(self, dim, heads, kv_heads=None, bottleneck=256):
	super().__init__()

	self.dim = dim
	self.bottleneck = bottleneck
	self.heads = heads
	self.kv_heads = kv_heads or heads

	assert heads % self.kv_heads == 0

	self.head_dim = bottleneck // heads

	# single latent projection
	self.latent = nn.Linear(dim, bottleneck, bias=False)

	self.q_proj = nn.Linear(bottleneck, bottleneck, bias=False)
	self.k_proj = nn.Linear(bottleneck, self.kv_heads * self.head_dim, bias=False)
	self.v_proj = nn.Linear(bottleneck, self.kv_heads * self.head_dim, bias=False)

	self.out_proj = nn.Linear(bottleneck, dim, bias=False)

	def forward(self, x, rope):
	x = self.latent(x)

	b, t, _ = x.shape

	q = self.q_proj(x).view(b, t, self.heads, self.head_dim).transpose(1, 2)
	k = self.k_proj(x).view(b, t, self.kv_heads, self.head_dim).transpose(1, 2)
	v = self.v_proj(x).view(b, t, self.kv_heads, self.head_dim).transpose(1, 2)

	split_size = self.head_dim // 2


	q_c, q_p = q.split([split_size, split_size], dim=-1)
	k_c, k_p = k.split([split_size, split_size], dim=-1)

	# Apply RoPE ONLY to the Position half
	q_p = rope(q_p, t)
	k_p = rope(k_p, t)

	q = torch.cat([q_c, q_p], dim=-1)
	k = torch.cat([k_c, k_p], dim=-1)

	out = F.scaled_dot_product_attention(
	q, k, v,
	enable_gqa=True,
	is_causal=True
	)

	out = out.transpose(1, 2).contiguous().view(b, t, self.bottleneck)
	return self.out_proj(out)

	class StreamDataset(Dataset):
	def __init__(self, bin_file, seq_len):
	# dtype MUST match what you used in tofile()
	self.data = np.memmap(bin_file, dtype=np.uint16, mode='r')
	self.seq_len = seq_len
	# We need seq_len + 1 to get a (input, target) pair
	self.n_samples = len(self.data) // (seq_len + 1)

	def __len__(self):
	return self.n_samples

	def __getitem__(self, idx):
	start = idx * (self.seq_len + 1)
	end = start + self.seq_len + 1
	chunk = self.data[start:end]
	x = chunk[:-1]
	y = chunk[1:]
	return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

	class FastMemoryCell(nn.Module):
	"""
	Drop-in replacement for memory1 + GatedResidual.

	Speed over your original:
	Original: 4 matmuls (gate_x, gate_r x2 GR calls) + lin1 = 5 total
	This: 1 matmul (fused_proj) = 5x fewer weight multiplications

	Quality over a vanilla GRU:
	- Bidirectional: returns BOTH a new hidden state AND a context vector
	- Reset gate (like GRU) for selective forgetting
	- Shared candidate prevents the two gate paths from fighting each other
	- RMSNorm instead of LayerNorm (no mean subtraction = ~20% faster norm)
	"""

	def __init__(self, dim: int):
	super().__init__()

	# ONE big Linear replaces:
	# self.GR.gate_x (dim -> dim)
	# self.GR.gate_r (dim -> dim)
	# self.GR1.gate_x (dim -> dim)
	# self.GR1.gate_r (dim -> dim)
	# self.lin1 (dim -> dim)
	#
	# In C++ terms: instead of 4 small GEMM calls,
	# we do 1 large GEMM — GPU loves wide matmuls.
	self.fused_proj = nn.Linear(dim * 2, dim * 3, bias=True)

	# RMSNorm: skips mean subtraction vs LayerNorm, ~20% faster
	# Requires PyTorch >= 2.1. Fall back to nn.LayerNorm if needed.
	self.norm = nn.RMSNorm(dim)

	def forward(self, x: torch.Tensor, state: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
	# Single cat + single matmul for ALL gate logic
	# Shape: [batch, dim*3]
	proj = self.fused_proj(torch.cat([x, state], dim=-1))

	# Slice into 3 equal parts along last dim — no memory copy, just views
	g_update, g_reset, g_context = proj.chunk(3, dim=-1)

	g_update = g_update.sigmoid() # How much new info enters the state
	g_reset = g_reset.sigmoid() # GRU-style: what old state to use for candidate
	g_context = g_context.sigmoid() # How much updated state leaks into context output

	# GRU-style candidate: reset gate filters what old state matters
	candidate = (g_reset * state).tanh()

	# --- Two outputs, shared computation ---

	# 1. New hidden state (equivalent to your: x = GR(input, state) → lin1)
	new_state = (1.0 - g_update) * state + g_update * candidate

	# 2. Context vector (equivalent to your: w = GR(state, input))
	# Blends raw input with the freshly updated state
	context = (1.0 - g_context) * x + g_context * new_state

	return self.norm(new_state), context

	#My rms norm implementation. This upcasts to fp32, then casts to whatever the input dtype was.
	class RMSNorm(nn.Module):
	def __init__(self, dim, eps=1e-5):
	super().__init__()
	self.eps = eps
	self.weight = nn.Parameter(torch.ones(dim)) # stays float32 always

	def forward(self, x):
	# Upcast input to float32 for norm (more numerically stable anyway)
	# then cast result back to whatever dtype x was
	return torch.rms_norm(x.to(self.weight.dtype), x.shape[-1:], self.weight, self.eps)

	class FlashCrossAttention(nn.Module):
	"""
	Drop-in replacement for nn.MultiheadAttention that uses
	Flash Attention (O(seq) memory instead of O(seq²)).

	Usage identical to your existing MA1/MA2:
	self.layerMA1 = AI_ex.FlashCrossAttention(dim, heads)
	y, _ = self.layerMA1(query, key, value, attn_mask=mask)

	The attn_mask parameter is accepted but ignored —
	Flash Attention handles causality internally and is
	always more memory efficient than passing an explicit mask.
	"""
	def __init__(self, dim, heads, bottleneck=256):
	super().__init__()
	self.heads = heads
	self.head_dim = dim // heads
	self.dim = dim

	self.q_proj = nn.Linear(dim, dim, bias=False)
	self.k_proj = nn.Linear(dim, dim, bias=False)
	self.v_proj = nn.Linear(dim, dim, bias=False)
	self.out_proj = nn.Linear(dim, dim, bias=False)

	def forward(self, query, key, value, attn_mask=None):
	B, Sq, D = query.shape
	Skv = key.size(1)
	H, Hd = self.heads, self.head_dim

	q = self.q_proj(query).view(B, Sq, H, Hd).transpose(1, 2)
	k = self.k_proj(key).view(B, Skv, H, Hd).transpose(1, 2)
	v = self.v_proj(value).view(B, Skv, H, Hd).transpose(1, 2)

	# Flash Attention — O(seq) memory, same result as standard attention
	out = F.scaled_dot_product_attention(
	q, k, v,
	attn_mask=None, # cross attention doesn't need causal mask
	is_causal=False,
	enable_gqa=True # use GQA optimization for even more speed and memory savings
	)

	out = out.transpose(1, 2).contiguous().view(B, Sq, D)
	return self.out_proj(out), None # None matches nn.MultiheadAttention signature

	class ThinkingRouter(nn.Module):
	"""
	Routes to different experts based on the QUALITY of current thinking,
	not just the content. Uses three signals:

	1. Delta: how much y changed this iteration (uncertainty signal)
	2. Drift: how far y is from the linguistic anchor (grounding signal)
	3. Iter: which iteration we're on (stage signal)

	These directly describe WHERE we are in the thinking process,
	making routing decisions interpretable and meaningful.
	"""
	def __init__(self, dim: int, n_experts: int = 2, max_iter: int = 3):
	super().__init__()
	self.n_experts = n_experts
	self.max_iter = max_iter
	# Iteration embedding — gives each iteration a learned "personality"
	# iter 1 = "first pass", iter 2 = "refinement", iter 3 = "verification"
	self.iter_embed = nn.Embedding(max_iter, 16)

	# Project the three signals into routing logits
	# Input: delta_scalar + drift_scalar + iter_embed(16) = 18 dims
	self.router = nn.Sequential(
	nn.Linear(18, 64),
	SwiGLU(),
	nn.Linear(32, n_experts, bias=False)
	)

	# Init router to be nearly uniform at start
	# → experts start with equal load, specialization emerges
	nn.init.normal_(self.router[0].weight, std=0.01)
	nn.init.normal_(self.router[2].weight, std=0.01)

	self.last_weights = None # store for balancing loss

	def forward(self,
	y: torch.Tensor, # current hidden state
	y_prev: torch.Tensor, # hidden state from last iter
	linguistic_anchor: torch.Tensor, # what the input said
	iter_idx: int # which iteration (0-indexed)
	) -> torch.Tensor:
	"""
	Returns routing weights [batch, n_experts].
	"""
	# Signal 1: Delta — how much thinking changed this step
	# High delta = uncertain, still changing a lot
	# Low delta = converging, changes are subtle
	delta = (y - y_prev).norm(dim=-1).mean(dim=-1, keepdim=True)
	# delta shape: [batch, 1]

	# Signal 2: Drift — how far current thinking is from the input
	# High drift = model is thinking abstractly, far from literal input
	# Low drift = model is still closely following the input
	drift = (y - linguistic_anchor).norm(dim=-1).mean(dim=-1, keepdim=True)
	# drift shape: [batch, 1]

	# Normalize both signals so they're comparable
	# Detach to avoid routing gradients affecting main computation
	delta = delta.detach() / (delta.detach().mean() + 1e-8)
	drift = drift.detach() / (drift.detach().mean() + 1e-8)

	# Signal 3: Iteration stage embedding
	iter_clamped = min(iter_idx if isinstance(iter_idx, int)
	else iter_idx.item(),
	self.max_iter - 1) # clamp to valid range

	iter_tensor = torch.as_tensor(iter_clamped, device=y.device, dtype=torch.long)
	iter_emb = self.iter_embed(iter_tensor)
	iter_emb = iter_emb.unsqueeze(0).expand(y.size(0), -1) # [batch, 16]

	# Combine signals
	routing_input = torch.cat([delta, drift, iter_emb], dim=-1) # [batch, 18]
	logits = self.router(routing_input) # [batch, n_experts]

	prob = torch.softmax(logits, dim=-1) # is this the right dimension/axis to view? I need to make sure that this is the axis of probabilities.



	return prob


	class MoLLayer(nn.Module):
	def __init__(self, dim: int, ffndim: int,
	n_experts: int = 2, max_iter: int = 3, bias=True):
	super().__init__()
	self.n_experts = n_experts

	self.router = ThinkingRouter(dim, n_experts, max_iter)
	self.experts = nn.ModuleList([
	nn.Sequential(
	nn.Linear(dim, ffndim * 2, bias=bias),
	SwiGLU(),
	nn.Linear(ffndim, dim, bias=bias)
	)
	for _ in range(n_experts)
	])

	def forward(self, x:torch.Tensor, x_prev:torch.Tensor, linguistic_anchor: torch.Tensor, iter_idx:int):# -> torch.Tensor
	#self.router.last_weights = None

	weights = self.router(x, x_prev, linguistic_anchor, iter_idx)

	out = torch.zeros_like(x)
	for i in range(self.n_experts):
	out += weights[:, i].unsqueeze(1).unsqueeze(2) * self.experts[i](x)

	return out

	class custom_mem(nn.Module):
	def __init__(self, dim, num_heads, head_dim=None, dtype=torch.bfloat16):
	super().__init__()
	self.dim = dim
	self.heads = num_heads
	# If head_dim isn't set, we'll default to a light bottleneck
	self.head_dim = head_dim if head_dim else (dim // num_heads) // 2

	self.total_mid_dim = self.heads * self.head_dim

	self.norm_curr = RMSNorm(dim)
	self.norm_prev = RMSNorm(dim)
	self.norm_anch = RMSNorm(dim)

	self.input_proj = nn.Linear(dim * 3, self.total_mid_dim, bias=False, dtype=dtype)

	self.w12 = nn.Linear(self.total_mid_dim, self.total_mid_dim * 2, bias=False, dtype=dtype)

	#self.scale1 = nn.Linear(self.total_mid_dim // 2, self.total_mid_dim, bias=False, dtype=dtype)
	self.w3 = nn.Linear(self.total_mid_dim, dim, bias=False, dtype=dtype)
	self.swiglu = SwiGLU()

	def forward(self, current, previous, anchor):
	# Apply RMSNorm to inputs to stabilize the "Trinity"
	c = self.norm_curr(current)
	p = self.norm_prev(previous)
	a = self.norm_anch(anchor)

	# Concatenate and project to the multi-head space
	# x shape: [Batch, Seq, (num_heads * head_dim)]
	combined = torch.cat([c, p, a], dim=-1)
	x = self.input_proj(combined)

	# SwiGLU Logic: Split the mid_dim in half for gate vs value
	gate_val = self.w12(x)
	gate, val = gate_val.chunk(2, dim=-1)


	# Apply swiglu to the gate and multiply by the value
	# This happens in parallel across all heads
	x = F.silu(gate) * val

	# Final Mixing: w3 sees all heads at once and collapses them back to 'dim'
	return self.w3(x)

	class engram(nn.Module):
	def __init__(
	self,
	dim: int,
	num_heads: int = 4,
	ngram: int = 3,
	memory_size: int = 16384,
	bottleneck: int = 128,
	dtype=torch.bfloat16,
	):
	super().__init__()
	assert dim % num_heads == 0, "dim must be divisible by num_heads"

	self.dim = dim
	self.num_heads = num_heads
	self.ngram = ngram
	self.memory_size = memory_size

	self.key_gate = nn.Linear(dim, num_heads, bias=False, dtype=dtype)
	self.memory = nn.Embedding(memory_size, dim, dtype=dtype)

	self.post = nn.Sequential(
	nn.Linear(dim, bottleneck * 2, bias=False, dtype=dtype),
	SwiGLU(),
	nn.Linear(bottleneck, dim, bias=False, dtype=dtype),
	)

	self.norm = RMSNorm(dim)
	self.percent = nn.Linear(dim, 1, bias=False, dtype=dtype)

	self.register_buffer(
	"coeffs",
	torch.tensor([1, 1315423911, 2654435761, 2246822519], dtype=torch.long),
	persistent=False,
	)
	self.register_buffer(
	"salts",
	torch.tensor([0, 97, 193, 389], dtype=torch.long),
	persistent=False,
	)

	def _hash_ngrams(self, input_ids: torch.Tensor) -> torch.Tensor:
	B, S = input_ids.shape

	if self.ngram > 1:
	pad = torch.zeros((B, self.ngram - 1), device=input_ids.device, dtype=input_ids.dtype)
	ids = torch.cat([pad, input_ids], dim=1)
	windows = ids.unfold(1, self.ngram, 1).long() # [B, S, ngram]
	else:
	windows = input_ids.long().unsqueeze(-1) # [B, S, 1]

	coeffs = self.coeffs[:windows.size(-1)].view(1, 1, -1)
	base = (windows * coeffs).sum(dim=-1) # [B, S]

	hashes = []
	for salt in self.salts[:self.num_heads]:
	hashes.append(torch.remainder(base + salt, self.memory_size))

	return torch.stack(hashes, dim=-1) # [B, S, H]

	def forward(self, x: torch.Tensor, input_ids: torch.Tensor) -> torch.Tensor:
	idx = self._hash_ngrams(input_ids) # [B, S, H]
	mem = self.memory(idx) # [B, S, H, D]

	x = x.to(self.key_gate.weight.dtype)

	gates = torch.softmax(self.key_gate(x), dim=-1).unsqueeze(-1) # [B, S, H, 1]
	out = (mem * gates).sum(dim=2) # [B, S, D]

	out = self.post(self.norm(out))
	return torch.sigmoid(self.percent(out)) * out