Upload 2 files

e00bb21 verified about 2 months ago

30.8 kB

	from datetime import datetime
	import os
	from pyexpat import model
	from httpx import get
	os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
	#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments=True"
	import torch
	import torch.nn as nn
	import torch.optim as optim
	import random
	import tiktoken
	import ai_extras as AI_ex
	from torch.utils.data import DataLoader, SubsetRandomSampler, TensorDataset, random_split
	import numpy as np
	import torch._dynamo
	import bitsandbytes as bnb
	import torch.utils.checkpoint as cp
	from safetensors.torch import save_model as sf_save_model

	torch.backends.cuda.enable_flash_sdp(True)

	torch._dynamo.config.suppress_errors = False # make errors visible
	torch._dynamo.config.verbose = True # print detailed compilation logs

	#torch._inductor.config.triton.cudagraphs = True

	torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True
	torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = True
	torch.set_float32_matmul_precision('medium')
	torch.backends.cuda.enable_mem_efficient_sdp = True
	torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp = True

	enc = tiktoken.get_encoding("gpt2")

	def save_model(model, filename):
	# 1. Create a temporary name for the Windows lock workaround
	temp_filename = filename + ".tmp"

	# 2. Use the specialized save_model function
	# This automatically handles shared tensors like embed/layerO1
	sf_save_model(model, temp_filename)
	print("saving model...")
	# 3. The Atomic Swap (to beat the Windows 1224 error)
	try:
	if os.path.exists(filename):
	old_file = filename + ".old"
	if os.path.exists(old_file):
	os.remove(old_file)
	os.rename(filename, old_file)

	os.rename(temp_filename, filename)
	# Optional: print("Model saved successfully with weight-tying support.")
	except OSError as e:
	print(f"Windows I/O Lock: {e}. Checkpoint kept at {temp_filename}")

	def get_optimizer(self, lr):
	# Separate parameters into three distinct groups
	decay_params = []
	no_decay_params = []
	alpha_32bit_params = [] # Special group for high-precision scalars

	for name, param in self.named_parameters():
	if not param.requires_grad:
	continue

	# 1. High-precision scalars (No decay + Force 32-bit)
	# We catch anything with 'alpha' or the memory gate here
	if any(x in name for x in ['alpha', 'mem_gate']):
	alpha_32bit_params.append(param)

	# 2. No decay for biases, LayerNorm/RMSNorm weights, and Embeddings
	elif 'bias' in name or 'norm' in name.lower() or 'embed' in name or 'layerO1' in name or 'engram' in name:
	no_decay_params.append(param)

	# 3. Standard weights with decay
	else:
	decay_params.append(param)

	optim_groups = [
	# Standard weights (8-bit)
	{'params': decay_params, 'weight_decay': 0.04},
	# Biases/Norms (8-bit)
	{'params': no_decay_params, 'weight_decay': 0.0},
	# Alphas/Scalars (Forced 32-bit)
	{'params': alpha_32bit_params, 'weight_decay': 0.0, 'optim_bits': 32}
	]

	# Initialize 8-bit AdamW - the optim_bits:32 in the group will override this for alphas
	optimizer = bnb.optim.AdamW8bit(
	optim_groups,
	lr=lr,
	betas=(0.9, 0.95)
	)
	return optimizer

	class biggerbrain(nn.Module):
	def __init__(self, device, sequence_length=640):
	super(biggerbrain, self).__init__()
	self.dim1 = 768
	self.ffndim = int(self.dim1 * 2.7)
	self.T_heads = 8
	self.P_heads = 12
	self.kv_heads = 4
	self.sequencelength = sequence_length
	self.device = device
	self.local_seq_len = 256
	self.MLA_dim = 384
	self.debugprints = False

	self.rope = AI_ex.RoPE(self.dim1 // self.T_heads, max_seq_len=self.sequencelength + 20) # a bit of extra rope for dependiencies that exceed training context

	self.embed = nn.Embedding(enc.max_token_value + 1, self.dim1)
	self.Engram = AI_ex.engram(self.dim1, self.kv_heads, 3, memory_size=12192, bottleneck=self.MLA_dim)

	self.layerO1 = nn.Linear(self.dim1, enc.max_token_value + 1, bias=False)

	self.mem_gate = nn.Linear(self.dim1, 1, bias=True)
	nn.init.zeros_(self.mem_gate[-1].weight if isinstance(self.mem_gate, nn.Sequential) else self.mem_gate.weight)
	nn.init.constant_(self.mem_gate[-1].bias if isinstance(self.mem_gate, nn.Sequential) else self.mem_gate.bias, -1.0)

	#block weighting parameters
	self.alpha_pre = nn.Parameter(torch.tensor(0.5))
	self.alpha_loop = nn.Parameter(torch.tensor(0.5))
	self.alpha_post = nn.Parameter(torch.tensor(0.5))
	self.alpha_mem = nn.Parameter(torch.tensor(0.5))

	#Pre block1
	self.layerPreA1 = AI_ex.RoPEAttention(self.dim1, self.P_heads, self.kv_heads, bottleneck=self.MLA_dim)
	self.preffn1 = AI_ex.MoLLayer(self.dim1, self.ffndim, n_experts=3, max_iter=1)
	self.normPre1 = AI_ex.RMSNorm(self.dim1)

	#Pre block2
	self.layerPreA2 = AI_ex.RoPEAttention(self.dim1, self.P_heads, self.kv_heads, bottleneck=self.MLA_dim)
	self.preffn2 = nn.Sequential(nn.Linear(self.dim1, self.ffndim * 2), AI_ex.SwiGLU(), nn.Linear(self.ffndim, self.dim1))#Half length local attention.
	self.normPre2 = AI_ex.RMSNorm(self.dim1)

	#Post block1
	self.layerPostA1 = AI_ex.RoPEAttention(self.dim1, self.P_heads, self.kv_heads, bottleneck=self.MLA_dim)
	self.postffn1 = nn.Sequential(nn.Linear(self.dim1, self.ffndim * 2), AI_ex.SwiGLU(), nn.Linear(self.ffndim, self.dim1))
	self.normPost1 = AI_ex.RMSNorm(self.dim1)

	#Post block2
	self.layerPostA2 = AI_ex.RoPEAttention(self.dim1, self.P_heads, self.kv_heads, bottleneck=self.MLA_dim)
	self.postffn2 = nn.Sequential(nn.Linear(self.dim1, self.ffndim * 2), AI_ex.SwiGLU(), nn.Linear(self.ffndim, self.dim1))
	self.normPost2 = AI_ex.RMSNorm(self.dim1)

	#Middle attention
	self.layerA1 = AI_ex.RoPEAttention(self.dim1, self.T_heads, self.kv_heads, bottleneck=self.MLA_dim)
	self.layerA2 = AI_ex.RoPEAttention(self.dim1, self.T_heads, self.kv_heads, bottleneck=self.MLA_dim)#Half length local attention.
	self.layerA3 = AI_ex.RoPEAttention(self.dim1, self.T_heads, self.kv_heads, bottleneck=self.MLA_dim)
	self.layerA4 = AI_ex.RoPEAttention(self.dim1, self.T_heads, self.kv_heads, bottleneck=self.MLA_dim)#Half length local attention.
	self.layerA5 = AI_ex.RoPEAttention(self.dim1, self.T_heads, self.kv_heads, bottleneck=self.MLA_dim)

	self.layerMi1 = AI_ex.MoLLayer(self.dim1, self.ffndim, n_experts=2)
	self.layerMi2 = AI_ex.MoLLayer(self.dim1, self.ffndim, n_experts=2)#Half length local attention.
	self.layerMi3 = AI_ex.MoLLayer(self.dim1, self.ffndim, n_experts=2)
	self.layerMi4 = AI_ex.MoLLayer(self.dim1, self.ffndim, n_experts=2)#Half length local attention.
	self.layerMi5 = AI_ex.MoLLayer(self.dim1, self.ffndim, n_experts=2)

	self.norm0 = AI_ex.RMSNorm(self.dim1)
	self.norm1 = AI_ex.RMSNorm(self.dim1)
	self.norm2 = AI_ex.RMSNorm(self.dim1)
	self.norm3 = AI_ex.RMSNorm(self.dim1)

	self.normM1 = AI_ex.RMSNorm(self.dim1) # Layer normalization for stabilizing.
	self.normM2 = AI_ex.RMSNorm(self.dim1)
	self.normM3 = AI_ex.RMSNorm(self.dim1)
	self.normM4 = AI_ex.RMSNorm(self.dim1)
	self.normM5 = AI_ex.RMSNorm(self.dim1)

	self.scratchpad_gate = nn.Linear(self.dim1, 1, bias=True)

	self.Mnorm = AI_ex.RMSNorm(self.dim1)
	self.M1 = AI_ex.custom_mem(self.dim1, self.T_heads) # for memory integration
	self.MM2 = AI_ex.GatedResidual(self.dim1) # for linguistic anchor pull

	nn.init.zeros_(self.scratchpad_gate.weight)
	nn.init.constant_(self.scratchpad_gate.bias, -1.0)

	self.layerO1.weight = self.embed.weight # tied weights
	self.apply(self._init_weights)

	def pick_word(self, output, k=50, temperature=0.8,
	prev_tokens=None, rep_penalty=1.1):
	logits = output / (temperature + 1e-8)

	# Permanently blacklist chat-format artifact tokens
	# These have high probability from training data contamination
	# but should never appear in normal prose output
	BAD_TOKENS = [
	#25, # ":" single colon
	#3712, # "::" double colon
	#1058, # ":" alternate encoding
	]
	for tok in BAD_TOKENS:
	if tok < logits.size(-1):
	logits[0, tok] = float('-inf')

	# Repetition penalty
	if prev_tokens is not None:
	# This list comprehension ensures everything is a flat integer
	# even if it was passed in as [tensor(1), tensor(2)] or [[1], [2]]
	flat_tokens = []
	for t in prev_tokens[-64:]:
	if isinstance(t, torch.Tensor):
	flat_tokens.append(t.item())
	elif isinstance(t, list):
	flat_tokens.extend(t)
	else:
	flat_tokens.append(t)

	for tok in set(flat_tokens):
	if tok < logits.size(-1):
	if logits[0, tok] > 0:
	logits[0, tok] /= rep_penalty
	else:
	logits[0, tok] *= rep_penalty

	# Top-K filtering
	v, _ = torch.topk(logits, min(k, logits.size(-1)))
	# Use .view() to ensure the shapes match perfectly
	logits[logits < v[..., -1].view(-1, 1)] = float('-inf')

	# Sample
	probabilities = torch.softmax(logits, dim=-1)
	word_id = torch.multinomial(probabilities, num_samples=1)

	return word_id

	def _init_weights(self, module):
	if isinstance(module, (nn.Linear, nn.Embedding)):
	# Standard GPT-2 initialization
	module.weight.data.normal_(mean=0.0, std=0.01)
	if isinstance(module, nn.Linear) and module.bias is not None:
	module.bias.data.zero_()

	def trainingloop(self, data, epochs=50, lr=3e-4, batchsize=32,
	accumulation_steps=4,
	subset_fraction=1.0, warmup_steps=2000):
	try:
	self.train()
	batchloss = 1000
	# Build full index list once
	dataset_size = len(data)
	all_indices = np.arange(dataset_size)
	np.random.shuffle(all_indices) # shuffle once upfront
	#self.to(dtype=torch.bfloat16)
	# How many samples per epoch
	subset_size = int(dataset_size * subset_fraction)
	ITER_OPTIONS = [1, 2, 3]
	ITER_WEIGHTS = [0.75, 0.15, 0.1]
	chunks = [
	all_indices[i * subset_size : (i + 1) * subset_size]
	for i in range(int(1.0 / subset_fraction))
	]

	optimizer = get_optimizer(self, lr)

	criterion = nn.CrossEntropyLoss(ignore_index=-1)
	warmup_scheduler = torch.optim.lr_scheduler.LinearLR(
	optimizer, 0.01, end_factor=1.0, total_iters=warmup_steps
	)

	decay_scheduler = torch.optim.lr_scheduler.LinearLR(
	optimizer, start_factor=1.0, end_factor=0.001
	)

	# The Unified Scheduler
	scheduler = torch.optim.lr_scheduler.SequentialLR(
	optimizer,
	schedulers=[warmup_scheduler, decay_scheduler],
	milestones=[warmup_steps]
	)

	best_loss = 1000.0

	#self.forward_training = torch.compile(self.forward_training, dynamic=True, backend ='inductor')#, options=['shape_padding':True] model#self
	if os.path.exists("checkpoint_full.pth"):
	checkpoint = torch.load("checkpoint_full.pth", weights_only=False, map_location='cpu')
	self.load_state_dict(checkpoint['model'])
	optimizer.load_state_dict(checkpoint['optimizer'])
	scheduler.load_state_dict(checkpoint['scheduler'])
	start_epoch = checkpoint['epoch']
	#all_indices = checkpoint['all_indices']
	batchloss = checkpoint['batchloss']
	print(f"Loaded checkpoint for training. {scheduler.get_last_lr()}")
	print(f"Restored batchloss: {batchloss:.4f}")

	for epoch in range(epochs):
	epoch_loss = 0.0
	batches_run = 0
	optimizer.zero_grad(set_to_none=True)

	chunk_idx = epoch % len(chunks)
	sampler = SubsetRandomSampler(chunks[chunk_idx])

	loader = DataLoader(
	data,
	batch_size=batchsize,
	sampler=sampler,
	num_workers=0,
	pin_memory=True,
	drop_last=True
	)
	if epoch % len(chunks) == 0 and epoch > 0:
	np.random.shuffle(all_indices)
	chunks = [
	all_indices[i * subset_size : (i + 1) * subset_size]
	for i in range(int(1.0 / subset_fraction))
	]
	print(f"Reshuffled data chunks for next cycle")

	for i, (batch_inputs, batch_targets) in enumerate(loader):
	if i == 0:
	print(f"Epoch {epoch} \| Chunk {chunk_idx+1}/{len(chunks)} \| "
	f"Starting training...")
	if (i + 1) % (accumulation_steps * 1000) == 0:
	std = self.embed.weight.std().item()
	print(f" Embed std: {std:.4f} (healthy = ~0.02, bad = <0.005 OR > 0.05)")
	if std < 0.005 or std > 0.05:
	print(" WARNING: embedding problems detected!")
	if std < 0.005:
	print("Embeddings are collapsing. Warning! Attempt to pause.")
	if input("Press Enter to continue...").lower() == "stop":
	print("Stopping training loop. Saving model...")
	save_model(self, "stopped_model.safetensors")
	return
	elif std > 0.05:
	print("Embeddings are too large. Warning! Attempt to pause.")
	if input("Press Enter to continue...").lower() == "stop":
	print("Stopping training loop. Saving model...")
	save_model(self, "stopped_model.safetensors")
	return

	batch_inputs = batch_inputs.to(self.device, non_blocking=True)
	batch_targets = batch_targets.to(self.device, non_blocking=True)

	with torch.amp.autocast('cuda', dtype=torch.bfloat16):
	num_iter = random.choices(ITER_OPTIONS, weights=ITER_WEIGHTS, k=1)[0]
	logits = self.forward_training(batch_inputs, iters =num_iter)
	lm_loss = criterion(
	logits.view(-1, enc.max_token_value + 1),
	batch_targets.reshape(-1)
	)
	lm_loss = lm_loss / accumulation_steps
	lm_loss.backward()

	if (i + 1) % accumulation_steps == 0:
	#print(f"Alpha_pre Grad: {self.alpha_pre.grad}")
	torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0, foreach=True)
	optimizer.step()
	scheduler.step()
	optimizer.zero_grad(set_to_none=True)
	if (i + 1) % (accumulation_steps * 4) == 0:
	if batchloss > lm_loss.detach().item():
	batchloss = lm_loss.detach().item()
	print(f"New best loss: {batchloss * accumulation_steps:.4f}. Saving model...")
	save_model(self, f"best_model.safetensors")

	if (i + 1) % (accumulation_steps * 100) == 0:
	print(f"Epoch {epoch} \| Batch {i + 1 // accumulation_steps} \| loss={lm_loss.item() * accumulation_steps:.4f}")
	torch.save({
	'model': self.state_dict(),
	'optimizer': optimizer.state_dict(),
	'scheduler': scheduler.state_dict(),
	'epoch': epoch,
	'chunk_idx': chunk_idx,
	#'all_indices': all_indices, # save the shuffle order
	'batchloss': batchloss,
	}, "checkpoint_full.pth")
	print(f"Saved state.")

	if (i + 1) % (accumulation_steps * 50000) == 0:
	save_model(self, "model.safetensors")

	avg_loss = (epoch_loss / batches_run)
	timestamp = datetime.now().strftime("%H:%M:%S")
	print(f"[{timestamp}] Epoch {epoch:3d} \| "
	f"Loss: {avg_loss:.4f} \| "
	f"Chunk: {chunk_idx+1}/{len(chunks)}")
	except KeyboardInterrupt:
	print("\n[!] Manual Interrupt detected. Saving safety checkpoint...")
	save_model(model, "emergency_checkpoint.safetensors")
	print("Done. Safe to exit.")


	def forward_training(self, input_ids, iters=3):

	current_input_ids = input_ids[:, -self.sequencelength:]#:,
	curr_seq_len = current_input_ids.size(1)#
	input_ids = current_input_ids

	w = current_input_ids.long()
	w = self.embed(w)

	#Pre block 1
	y = self.norm0(w)
	o = y
	attn_out = self.layerPreA1(y, rope=self.rope)#, _
	ffn_out = self.preffn1(attn_out, w, w, 1)
	combined = attn_out + ffn_out
	#combined = combined * (1 / 1.41421356237) # sqrt(2)
	y = o + (self.alpha_pre * combined)

	#Pre block 2
	y = self.norm1(y)
	o = y
	attn_out = self.layerPreA2(y[:, -self.local_seq_len:], rope=self.rope)#, _
	ffn_out = self.preffn2(attn_out)
	combined = attn_out + ffn_out
	#combined = attn_out * (1 / 1.41421356237)
	y = o.clone() # Keep the first half as is
	y[:, -self.local_seq_len:] = o[:, -self.local_seq_len:] + (self.alpha_pre * combined)

	#Engram Block
	mem = self.Engram(y, input_ids)#ENgram for mem
	gate = torch.sigmoid(self.mem_gate(y))
	y = y + (gate * mem) # make it decide when to use Engram.
	z = y
	linguistic_anchor = y

	for j in range(iters):
	y_prev = y#.detach()
	y = (self.M1(y, z, linguistic_anchor) * self.alpha_mem) + y
	y = self.Mnorm(y)

	iter_tensor = torch.tensor(j, device=self.device)
	# ---- BLOCK 1 ----
	y = self.normM1(y)
	o = y
	attn_out = self.layerA1(y, rope=self.rope)
	moe_out = self.layerMi1(y, y_prev, linguistic_anchor, iter_tensor)
	combined = moe_out + attn_out
	#combined = combined * (1 / 1.41421356237)
	y = o + (self.alpha_loop * combined)

	# ---- BLOCK 2 ----
	y = self.normM2(y)
	o = y
	g = y[:, -self.local_seq_len:]
	attn_out = self.layerA2(g, rope=self.rope)
	moe_out = self.layerMi2(attn_out, y_prev[:, -self.local_seq_len:], linguistic_anchor[:, -self.local_seq_len:], iter_tensor)
	combined = attn_out + moe_out
	#combined = combined * (1 / 1.41421356237)
	y = o.clone() # Keep the first half as is
	y[:, -self.local_seq_len:] = o[:, -self.local_seq_len:] + (self.alpha_loop * combined)

	# ---- BLOCK 3 ----
	y = self.normM3(y)
	o = y
	attn_out = self.layerA3(y, rope=self.rope)
	moe_out = self.layerMi3(attn_out, y_prev, linguistic_anchor, iter_tensor)
	combined = attn_out + moe_out
	#combined = combined * (1 / 1.41421356237)
	y = o + (self.alpha_loop * combined)

	# ---- BLOCK 4 ----
	y = self.normM4(y)
	o = y
	g = y[:, -self.local_seq_len:]
	attn_out = self.layerA4(g, rope=self.rope)
	moe_out = self.layerMi4(attn_out, y_prev[:, -self.local_seq_len:], linguistic_anchor[:, -self.local_seq_len:], iter_tensor)
	combined = attn_out + moe_out
	#combined = combined * (1 / 1.41421356237)
	y = o.clone() # Keep the first half as is
	y[:, -self.local_seq_len:] = o[:, -self.local_seq_len:] + (self.alpha_loop * combined)

	# ---- BLOCK 5 ----
	y = self.normM5(y)
	o = y
	attn_out = self.layerA5(y, rope=self.rope)
	moe_out = self.layerMi5(attn_out, y_prev, linguistic_anchor, iter_tensor)
	combined = attn_out + moe_out
	y = o + (self.alpha_loop * combined)

	z = y

	y = ((self.scratchpad_gate(y)) * y) + y_prev # gated scratchpad residual

	#post block 1
	z = self.norm2(z)
	o = z
	attn_out = self.layerPostA1(z, rope=self.rope)
	ffn_out = self.postffn1(attn_out)
	combined = attn_out + ffn_out
	#combined = combined * (1 / 1.41421356237)
	z = o + (self.alpha_post * combined)

	#post block 2
	z = self.norm3(z)
	o = z
	attn_out = self.layerPostA2(z, rope=self.rope)
	ffn_out = self.postffn2(attn_out)
	combined = attn_out + ffn_out
	#combined = combined * (1 / 1.41421356237)
	z = o + (self.alpha_post * combined)

	#weight tied output layer with embedding
	logits = self.layerO1(z)
	#if self.debugprints:
	#print(f"Logit Max: {logits.max().item()}, Logit Min: {logits.min().item()}")
	return logits

	def forward(self, input_ids, max_tokens ,iters=3, top_k=10, temperature=1.0, rep_penalty=1.5):
	generated_tokens = []


	for _ in range(max_tokens):
	current_input_ids = input_ids[:, -self.sequencelength:]#:,
	curr_seq_len = current_input_ids.size(1)#
	input_ids = current_input_ids

	w = current_input_ids.long().to(self.device)
	w = self.embed(w)

	#Pre block 1
	y = self.norm0(w)
	o = y
	attn_out = self.layerPreA1(y, rope=self.rope)#, _
	ffn_out = self.preffn1(attn_out, w, w, 1)
	combined = attn_out + ffn_out
	#combined = combined * (1 / 1.41421356237) # sqrt(2)
	y = o + (self.alpha_pre * combined)

	#Pre block 2
	y = self.norm1(y)
	o = y
	attn_out = self.layerPreA2(y[:, -self.local_seq_len:], rope=self.rope)#, _
	ffn_out = self.preffn2(attn_out)
	combined = attn_out + ffn_out
	#combined = attn_out * (1 / 1.41421356237)
	y = o.clone() # Keep the first half as is
	y[:, -self.local_seq_len:] = o[:, -self.local_seq_len:] + (self.alpha_pre * combined)

	#Engram Block
	mem = self.Engram(y, input_ids)#ENgram for mem
	gate = torch.sigmoid(self.mem_gate(y))
	y = y + (gate * mem) # make it decide when to use Engram.
	z = y
	linguistic_anchor = y

	for j in range(iters):
	y_prev = y#.detach()
	y = (self.M1(y, z, linguistic_anchor) * self.alpha_mem) + y
	y = self.Mnorm(y)

	iter_tensor = torch.tensor(j, device=self.device)
	# ---- BLOCK 1 ----
	y = self.normM1(y)
	o = y
	attn_out = self.layerA1(y, rope=self.rope)
	moe_out = self.layerMi1(y, y_prev, linguistic_anchor, iter_tensor)
	combined = moe_out + attn_out
	#combined = combined * (1 / 1.41421356237)
	y = o + (self.alpha_loop * combined)

	# ---- BLOCK 2 ----
	y = self.normM2(y)
	o = y
	g = y[:, -self.local_seq_len:]
	attn_out = self.layerA2(g, rope=self.rope)
	moe_out = self.layerMi2(attn_out, y_prev[:, -self.local_seq_len:], linguistic_anchor[:, -self.local_seq_len:], iter_tensor)
	combined = attn_out + moe_out
	#combined = combined * (1 / 1.41421356237)
	y = o.clone() # Keep the first half as is
	y[:, -self.local_seq_len:] = o[:, -self.local_seq_len:] + (self.alpha_loop * combined)

	# ---- BLOCK 3 ----
	y = self.normM3(y)
	o = y
	attn_out = self.layerA3(y, rope=self.rope)
	moe_out = self.layerMi3(attn_out, y_prev, linguistic_anchor, iter_tensor)
	combined = attn_out + moe_out
	#combined = combined * (1 / 1.41421356237)
	y = o + (self.alpha_loop * combined)

	# ---- BLOCK 4 ----
	y = self.normM4(y)
	o = y
	g = y[:, -self.local_seq_len:]
	attn_out = self.layerA4(g, rope=self.rope)
	moe_out = self.layerMi4(attn_out, y_prev[:, -self.local_seq_len:], linguistic_anchor[:, -self.local_seq_len:], iter_tensor)
	combined = attn_out + moe_out
	#combined = combined * (1 / 1.41421356237)
	y = o.clone() # Keep the first half as is
	y[:, -self.local_seq_len:] = o[:, -self.local_seq_len:] + (self.alpha_loop * combined)

	# ---- BLOCK 5 ----
	y = self.normM5(y)
	o = y
	attn_out = self.layerA5(y, rope=self.rope)
	moe_out = self.layerMi5(attn_out, y_prev, linguistic_anchor, iter_tensor)
	combined = attn_out + moe_out
	y = o + (self.alpha_loop * combined)
	z = y
	y = ((self.scratchpad_gate(y)) * y) + y_prev # gated scratchpad residual

	#post block 1
	z = self.norm2(z)
	o = z
	attn_out = self.layerPostA1(z, rope=self.rope)
	ffn_out = self.postffn1(attn_out)
	combined = attn_out + ffn_out
	#combined = combined * (1 / 1.41421356237)
	z = o + (self.alpha_post * combined)

	#post block 2
	z = self.norm3(z)
	o = z
	attn_out = self.layerPostA2(z, rope=self.rope)
	ffn_out = self.postffn2(attn_out)
	combined = attn_out + ffn_out
	#combined = combined * (1 / 1.41421356237)
	z = o + (self.alpha_post * combined)

	#weight tied output layer with embedding
	logits = self.layerO1(z)
	token1 = self.pick_word(logits[:, -1, :], k=top_k, temperature=temperature, prev_tokens=generated_tokens, rep_penalty=rep_penalty)
	token1 = token1
	generated_tokens.append(token1)#.item()
	input_ids = torch.cat([input_ids, token1], dim=1)
	if token1.item() == enc.eot_token:
	break
	#return generated_tokens
	if enc.decode([token1.item()]) == "<\|endoftext\|>":
	break
	#return generated_tokens

	return generated_tokens

	#Setup model
	##This returns the compiled model, moved to BF16 and to the specified device.
	def initmodel(device):
	model = biggerbrain(device).to(device).to(torch.bfloat16)
	if device != 'cpu':
	model = torch.compile(model, dynamic=True)
	return model

	def think(prompt, model, max_length=100, iter=3, top_k=10, temperature=1.0, raw=False, rep_penalty=1.5):
	model = model._orig_mod if hasattr(model, '_orig_mod') else model
	if raw:
	formatted = prompt
	else:
	formatted = f"User: \n\n{prompt}\n\nAssistant:"
	input_ids = torch.tensor([enc.encode(formatted, allowed_special={'<\|endoftext\|>'})]).to(model.device)#, device=model.device
	model.eval()
	with torch.no_grad():
	generated_tokens = model.forward(
	input_ids,
	max_tokens=max_length,
	iters=iter,
	top_k=top_k,
	temperature=temperature,
	rep_penalty=rep_penalty
	)
	#print("Output:", enc.decode(generated_tokens))
	return enc.decode(generated_tokens)

	def print_parameter_breakdown(model):
	# Use _orig_mod if the model is compiled
	base_model = model._orig_mod if hasattr(model, '_orig_mod') else model
	total_params = 0
	print("--- Parameter Breakdown ---")
	for name, param in base_model.named_parameters():
	if not param.requires_grad:
	continue
	params = param.numel()
	total_params += params
	# Only print layers that have a suspiciously high count (e.g., > 1 million)
	if params > 1_000_000:
	print(f"MASSIVE LAYER DETECTED -> {name}: {params:,} parameters")
	else:
	print(f"{name}: {params:,} parameters")

	print(f"---------------------------\nTrue Total: {total_params:,}")