"""Minimal training loop for the Qovaryx scratch base (single GPU, bf16). This is a nanoGPT-style starter. You bring the data; the model is random-init. Usage: python train_quickstart.py \ --model tjarvis91/qovaryx-50m-scratch-base \ --data path/to/your_corpus.jsonl \ --tokens-per-batch 4096 \ --grad-accum 8 \ --max-steps 50000 """ from __future__ import annotations import argparse import json import time from pathlib import Path import torch from torch.utils.data import Dataset, DataLoader from transformers import AutoModelForCausalLM, AutoTokenizer class JsonlChunkDataset(Dataset): def __init__(self, path: str, tokenizer, max_seq_len: int = 2048): self.rows = [] with open(path, "r", encoding="utf-8") as f: for line in f: r = json.loads(line) # accept either {"text": "..."} or chat-format messages if "text" in r: txt = r["text"] elif "messages" in r: txt = "\n".join(m["content"] for m in r["messages"]) else: continue self.rows.append(txt) self.tok = tokenizer self.max_seq_len = max_seq_len def __len__(self): return len(self.rows) def __getitem__(self, i): enc = self.tok(self.rows[i], truncation=True, max_length=self.max_seq_len, return_tensors="pt") ids = enc["input_ids"][0] return {"input_ids": ids, "labels": ids.clone()} def collate(batch, pad_id): max_len = max(b["input_ids"].size(0) for b in batch) out = {"input_ids": [], "labels": []} for b in batch: ids = b["input_ids"] pad = max_len - ids.size(0) out["input_ids"].append(torch.cat([ids, torch.full((pad,), pad_id, dtype=ids.dtype)])) out["labels"].append(torch.cat([b["labels"], torch.full((pad,), -100, dtype=ids.dtype)])) return {k: torch.stack(v) for k, v in out.items()} def main(): ap = argparse.ArgumentParser() ap.add_argument("--model", default="tjarvis91/qovaryx-50m-scratch-base") ap.add_argument("--data", required=True) ap.add_argument("--tokens-per-batch", type=int, default=4096) ap.add_argument("--grad-accum", type=int, default=8) ap.add_argument("--max-steps", type=int, default=50_000) ap.add_argument("--lr", type=float, default=2e-4) ap.add_argument("--warmup-steps", type=int, default=500) ap.add_argument("--max-seq-len", type=int, default=2048) ap.add_argument("--save-steps", type=int, default=1000) ap.add_argument("--save-dir", default="./qovaryx_run") args = ap.parse_args() tok = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True) pad_id = tok.pad_token_id or tok.eos_token_id or 0 model = AutoModelForCausalLM.from_pretrained( args.model, trust_remote_code=True, torch_dtype=torch.bfloat16, ).cuda() model.train() ds = JsonlChunkDataset(args.data, tok, max_seq_len=args.max_seq_len) loader = DataLoader( ds, batch_size=1, shuffle=True, collate_fn=lambda b: collate(b, pad_id), ) opt = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=0.1, betas=(0.9, 0.95)) sched = torch.optim.lr_scheduler.LambdaLR( opt, lambda s: min(1.0, s / args.warmup_steps), ) save_dir = Path(args.save_dir); save_dir.mkdir(parents=True, exist_ok=True) step = 0; t0 = time.time(); accum_loss = 0.0 for epoch in range(10**9): for batch in loader: batch = {k: v.cuda() for k, v in batch.items()} with torch.amp.autocast("cuda", dtype=torch.bfloat16): out = model(**batch) (out.loss / args.grad_accum).backward() accum_loss += float(out.loss.item()) if (step + 1) % args.grad_accum == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) opt.step(); sched.step(); opt.zero_grad() if (step + 1) % 10 == 0: avg = accum_loss / 10 accum_loss = 0.0 print(f"step={step+1} loss={avg:.4f} elapsed={time.time()-t0:.1f}s lr={sched.get_last_lr()[0]:.2e}") if (step + 1) % args.save_steps == 0: ckpt = save_dir / f"step_{step+1:07d}" model.save_pretrained(str(ckpt)) tok.save_pretrained(str(ckpt)) print(f"saved {ckpt}") step += 1 if step >= args.max_steps: model.save_pretrained(str(save_dir / "final")) tok.save_pretrained(str(save_dir / "final")) return if __name__ == "__main__": main()