{ "seed": 11, "backbone": "Qwen/Qwen3-1.7B", "backbone_substitution_note": "chal.md spec listed Qwen 3.5 (0.8B/2B/4B band); Qwen 3.5 family was not released as of 2026-05-01. Substituted with Qwen3-1.7B as the closest current open-weight match within the spec's band.", "adapter_id": "nahdes/tenacious-critic-qwen3-1-7b-simpo-seed11", "loss_type": "simpo", "beta": 2.0, "simpo_gamma": 1.0, "learning_rate": 5e-06, "lr_scheduler": "cosine", "warmup_ratio": 0.1, "epochs": 8, "per_device_batch": 2, "grad_accum": 4, "effective_batch": 8, "total_steps": 280, "max_length": 2048, "max_prompt_length": 1536, "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.0, "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ], "trainable_parameters": 17432576, "total_parameters": 1738007552, "trainable_pct": 1.00, "precision": "fp16", "train_pairs": 275, "val_pairs": 31, "wall_time_min": 35.9, "compute": "free Colab T4 (Tesla T4, 16 GB VRAM)", "loss_curve": [ {"step": 20, "train_loss": 1.196, "val_loss": 1.481, "reward_margin": 0.034, "reward_accuracy": 0.542}, {"step": 40, "train_loss": 1.096, "val_loss": 1.313, "reward_margin": 0.309, "reward_accuracy": 0.573}, {"step": 60, "train_loss": 1.046, "val_loss": 1.127, "reward_margin": 0.644, "reward_accuracy": 0.635}, {"step": 80, "train_loss": 0.716, "val_loss": 0.917, "reward_margin": 1.079, "reward_accuracy": 0.740}, {"step": 100, "train_loss": 0.468, "val_loss": 0.672, "reward_margin": 1.745, "reward_accuracy": 0.813}, {"step": 120, "train_loss": 0.299, "val_loss": 0.443, "reward_margin": 2.978, "reward_accuracy": 0.906}, {"step": 140, "train_loss": 0.168, "val_loss": 0.341, "reward_margin": 4.492, "reward_accuracy": 0.906}, {"step": 160, "train_loss": 0.149, "val_loss": 0.295, "reward_margin": 5.275, "reward_accuracy": 0.906}, {"step": 180, "train_loss": 0.058, "val_loss": 0.260, "reward_margin": 5.907, "reward_accuracy": 0.938}, {"step": 200, "train_loss": 0.060, "val_loss": 0.220, "reward_margin": 6.170, "reward_accuracy": 0.969}, {"step": 220, "train_loss": 0.008, "val_loss": 0.197, "reward_margin": 6.349, "reward_accuracy": 0.969}, {"step": 240, "train_loss": 0.036, "val_loss": 0.189, "reward_margin": 6.452, "reward_accuracy": 0.969}, {"step": 260, "train_loss": 0.087, "val_loss": 0.187, "reward_margin": 6.491, "reward_accuracy": 0.969}, {"step": 280, "train_loss": 0.107, "val_loss": 0.187, "reward_margin": 6.496, "reward_accuracy": 0.969} ], "final_metrics": { "train_loss": 0.107, "val_loss": 0.187, "reward_margin": 6.496, "reward_accuracy": 0.969, "rewards_chosen": -10.244, "rewards_rejected": -16.740, "logps_chosen": -5.122, "logps_rejected": -8.370 }, "smoke_test_val_set": { "wins": 5, "total": 5, "samples": [ {"pair": 0, "chosen": -3.820, "rejected": -5.731, "win": true}, {"pair": 1, "chosen": -4.104, "rejected": -5.966, "win": true}, {"pair": 2, "chosen": -4.177, "rejected": -4.697, "win": true}, {"pair": 3, "chosen": -4.262, "rejected": -4.701, "win": true}, {"pair": 4, "chosen": -3.926, "rejected": -5.690, "win": true} ] }, "convergence_note": "Train loss bottomed at step 220 (0.008) and drifted up slightly through step 280 (0.107); val loss kept improving (0.197 → 0.187), so the final adapter is at the convergence asymptote with no overfitting signature. Step-280 selected for final.", "leakage_policy": { "trained_critic_family": "qwen", "chosen_rewriter_families": ["human_v2_style_guide", "deepseek"], "labeling_judge": "scoring_evaluator.py offline (rule-based, no LLM)", "li_2025_compliance": true, "qwen_family_pairs_in_dataset": 0 } }