skomadinajs commited on
Commit
29bf1fc
·
verified ·
1 Parent(s): 78dcd46

Training in progress, step 81

Browse files
Files changed (2) hide show
  1. adapter_model.safetensors +1 -1
  2. debug.log +19 -0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2a6dd3095ba4b5767a82be65a11db213a724d85cc054e14addf62315aef2e98
3
  size 369150544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fb871ec57fdd5a075afc1bbb81610c1d059794283533e521e8b19dce5fc1d80
3
  size 369150544
debug.log CHANGED
@@ -389,3 +389,22 @@ wandb: WARNING Saving files without folders. If you want to preserve subdirector
389
  {'eval_loss': 0.7018095850944519, 'eval_runtime': 11.3624, 'eval_samples_per_second': 8.977, 'eval_steps_per_second': 2.288, 'memory/max_active (GiB)': 44.57, 'memory/max_allocated (GiB)': 44.57, 'memory/device_reserved (GiB)': 67.48, 'epoch': 3.9}
390
  {'loss': 0.5013, 'grad_norm': 0.3766796588897705, 'learning_rate': 1.587464671688187e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'tokens_per_second_per_gpu': 6091.58, 'epoch': 4.0}
391
  [2025-12-16 23:00:53,957] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output/checkpoint-72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  {'eval_loss': 0.7018095850944519, 'eval_runtime': 11.3624, 'eval_samples_per_second': 8.977, 'eval_steps_per_second': 2.288, 'memory/max_active (GiB)': 44.57, 'memory/max_allocated (GiB)': 44.57, 'memory/device_reserved (GiB)': 67.48, 'epoch': 3.9}
390
  {'loss': 0.5013, 'grad_norm': 0.3766796588897705, 'learning_rate': 1.587464671688187e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'tokens_per_second_per_gpu': 6091.58, 'epoch': 4.0}
391
  [2025-12-16 23:00:53,957] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output/checkpoint-72
392
+ {'loss': 0.4905, 'grad_norm': 0.2340284287929535, 'learning_rate': 1.1747842321367886e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'tokens_per_second_per_gpu': 4514.34, 'epoch': 4.11}
393
+ [2025-12-16 23:01:36,261] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step...
394
+ [2025-12-16 23:01:37,429] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.553159236907959
395
+ [2025-12-16 23:01:37,981] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5507926940917969
396
+ [2025-12-16 23:01:38,572] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5903477668762207
397
+ [2025-12-16 23:01:39,119] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5467860698699951
398
+ [2025-12-16 23:01:39,120] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9]
399
+ {'eval_loss': 0.7022753357887268, 'eval_runtime': 11.3827, 'eval_samples_per_second': 8.961, 'eval_steps_per_second': 2.284, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'epoch': 4.17}
400
+ {'loss': 0.5351, 'grad_norm': 0.21539896726608276, 'learning_rate': 8.208341474624071e-06, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 2274.18, 'epoch': 4.23}
401
+ {'loss': 0.5053, 'grad_norm': 0.21500085294246674, 'learning_rate': 5.27969897080901e-06, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4638.83, 'epoch': 4.34}
402
+ {'loss': 0.5374, 'grad_norm': 0.24092087149620056, 'learning_rate': 2.9814044425935606e-06, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4547.21, 'epoch': 4.45}
403
+ [2025-12-16 23:02:52,287] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step...
404
+ [2025-12-16 23:02:53,601] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6551158428192139
405
+ [2025-12-16 23:02:54,241] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6379897594451904
406
+ [2025-12-16 23:02:54,909] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6671915054321289
407
+ [2025-12-16 23:02:55,571] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6615691184997559
408
+ [2025-12-16 23:02:55,572] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9]
409
+ {'eval_loss': 0.7037167549133301, 'eval_runtime': 11.6258, 'eval_samples_per_second': 8.774, 'eval_steps_per_second': 2.236, 'memory/max_active (GiB)': 44.57, 'memory/max_allocated (GiB)': 44.57, 'memory/device_reserved (GiB)': 67.48, 'epoch': 4.45}
410
+ [2025-12-16 23:03:19,526] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output/checkpoint-81