[2025-12-16 22:40:42,182] [DEBUG] [axolotl.utils.config.resolve_dtype:66] [PID:27] bf16 support detected, enabling for this configuration. [2025-12-16 22:40:42,529] [DEBUG] [axolotl.utils.config.log_gpu_memory_usage:127] [PID:27] baseline 0.000GB () [2025-12-16 22:40:42,530] [INFO] [axolotl.cli.config.load_cfg:248] [PID:27] config: { "activation_offloading": false, "adapter": "qlora", "axolotl_config_path": "/workspace-data/config/config.yml", "base_model": "nvidia/Nemotron-Mini-4B-Instruct", "base_model_config": "nvidia/Nemotron-Mini-4B-Instruct", "batch_size": 16, "bf16": true, "capabilities": { "bf16": true, "compute_capability": "sm_80", "fp8": false, "n_gpu": 1, "n_node": 1 }, "context_parallel_size": 1, "dataloader_num_workers": 1, "dataloader_pin_memory": true, "dataloader_prefetch_factor": 256, "dataset_num_proc": 20, "datasets": [ { "message_property_mappings": { "content": "content", "role": "role" }, "path": "jalasoft/typst-instruct", "trust_remote_code": false, "type": { "field_instruction": "prompt", "field_output": "completion", "format": "System\n{system}\n\nUser\n{instruction}\nAssistant\n", "system_prompt": "You are an expert in Typst markup language. Generate clean, well-formatted Typst code based on user instructions." } } ], "ddp": false, "device": "cuda:0", "dion_rank_fraction": 1.0, "dion_rank_multiple_of": 1, "env_capabilities": { "torch_version": "2.8.0" }, "eot_tokens": [ "" ], "eval_batch_size": 4, "eval_causal_lm_metrics": [ "sacrebleu", "comet", "ter", "chrf" ], "eval_max_new_tokens": 128, "eval_sample_packing": true, "eval_steps": 0.05, "eval_table_size": 0, "evals_per_epoch": 4, "experimental_skip_move_to_device": true, "flash_attention": true, "fp16": false, "gradient_accumulation_steps": 4, "gradient_checkpointing": true, "gradient_checkpointing_kwargs": { "use_reentrant": false }, "hub_model_id": "jalasoft/nemotron-mini-4B-it-ft-typ", "include_tkps": true, "is_falcon_derived_model": false, "is_llama_derived_model": false, "is_mistral_derived_model": false, "learning_rate": 0.0002, "lisa_layers_attribute": "model.layers", "load_best_model_at_end": false, "load_in_4bit": true, "load_in_8bit": false, "local_rank": 0, "logging_steps": 2, "lora_alpha": 128, "lora_dropout": 0.1, "lora_r": 64, "lora_target_linear": true, "loraplus_lr_embedding": 1e-06, "lr_scheduler": "cosine", "mean_resizing_embeddings": false, "micro_batch_size": 4, "model_config_type": "nemotron", "multipack_real_batches": false, "num_epochs": 5.0, "optimizer": "adamw_torch_fused", "output_dir": "/workspace-data/output", "pad_to_sequence_len": true, "pretrain_multipack_attn": true, "profiler_steps_start": 0, "qlora_sharded_model_loading": false, "ray_num_workers": 1, "resources_per_worker": { "GPU": 1 }, "sample_packing": true, "sample_packing_bin_size": 200, "sample_packing_group_size": 100000, "save_only_model": false, "save_safetensors": true, "save_steps": 0.1, "saves_per_epoch": 2, "sequence_len": 4096, "shuffle_before_merging_datasets": false, "shuffle_merged_datasets": true, "skip_prepare_dataset": false, "special_tokens": { "pad_token": "" }, "streaming_multipack_buffer_size": 10000, "strict": false, "tensor_parallel_size": 1, "tf32": true, "tiled_mlp_use_original_mlp": true, "tokenizer_config": "nvidia/Nemotron-Mini-4B-Instruct", "tokenizer_save_jinja_files": true, "tokenizer_type": "AutoTokenizer", "torch_dtype": "torch.bfloat16", "train_on_inputs": false, "trl": { "log_completions": false, "mask_truncated_completions": false, "ref_model_mixup_alpha": 0.9, "ref_model_sync_steps": 64, "scale_rewards": true, "sync_ref_model": false, "use_vllm": false, "vllm_server_host": "0.0.0.0", "vllm_server_port": 8000 }, "type_of_model": "AutoModelForCausalLM", "use_ray": false, "use_wandb": true, "val_set_size": 0.1, "vllm": { "device": "auto", "dtype": "auto", "gpu_memory_utilization": 0.9, "host": "0.0.0.0", "port": 8000 }, "wandb_project": "nemotron-mini-4B-it-ft-typ", "warmup_ratio": 0.1, "warmup_steps": 0, "weight_decay": 0.01, "world_size": 1 } [2025-12-16 22:40:45,256] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:27] EOS: 3 / [2025-12-16 22:40:45,257] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:27] BOS: 2 / [2025-12-16 22:40:45,257] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:27] PAD: 5 / [2025-12-16 22:40:45,257] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:27] UNK: None / None [2025-12-16 22:40:45,258] [INFO] [axolotl.utils.data.shared.load_preprocessed_dataset:481] [PID:27] Unable to find prepared dataset in last_run_prepared/438fce615f8256908523b2639d484352 [2025-12-16 22:40:45,259] [INFO] [axolotl.utils.data.sft._load_raw_datasets:320] [PID:27] Loading raw datasets... [2025-12-16 22:40:45,259] [WARNING] [axolotl.utils.data.sft._load_raw_datasets:322] [PID:27] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset using `axolotl preprocess path/to/config.yml`. [2025-12-16 22:40:46,167] [INFO] [axolotl.utils.data.wrappers.get_dataset_wrapper:87] [PID:27] Loading dataset: jalasoft/typst-instruct with base_type: None and prompt_style: None [2025-12-16 22:40:55,889] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:218] [PID:27] min_input_len: 144 [2025-12-16 22:40:55,893] [INFO] [axolotl.utils.data.utils.handle_long_seq_in_dataset:220] [PID:27] max_input_len: 4356 [2025-12-16 22:40:57,188] [WARNING] [axolotl.utils.data.utils.handle_long_seq_in_dataset:260] [PID:27] Dropped 3 samples from dataset [2025-12-16 22:41:00,352] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:27] total_num_tokens: 142_890 [2025-12-16 22:41:00,357] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:27] `total_supervised_tokens: 123_911` [2025-12-16 22:41:00,364] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27] Using single process for pack_parallel, running sequentially. [2025-12-16 22:41:01,114] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27] Using single process for pack_parallel, running sequentially. [2025-12-16 22:41:01,267] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.15308165550231934 [2025-12-16 22:41:01,268] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27] Using single process for pack_parallel, running sequentially. [2025-12-16 22:41:01,421] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.15324163436889648 [2025-12-16 22:41:01,421] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27] Using single process for pack_parallel, running sequentially. [2025-12-16 22:41:01,581] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.15950298309326172 [2025-12-16 22:41:01,581] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27] Using single process for pack_parallel, running sequentially. [2025-12-16 22:41:01,739] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.15772557258605957 [2025-12-16 22:41:01,767] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] [2025-12-16 22:41:01,767] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:27] data_loader_len: 2 [2025-12-16 22:41:01,768] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:27] sample_packing_eff_est across ranks: [0.9690348307291666] [2025-12-16 22:41:01,768] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:27] sample_packing_eff_est: None [2025-12-16 22:41:01,768] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:27] total_num_steps: 10 [2025-12-16 22:41:01,776] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:406] [PID:27] total_num_tokens: 1_144_624 [2025-12-16 22:41:01,785] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:424] [PID:27] `total_supervised_tokens: 979_724` [2025-12-16 22:41:01,798] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27] Using single process for pack_parallel, running sequentially. [2025-12-16 22:41:01,955] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27] Using single process for pack_parallel, running sequentially. [2025-12-16 22:41:02,107] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.15265655517578125 [2025-12-16 22:41:02,108] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27] Using single process for pack_parallel, running sequentially. [2025-12-16 22:41:02,322] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.21393156051635742 [2025-12-16 22:41:02,322] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27] Using single process for pack_parallel, running sequentially. [2025-12-16 22:41:02,483] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.16082262992858887 [2025-12-16 22:41:02,484] [DEBUG] [axolotl.utils.samplers.multipack.pack_parallel:177] [PID:27] Using single process for pack_parallel, running sequentially. [2025-12-16 22:41:02,640] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.1570568084716797 [2025-12-16 22:41:02,641] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [71] [2025-12-16 22:41:02,641] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:483] [PID:27] data_loader_len: 17 [2025-12-16 22:41:02,641] [INFO] [axolotl.utils.trainer.calc_sample_packing_eff_est:499] [PID:27] sample_packing_eff_est across ranks: [0.9839761223591549] [2025-12-16 22:41:02,642] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:511] [PID:27] sample_packing_eff_est: 0.99 [2025-12-16 22:41:02,642] [DEBUG] [axolotl.utils.trainer.calculate_total_num_steps:522] [PID:27] total_num_steps: 85 [2025-12-16 22:41:02,642] [INFO] [axolotl.utils.data.sft._prepare_standard_dataset:121] [PID:27] Maximum number of steps set at 85 [2025-12-16 22:41:02,685] [DEBUG] [axolotl.train.setup_model_and_tokenizer:65] [PID:27] Loading tokenizer... nvidia/Nemotron-Mini-4B-Instruct [2025-12-16 22:41:03,685] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:278] [PID:27] EOS: 3 / [2025-12-16 22:41:03,686] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:279] [PID:27] BOS: 2 / [2025-12-16 22:41:03,686] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:280] [PID:27] PAD: 5 / [2025-12-16 22:41:03,686] [DEBUG] [axolotl.loaders.tokenizer.load_tokenizer:281] [PID:27] UNK: None / None [2025-12-16 22:41:03,687] [DEBUG] [axolotl.train.setup_model_and_tokenizer:74] [PID:27] Loading model [2025-12-16 22:41:03,732] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_evaluation_loop:87] [PID:27] Patched Trainer.evaluation_loop with nanmean loss calculation [2025-12-16 22:41:03,734] [DEBUG] [axolotl.monkeypatch.transformers.trainer_loss_calc.patch_maybe_log_save_evaluate:138] [PID:27] Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation [2025-12-16 22:41:03,735] [INFO] [axolotl.loaders.patch_manager._apply_multipack_patches:301] [PID:27] Applying multipack dataloader patch for sample packing... [2025-12-16 22:41:56,068] [INFO] [axolotl.loaders.model._prepare_model_for_quantization:851] [PID:27] converting PEFT model w/ prepare_model_for_kbit_training [2025-12-16 22:41:56,072] [INFO] [axolotl.loaders.model._configure_embedding_dtypes:345] [PID:27] Converting modules to torch.bfloat16 [2025-12-16 22:41:56,077] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:27] Memory usage after model load 8.584GB (+8.584GB allocated, +10.197GB reserved) [2025-12-16 22:41:56,078] [INFO] [axolotl.loaders.adapter.load_lora:80] [PID:27] found linear modules: ['down_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj'] trainable params: 92,274,688 || all params: 4,282,783,744 || trainable%: 2.1545 [2025-12-16 22:41:57,304] [DEBUG] [axolotl.loaders.model.log_gpu_memory_usage:127] [PID:27] after adapters 4.533GB (+4.533GB allocated, +10.400GB reserved) [2025-12-16 22:41:59,850] [WARNING] [py.warnings._showwarnmsg:110] [PID:27] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'repr' attribute with value False was provided to the `Field()` function, which has no effect in the context it was used. 'repr' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type. warnings.warn( [2025-12-16 22:41:59,850] [WARNING] [py.warnings._showwarnmsg:110] [PID:27] /root/miniconda3/envs/py3.11/lib/python3.11/site-packages/pydantic/_internal/_generate_schema.py:2249: UnsupportedFieldAttributeWarning: The 'frozen' attribute with value True was provided to the `Field()` function, which has no effect in the context it was used. 'frozen' is field-specific metadata, and can only be attached to a model field using `Annotated` metadata or by assignment. This may have happened because an `Annotated` type alias using the `type` statement was used, or if the `Field()` function was attached to a single member of a union type. warnings.warn( [2025-12-16 22:42:07,321] [WARNING] [accelerate.utils.other.check_os_kernel:512] [PID:27] Detected kernel version 4.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher. [2025-12-16 22:42:11,621] [INFO] [axolotl.train.save_initial_configs:398] [PID:27] Pre-saving adapter config to /workspace-data/output... [2025-12-16 22:42:11,624] [INFO] [axolotl.train.save_initial_configs:402] [PID:27] Pre-saving tokenizer to /workspace-data/output... [2025-12-16 22:42:11,962] [INFO] [axolotl.train.save_initial_configs:407] [PID:27] Pre-saving model config to /workspace-data/output... [2025-12-16 22:42:11,967] [INFO] [axolotl.train.execute_training:196] [PID:27] Starting trainer... [2025-12-16 22:42:13,670] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6428604125976562 [2025-12-16 22:42:14,293] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6221218109130859 [2025-12-16 22:42:14,918] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6248421669006348 [2025-12-16 22:42:15,544] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6253554821014404 [2025-12-16 22:42:15,545] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [71] wandb: Currently logged in as: santiago-komadina (santiago-komadina-jalasoft) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin wandb: setting up run 0upn0naz wandb: Tracking run with wandb version 0.22.2 wandb: Run data is saved locally in /root/wandb/run-20251216_224215-0upn0naz wandb: Run `wandb offline` to turn off syncing. wandb: Syncing run warm-cherry-1 wandb: ⭐️ View project at https://wandb.ai/santiago-komadina-jalasoft/nemotron-mini-4B-it-ft-typ wandb: 🚀 View run at https://wandb.ai/santiago-komadina-jalasoft/nemotron-mini-4B-it-ft-typ/runs/0upn0naz wandb: Detected [huggingface_hub.inference] in use. wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ wandb: WARNING Saving files without folders. If you want to preserve subdirectories pass base_path to wandb.save, i.e. wandb.save("/mnt/folder/file.h5", base_path="/mnt") [2025-12-16 22:42:17,312] [INFO] [axolotl.utils.callbacks.on_train_begin:757] [PID:27] The Axolotl config has been saved to the WandB run under files. [2025-12-16 22:42:17,318] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:42:18,520] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5991508960723877 [2025-12-16 22:42:19,464] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5996401309967041 [2025-12-16 22:42:20,083] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6178188323974609 [2025-12-16 22:42:20,697] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6140866279602051 [2025-12-16 22:42:20,698] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 1.2447901964187622, 'eval_runtime': 11.3439, 'eval_samples_per_second': 8.992, 'eval_steps_per_second': 2.292, 'memory/max_active (GiB)': 43.87, 'memory/max_allocated (GiB)': 43.87, 'memory/device_reserved (GiB)': 44.33, 'epoch': 0} {'loss': 1.2263, 'grad_norm': 1.0431029796600342, 'learning_rate': 2.5e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.99, 'tokens_per_second_per_gpu': 9742.9, 'epoch': 0.11} {'loss': 1.216, 'grad_norm': 0.5182428359985352, 'learning_rate': 7.500000000000001e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.99, 'tokens_per_second_per_gpu': 4473.57, 'epoch': 0.23} [2025-12-16 22:43:35,611] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:43:36,859] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5805962085723877 [2025-12-16 22:43:37,450] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5903089046478271 [2025-12-16 22:43:38,008] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.557340145111084 [2025-12-16 22:43:38,613] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6043202877044678 [2025-12-16 22:43:38,614] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 1.1019586324691772, 'eval_runtime': 11.5084, 'eval_samples_per_second': 8.863, 'eval_steps_per_second': 2.259, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.99, 'epoch': 0.28} {'loss': 1.0577, 'grad_norm': 0.3546995520591736, 'learning_rate': 0.000125, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 2269.03, 'epoch': 0.34} {'loss': 1.0454, 'grad_norm': 0.440667986869812, 'learning_rate': 0.000175, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4557.72, 'epoch': 0.45} [2025-12-16 22:44:39,676] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output/checkpoint-9 {'loss': 1.0001, 'grad_norm': 0.3028908669948578, 'learning_rate': 0.0001999167799344583, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4434.28, 'epoch': 0.56} [2025-12-16 22:44:55,206] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:44:56,738] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.760429859161377 [2025-12-16 22:44:57,595] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.8560998439788818 [2025-12-16 22:44:58,448] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.8528122901916504 [2025-12-16 22:44:59,290] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.8404862880706787 [2025-12-16 22:44:59,290] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.9463796019554138, 'eval_runtime': 12.2941, 'eval_samples_per_second': 8.297, 'eval_steps_per_second': 2.115, 'memory/max_active (GiB)': 44.57, 'memory/max_allocated (GiB)': 44.57, 'memory/device_reserved (GiB)': 67.48, 'epoch': 0.56} {'loss': 0.8968, 'grad_norm': 0.2737530767917633, 'learning_rate': 0.00019925185024910277, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4502.37, 'epoch': 0.68} {'loss': 0.8833, 'grad_norm': 0.2401425987482071, 'learning_rate': 0.00019792641587574212, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4414.93, 'epoch': 0.79} [2025-12-16 22:46:13,827] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:46:15,308] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.7395086288452148 [2025-12-16 22:46:16,063] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.7533645629882812 [2025-12-16 22:46:16,835] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.7701354026794434 [2025-12-16 22:46:17,693] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.8576903343200684 [2025-12-16 22:46:17,694] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.8602458834648132, 'eval_runtime': 11.8591, 'eval_samples_per_second': 8.601, 'eval_steps_per_second': 2.192, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'epoch': 0.85} {'loss': 0.8453, 'grad_norm': 0.23792307078838348, 'learning_rate': 0.00019594929736144976, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 2303.62, 'epoch': 0.9} {'loss': 0.7881, 'grad_norm': 0.2736661732196808, 'learning_rate': 0.0001933336521037367, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'tokens_per_second_per_gpu': 6103.21, 'epoch': 1.0} [2025-12-16 22:47:01,434] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output/checkpoint-18 {'loss': 0.7723, 'grad_norm': 0.2326890528202057, 'learning_rate': 0.0001900968867902419, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'tokens_per_second_per_gpu': 4474.71, 'epoch': 1.11} [2025-12-16 22:47:31,173] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:47:32,438] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6057627201080322 [2025-12-16 22:47:33,133] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6939413547515869 [2025-12-16 22:47:33,788] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6543259620666504 [2025-12-16 22:47:34,427] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.638103723526001 [2025-12-16 22:47:34,427] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.8047566413879395, 'eval_runtime': 11.5173, 'eval_samples_per_second': 8.856, 'eval_steps_per_second': 2.257, 'memory/max_active (GiB)': 44.57, 'memory/max_allocated (GiB)': 44.57, 'memory/device_reserved (GiB)': 67.49, 'epoch': 1.11} {'loss': 0.7225, 'grad_norm': 0.20799821615219116, 'learning_rate': 0.00018626054156009806, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4517.53, 'epoch': 1.23} {'loss': 0.7031, 'grad_norm': 0.2009187638759613, 'learning_rate': 0.00018185014665785936, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4433.33, 'epoch': 1.34} [2025-12-16 22:48:47,768] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:48:48,952] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.587979793548584 [2025-12-16 22:48:49,541] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5882759094238281 [2025-12-16 22:48:50,108] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5667750835418701 [2025-12-16 22:48:50,671] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5623953342437744 [2025-12-16 22:48:50,672] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.7722324132919312, 'eval_runtime': 11.4867, 'eval_samples_per_second': 8.88, 'eval_steps_per_second': 2.263, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'epoch': 1.39} {'loss': 0.719, 'grad_norm': 0.21798835694789886, 'learning_rate': 0.0001768950525339362, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 2290.77, 'epoch': 1.45} [2025-12-16 22:49:26,967] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output/checkpoint-27 {'loss': 0.7063, 'grad_norm': 0.21198105812072754, 'learning_rate': 0.00017142823452219038, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4481.31, 'epoch': 1.56} {'loss': 0.6842, 'grad_norm': 0.20669737458229065, 'learning_rate': 0.00016548607339452853, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4551.98, 'epoch': 1.68} [2025-12-16 22:50:07,271] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:50:08,411] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5734491348266602 [2025-12-16 22:50:08,964] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5519306659698486 [2025-12-16 22:50:09,542] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5771615505218506 [2025-12-16 22:50:10,111] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5690045356750488 [2025-12-16 22:50:10,112] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.7488055229187012, 'eval_runtime': 11.4272, 'eval_samples_per_second': 8.926, 'eval_steps_per_second': 2.275, 'memory/max_active (GiB)': 44.57, 'memory/max_allocated (GiB)': 44.57, 'memory/device_reserved (GiB)': 67.48, 'epoch': 1.68} {'loss': 0.7048, 'grad_norm': 0.1963784545660019, 'learning_rate': 0.00015910811325286768, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4556.99, 'epoch': 1.79} {'loss': 0.7134, 'grad_norm': 0.19664043188095093, 'learning_rate': 0.00015233679836966122, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4555.74, 'epoch': 1.9} [2025-12-16 22:51:23,266] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:51:24,618] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6728262901306152 [2025-12-16 22:51:25,291] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6715450286865234 [2025-12-16 22:51:25,928] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6365022659301758 [2025-12-16 22:51:26,584] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6554775238037109 [2025-12-16 22:51:26,584] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.7312036156654358, 'eval_runtime': 11.6385, 'eval_samples_per_second': 8.764, 'eval_steps_per_second': 2.234, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'epoch': 1.96} {'loss': 0.6924, 'grad_norm': 0.27984705567359924, 'learning_rate': 0.00014521719072826858, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'tokens_per_second_per_gpu': 2164.48, 'epoch': 2.0} [2025-12-16 22:51:45,260] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output/checkpoint-36 {'loss': 0.6301, 'grad_norm': 0.20510244369506836, 'learning_rate': 0.00013779667014289065, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'tokens_per_second_per_gpu': 4521.82, 'epoch': 2.11} {'loss': 0.6013, 'grad_norm': 0.2081460803747177, 'learning_rate': 0.00013012461895372344, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'tokens_per_second_per_gpu': 4480.82, 'epoch': 2.23} [2025-12-16 22:52:39,901] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:52:41,086] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5562489032745361 [2025-12-16 22:52:41,631] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5433411598205566 [2025-12-16 22:52:42,175] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5441112518310547 [2025-12-16 22:52:42,714] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5376300811767578 [2025-12-16 22:52:42,714] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.7242206931114197, 'eval_runtime': 11.4052, 'eval_samples_per_second': 8.943, 'eval_steps_per_second': 2.28, 'memory/max_active (GiB)': 44.57, 'memory/max_allocated (GiB)': 44.57, 'memory/device_reserved (GiB)': 67.49, 'epoch': 2.23} {'loss': 0.6345, 'grad_norm': 0.20908962190151215, 'learning_rate': 0.00012225209339563145, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4464.71, 'epoch': 2.34} {'loss': 0.606, 'grad_norm': 0.21036918461322784, 'learning_rate': 0.00011423148382732853, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4610.24, 'epoch': 2.45} [2025-12-16 22:53:55,802] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:53:56,924] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5670294761657715 [2025-12-16 22:53:57,495] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5697734355926514 [2025-12-16 22:53:58,102] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.606992244720459 [2025-12-16 22:53:58,655] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5524764060974121 [2025-12-16 22:53:58,656] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.7133870720863342, 'eval_runtime': 11.4191, 'eval_samples_per_second': 8.932, 'eval_steps_per_second': 2.277, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'epoch': 2.51} [2025-12-16 22:54:10,083] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output/checkpoint-45 {'loss': 0.5699, 'grad_norm': 0.22012656927108765, 'learning_rate': 0.00010611616608218429, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 2257.77, 'epoch': 2.56} {'loss': 0.5996, 'grad_norm': 0.21454322338104248, 'learning_rate': 9.79601462608595e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4404.17, 'epoch': 2.68} {'loss': 0.5736, 'grad_norm': 0.21601669490337372, 'learning_rate': 8.981770132961649e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4449.49, 'epoch': 2.79} [2025-12-16 22:55:14,460] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:55:15,571] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5595951080322266 [2025-12-16 22:55:16,117] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5449485778808594 [2025-12-16 22:55:16,674] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5567653179168701 [2025-12-16 22:55:17,218] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5431113243103027 [2025-12-16 22:55:17,219] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.7081390023231506, 'eval_runtime': 11.4267, 'eval_samples_per_second': 8.926, 'eval_steps_per_second': 2.275, 'memory/max_active (GiB)': 44.57, 'memory/max_allocated (GiB)': 44.57, 'memory/device_reserved (GiB)': 67.48, 'epoch': 2.79} {'loss': 0.581, 'grad_norm': 0.2167098969221115, 'learning_rate': 8.174301791606385e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4608.25, 'epoch': 2.9} {'loss': 0.6049, 'grad_norm': 0.3036252558231354, 'learning_rate': 7.378983170608982e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'tokens_per_second_per_gpu': 6340.01, 'epoch': 3.0} [2025-12-16 22:56:12,732] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output/checkpoint-54 [2025-12-16 22:56:29,903] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:56:31,105] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5644886493682861 [2025-12-16 22:56:31,651] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5451717376708984 [2025-12-16 22:56:32,229] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5778226852416992 [2025-12-16 22:56:32,817] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5867812633514404 [2025-12-16 22:56:32,817] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.7016817927360535, 'eval_runtime': 11.386, 'eval_samples_per_second': 8.958, 'eval_steps_per_second': 2.284, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'epoch': 3.06} {'loss': 0.5623, 'grad_norm': 0.23229679465293884, 'learning_rate': 6.601106984173835e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 2249.63, 'epoch': 3.11} {'loss': 0.5422, 'grad_norm': 0.20941545069217682, 'learning_rate': 5.845849869981137e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4662.79, 'epoch': 3.23} {'loss': 0.5023, 'grad_norm': 0.22417426109313965, 'learning_rate': 5.11823793951719e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4506.08, 'epoch': 3.34} [2025-12-16 22:57:45,685] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:57:46,797] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5598804950714111 [2025-12-16 22:57:47,358] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5603907108306885 [2025-12-16 22:57:47,944] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5853786468505859 [2025-12-16 22:57:48,556] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6114795207977295 [2025-12-16 22:57:48,557] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.7072933316230774, 'eval_runtime': 11.3794, 'eval_samples_per_second': 8.964, 'eval_steps_per_second': 2.285, 'memory/max_active (GiB)': 44.57, 'memory/max_allocated (GiB)': 44.57, 'memory/device_reserved (GiB)': 67.48, 'epoch': 3.34} {'loss': 0.5672, 'grad_norm': 0.22416777908802032, 'learning_rate': 4.423113330131707e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4504.23, 'epoch': 3.45} [2025-12-16 22:58:36,901] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output/checkpoint-63 {'loss': 0.5452, 'grad_norm': 0.24252445995807648, 'learning_rate': 3.7651019814126654e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4601.17, 'epoch': 3.56} [2025-12-16 22:59:04,397] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 22:59:05,525] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5673832893371582 [2025-12-16 22:59:06,092] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.566525936126709 [2025-12-16 22:59:06,659] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.566476583480835 [2025-12-16 22:59:07,243] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.582695722579956 [2025-12-16 22:59:07,243] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.7047077417373657, 'eval_runtime': 11.4286, 'eval_samples_per_second': 8.925, 'eval_steps_per_second': 2.275, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'epoch': 3.62} {'loss': 0.5324, 'grad_norm': 0.2310749590396881, 'learning_rate': 3.1485828503215585e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 2222.68, 'epoch': 3.68} {'loss': 0.5322, 'grad_norm': 0.22174930572509766, 'learning_rate': 2.5776587699573006e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4499.22, 'epoch': 3.79} {'loss': 0.5051, 'grad_norm': 0.22046604752540588, 'learning_rate': 2.0561291458788733e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4445.6, 'epoch': 3.9} [2025-12-16 23:00:20,447] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 23:00:21,577] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5424764156341553 [2025-12-16 23:00:22,125] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5474584102630615 [2025-12-16 23:00:22,678] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5522243976593018 [2025-12-16 23:00:23,237] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5580763816833496 [2025-12-16 23:00:23,237] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.7018095850944519, 'eval_runtime': 11.3624, 'eval_samples_per_second': 8.977, 'eval_steps_per_second': 2.288, 'memory/max_active (GiB)': 44.57, 'memory/max_allocated (GiB)': 44.57, 'memory/device_reserved (GiB)': 67.48, 'epoch': 3.9} {'loss': 0.5013, 'grad_norm': 0.3766796588897705, 'learning_rate': 1.587464671688187e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'tokens_per_second_per_gpu': 6091.58, 'epoch': 4.0} [2025-12-16 23:00:53,957] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output/checkpoint-72 {'loss': 0.4905, 'grad_norm': 0.2340284287929535, 'learning_rate': 1.1747842321367886e-05, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'tokens_per_second_per_gpu': 4514.34, 'epoch': 4.11} [2025-12-16 23:01:36,261] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 23:01:37,429] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.553159236907959 [2025-12-16 23:01:37,981] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5507926940917969 [2025-12-16 23:01:38,572] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5903477668762207 [2025-12-16 23:01:39,119] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5467860698699951 [2025-12-16 23:01:39,120] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.7022753357887268, 'eval_runtime': 11.3827, 'eval_samples_per_second': 8.961, 'eval_steps_per_second': 2.284, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.49, 'epoch': 4.17} {'loss': 0.5351, 'grad_norm': 0.21539896726608276, 'learning_rate': 8.208341474624071e-06, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 2274.18, 'epoch': 4.23} {'loss': 0.5053, 'grad_norm': 0.21500085294246674, 'learning_rate': 5.27969897080901e-06, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4638.83, 'epoch': 4.34} {'loss': 0.5374, 'grad_norm': 0.24092087149620056, 'learning_rate': 2.9814044425935606e-06, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4547.21, 'epoch': 4.45} [2025-12-16 23:02:52,287] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 23:02:53,601] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6551158428192139 [2025-12-16 23:02:54,241] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6379897594451904 [2025-12-16 23:02:54,909] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6671915054321289 [2025-12-16 23:02:55,571] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6615691184997559 [2025-12-16 23:02:55,572] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.7037167549133301, 'eval_runtime': 11.6258, 'eval_samples_per_second': 8.774, 'eval_steps_per_second': 2.236, 'memory/max_active (GiB)': 44.57, 'memory/max_allocated (GiB)': 44.57, 'memory/device_reserved (GiB)': 67.48, 'epoch': 4.45} [2025-12-16 23:03:19,526] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output/checkpoint-81 {'loss': 0.495, 'grad_norm': 0.22582735121250153, 'learning_rate': 1.3287526608711131e-06, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4492.37, 'epoch': 4.56} {'loss': 0.494, 'grad_norm': 0.22348730266094208, 'learning_rate': 3.3274175058067846e-07, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'tokens_per_second_per_gpu': 4489.26, 'epoch': 4.68} [2025-12-16 23:04:13,132] [INFO] [axolotl.core.trainers.base.evaluate:377] [PID:27] Running evaluation step... [2025-12-16 23:04:14,299] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5913589000701904 [2025-12-16 23:04:14,901] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.6010839939117432 [2025-12-16 23:04:15,471] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5690572261810303 [2025-12-16 23:04:16,044] [DEBUG] [axolotl.utils.samplers.multipack.__len__:462] [PID:27] generate_batches time: 0.5720643997192383 [2025-12-16 23:04:16,044] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:438] [PID:27] gather_len_batches: [9] {'eval_loss': 0.7037572264671326, 'eval_runtime': 11.4254, 'eval_samples_per_second': 8.927, 'eval_steps_per_second': 2.276, 'memory/max_active (GiB)': 58.46, 'memory/max_allocated (GiB)': 58.46, 'memory/device_reserved (GiB)': 67.48, 'epoch': 4.73} [2025-12-16 23:04:27,478] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output/checkpoint-85 {'train_runtime': 1334.7133, 'train_samples_per_second': 1.019, 'train_steps_per_second': 0.064, 'train_loss': 0.679589033126831, 'memory/max_active (GiB)': 5.24, 'memory/max_allocated (GiB)': 5.24, 'memory/device_reserved (GiB)': 47.45, 'epoch': 4.73} [2025-12-16 23:04:36,294] [INFO] [axolotl.train.save_trained_model:218] [PID:27] Training completed! Saving trained model to /workspace-data/output. [2025-12-16 23:04:36,950] [INFO] [axolotl.train.save_trained_model:336] [PID:27] Model successfully saved to /workspace-data/output [2025-12-16 23:04:37,226] [INFO] [axolotl.core.trainers.base._save:665] [PID:27] Saving model checkpoint to /workspace-data/output