bf16: true cutoff_len: 6000 dataset: critic_training_data,refined,omnisvg,svgen,llm4svg dataset_dir: data ddp_timeout: 180000000 deepspeed: cache/ds_z3_config.json do_train: true eval_steps: 5000 eval_strategy: steps finetuning_type: full flash_attn: auto gradient_accumulation_steps: 8 include_num_input_tokens_seen: true learning_rate: 0.0001 logging_steps: 21 lr_scheduler_type: cosine max_grad_norm: 1.0 max_samples: 1000000 model_name_or_path: /gemini-3/space/thu/zhaozhiyuan/wfy-mptsnet/Qwen2.5-VL-7B-Instruct num_train_epochs: 3.0 optim: adamw_torch output_dir: saves/Qwen2.5-VL-7B-Instruct/full/all_stage_123 packing: false per_device_eval_batch_size: 2 per_device_train_batch_size: 2 plot_loss: true preprocessing_num_workers: 16 report_to: none save_steps: 5000 stage: sft template: qwen2_vl trust_remote_code: true val_size: 0.03 warmup_steps: 0