bf16: true cutoff_len: 3000 dataset: pumlGenV3 dataset_dir: data ddp_timeout: 180000000 deepspeed: llamaboard_cache/ds_z3_config.json do_train: true enable_thinking: false finetuning_type: full flash_attn: auto freeze_multi_modal_projector: true freeze_vision_tower: true gradient_accumulation_steps: 16 image_max_pixels: 589824 image_min_pixels: 1024 include_num_input_tokens_seen: true learning_rate: 5.0e-05 logging_steps: 1 lr_scheduler_type: cosine max_grad_norm: 1.0 max_samples: 100000 model_name_or_path: Qwen/Qwen3.5-4B-Base num_train_epochs: 3.0 optim: paged_adamw_8bit output_dir: saves/Qwen3.5-4B-Base/full/Qwen3.5-4B-Base-PumlGenV3 packing: false per_device_train_batch_size: 1 plot_loss: true preprocessing_num_workers: 16 report_to: wandb save_steps: 2500 stage: sft template: qwen3_5_nothink trust_remote_code: true video_max_pixels: 65536 video_min_pixels: 256 warmup_steps: 0