model: base_model_id: allenai/OLMo-2-1124-7B-Instruct model_family: olmo target_modules: - q_proj - k_proj - v_proj - o_proj - gate_proj - up_proj - down_proj lora: r: 16 alpha: 32 dropout: 0.05 dataset: name: allenai/ai2_arc config: ARC-Challenge slug: arc_challenge split: train text_field: question max_samples: 50 eval_samples: 256 domain: science format: arc prompt_style: style: boxed system_prompt: Please reason step by step, and put your final answer within \boxed{}. sdpo: reward: gsm8k_match num_generations: 4 generation_batch_size: 64 steps_per_generation: null max_prompt_length: 512 max_completion_length: 1024 mask_truncated_completions: true distillation_alpha: 1.0 distillation_topk: null distillation_weight: 1.0 distillation_is_clip: 2.0 full_logit_distillation: false policy_loss_mode: distillation_only teacher_regularization: ema teacher_update_rate: null success_reward_threshold: 1.0 use_successful_as_teacher: true include_environment_feedback: false feedback_column: null beta: 0.0 epsilon: 0.2 scale_rewards: group temperature: 1.3 dump_rollouts: true optimization: num_train_epochs: 1 per_device_batch_size: 1 gradient_accumulation_steps: 2 learning_rate: 5.0e-05 warmup_ratio: 0.0 weight_decay: 0.0 lr_scheduler_type: cosine max_grad_norm: 1.0 sequence: max_length: 2048 packing: true checkpointing: num_checkpoints: 8 save_total_limit: 64 schedule: log save_steps: null runtime: logging_steps: 20 bf16: true gradient_checkpointing: true wandb: true wandb_project: amr-fma-train hf_push: true hf_org: tkwiecinski hf_visibility: public force_restart: false evaluation: enabled: false eval_steps: null strategy: null phase: name: P1 run: method: lora_sdpo exp_name: p1_sdpo_multimodel_trial seed: 42 tags: phase: ${phase.name} domain: ${dataset.domain} paths: base: ${oc.env:AMR_FMA_BASE,/capstor/scratch/cscs/${oc.env:USER}/amr-fma} model_short: ${hf_last:${model.base_model_id}} run_slug: ${run.exp_name}__s${run.seed}