File size: 3,488 Bytes

627e5d7

#!/usr/bin/env bash
set -euo pipefail

ROPEDIA_WORKSPACE="${ROPEDIA_WORKSPACE:-$HOME/Ropedia}"
PROJECT_DIR="${PROJECT_DIR:-$ROPEDIA_WORKSPACE/ropedia-episode-task-suite}"
BASE_RUN_ID="${BASE_RUN_ID:-xperience10m_qwen3_omni_128ep_fullsplit_fast8gpu}"
SMOKE_RUN_ID="${SMOKE_RUN_ID:-xperience10m_qwen3_omni_128ep_fullsplit_fast8gpu_lora_fsdp_smoke_v3}"
SMOKE_EXIT_EVENT="${SMOKE_EXIT_EVENT:-train_exit_fsdp_smoke_v3_no_val}"
FULL_RUN_ID="${FULL_RUN_ID:-xperience10m_qwen3_omni_128ep_fullsplit_fast8gpu_lora_fsdp_full_train_valmon}"
MODEL_ID="${MODEL_ID:-$ROPEDIA_WORKSPACE/modelscope_models/Qwen__Qwen3-Omni-30B-A3B-Instruct}"
BACKBONE_CONFIG="${BACKBONE_CONFIG:-configs/omni_backbones/qwen3_omni_lora.json}"
FSDP_CPU_RAM_EFFICIENT_LOADING="${FSDP_CPU_RAM_EFFICIENT_LOADING:-true}"
FSDP_SYNC_MODULE_STATES="${FSDP_SYNC_MODULE_STATES:-true}"
FSDP_ACTIVATION_CHECKPOINTING="${FSDP_ACTIVATION_CHECKPOINTING:-true}"

cd "$PROJECT_DIR"

RUN_DIR="results/omni_finetune/${BASE_RUN_ID}"
STATUS="${RUN_DIR}/status.jsonl"
DATASET_JSONL="results/omni_finetune/${BASE_RUN_ID}_dataset/dataset.jsonl"
FULL_LOG="${FULL_LOG:-${RUN_DIR}/train_${FULL_RUN_ID}.log}"

mkdir -p "$RUN_DIR"

is_full_training_active() {
  pgrep -af "scripts/omni/train_qwen3_omni_lora.py.*${FULL_RUN_ID}" >/dev/null 2>&1
}

is_smoke_active() {
  pgrep -af "scripts/omni/train_qwen3_omni_lora.py.*${SMOKE_RUN_ID}" >/dev/null 2>&1
}

if is_full_training_active; then
  echo "full run already active"
  exit 0
fi

while true; do
  if grep -q "${SMOKE_EXIT_EVENT}.*returncode\":0" "$STATUS"; then
    break
  fi
  if grep -q "${SMOKE_EXIT_EVENT}.*returncode\":[1-9]" "$STATUS"; then
    echo "{\"event\":\"train_full_blocked_smoke_failed\",\"time\":$(date +%s),\"smoke_run_id\":\"${SMOKE_RUN_ID}\"}" >> "$STATUS"
    exit 1
  fi
  if ! is_smoke_active; then
    echo "{\"event\":\"train_full_blocked_smoke_missing_exit\",\"time\":$(date +%s),\"smoke_run_id\":\"${SMOKE_RUN_ID}\"}" >> "$STATUS"
    exit 2
  fi
  sleep 30
done

if is_full_training_active; then
  echo "full run already active after smoke"
  exit 0
fi

echo "{\"event\":\"train_start_fsdp_full_train_valmon\",\"time\":$(date +%s),\"run_id\":\"${FULL_RUN_ID}\",\"train_split\":\"train\",\"val_split\":\"val\",\"test_split_reserved\":\"test\",\"num_processes\":8}" >> "$STATUS"

train_cmd=(
  .venv/bin/python -m accelerate.commands.launch
  --num_processes 8
  --mixed_precision bf16
  --use_fsdp
  --fsdp_sharding_strategy FULL_SHARD
  --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP
  --fsdp_transformer_layer_cls_to_wrap Qwen3OmniMoeThinkerTextDecoderLayer
  --fsdp_use_orig_params true
  --fsdp_cpu_ram_efficient_loading "$FSDP_CPU_RAM_EFFICIENT_LOADING"
  --fsdp_sync_module_states "$FSDP_SYNC_MODULE_STATES"
  --fsdp_activation_checkpointing "$FSDP_ACTIVATION_CHECKPOINTING"
  scripts/omni/train_qwen3_omni_lora.py
  --dataset-jsonl "$DATASET_JSONL"
  --model-id "$MODEL_ID"
  --backbone-config "$BACKBONE_CONFIG"
  --run-id "$FULL_RUN_ID"
  --train-split train
  --val-split val
  --epochs 1
  --batch-size 1
  --gradient-accumulation-steps 8
  --max-train-samples 0
  --max-val-samples 512
  --local-files-only
  --gradient-checkpointing
  --progress-every 10
)

set +e
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
"${train_cmd[@]}" > "$FULL_LOG" 2>&1
rc=$?
set -e

echo "{\"event\":\"train_exit_fsdp_full_train_valmon\",\"time\":$(date +%s),\"run_id\":\"${FULL_RUN_ID}\",\"returncode\":${rc}}" >> "$STATUS"
exit "$rc"