ropedia-xperience-10m-task-baselines / scripts /omni /auto_start_128_fsdp_full_after_smoke.sh

Update final Qwen model scripts

627e5d7 verified 23 days ago

3.49 kB

	#!/usr/bin/env bash
	set -euo pipefail

	ROPEDIA_WORKSPACE="${ROPEDIA_WORKSPACE:-$HOME/Ropedia}"
	PROJECT_DIR="${PROJECT_DIR:-$ROPEDIA_WORKSPACE/ropedia-episode-task-suite}"
	BASE_RUN_ID="${BASE_RUN_ID:-xperience10m_qwen3_omni_128ep_fullsplit_fast8gpu}"
	SMOKE_RUN_ID="${SMOKE_RUN_ID:-xperience10m_qwen3_omni_128ep_fullsplit_fast8gpu_lora_fsdp_smoke_v3}"
	SMOKE_EXIT_EVENT="${SMOKE_EXIT_EVENT:-train_exit_fsdp_smoke_v3_no_val}"
	FULL_RUN_ID="${FULL_RUN_ID:-xperience10m_qwen3_omni_128ep_fullsplit_fast8gpu_lora_fsdp_full_train_valmon}"
	MODEL_ID="${MODEL_ID:-$ROPEDIA_WORKSPACE/modelscope_models/Qwen__Qwen3-Omni-30B-A3B-Instruct}"
	BACKBONE_CONFIG="${BACKBONE_CONFIG:-configs/omni_backbones/qwen3_omni_lora.json}"
	FSDP_CPU_RAM_EFFICIENT_LOADING="${FSDP_CPU_RAM_EFFICIENT_LOADING:-true}"
	FSDP_SYNC_MODULE_STATES="${FSDP_SYNC_MODULE_STATES:-true}"
	FSDP_ACTIVATION_CHECKPOINTING="${FSDP_ACTIVATION_CHECKPOINTING:-true}"

	cd "$PROJECT_DIR"

	RUN_DIR="results/omni_finetune/${BASE_RUN_ID}"
	STATUS="${RUN_DIR}/status.jsonl"
	DATASET_JSONL="results/omni_finetune/${BASE_RUN_ID}_dataset/dataset.jsonl"
	FULL_LOG="${FULL_LOG:-${RUN_DIR}/train_${FULL_RUN_ID}.log}"

	mkdir -p "$RUN_DIR"

	is_full_training_active() {
	pgrep -af "scripts/omni/train_qwen3_omni_lora.py.*${FULL_RUN_ID}" >/dev/null 2>&1
	}

	is_smoke_active() {
	pgrep -af "scripts/omni/train_qwen3_omni_lora.py.*${SMOKE_RUN_ID}" >/dev/null 2>&1
	}

	if is_full_training_active; then
	echo "full run already active"
	exit 0
	fi

	while true; do
	if grep -q "${SMOKE_EXIT_EVENT}.*returncode\":0" "$STATUS"; then
	break
	fi
	if grep -q "${SMOKE_EXIT_EVENT}.*returncode\":[1-9]" "$STATUS"; then
	echo "{\"event\":\"train_full_blocked_smoke_failed\",\"time\":$(date +%s),\"smoke_run_id\":\"${SMOKE_RUN_ID}\"}" >> "$STATUS"
	exit 1
	fi
	if ! is_smoke_active; then
	echo "{\"event\":\"train_full_blocked_smoke_missing_exit\",\"time\":$(date +%s),\"smoke_run_id\":\"${SMOKE_RUN_ID}\"}" >> "$STATUS"
	exit 2
	fi
	sleep 30
	done

	if is_full_training_active; then
	echo "full run already active after smoke"
	exit 0
	fi

	echo "{\"event\":\"train_start_fsdp_full_train_valmon\",\"time\":$(date +%s),\"run_id\":\"${FULL_RUN_ID}\",\"train_split\":\"train\",\"val_split\":\"val\",\"test_split_reserved\":\"test\",\"num_processes\":8}" >> "$STATUS"

	train_cmd=(
	.venv/bin/python -m accelerate.commands.launch
	--num_processes 8
	--mixed_precision bf16
	--use_fsdp
	--fsdp_sharding_strategy FULL_SHARD
	--fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP
	--fsdp_transformer_layer_cls_to_wrap Qwen3OmniMoeThinkerTextDecoderLayer
	--fsdp_use_orig_params true
	--fsdp_cpu_ram_efficient_loading "$FSDP_CPU_RAM_EFFICIENT_LOADING"
	--fsdp_sync_module_states "$FSDP_SYNC_MODULE_STATES"
	--fsdp_activation_checkpointing "$FSDP_ACTIVATION_CHECKPOINTING"
	scripts/omni/train_qwen3_omni_lora.py
	--dataset-jsonl "$DATASET_JSONL"
	--model-id "$MODEL_ID"
	--backbone-config "$BACKBONE_CONFIG"
	--run-id "$FULL_RUN_ID"
	--train-split train
	--val-split val
	--epochs 1
	--batch-size 1
	--gradient-accumulation-steps 8
	--max-train-samples 0
	--max-val-samples 512
	--local-files-only
	--gradient-checkpointing
	--progress-every 10
	)

	set +e
	CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
	PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
	"${train_cmd[@]}" > "$FULL_LOG" 2>&1
	rc=$?
	set -e

	echo "{\"event\":\"train_exit_fsdp_full_train_valmon\",\"time\":$(date +%s),\"run_id\":\"${FULL_RUN_ID}\",\"returncode\":${rc}}" >> "$STATUS"
	exit "$rc"