ropedia-xperience-10m-task-baselines / scripts /omni /run_qwen3_omni_v4_4epoch_8gpu.sh

Publish Ropedia Xperience-10M task baseline cards

3a10443 verified 26 days ago

3.48 kB

	#!/usr/bin/env bash
	set -euo pipefail

	# Stronger Qwen3-Omni LoRA continuation over the already exported 128-episode
	# 96/16/16 dataset. This launcher intentionally reuses the sealed split and
	# writes a distinct run id so it cannot overwrite the public v3 diagnostic.

	SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
	PROJECT_DIR="${PROJECT_DIR:-$(cd "$SCRIPT_DIR/../.." && pwd)}"
	cd "$PROJECT_DIR"

	RUN_ID="${RUN_ID:-xperience10m_qwen3_omni_128ep_structured_json_v4_4epoch_full8gpu_lora}"
	DATASET_JSONL="${DATASET_JSONL:-results/omni_finetune/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_dataset/dataset.jsonl}"
	MODEL_ID="${MODEL_ID:-$HOME/Ropedia/modelscope_models/Qwen__Qwen3-Omni-30B-A3B-Instruct}"
	BACKBONE_CONFIG="${BACKBONE_CONFIG:-configs/omni_backbones/qwen3_omni_lora.json}"
	EPOCHS="${EPOCHS:-4}"
	GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-8}"
	MAX_VAL_SAMPLES="${MAX_VAL_SAMPLES:-512}"

	RUN_DIR="results/omni_finetune/${RUN_ID}"
	LOG="${RUN_DIR}/train.launch.log"
	STATUS="${RUN_DIR}/launch_status.jsonl"
	mkdir -p "$RUN_DIR"

	json_status() {
	.venv/bin/python - "$STATUS" "$@" <<'PY'
	import json
	import sys
	import time

	path = sys.argv[1]
	payload = {"time": time.time()}
	for item in sys.argv[2:]:
	key, value = item.split("=", 1)
	if value.isdigit():
	value = int(value)
	payload[key] = value
	with open(path, "a", encoding="utf-8") as handle:
	handle.write(json.dumps(payload, sort_keys=True) + "\n")
	print(json.dumps(payload, sort_keys=True), flush=True)
	PY
	}

	if [[ ! -s "$DATASET_JSONL" ]]; then
	json_status event=blocked_missing_dataset dataset_jsonl="$DATASET_JSONL"
	exit 2
	fi

	if pgrep -af "train_qwen3_omni_lora.py.*--run-id ${RUN_ID}" >/dev/null 2>&1; then
	json_status event=already_running run_id="$RUN_ID"
	pgrep -af "train_qwen3_omni_lora.py.*--run-id ${RUN_ID}"
	exit 0
	fi

	if pgrep -af "train_qwen3_omni_lora.py" >/dev/null 2>&1; then
	json_status event=blocked_other_training run_id="$RUN_ID"
	pgrep -af "train_qwen3_omni_lora.py"
	exit 3
	fi

	cmd=(
	.venv/bin/python -m accelerate.commands.launch
	--num_processes 8
	--mixed_precision bf16
	--use_fsdp
	--fsdp_sharding_strategy FULL_SHARD
	--fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP
	--fsdp_transformer_layer_cls_to_wrap Qwen3OmniMoeThinkerTextDecoderLayer
	--fsdp_use_orig_params true
	--fsdp_cpu_ram_efficient_loading true
	--fsdp_sync_module_states true
	--fsdp_activation_checkpointing true
	scripts/omni/train_qwen3_omni_lora.py
	--dataset-jsonl "$DATASET_JSONL"
	--model-id "$MODEL_ID"
	--backbone-config "$BACKBONE_CONFIG"
	--run-id "$RUN_ID"
	--train-split train
	--val-split val
	--epochs "$EPOCHS"
	--batch-size 1
	--gradient-accumulation-steps "$GRADIENT_ACCUMULATION_STEPS"
	--max-train-samples 0
	--max-val-samples "$MAX_VAL_SAMPLES"
	--local-files-only
	--gradient-checkpointing
	--progress-every 10
	)

	json_status event=launch_start run_id="$RUN_ID" epochs="$EPOCHS" dataset_jsonl="$DATASET_JSONL"
	CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" \
	PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}" \
	nohup "${cmd[@]}" > "$LOG" 2>&1 < /dev/null &
	pid=$!
	sleep 3

	if ps -p "$pid" >/dev/null 2>&1; then
	json_status event=launch_detached run_id="$RUN_ID" pid="$pid" log="$LOG"
	echo "launched run_id=${RUN_ID} pid=${pid} log=${LOG}"
	exit 0
	fi

	json_status event=launch_failed run_id="$RUN_ID" log="$LOG"
	tail -120 "$LOG" \|\| true
	exit 1