ropedia-xperience-10m-task-baselines / scripts /omni /run_qwen3_omni_v4_4epoch_8gpu.sh
cy0307's picture
Publish Ropedia Xperience-10M task baseline cards
3a10443 verified
Raw
History Blame
3.48 kB
#!/usr/bin/env bash
set -euo pipefail
# Stronger Qwen3-Omni LoRA continuation over the already exported 128-episode
# 96/16/16 dataset. This launcher intentionally reuses the sealed split and
# writes a distinct run id so it cannot overwrite the public v3 diagnostic.
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="${PROJECT_DIR:-$(cd "$SCRIPT_DIR/../.." && pwd)}"
cd "$PROJECT_DIR"
RUN_ID="${RUN_ID:-xperience10m_qwen3_omni_128ep_structured_json_v4_4epoch_full8gpu_lora}"
DATASET_JSONL="${DATASET_JSONL:-results/omni_finetune/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_dataset/dataset.jsonl}"
MODEL_ID="${MODEL_ID:-$HOME/Ropedia/modelscope_models/Qwen__Qwen3-Omni-30B-A3B-Instruct}"
BACKBONE_CONFIG="${BACKBONE_CONFIG:-configs/omni_backbones/qwen3_omni_lora.json}"
EPOCHS="${EPOCHS:-4}"
GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-8}"
MAX_VAL_SAMPLES="${MAX_VAL_SAMPLES:-512}"
RUN_DIR="results/omni_finetune/${RUN_ID}"
LOG="${RUN_DIR}/train.launch.log"
STATUS="${RUN_DIR}/launch_status.jsonl"
mkdir -p "$RUN_DIR"
json_status() {
.venv/bin/python - "$STATUS" "$@" <<'PY'
import json
import sys
import time
path = sys.argv[1]
payload = {"time": time.time()}
for item in sys.argv[2:]:
key, value = item.split("=", 1)
if value.isdigit():
value = int(value)
payload[key] = value
with open(path, "a", encoding="utf-8") as handle:
handle.write(json.dumps(payload, sort_keys=True) + "\n")
print(json.dumps(payload, sort_keys=True), flush=True)
PY
}
if [[ ! -s "$DATASET_JSONL" ]]; then
json_status event=blocked_missing_dataset dataset_jsonl="$DATASET_JSONL"
exit 2
fi
if pgrep -af "train_qwen3_omni_lora.py.*--run-id ${RUN_ID}" >/dev/null 2>&1; then
json_status event=already_running run_id="$RUN_ID"
pgrep -af "train_qwen3_omni_lora.py.*--run-id ${RUN_ID}"
exit 0
fi
if pgrep -af "train_qwen3_omni_lora.py" >/dev/null 2>&1; then
json_status event=blocked_other_training run_id="$RUN_ID"
pgrep -af "train_qwen3_omni_lora.py"
exit 3
fi
cmd=(
.venv/bin/python -m accelerate.commands.launch
--num_processes 8
--mixed_precision bf16
--use_fsdp
--fsdp_sharding_strategy FULL_SHARD
--fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP
--fsdp_transformer_layer_cls_to_wrap Qwen3OmniMoeThinkerTextDecoderLayer
--fsdp_use_orig_params true
--fsdp_cpu_ram_efficient_loading true
--fsdp_sync_module_states true
--fsdp_activation_checkpointing true
scripts/omni/train_qwen3_omni_lora.py
--dataset-jsonl "$DATASET_JSONL"
--model-id "$MODEL_ID"
--backbone-config "$BACKBONE_CONFIG"
--run-id "$RUN_ID"
--train-split train
--val-split val
--epochs "$EPOCHS"
--batch-size 1
--gradient-accumulation-steps "$GRADIENT_ACCUMULATION_STEPS"
--max-train-samples 0
--max-val-samples "$MAX_VAL_SAMPLES"
--local-files-only
--gradient-checkpointing
--progress-every 10
)
json_status event=launch_start run_id="$RUN_ID" epochs="$EPOCHS" dataset_jsonl="$DATASET_JSONL"
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" \
PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}" \
nohup "${cmd[@]}" > "$LOG" 2>&1 < /dev/null &
pid=$!
sleep 3
if ps -p "$pid" >/dev/null 2>&1; then
json_status event=launch_detached run_id="$RUN_ID" pid="$pid" log="$LOG"
echo "launched run_id=${RUN_ID} pid=${pid} log=${LOG}"
exit 0
fi
json_status event=launch_failed run_id="$RUN_ID" log="$LOG"
tail -120 "$LOG" || true
exit 1