#!/usr/bin/env bash
set -euo pipefail

# Stronger Qwen3-Omni LoRA continuation over the already exported 128-episode
# 96/16/16 dataset. This launcher intentionally reuses the sealed split and
# writes a distinct run id so it cannot overwrite the public v3 diagnostic.

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="${PROJECT_DIR:-$(cd "$SCRIPT_DIR/../.." && pwd)}"
cd "$PROJECT_DIR"

RUN_ID="${RUN_ID:-xperience10m_qwen3_omni_128ep_structured_json_v4_4epoch_full8gpu_lora}"
DATASET_JSONL="${DATASET_JSONL:-results/omni_finetune/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_dataset/dataset.jsonl}"
MODEL_ID="${MODEL_ID:-$HOME/Ropedia/modelscope_models/Qwen__Qwen3-Omni-30B-A3B-Instruct}"
BACKBONE_CONFIG="${BACKBONE_CONFIG:-configs/omni_backbones/qwen3_omni_lora.json}"
EPOCHS="${EPOCHS:-4}"
GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-8}"
MAX_VAL_SAMPLES="${MAX_VAL_SAMPLES:-512}"

RUN_DIR="results/omni_finetune/${RUN_ID}"
LOG="${RUN_DIR}/train.launch.log"
STATUS="${RUN_DIR}/launch_status.jsonl"
mkdir -p "$RUN_DIR"

json_status() {
  .venv/bin/python - "$STATUS" "$@" <<'PY'
import json
import sys
import time

path = sys.argv[1]
payload = {"time": time.time()}
for item in sys.argv[2:]:
    key, value = item.split("=", 1)
    if value.isdigit():
        value = int(value)
    payload[key] = value
with open(path, "a", encoding="utf-8") as handle:
    handle.write(json.dumps(payload, sort_keys=True) + "\n")
print(json.dumps(payload, sort_keys=True), flush=True)
PY
}

if [[ ! -s "$DATASET_JSONL" ]]; then
  json_status event=blocked_missing_dataset dataset_jsonl="$DATASET_JSONL"
  exit 2
fi

if pgrep -af "train_qwen3_omni_lora.py.*--run-id ${RUN_ID}" >/dev/null 2>&1; then
  json_status event=already_running run_id="$RUN_ID"
  pgrep -af "train_qwen3_omni_lora.py.*--run-id ${RUN_ID}"
  exit 0
fi

if pgrep -af "train_qwen3_omni_lora.py" >/dev/null 2>&1; then
  json_status event=blocked_other_training run_id="$RUN_ID"
  pgrep -af "train_qwen3_omni_lora.py"
  exit 3
fi

cmd=(
  .venv/bin/python -m accelerate.commands.launch
  --num_processes 8
  --mixed_precision bf16
  --use_fsdp
  --fsdp_sharding_strategy FULL_SHARD
  --fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP
  --fsdp_transformer_layer_cls_to_wrap Qwen3OmniMoeThinkerTextDecoderLayer
  --fsdp_use_orig_params true
  --fsdp_cpu_ram_efficient_loading true
  --fsdp_sync_module_states true
  --fsdp_activation_checkpointing true
  scripts/omni/train_qwen3_omni_lora.py
  --dataset-jsonl "$DATASET_JSONL"
  --model-id "$MODEL_ID"
  --backbone-config "$BACKBONE_CONFIG"
  --run-id "$RUN_ID"
  --train-split train
  --val-split val
  --epochs "$EPOCHS"
  --batch-size 1
  --gradient-accumulation-steps "$GRADIENT_ACCUMULATION_STEPS"
  --max-train-samples 0
  --max-val-samples "$MAX_VAL_SAMPLES"
  --local-files-only
  --gradient-checkpointing
  --progress-every 10
)

json_status event=launch_start run_id="$RUN_ID" epochs="$EPOCHS" dataset_jsonl="$DATASET_JSONL"
CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}" \
PYTORCH_CUDA_ALLOC_CONF="${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}" \
nohup "${cmd[@]}" > "$LOG" 2>&1 < /dev/null &
pid=$!
sleep 3

if ps -p "$pid" >/dev/null 2>&1; then
  json_status event=launch_detached run_id="$RUN_ID" pid="$pid" log="$LOG"
  echo "launched run_id=${RUN_ID} pid=${pid} log=${LOG}"
  exit 0
fi

json_status event=launch_failed run_id="$RUN_ID" log="$LOG"
tail -120 "$LOG" || true
exit 1