Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
File size: 3,488 Bytes
627e5d7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | #!/usr/bin/env bash
set -euo pipefail
ROPEDIA_WORKSPACE="${ROPEDIA_WORKSPACE:-$HOME/Ropedia}"
PROJECT_DIR="${PROJECT_DIR:-$ROPEDIA_WORKSPACE/ropedia-episode-task-suite}"
BASE_RUN_ID="${BASE_RUN_ID:-xperience10m_qwen3_omni_128ep_fullsplit_fast8gpu}"
SMOKE_RUN_ID="${SMOKE_RUN_ID:-xperience10m_qwen3_omni_128ep_fullsplit_fast8gpu_lora_fsdp_smoke_v3}"
SMOKE_EXIT_EVENT="${SMOKE_EXIT_EVENT:-train_exit_fsdp_smoke_v3_no_val}"
FULL_RUN_ID="${FULL_RUN_ID:-xperience10m_qwen3_omni_128ep_fullsplit_fast8gpu_lora_fsdp_full_train_valmon}"
MODEL_ID="${MODEL_ID:-$ROPEDIA_WORKSPACE/modelscope_models/Qwen__Qwen3-Omni-30B-A3B-Instruct}"
BACKBONE_CONFIG="${BACKBONE_CONFIG:-configs/omni_backbones/qwen3_omni_lora.json}"
FSDP_CPU_RAM_EFFICIENT_LOADING="${FSDP_CPU_RAM_EFFICIENT_LOADING:-true}"
FSDP_SYNC_MODULE_STATES="${FSDP_SYNC_MODULE_STATES:-true}"
FSDP_ACTIVATION_CHECKPOINTING="${FSDP_ACTIVATION_CHECKPOINTING:-true}"
cd "$PROJECT_DIR"
RUN_DIR="results/omni_finetune/${BASE_RUN_ID}"
STATUS="${RUN_DIR}/status.jsonl"
DATASET_JSONL="results/omni_finetune/${BASE_RUN_ID}_dataset/dataset.jsonl"
FULL_LOG="${FULL_LOG:-${RUN_DIR}/train_${FULL_RUN_ID}.log}"
mkdir -p "$RUN_DIR"
is_full_training_active() {
pgrep -af "scripts/omni/train_qwen3_omni_lora.py.*${FULL_RUN_ID}" >/dev/null 2>&1
}
is_smoke_active() {
pgrep -af "scripts/omni/train_qwen3_omni_lora.py.*${SMOKE_RUN_ID}" >/dev/null 2>&1
}
if is_full_training_active; then
echo "full run already active"
exit 0
fi
while true; do
if grep -q "${SMOKE_EXIT_EVENT}.*returncode\":0" "$STATUS"; then
break
fi
if grep -q "${SMOKE_EXIT_EVENT}.*returncode\":[1-9]" "$STATUS"; then
echo "{\"event\":\"train_full_blocked_smoke_failed\",\"time\":$(date +%s),\"smoke_run_id\":\"${SMOKE_RUN_ID}\"}" >> "$STATUS"
exit 1
fi
if ! is_smoke_active; then
echo "{\"event\":\"train_full_blocked_smoke_missing_exit\",\"time\":$(date +%s),\"smoke_run_id\":\"${SMOKE_RUN_ID}\"}" >> "$STATUS"
exit 2
fi
sleep 30
done
if is_full_training_active; then
echo "full run already active after smoke"
exit 0
fi
echo "{\"event\":\"train_start_fsdp_full_train_valmon\",\"time\":$(date +%s),\"run_id\":\"${FULL_RUN_ID}\",\"train_split\":\"train\",\"val_split\":\"val\",\"test_split_reserved\":\"test\",\"num_processes\":8}" >> "$STATUS"
train_cmd=(
.venv/bin/python -m accelerate.commands.launch
--num_processes 8
--mixed_precision bf16
--use_fsdp
--fsdp_sharding_strategy FULL_SHARD
--fsdp_auto_wrap_policy TRANSFORMER_BASED_WRAP
--fsdp_transformer_layer_cls_to_wrap Qwen3OmniMoeThinkerTextDecoderLayer
--fsdp_use_orig_params true
--fsdp_cpu_ram_efficient_loading "$FSDP_CPU_RAM_EFFICIENT_LOADING"
--fsdp_sync_module_states "$FSDP_SYNC_MODULE_STATES"
--fsdp_activation_checkpointing "$FSDP_ACTIVATION_CHECKPOINTING"
scripts/omni/train_qwen3_omni_lora.py
--dataset-jsonl "$DATASET_JSONL"
--model-id "$MODEL_ID"
--backbone-config "$BACKBONE_CONFIG"
--run-id "$FULL_RUN_ID"
--train-split train
--val-split val
--epochs 1
--batch-size 1
--gradient-accumulation-steps 8
--max-train-samples 0
--max-val-samples 512
--local-files-only
--gradient-checkpointing
--progress-every 10
)
set +e
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
"${train_cmd[@]}" > "$FULL_LOG" 2>&1
rc=$?
set -e
echo "{\"event\":\"train_exit_fsdp_full_train_valmon\",\"time\":$(date +%s),\"run_id\":\"${FULL_RUN_ID}\",\"returncode\":${rc}}" >> "$STATUS"
exit "$rc"
|