File size: 4,223 Bytes
e3f9da9
 
 
 
 
 
 
 
 
 
 
 
3506172
e3f9da9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3506172
e3f9da9
 
 
 
 
 
 
3506172
e3f9da9
 
3a3e7ac
 
e3f9da9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3a3e7ac
e3f9da9
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env bash
set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"

GPU_HOST_SUFFIX="${GPU_HOST_SUFFIX:-$(printf 'A%s-80Gx4' 100)}"
REMOTE_HOST="${REMOTE_HOST:-ANGEL-${GPU_HOST_SUFFIX}}"
REMOTE_ROOT="${REMOTE_ROOT:-/mnt/kgc/chaoyue/ropedia-h20-side/ropedia-episode-task-suite}"
RUN_ID="${RUN_ID:-xperience10m_cosmos3_super_future_task_probes_a100_textonly_v1_20260620}"
RESULT_ROOT="${RESULT_ROOT:-results/omni_finetune}"
TASKS_CSV="${TASKS_CSV:-temporal_order,misalignment_detection,next_subtask_forecast,object_set_forecast}"
MODEL_LABEL="${MODEL_LABEL:-Cosmos3-Super}"

REMOTE_RUN_DIR="${REMOTE_ROOT}/${RESULT_ROOT}/${RUN_ID}"
LOCAL_RUN_DIR="${PROJECT_ROOT}/${RESULT_ROOT}/${RUN_ID}"
LOCAL_LAUNCHER_DIR="${PROJECT_ROOT}/${RESULT_ROOT}/deferred_launchers"
REMOTE_LAUNCHER_LOGS=(
  "${REMOTE_ROOT}/${RESULT_ROOT}/${RUN_ID}.launch.log"
  "${REMOTE_ROOT}/${RESULT_ROOT}/${RUN_ID}.launcher.log"
  "${REMOTE_ROOT}/${RESULT_ROOT}/${RUN_ID}.runner.log"
  "${REMOTE_ROOT}/${RESULT_ROOT}/deferred_launchers/${RUN_ID}.launch.log"
  "${REMOTE_ROOT}/${RESULT_ROOT}/deferred_launchers/${RUN_ID}.launcher.log"
  "${REMOTE_ROOT}/${RESULT_ROOT}/deferred_launchers/${RUN_ID}.runner.log"
)

IFS=',' read -r -a TASKS <<< "$TASKS_CSV"

echo "checking remote run ${REMOTE_HOST}:${REMOTE_RUN_DIR}"
ssh "$REMOTE_HOST" "cd '$REMOTE_ROOT' && test -s '${RESULT_ROOT}/${RUN_ID}/summary.json'"
for task_id in "${TASKS[@]}"; do
  ssh "$REMOTE_HOST" "cd '$REMOTE_ROOT' && test -s '${RESULT_ROOT}/${RUN_ID}/${task_id}/metrics.json'"
done

mkdir -p "$LOCAL_RUN_DIR" "$LOCAL_LAUNCHER_DIR"
rsync -av "${REMOTE_HOST}:${REMOTE_RUN_DIR}/" "$LOCAL_RUN_DIR/"
for remote_launcher_log in "${REMOTE_LAUNCHER_LOGS[@]}"; do
  ssh "$REMOTE_HOST" "test -s '$remote_launcher_log'" >/dev/null 2>&1 \
    && rsync -av "${REMOTE_HOST}:${remote_launcher_log}" "$LOCAL_LAUNCHER_DIR/" \
    || true
done

python3 - "$PROJECT_ROOT" "$RUN_ID" "$TASKS_CSV" "$MODEL_LABEL" <<'PY'
import json
import sys
from pathlib import Path

root = Path(sys.argv[1])
run_id = sys.argv[2]
task_ids = [item.strip() for item in sys.argv[3].split(",") if item.strip()]
model_label = sys.argv[4]
run_dir = root / "results/omni_finetune" / run_id
metric_key_by_task = {
    "timeline_subtask": "timeline_subtask_macro_f1",
    "object_relevance": "object_relevance_micro_f1",
    "temporal_order": "temporal_order_f1",
    "misalignment_detection": "misalignment_detection_f1",
    "next_subtask_forecast": "next_subtask_forecast_macro_f1",
    "object_set_forecast": "object_set_forecast_micro_f1",
}
expected = {task_id: metric_key_by_task[task_id] for task_id in task_ids}

summary_path = run_dir / "summary.json"
if not summary_path.exists():
    raise SystemExit(f"missing summary: {summary_path}")
summary = json.loads(summary_path.read_text(encoding="utf-8"))
if summary.get("status") != "pass":
    raise SystemExit(f"run summary is not pass: {summary.get('status')}")

records = []
for task_id, metric_key in expected.items():
    metrics_path = run_dir / task_id / "metrics.json"
    if not metrics_path.exists():
        raise SystemExit(f"missing metrics: {metrics_path}")
    metrics = json.loads(metrics_path.read_text(encoding="utf-8"))
    score = metrics.get(metric_key)
    if metrics.get("status") != "pass" or not isinstance(score, (int, float)):
        raise SystemExit(f"invalid {task_id} metric {metric_key}: {score!r}")
    records.append(
        {
            "task_id": task_id,
            "metric_key": metric_key,
            "primary_score": score,
            "num_samples": metrics.get("num_samples"),
            "source": str(metrics_path.relative_to(root)),
        }
    )

validation = {
    "title": f"{model_label} Future/Current Task Probe Collection Validation",
    "status": "pass",
    "run_id": run_id,
    "summary": str(summary_path.relative_to(root)),
    "validated_task_count": len(records),
    "records": records,
}
(run_dir / "collection_validation.json").write_text(
    json.dumps(validation, indent=2, sort_keys=True) + "\n",
    encoding="utf-8",
)
print(json.dumps(validation, indent=2, sort_keys=True))
PY

echo "collected and validated ${LOCAL_RUN_DIR}"