File size: 2,616 Bytes
f590d7e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | #!/usr/bin/env python3
"""Print a compact progress snapshot for an omni fine-tuning run."""
from __future__ import annotations
import argparse
import json
import subprocess
from pathlib import Path
def parse_args() -> argparse.Namespace:
workspace_default = Path(__file__).resolve().parents[2]
parser = argparse.ArgumentParser(description="Monitor an omni fine-tuning run.")
parser.add_argument("--workspace", type=Path, default=workspace_default)
parser.add_argument("--run-id", default="xperience10m_qwen3_omni_32ep")
parser.add_argument("--last", type=int, default=5)
return parser.parse_args()
def read_jsonl(path: Path, limit: int) -> list[dict]:
if not path.exists():
return []
rows = []
with path.open("r", encoding="utf-8") as fp:
for line in fp:
line = line.strip()
if line:
try:
rows.append(json.loads(line))
except json.JSONDecodeError:
pass
return rows[-limit:]
def nvidia_smi() -> str:
cmd = [
"nvidia-smi",
"--query-gpu=index,memory.used,memory.total,utilization.gpu",
"--format=csv,noheader,nounits",
]
try:
return subprocess.check_output(cmd, text=True, stderr=subprocess.STDOUT).strip()
except (FileNotFoundError, subprocess.CalledProcessError) as exc:
return f"nvidia-smi unavailable: {exc}"
def main() -> int:
args = parse_args()
root = args.workspace / "results" / "omni_finetune"
pipeline_status = root / args.run_id / "pipeline_status.jsonl"
train_progress = root / f"{args.run_id}_lora" / "progress.jsonl"
metrics = root / f"{args.run_id}_eval" / "metrics.json"
log_path = root / args.run_id / "logs" / "pipeline.log"
print(f"Run: {args.run_id}")
print(f"Pipeline log: {log_path}")
print("\nGPU status: index, used MiB, total MiB, util %")
print(nvidia_smi())
print("\nRecent pipeline phases:")
for row in read_jsonl(pipeline_status, args.last):
print(json.dumps(row, ensure_ascii=False))
print("\nRecent training progress:")
for row in read_jsonl(train_progress, args.last):
print(json.dumps(row, ensure_ascii=False))
if metrics.exists():
print("\nEval metrics:")
payload = json.loads(metrics.read_text(encoding="utf-8"))
keys = ["accuracy", "action_macro_f1", "json_validity_rate", "subtask_accuracy", "object_micro_f1"]
print(json.dumps({key: payload.get(key) for key in keys}, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())
|