#!/usr/bin/env python3 """ Startup script: runs the sandbox server (for health checks) and launches FSD training in the background. """ import subprocess import threading import sys import os import time def run_training(): """Clone FSD repo and run training.""" time.sleep(10) # Wait for server to start print("\n" + "=" * 60, flush=True) print(" [TRAINING] Starting FSD-Level5-CoT training on SADC data", flush=True) print("=" * 60 + "\n", flush=True) # Clone the model repo if not os.path.exists("/app/fsd_repo"): print("[TRAINING] Cloning model repo...", flush=True) subprocess.run( ["git", "clone", "https://huggingface.co/Reality123b/FSD-Level5-CoT", "/app/fsd_repo"], check=True, ) print("[TRAINING] Repo cloned.", flush=True) else: print("[TRAINING] Repo already exists, pulling latest...", flush=True) subprocess.run(["git", "pull"], cwd="/app/fsd_repo") # Run training with small subset (CPU-feasible) cmd = [ sys.executable, "train_sadc_e2e.py", "--train_samples", "200", "--val_samples", "50", "--epochs", "2", "--batch_size", "2", "--grad_accum", "4", "--lr", "3e-4", "--num_workers", "0", "--log_every", "5", "--eval_every", "50", "--push_to_hub", "--hub_model_id", "Reality123b/FSD-Level5-CoT", "--save_dir", "/app/checkpoints", "--data_dir", "/app/sadc_subset", ] print(f"[TRAINING] Command: {' '.join(cmd)}", flush=True) result = subprocess.run(cmd, cwd="/app/fsd_repo") print(f"\n[TRAINING] Training finished with exit code: {result.returncode}", flush=True) def run_server(): """Run the sandbox server.""" print("[SERVER] Starting sandbox server on port 7860...", flush=True) subprocess.run([sys.executable, "sandbox_server.py"], cwd="/app") # Start training in background thread training_thread = threading.Thread(target=run_training, daemon=True) training_thread.start() # Start server in foreground (keeps Space alive for health checks) run_server()