Spaces:
Sleeping
Sleeping
deploy from github
Browse files- web/app.py +52 -0
web/app.py
CHANGED
|
@@ -92,10 +92,18 @@ def health_check():
|
|
| 92 |
def create_run(request: RunRequest):
|
| 93 |
"""Start a new simulation run."""
|
| 94 |
import traceback
|
|
|
|
| 95 |
|
| 96 |
sim = None
|
| 97 |
run_id = None
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
try:
|
| 100 |
sim = Simulation(
|
| 101 |
num_agents=request.num_agents,
|
|
@@ -108,6 +116,9 @@ def create_run(request: RunRequest):
|
|
| 108 |
|
| 109 |
metrics = sim.run()
|
| 110 |
|
|
|
|
|
|
|
|
|
|
| 111 |
# Get agent states
|
| 112 |
agent_data = []
|
| 113 |
for agent in sim.agents:
|
|
@@ -125,11 +136,27 @@ def create_run(request: RunRequest):
|
|
| 125 |
agents=agent_data
|
| 126 |
)
|
| 127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
except Exception as e:
|
| 129 |
error_msg = str(e)
|
| 130 |
print(f"[ERROR] Run failed: {error_msg}")
|
| 131 |
traceback.print_exc()
|
| 132 |
|
|
|
|
|
|
|
|
|
|
| 133 |
# Try to mark run as failed if we have a run_id
|
| 134 |
if run_id and supabase:
|
| 135 |
try:
|
|
@@ -167,6 +194,31 @@ def get_run_detail(run_id: int):
|
|
| 167 |
raise HTTPException(status_code=500, detail=str(e))
|
| 168 |
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
# ==================== Metrics Endpoints ====================
|
| 171 |
|
| 172 |
@app.get("/api/metrics/{run_id}")
|
|
|
|
| 92 |
def create_run(request: RunRequest):
|
| 93 |
"""Start a new simulation run."""
|
| 94 |
import traceback
|
| 95 |
+
import signal
|
| 96 |
|
| 97 |
sim = None
|
| 98 |
run_id = None
|
| 99 |
|
| 100 |
+
# Set timeout to prevent hangs (30 minutes)
|
| 101 |
+
def timeout_handler(signum, frame):
|
| 102 |
+
raise TimeoutError("Run timed out after 30 minutes")
|
| 103 |
+
|
| 104 |
+
signal.signal(signal.SIGALRM, timeout_handler)
|
| 105 |
+
signal.alarm(1800) # 30 minutes
|
| 106 |
+
|
| 107 |
try:
|
| 108 |
sim = Simulation(
|
| 109 |
num_agents=request.num_agents,
|
|
|
|
| 116 |
|
| 117 |
metrics = sim.run()
|
| 118 |
|
| 119 |
+
# Cancel timeout on success
|
| 120 |
+
signal.alarm(0)
|
| 121 |
+
|
| 122 |
# Get agent states
|
| 123 |
agent_data = []
|
| 124 |
for agent in sim.agents:
|
|
|
|
| 136 |
agents=agent_data
|
| 137 |
)
|
| 138 |
|
| 139 |
+
except TimeoutError as e:
|
| 140 |
+
error_msg = str(e)
|
| 141 |
+
print(f"[ERROR] Run timed out: {error_msg}")
|
| 142 |
+
|
| 143 |
+
if run_id and supabase:
|
| 144 |
+
try:
|
| 145 |
+
supabase.update_run_status(run_id, "timeout")
|
| 146 |
+
print(f"[ERROR] Marked run {run_id} as timeout")
|
| 147 |
+
except:
|
| 148 |
+
pass
|
| 149 |
+
|
| 150 |
+
raise HTTPException(status_code=504, detail=f"Run timed out: {error_msg}")
|
| 151 |
+
|
| 152 |
except Exception as e:
|
| 153 |
error_msg = str(e)
|
| 154 |
print(f"[ERROR] Run failed: {error_msg}")
|
| 155 |
traceback.print_exc()
|
| 156 |
|
| 157 |
+
# Cancel timeout on error
|
| 158 |
+
signal.alarm(0)
|
| 159 |
+
|
| 160 |
# Try to mark run as failed if we have a run_id
|
| 161 |
if run_id and supabase:
|
| 162 |
try:
|
|
|
|
| 194 |
raise HTTPException(status_code=500, detail=str(e))
|
| 195 |
|
| 196 |
|
| 197 |
+
@app.post("/api/admin/clear-stuck-runs")
|
| 198 |
+
def clear_stuck_runs():
|
| 199 |
+
"""Clear all runs marked as 'running' - marks them as failed."""
|
| 200 |
+
if not supabase:
|
| 201 |
+
raise HTTPException(status_code=503, detail="Supabase not configured")
|
| 202 |
+
|
| 203 |
+
try:
|
| 204 |
+
from datetime import datetime
|
| 205 |
+
# Get all running runs
|
| 206 |
+
runs = supabase.get_all_runs()
|
| 207 |
+
stuck_runs = [r for r in runs if r.get("status") == "running"]
|
| 208 |
+
|
| 209 |
+
updated = 0
|
| 210 |
+
for run in stuck_runs:
|
| 211 |
+
supabase.client.table("runs").update({
|
| 212 |
+
"status": "failed",
|
| 213 |
+
"end_time": datetime.now().isoformat()
|
| 214 |
+
}).eq("id", run["id"]).execute()
|
| 215 |
+
updated += 1
|
| 216 |
+
|
| 217 |
+
return {"cleared": updated, "message": f"Marked {updated} stuck runs as failed"}
|
| 218 |
+
except Exception as e:
|
| 219 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 220 |
+
|
| 221 |
+
|
| 222 |
# ==================== Metrics Endpoints ====================
|
| 223 |
|
| 224 |
@app.get("/api/metrics/{run_id}")
|