nice-bill commited on
Commit
96fe5df
·
verified ·
1 Parent(s): b21c4bb

deploy from github

Browse files
Files changed (1) hide show
  1. web/app.py +52 -0
web/app.py CHANGED
@@ -92,10 +92,18 @@ def health_check():
92
  def create_run(request: RunRequest):
93
  """Start a new simulation run."""
94
  import traceback
 
95
 
96
  sim = None
97
  run_id = None
98
 
 
 
 
 
 
 
 
99
  try:
100
  sim = Simulation(
101
  num_agents=request.num_agents,
@@ -108,6 +116,9 @@ def create_run(request: RunRequest):
108
 
109
  metrics = sim.run()
110
 
 
 
 
111
  # Get agent states
112
  agent_data = []
113
  for agent in sim.agents:
@@ -125,11 +136,27 @@ def create_run(request: RunRequest):
125
  agents=agent_data
126
  )
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  except Exception as e:
129
  error_msg = str(e)
130
  print(f"[ERROR] Run failed: {error_msg}")
131
  traceback.print_exc()
132
 
 
 
 
133
  # Try to mark run as failed if we have a run_id
134
  if run_id and supabase:
135
  try:
@@ -167,6 +194,31 @@ def get_run_detail(run_id: int):
167
  raise HTTPException(status_code=500, detail=str(e))
168
 
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  # ==================== Metrics Endpoints ====================
171
 
172
  @app.get("/api/metrics/{run_id}")
 
92
  def create_run(request: RunRequest):
93
  """Start a new simulation run."""
94
  import traceback
95
+ import signal
96
 
97
  sim = None
98
  run_id = None
99
 
100
+ # Set timeout to prevent hangs (30 minutes)
101
+ def timeout_handler(signum, frame):
102
+ raise TimeoutError("Run timed out after 30 minutes")
103
+
104
+ signal.signal(signal.SIGALRM, timeout_handler)
105
+ signal.alarm(1800) # 30 minutes
106
+
107
  try:
108
  sim = Simulation(
109
  num_agents=request.num_agents,
 
116
 
117
  metrics = sim.run()
118
 
119
+ # Cancel timeout on success
120
+ signal.alarm(0)
121
+
122
  # Get agent states
123
  agent_data = []
124
  for agent in sim.agents:
 
136
  agents=agent_data
137
  )
138
 
139
+ except TimeoutError as e:
140
+ error_msg = str(e)
141
+ print(f"[ERROR] Run timed out: {error_msg}")
142
+
143
+ if run_id and supabase:
144
+ try:
145
+ supabase.update_run_status(run_id, "timeout")
146
+ print(f"[ERROR] Marked run {run_id} as timeout")
147
+ except:
148
+ pass
149
+
150
+ raise HTTPException(status_code=504, detail=f"Run timed out: {error_msg}")
151
+
152
  except Exception as e:
153
  error_msg = str(e)
154
  print(f"[ERROR] Run failed: {error_msg}")
155
  traceback.print_exc()
156
 
157
+ # Cancel timeout on error
158
+ signal.alarm(0)
159
+
160
  # Try to mark run as failed if we have a run_id
161
  if run_id and supabase:
162
  try:
 
194
  raise HTTPException(status_code=500, detail=str(e))
195
 
196
 
197
+ @app.post("/api/admin/clear-stuck-runs")
198
+ def clear_stuck_runs():
199
+ """Clear all runs marked as 'running' - marks them as failed."""
200
+ if not supabase:
201
+ raise HTTPException(status_code=503, detail="Supabase not configured")
202
+
203
+ try:
204
+ from datetime import datetime
205
+ # Get all running runs
206
+ runs = supabase.get_all_runs()
207
+ stuck_runs = [r for r in runs if r.get("status") == "running"]
208
+
209
+ updated = 0
210
+ for run in stuck_runs:
211
+ supabase.client.table("runs").update({
212
+ "status": "failed",
213
+ "end_time": datetime.now().isoformat()
214
+ }).eq("id", run["id"]).execute()
215
+ updated += 1
216
+
217
+ return {"cleared": updated, "message": f"Marked {updated} stuck runs as failed"}
218
+ except Exception as e:
219
+ raise HTTPException(status_code=500, detail=str(e))
220
+
221
+
222
  # ==================== Metrics Endpoints ====================
223
 
224
  @app.get("/api/metrics/{run_id}")