sachinchandrankallar commited on
Commit
733c0c5
·
1 Parent(s): cdea66b

1200 seconds from 600 secs timeout

Browse files
Dockerfile.hf-spaces-minimal CHANGED
@@ -48,5 +48,5 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
48
  CMD curl -f http://localhost:7860/health || exit 1
49
 
50
  # Start application with single worker for minimal memory footprint
51
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "600"]
52
 
 
48
  CMD curl -f http://localhost:7860/health || exit 1
49
 
50
  # Start application with single worker for minimal memory footprint
51
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "1", "--timeout-keep-alive", "1200"]
52
 
docs/archive/COMPREHENSIVE_STREAMING_FIX.md CHANGED
@@ -31,7 +31,7 @@ is_gguf_mode = (data.get('generation_mode') == 'gguf' or
31
  ### **3. Extended Timeout Configuration**
32
  ```python
33
  # Extended timeout for GGUF operations
34
- max_wait_time = 600 # 10 minutes for GGUF operations
35
  heartbeat_interval = 5 # Every 5 seconds
36
  ```
37
 
@@ -54,7 +54,7 @@ heartbeat_interval = 5 # Every 5 seconds
54
  ### **5. Enhanced SSE Generator**
55
  ```python
56
  def sse_generator_extended(job_id):
57
- max_wait_time = 600 # 10 minutes for GGUF operations
58
  heartbeat_interval = 5 # Every 5 seconds
59
  # Enhanced logging and progress updates
60
  ```
 
31
  ### **3. Extended Timeout Configuration**
32
  ```python
33
  # Extended timeout for GGUF operations
34
+ max_wait_time = 1200 # 10 minutes for GGUF operations
35
  heartbeat_interval = 5 # Every 5 seconds
36
  ```
37
 
 
54
  ### **5. Enhanced SSE Generator**
55
  ```python
56
  def sse_generator_extended(job_id):
57
+ max_wait_time = 1200 # 10 minutes for GGUF operations
58
  heartbeat_interval = 5 # Every 5 seconds
59
  # Enhanced logging and progress updates
60
  ```
docs/archive/patient_summary_models_review.md CHANGED
@@ -160,7 +160,7 @@ elif model_type == "causal-openvino":
160
 
161
  #### Weaknesses
162
  - ⚠️ **Slight quality loss**: Q4 quantization may reduce quality slightly
163
- - ⚠️ **Longer timeouts**: Extended timeout needed (600s on HF Spaces)
164
  - ⚠️ **File path parsing**: Requires special handling for filename extraction
165
 
166
  #### Implementation Details
@@ -428,7 +428,7 @@ Based on HF Spaces configuration (`hf_spaces_config.py`):
428
  - ✅ **RAM**: ~3-4GB during inference
429
  - ✅ **Speed**: Very good on T4 (GGUF optimized)
430
  - ✅ **HF Spaces Config**: Primary GGUF model (line 33)
431
- - ✅ **Extended Timeout**: 600s configured for HF Spaces (routes_fastapi.py line 1075)
432
  - ✅ **Quantization**: Q4 reduces memory by ~75%
433
 
434
  #### Performance Estimates
@@ -449,7 +449,7 @@ Based on HF Spaces configuration (`hf_spaces_config.py`):
449
  #### Recommendations
450
  - **Best Choice** for cost-conscious deployment
451
  - Use when expecting high concurrent load
452
- - Extended timeout already configured (600s)
453
  - Cache-friendly for repeated requests
454
 
455
  ---
@@ -551,7 +551,7 @@ GGUF (Phi-3-Q4): ~2.0GB GPU (16% of usable)
551
 
552
  Based on `routes_fastapi.py`:
553
  - **Standard models**: 120-180s timeout
554
- - **GGUF models**: 600s extended timeout (line 1075)
555
  - **HF Spaces detection**: Automatic (line 1073-1074)
556
 
557
  ### Optimization Strategies for T4
@@ -619,7 +619,7 @@ Fallback Model: microsoft/Phi-3-mini-4k-instruct-gguf
619
  Emergency Fallback: google/flan-t5-large
620
  Max Concurrent: 5-6 requests (BART), 8-10 (GGUF)
621
  Memory Limit: 80% (12.8GB GPU, 24GB RAM)
622
- Timeout: 180s (standard), 600s (GGUF)
623
  ```
624
 
625
  ### 📊 **Expected Performance**
 
160
 
161
  #### Weaknesses
162
  - ⚠️ **Slight quality loss**: Q4 quantization may reduce quality slightly
163
+ - ⚠️ **Longer timeouts**: Extended timeout needed (1200s on HF Spaces)
164
  - ⚠️ **File path parsing**: Requires special handling for filename extraction
165
 
166
  #### Implementation Details
 
428
  - ✅ **RAM**: ~3-4GB during inference
429
  - ✅ **Speed**: Very good on T4 (GGUF optimized)
430
  - ✅ **HF Spaces Config**: Primary GGUF model (line 33)
431
+ - ✅ **Extended Timeout**: 1200s configured for HF Spaces (routes_fastapi.py line 1075)
432
  - ✅ **Quantization**: Q4 reduces memory by ~75%
433
 
434
  #### Performance Estimates
 
449
  #### Recommendations
450
  - **Best Choice** for cost-conscious deployment
451
  - Use when expecting high concurrent load
452
+ - Extended timeout already configured (1200s)
453
  - Cache-friendly for repeated requests
454
 
455
  ---
 
551
 
552
  Based on `routes_fastapi.py`:
553
  - **Standard models**: 120-180s timeout
554
+ - **GGUF models**: 1200s extended timeout (line 1075)
555
  - **HF Spaces detection**: Automatic (line 1073-1074)
556
 
557
  ### Optimization Strategies for T4
 
619
  Emergency Fallback: google/flan-t5-large
620
  Max Concurrent: 5-6 requests (BART), 8-10 (GGUF)
621
  Memory Limit: 80% (12.8GB GPU, 24GB RAM)
622
+ Timeout: 180s (standard), 1200s (GGUF)
623
  ```
624
 
625
  ### 📊 **Expected Performance**
docs/hf-spaces/FILES_CREATED.md CHANGED
@@ -125,7 +125,7 @@ python verify_cache.py
125
 
126
  ### 7. `MODEL_CACHING_SUMMARY.md` ⭐ START HERE
127
  **Purpose**: Overview and answer to your question
128
- **Size**: ~600 lines
129
  **Contents**:
130
  - Direct answer to your question
131
  - Performance comparison
@@ -183,7 +183,7 @@ python verify_cache.py
183
 
184
  ### 11. `README_HF_SPACES.md`
185
  **Purpose**: Main README for HF Spaces deployment
186
- **Size**: ~600 lines
187
  **Contents**:
188
  - Quick start (3 steps)
189
  - File structure
@@ -231,11 +231,11 @@ python verify_cache.py
231
  | `entrypoint.sh` | Script | ⭐ YES | 40 lines | Startup verification |
232
  | `verify_cache.py` | Tool | Recommended | 200 lines | Verify cache |
233
  | `health_endpoints.py` | Code | Recommended | +120 lines | Health endpoints |
234
- | `MODEL_CACHING_SUMMARY.md` | Docs | ⭐ START HERE | 600 lines | Overview |
235
  | `HF_SPACES_QUICKSTART.md` | Docs | Recommended | 400 lines | Quick start |
236
  | `HF_SPACES_DEPLOYMENT.md` | Docs | Reference | 800 lines | Full guide |
237
  | `DEPLOYMENT_CHECKLIST.md` | Docs | Helpful | 400 lines | Checklist |
238
- | `README_HF_SPACES.md` | Docs | Reference | 600 lines | Main README |
239
  | `COMPARISON_BEFORE_AFTER.md` | Docs | Helpful | 500 lines | Comparison |
240
  | `FILES_CREATED.md` | Docs | Reference | This file | Index |
241
 
 
125
 
126
  ### 7. `MODEL_CACHING_SUMMARY.md` ⭐ START HERE
127
  **Purpose**: Overview and answer to your question
128
+ **Size**: ~1200 lines
129
  **Contents**:
130
  - Direct answer to your question
131
  - Performance comparison
 
183
 
184
  ### 11. `README_HF_SPACES.md`
185
  **Purpose**: Main README for HF Spaces deployment
186
+ **Size**: ~1200 lines
187
  **Contents**:
188
  - Quick start (3 steps)
189
  - File structure
 
231
  | `entrypoint.sh` | Script | ⭐ YES | 40 lines | Startup verification |
232
  | `verify_cache.py` | Tool | Recommended | 200 lines | Verify cache |
233
  | `health_endpoints.py` | Code | Recommended | +120 lines | Health endpoints |
234
+ | `MODEL_CACHING_SUMMARY.md` | Docs | ⭐ START HERE | 1200 lines | Overview |
235
  | `HF_SPACES_QUICKSTART.md` | Docs | Recommended | 400 lines | Quick start |
236
  | `HF_SPACES_DEPLOYMENT.md` | Docs | Reference | 800 lines | Full guide |
237
  | `DEPLOYMENT_CHECKLIST.md` | Docs | Helpful | 400 lines | Checklist |
238
+ | `README_HF_SPACES.md` | Docs | Reference | 1200 lines | Main README |
239
  | `COMPARISON_BEFORE_AFTER.md` | Docs | Helpful | 500 lines | Comparison |
240
  | `FILES_CREATED.md` | Docs | Reference | This file | Index |
241
 
docs/hf-spaces/INDEX.md CHANGED
@@ -122,8 +122,8 @@ All documentation for deploying to Hugging Face Spaces with pre-cached models.
122
  | DEPLOYMENT_CHECKLIST.md | ~400 | Use while deploying | ⭐⭐ |
123
  | MODEL_UPDATE_SUMMARY.md | ~500 | 10 min | ⭐⭐ |
124
  | HF_SPACES_DEPLOYMENT.md | ~800 | 30 min | ⭐ |
125
- | MODEL_CACHING_SUMMARY.md | ~600 | 15 min | ⭐ |
126
- | README_HF_SPACES.md | ~600 | Reference | ⭐ |
127
  | COMPARISON_BEFORE_AFTER.md | ~500 | Reference | Optional |
128
  | FILES_CREATED.md | ~500 | Reference | Optional |
129
 
 
122
  | DEPLOYMENT_CHECKLIST.md | ~400 | Use while deploying | ⭐⭐ |
123
  | MODEL_UPDATE_SUMMARY.md | ~500 | 10 min | ⭐⭐ |
124
  | HF_SPACES_DEPLOYMENT.md | ~800 | 30 min | ⭐ |
125
+ | MODEL_CACHING_SUMMARY.md | ~1200 | 15 min | ⭐ |
126
+ | README_HF_SPACES.md | ~1200 | Reference | ⭐ |
127
  | COMPARISON_BEFORE_AFTER.md | ~500 | Reference | Optional |
128
  | FILES_CREATED.md | ~500 | Reference | Optional |
129
 
services/ai-service/DEPLOYMENT_FIX.md CHANGED
@@ -17,13 +17,13 @@ The deployment was failing with a "Scheduling failure: unable to schedule" error
17
  **Before:**
18
  ```dockerfile
19
  RUN pip install --no-cache-dir -r /app/requirements.txt gunicorn
20
- CMD ["gunicorn", "-w", "4", "-b", "0.0.0.0:7860", "--timeout", "600", "wsgi:app"]
21
  ```
22
 
23
  **After:**
24
  ```dockerfile
25
  RUN pip install --no-cache-dir -r /app/requirements.txt uvicorn[standard]
26
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "600", "--workers", "4"]
27
  ```
28
 
29
  ### Why This Works
@@ -66,12 +66,12 @@ If you need more production-grade deployment with multiple workers:
66
  #### Option A: Gunicorn with Uvicorn Workers (Recommended for Production)
67
  ```dockerfile
68
  RUN pip install --no-cache-dir -r /app/requirements.txt gunicorn uvicorn[standard]
69
- CMD ["gunicorn", "app:app", "--workers", "4", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:7860", "--timeout", "600"]
70
  ```
71
 
72
  #### Option B: Pure Uvicorn (Current, Good for Medium Load)
73
  ```dockerfile
74
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "600", "--workers", "4"]
75
  ```
76
 
77
  ### 3. Health Check Configuration
 
17
  **Before:**
18
  ```dockerfile
19
  RUN pip install --no-cache-dir -r /app/requirements.txt gunicorn
20
+ CMD ["gunicorn", "-w", "4", "-b", "0.0.0.0:7860", "--timeout", "1200", "wsgi:app"]
21
  ```
22
 
23
  **After:**
24
  ```dockerfile
25
  RUN pip install --no-cache-dir -r /app/requirements.txt uvicorn[standard]
26
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "1200", "--workers", "4"]
27
  ```
28
 
29
  ### Why This Works
 
66
  #### Option A: Gunicorn with Uvicorn Workers (Recommended for Production)
67
  ```dockerfile
68
  RUN pip install --no-cache-dir -r /app/requirements.txt gunicorn uvicorn[standard]
69
+ CMD ["gunicorn", "app:app", "--workers", "4", "--worker-class", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:7860", "--timeout", "1200"]
70
  ```
71
 
72
  #### Option B: Pure Uvicorn (Current, Good for Medium Load)
73
  ```dockerfile
74
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "1200", "--workers", "4"]
75
  ```
76
 
77
  ### 3. Health Check Configuration
services/ai-service/Dockerfile.prod CHANGED
@@ -22,4 +22,4 @@ EXPOSE 7860
22
  ENV PRELOAD_SMALL_MODELS=false
23
 
24
  # Use uvicorn directly for FastAPI (ASGI) instead of gunicorn (WSGI)
25
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "600", "--workers", "4"]
 
22
  ENV PRELOAD_SMALL_MODELS=false
23
 
24
  # Use uvicorn directly for FastAPI (ASGI) instead of gunicorn (WSGI)
25
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860", "--timeout-keep-alive", "1200", "--workers", "4"]
services/ai-service/src/__main__.py CHANGED
@@ -12,4 +12,4 @@ initialize_agents(app)
12
 
13
  if __name__ == '__main__':
14
  import uvicorn
15
- uvicorn.run(app, host="0.0.0.0", port=7860, timeout_keep_alive=600)
 
12
 
13
  if __name__ == '__main__':
14
  import uvicorn
15
+ uvicorn.run(app, host="0.0.0.0", port=7860, timeout_keep_alive=1200)
services/ai-service/src/ai_med_extract/api/routes_fastapi.py CHANGED
@@ -635,7 +635,7 @@ def generate_rule_based_summary(baseline, delta_text, visits=None, patientid=Non
635
 
636
  # Clinical Overview: summarize baseline
637
  if baseline:
638
- baseline_snip = baseline[:600].replace("\n", " ")
639
  lines_assessment.append(f"- Baseline: {baseline_snip}")
640
  else:
641
  lines_assessment.append("- No baseline data available.")
@@ -1348,7 +1348,7 @@ async def async_patient_summary(data, job_id=None):
1348
  try:
1349
  # Use extended timeout for GGUF operations on HF Spaces
1350
  is_hf_spaces = os.environ.get('HF_SPACES', 'false').lower() == 'true'
1351
- timeout_value = timeout_config.get("gguf_extended_timeout" if is_hf_spaces else "gguf_timeout", 600)
1352
 
1353
  if cache_key not in GGUF_PIPELINE_CACHE:
1354
  if job_id:
@@ -1584,10 +1584,10 @@ async def async_patient_summary(data, job_id=None):
1584
  try:
1585
  raw_summary = await asyncio.wait_for(
1586
  generate_with_progress(),
1587
- timeout=timeout_config.get("generation_timeout", 600)
1588
  )
1589
  except asyncio.TimeoutError:
1590
- error_msg = f"Text generation timed out after {timeout_config.get('generation_timeout', 600)} seconds"
1591
  log_error_with_context(Exception(error_msg), "Text generation timeout", job_id)
1592
  update_job_with_error(job_id, error_msg, "generation_timeout")
1593
  raise Exception(error_msg)
@@ -1663,10 +1663,10 @@ async def async_patient_summary(data, job_id=None):
1663
  try:
1664
  result_sum = await asyncio.wait_for(
1665
  asyncio.to_thread(model.generate, context, config),
1666
- timeout=timeout_config.get("generation_timeout", 600)
1667
  )
1668
  except asyncio.TimeoutError:
1669
- error_msg = f"Summarization timed out after {timeout_config.get('generation_timeout', 600)} seconds"
1670
  log_error_with_context(Exception(error_msg), "Summarization timeout", job_id)
1671
  update_job_with_error(job_id, error_msg, "generation_timeout")
1672
  raise Exception(error_msg)
@@ -1777,7 +1777,7 @@ async def async_patient_summary(data, job_id=None):
1777
  temperature=0.1,
1778
  top_p=0.5,
1779
  ),
1780
- timeout=600
1781
  )
1782
  else:
1783
  config = create_generation_config(data, min_tokens=100, temperature=0.1, top_p=0.5)
@@ -1827,7 +1827,7 @@ async def async_patient_summary(data, job_id=None):
1827
  if "timeout" in error_str.lower():
1828
  error_category = "TIMEOUT"
1829
  # Enhanced timeout message with recommendations
1830
- user_message = f"""Summary generation timed out after {timeout_config.get('generation_timeout', 600)} seconds.
1831
 
1832
  This may be due to:
1833
  - Large patient dataset requiring more processing time
@@ -1952,7 +1952,7 @@ def process_patient_summary_background(data, job_id):
1952
  ehr_url,
1953
  json={"patientid": patientid},
1954
  headers=headers,
1955
- timeout=600
1956
  )
1957
  if response.status_code == 200:
1958
  sample_data = response.json()
@@ -2417,7 +2417,7 @@ async def home():
2417
  border-radius: 20px;
2418
  padding: 40px;
2419
  box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
2420
- max-width: 600px;
2421
  width: 100%;
2422
  animation: fadeIn 0.5s ease-in;
2423
  }
@@ -2433,7 +2433,7 @@ async def home():
2433
  padding: 8px 16px;
2434
  border-radius: 20px;
2435
  font-size: 14px;
2436
- font-weight: 600;
2437
  margin-bottom: 20px;
2438
  }
2439
  .status-dot {
@@ -2466,7 +2466,7 @@ async def home():
2466
  }
2467
  .info-title {
2468
  color: #374151;
2469
- font-weight: 600;
2470
  margin-bottom: 15px;
2471
  font-size: 18px;
2472
  }
@@ -2491,7 +2491,7 @@ async def home():
2491
  padding: 4px 8px;
2492
  border-radius: 4px;
2493
  font-size: 12px;
2494
- font-weight: 600;
2495
  margin-right: 10px;
2496
  min-width: 50px;
2497
  text-align: center;
@@ -2512,7 +2512,7 @@ async def home():
2512
  .link {
2513
  color: #667eea;
2514
  text-decoration: none;
2515
- font-weight: 600;
2516
  }
2517
  .link:hover {
2518
  text-decoration: underline;
@@ -2704,7 +2704,7 @@ async def generate_patient_summary_large_data(
2704
  """Wait for slot and then process."""
2705
  try:
2706
  # Wait for processing slot
2707
- if queue_manager.wait_for_slot(request_id, timeout=600):
2708
  # Update job status to show processing started
2709
  job_manager.update_job(job_id, JOB_STATUS["STARTED"], progress=5, data={'message': 'Processing slot acquired, starting generation...'})
2710
  # Start background task with optimized generation
@@ -2733,7 +2733,7 @@ async def generate_patient_summary_large_data(
2733
  'X-Content-Type-Options': 'nosniff',
2734
  'Access-Control-Allow-Origin': '*',
2735
  'Access-Control-Allow-Headers': 'Cache-Control, Connection',
2736
- 'Keep-Alive': 'timeout=3600',
2737
  # Force HTTP/1.1 to avoid HTTP/2 protocol errors
2738
  'X-Protocol': 'HTTP/1.1'
2739
  }
@@ -2790,7 +2790,7 @@ async def generate_patient_summary_streaming(
2790
  """Wait for slot and then process."""
2791
  try:
2792
  # Wait for processing slot
2793
- if queue_manager.wait_for_slot(request_id, timeout=600):
2794
  # Update job status to show processing started
2795
  job_manager.update_job(job_id, JOB_STATUS["STARTED"], progress=5, data={'message': 'Processing slot acquired, starting generation...'})
2796
  # Start background task with optimized generation
@@ -2819,7 +2819,7 @@ async def generate_patient_summary_streaming(
2819
  'X-Content-Type-Options': 'nosniff',
2820
  'Access-Control-Allow-Origin': '*',
2821
  'Access-Control-Allow-Headers': 'Cache-Control, Connection',
2822
- 'Keep-Alive': 'timeout=3600',
2823
  # Force HTTP/1.1 to avoid HTTP/2 protocol errors
2824
  'X-Protocol': 'HTTP/1.1'
2825
  }
@@ -2898,7 +2898,7 @@ async def generate_patient_summary(
2898
  """Wait for slot and then process."""
2899
  try:
2900
  # Wait for processing slot
2901
- if queue_manager.wait_for_slot(request_id, timeout=600):
2902
  # Update job status to show processing started
2903
  job_manager.update_job(job_id, JOB_STATUS["STARTED"], progress=5, data={'message': 'Processing slot acquired, starting generation...'})
2904
  # Start background task directly (not in separate thread to avoid nesting)
@@ -2928,7 +2928,7 @@ async def generate_patient_summary(
2928
  'X-Content-Type-Options': 'nosniff',
2929
  'Access-Control-Allow-Origin': '*',
2930
  'Access-Control-Allow-Headers': 'Cache-Control, Connection',
2931
- 'Keep-Alive': 'timeout=3600',
2932
  # Force HTTP/1.1 to avoid HTTP/2 protocol errors
2933
  'X-Protocol': 'HTTP/1.1'
2934
  }
 
635
 
636
  # Clinical Overview: summarize baseline
637
  if baseline:
638
+ baseline_snip = baseline[:1200].replace("\n", " ")
639
  lines_assessment.append(f"- Baseline: {baseline_snip}")
640
  else:
641
  lines_assessment.append("- No baseline data available.")
 
1348
  try:
1349
  # Use extended timeout for GGUF operations on HF Spaces
1350
  is_hf_spaces = os.environ.get('HF_SPACES', 'false').lower() == 'true'
1351
+ timeout_value = timeout_config.get("gguf_extended_timeout" if is_hf_spaces else "gguf_timeout", 1200)
1352
 
1353
  if cache_key not in GGUF_PIPELINE_CACHE:
1354
  if job_id:
 
1584
  try:
1585
  raw_summary = await asyncio.wait_for(
1586
  generate_with_progress(),
1587
+ timeout=timeout_config.get("generation_timeout", 1200)
1588
  )
1589
  except asyncio.TimeoutError:
1590
+ error_msg = f"Text generation timed out after {timeout_config.get('generation_timeout', 1200)} seconds"
1591
  log_error_with_context(Exception(error_msg), "Text generation timeout", job_id)
1592
  update_job_with_error(job_id, error_msg, "generation_timeout")
1593
  raise Exception(error_msg)
 
1663
  try:
1664
  result_sum = await asyncio.wait_for(
1665
  asyncio.to_thread(model.generate, context, config),
1666
+ timeout=timeout_config.get("generation_timeout", 1200)
1667
  )
1668
  except asyncio.TimeoutError:
1669
+ error_msg = f"Summarization timed out after {timeout_config.get('generation_timeout', 1200)} seconds"
1670
  log_error_with_context(Exception(error_msg), "Summarization timeout", job_id)
1671
  update_job_with_error(job_id, error_msg, "generation_timeout")
1672
  raise Exception(error_msg)
 
1777
  temperature=0.1,
1778
  top_p=0.5,
1779
  ),
1780
+ timeout=1200
1781
  )
1782
  else:
1783
  config = create_generation_config(data, min_tokens=100, temperature=0.1, top_p=0.5)
 
1827
  if "timeout" in error_str.lower():
1828
  error_category = "TIMEOUT"
1829
  # Enhanced timeout message with recommendations
1830
+ user_message = f"""Summary generation timed out after {timeout_config.get('generation_timeout', 1200)} seconds.
1831
 
1832
  This may be due to:
1833
  - Large patient dataset requiring more processing time
 
1952
  ehr_url,
1953
  json={"patientid": patientid},
1954
  headers=headers,
1955
+ timeout=1200
1956
  )
1957
  if response.status_code == 200:
1958
  sample_data = response.json()
 
2417
  border-radius: 20px;
2418
  padding: 40px;
2419
  box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3);
2420
+ max-width: 1200px;
2421
  width: 100%;
2422
  animation: fadeIn 0.5s ease-in;
2423
  }
 
2433
  padding: 8px 16px;
2434
  border-radius: 20px;
2435
  font-size: 14px;
2436
+ font-weight: 1200;
2437
  margin-bottom: 20px;
2438
  }
2439
  .status-dot {
 
2466
  }
2467
  .info-title {
2468
  color: #374151;
2469
+ font-weight: 1200;
2470
  margin-bottom: 15px;
2471
  font-size: 18px;
2472
  }
 
2491
  padding: 4px 8px;
2492
  border-radius: 4px;
2493
  font-size: 12px;
2494
+ font-weight: 1200;
2495
  margin-right: 10px;
2496
  min-width: 50px;
2497
  text-align: center;
 
2512
  .link {
2513
  color: #667eea;
2514
  text-decoration: none;
2515
+ font-weight: 1200;
2516
  }
2517
  .link:hover {
2518
  text-decoration: underline;
 
2704
  """Wait for slot and then process."""
2705
  try:
2706
  # Wait for processing slot
2707
+ if queue_manager.wait_for_slot(request_id, timeout=1200):
2708
  # Update job status to show processing started
2709
  job_manager.update_job(job_id, JOB_STATUS["STARTED"], progress=5, data={'message': 'Processing slot acquired, starting generation...'})
2710
  # Start background task with optimized generation
 
2733
  'X-Content-Type-Options': 'nosniff',
2734
  'Access-Control-Allow-Origin': '*',
2735
  'Access-Control-Allow-Headers': 'Cache-Control, Connection',
2736
+ 'Keep-Alive': 'timeout=31200',
2737
  # Force HTTP/1.1 to avoid HTTP/2 protocol errors
2738
  'X-Protocol': 'HTTP/1.1'
2739
  }
 
2790
  """Wait for slot and then process."""
2791
  try:
2792
  # Wait for processing slot
2793
+ if queue_manager.wait_for_slot(request_id, timeout=1200):
2794
  # Update job status to show processing started
2795
  job_manager.update_job(job_id, JOB_STATUS["STARTED"], progress=5, data={'message': 'Processing slot acquired, starting generation...'})
2796
  # Start background task with optimized generation
 
2819
  'X-Content-Type-Options': 'nosniff',
2820
  'Access-Control-Allow-Origin': '*',
2821
  'Access-Control-Allow-Headers': 'Cache-Control, Connection',
2822
+ 'Keep-Alive': 'timeout=31200',
2823
  # Force HTTP/1.1 to avoid HTTP/2 protocol errors
2824
  'X-Protocol': 'HTTP/1.1'
2825
  }
 
2898
  """Wait for slot and then process."""
2899
  try:
2900
  # Wait for processing slot
2901
+ if queue_manager.wait_for_slot(request_id, timeout=1200):
2902
  # Update job status to show processing started
2903
  job_manager.update_job(job_id, JOB_STATUS["STARTED"], progress=5, data={'message': 'Processing slot acquired, starting generation...'})
2904
  # Start background task directly (not in separate thread to avoid nesting)
 
2928
  'X-Content-Type-Options': 'nosniff',
2929
  'Access-Control-Allow-Origin': '*',
2930
  'Access-Control-Allow-Headers': 'Cache-Control, Connection',
2931
+ 'Keep-Alive': 'timeout=31200',
2932
  # Force HTTP/1.1 to avoid HTTP/2 protocol errors
2933
  'X-Protocol': 'HTTP/1.1'
2934
  }
services/ai-service/src/ai_med_extract/app.py CHANGED
@@ -764,7 +764,7 @@ def run_dev(host: str = "0.0.0.0", port: int = 7860, debug: bool = False):
764
  # Initialize agents in dev run (preload small models)
765
  initialize_agents(app, preload_small_models=True)
766
  print("Agents initialized, starting uvicorn")
767
- uvicorn.run(app, host=host, port=port, reload=debug, timeout_keep_alive=600)
768
 
769
 
770
  if __name__ == "__main__":
 
764
  # Initialize agents in dev run (preload small models)
765
  initialize_agents(app, preload_small_models=True)
766
  print("Agents initialized, starting uvicorn")
767
+ uvicorn.run(app, host=host, port=port, reload=debug, timeout_keep_alive=1200)
768
 
769
 
770
  if __name__ == "__main__":
services/ai-service/src/ai_med_extract/config/performance_config.py CHANGED
@@ -19,7 +19,7 @@ class PerformanceConfig:
19
 
20
  # Caching
21
  enable_caching: bool = True
22
- cache_ttl_seconds: int = 3600
23
  max_cache_size: int = 1000
24
  enable_multi_level_cache: bool = True
25
 
@@ -65,7 +65,7 @@ class PerformanceConfig:
65
 
66
  # Caching
67
  enable_caching=os.environ.get('ENABLE_CACHING', 'true').lower() == 'true',
68
- cache_ttl_seconds=int(os.environ.get('CACHE_TTL_SECONDS', '3600')),
69
  max_cache_size=int(os.environ.get('MAX_CACHE_SIZE', '1000')),
70
  enable_multi_level_cache=os.environ.get('ENABLE_MULTI_LEVEL_CACHE', 'true').lower() == 'true',
71
 
 
19
 
20
  # Caching
21
  enable_caching: bool = True
22
+ cache_ttl_seconds: int = 31200
23
  max_cache_size: int = 1000
24
  enable_multi_level_cache: bool = True
25
 
 
65
 
66
  # Caching
67
  enable_caching=os.environ.get('ENABLE_CACHING', 'true').lower() == 'true',
68
+ cache_ttl_seconds=int(os.environ.get('CACHE_TTL_SECONDS', '31200')),
69
  max_cache_size=int(os.environ.get('MAX_CACHE_SIZE', '1000')),
70
  enable_multi_level_cache=os.environ.get('ENABLE_MULTI_LEVEL_CACHE', 'true').lower() == 'true',
71
 
services/ai-service/src/ai_med_extract/enable_optimizations.py CHANGED
@@ -24,7 +24,7 @@ def enable_all_optimizations():
24
 
25
  # Caching
26
  'ENABLE_CACHING': 'true',
27
- 'CACHE_TTL_SECONDS': '3600',
28
  'MAX_CACHE_SIZE': '1000',
29
  'ENABLE_MULTI_LEVEL_CACHE': 'true',
30
 
@@ -85,7 +85,7 @@ def get_optimization_status() -> Dict[str, Any]:
85
  },
86
  "caching_optimizations": {
87
  "enabled": os.environ.get('ENABLE_CACHING', 'true'),
88
- "ttl_seconds": os.environ.get('CACHE_TTL_SECONDS', '3600'),
89
  "max_size": os.environ.get('MAX_CACHE_SIZE', '1000'),
90
  },
91
  "async_optimizations": {
 
24
 
25
  # Caching
26
  'ENABLE_CACHING': 'true',
27
+ 'CACHE_TTL_SECONDS': '31200',
28
  'MAX_CACHE_SIZE': '1000',
29
  'ENABLE_MULTI_LEVEL_CACHE': 'true',
30
 
 
85
  },
86
  "caching_optimizations": {
87
  "enabled": os.environ.get('ENABLE_CACHING', 'true'),
88
+ "ttl_seconds": os.environ.get('CACHE_TTL_SECONDS', '31200'),
89
  "max_size": os.environ.get('MAX_CACHE_SIZE', '1000'),
90
  },
91
  "async_optimizations": {
services/ai-service/src/ai_med_extract/inference_service.py CHANGED
@@ -140,7 +140,7 @@ class InferenceService:
140
  loop = asyncio.get_event_loop()
141
 
142
  # Optimize chunk size based on text length
143
- chunk_size = 8000 if len(text) > 16000 else 12000
144
 
145
  if len(text) > chunk_size:
146
  chunks = self._split_chunks(text, chunk_size)
 
140
  loop = asyncio.get_event_loop()
141
 
142
  # Optimize chunk size based on text length
143
+ chunk_size = 8000 if len(text) > 112000 else 12000
144
 
145
  if len(text) > chunk_size:
146
  chunks = self._split_chunks(text, chunk_size)
services/ai-service/src/ai_med_extract/phi_scrubber_service.py CHANGED
@@ -60,7 +60,7 @@ class PHIScrubberService:
60
  r = redis.from_url(settings.REDIS_URL, decode_responses=True)
61
  await r.hincrby(key, "events", 1)
62
  await r.hincrby(key, "found", len(m))
63
- await r.expire(key, 7*24*3600)
64
  except Exception:
65
  pass
66
  return {"original_length": len(text), "scrubbed_length": len(scrubbed), "total_phi_found": len(m), "phi_types": phi_types, "scrubbed_text": scrubbed}
 
60
  r = redis.from_url(settings.REDIS_URL, decode_responses=True)
61
  await r.hincrby(key, "events", 1)
62
  await r.hincrby(key, "found", len(m))
63
+ await r.expire(key, 7*24*31200)
64
  except Exception:
65
  pass
66
  return {"original_length": len(text), "scrubbed_length": len(scrubbed), "total_phi_found": len(m), "phi_types": phi_types, "scrubbed_text": scrubbed}
services/ai-service/src/ai_med_extract/services/job_manager.py CHANGED
@@ -29,7 +29,7 @@ class JobManager:
29
  """Initialize the job manager with in-memory storage."""
30
  self._jobs: Dict[str, Dict[str, Any]] = {}
31
  self._lock = threading.RLock() # Reentrant lock for nested calls
32
- self._cleanup_interval = 3600 # 1 hour
33
  self._max_job_age = 7200 # 2 hours
34
 
35
  def create_job(self, request_id: Optional[str] = None, initial_data: Optional[Dict] = None) -> str:
 
29
  """Initialize the job manager with in-memory storage."""
30
  self._jobs: Dict[str, Dict[str, Any]] = {}
31
  self._lock = threading.RLock() # Reentrant lock for nested calls
32
+ self._cleanup_interval = 31200 # 1 hour
33
  self._max_job_age = 7200 # 2 hours
34
 
35
  def create_job(self, request_id: Optional[str] = None, initial_data: Optional[Dict] = None) -> str:
services/ai-service/src/ai_med_extract/services/request_queue.py CHANGED
@@ -229,7 +229,7 @@ class RequestQueueManager:
229
  ]
230
  }
231
 
232
- def cleanup_old_requests(self, max_age: int = 3600) -> int:
233
  """
234
  Clean up old requests from tracking.
235
 
@@ -289,7 +289,7 @@ def get_queue_manager() -> RequestQueueManager:
289
  _queue_manager = RequestQueueManager(
290
  max_concurrent=6,
291
  max_queue_size=6,
292
- queue_timeout=600
293
  )
294
  logger.info("Initialized RequestQueueManager for Hugging Face Spaces (T4 medium)")
295
  else:
@@ -297,7 +297,7 @@ def get_queue_manager() -> RequestQueueManager:
297
  _queue_manager = RequestQueueManager(
298
  max_concurrent=4,
299
  max_queue_size=20,
300
- queue_timeout=600
301
  )
302
  logger.info("Initialized RequestQueueManager for local/development")
303
 
 
229
  ]
230
  }
231
 
232
+ def cleanup_old_requests(self, max_age: int = 31200) -> int:
233
  """
234
  Clean up old requests from tracking.
235
 
 
289
  _queue_manager = RequestQueueManager(
290
  max_concurrent=6,
291
  max_queue_size=6,
292
+ queue_timeout=1200
293
  )
294
  logger.info("Initialized RequestQueueManager for Hugging Face Spaces (T4 medium)")
295
  else:
 
297
  _queue_manager = RequestQueueManager(
298
  max_concurrent=4,
299
  max_queue_size=20,
300
+ queue_timeout=1200
301
  )
302
  logger.info("Initialized RequestQueueManager for local/development")
303
 
services/ai-service/src/ai_med_extract/utils/constants.py CHANGED
@@ -24,39 +24,39 @@ CHUNK_SIZE_DAYS = 90 # Days per chunk for date-based chunking
24
  # ========== TIMEOUT CONFIGURATION ==========
25
  TIMEOUT_CONFIG = {
26
  "fast": {
27
- "ehr_timeout": 600,
28
- "generation_timeout": 600,
29
- "gguf_timeout": 600,
30
- "gguf_extended_timeout": 600,
31
  "retry_attempts": 2
32
  },
33
  "normal": {
34
- "ehr_timeout": 600,
35
- "generation_timeout": 600,
36
- "gguf_timeout": 600,
37
- "gguf_extended_timeout": 600,
38
  "retry_attempts": 3
39
  },
40
  "extended": {
41
- "ehr_timeout": 600,
42
- "generation_timeout": 600,
43
- "gguf_timeout": 600,
44
- "gguf_extended_timeout": 600,
45
  "retry_attempts": 3
46
  },
47
  "large_data": {
48
- "ehr_timeout": 600,
49
- "generation_timeout": 600,
50
- "gguf_timeout": 600,
51
- "gguf_extended_timeout": 600,
52
  "retry_attempts": 2
53
  }
54
  }
55
 
56
  # ========== SSE STREAMING CONFIGURATION ==========
57
  SSE_CONFIG = {
58
- "max_wait_time": 3600, # 60 minutes max wait time for normal operations
59
- "extended_max_wait_time": 3600, # 60 minutes extended wait for GGUF/long operations
60
  "heartbeat_interval": 5, # Send heartbeat every 5 seconds
61
  "normal_heartbeat_interval": 10, # Normal heartbeat interval
62
  "poll_interval": 1, # Check job status every second
@@ -65,7 +65,7 @@ SSE_CONFIG = {
65
 
66
  # ========== CACHE CONFIGURATION ==========
67
  CACHE_CONFIG = {
68
- "ttl_seconds": 3600, # 1 hour
69
  "cache_dir": "/tmp/summary_cache",
70
  "max_cache_size": 100
71
  }
@@ -89,7 +89,7 @@ MEMORY_CONFIG = {
89
  "enable_quantization": True,
90
  "cache_models": True,
91
  "cleanup_interval": 300, # 5 minutes
92
- "max_memory_mb": 6000,
93
  "memory_pressure_threshold": 0.8,
94
  "aggressive_cleanup_threshold": 0.9
95
  }
 
24
  # ========== TIMEOUT CONFIGURATION ==========
25
  TIMEOUT_CONFIG = {
26
  "fast": {
27
+ "ehr_timeout": 1200,
28
+ "generation_timeout": 1200,
29
+ "gguf_timeout": 1200,
30
+ "gguf_extended_timeout": 1200,
31
  "retry_attempts": 2
32
  },
33
  "normal": {
34
+ "ehr_timeout": 1200,
35
+ "generation_timeout": 1200,
36
+ "gguf_timeout": 1200,
37
+ "gguf_extended_timeout": 1200,
38
  "retry_attempts": 3
39
  },
40
  "extended": {
41
+ "ehr_timeout": 1200,
42
+ "generation_timeout": 1200,
43
+ "gguf_timeout": 1200,
44
+ "gguf_extended_timeout": 1200,
45
  "retry_attempts": 3
46
  },
47
  "large_data": {
48
+ "ehr_timeout": 1200,
49
+ "generation_timeout": 1200,
50
+ "gguf_timeout": 1200,
51
+ "gguf_extended_timeout": 1200,
52
  "retry_attempts": 2
53
  }
54
  }
55
 
56
  # ========== SSE STREAMING CONFIGURATION ==========
57
  SSE_CONFIG = {
58
+ "max_wait_time": 31200, # 60 minutes max wait time for normal operations
59
+ "extended_max_wait_time": 31200, # 60 minutes extended wait for GGUF/long operations
60
  "heartbeat_interval": 5, # Send heartbeat every 5 seconds
61
  "normal_heartbeat_interval": 10, # Normal heartbeat interval
62
  "poll_interval": 1, # Check job status every second
 
65
 
66
  # ========== CACHE CONFIGURATION ==========
67
  CACHE_CONFIG = {
68
+ "ttl_seconds": 31200, # 1 hour
69
  "cache_dir": "/tmp/summary_cache",
70
  "max_cache_size": 100
71
  }
 
89
  "enable_quantization": True,
90
  "cache_models": True,
91
  "cleanup_interval": 300, # 5 minutes
92
+ "max_memory_mb": 12000,
93
  "memory_pressure_threshold": 0.8,
94
  "aggressive_cleanup_threshold": 0.9
95
  }
services/ai-service/src/ai_med_extract/utils/hf_spaces_config.py CHANGED
@@ -65,7 +65,7 @@ TIMEOUT_SETTINGS = {
65
  "model_loading_timeout": 300, # 5 minutes for model loading
66
  "inference_timeout": 120, # 2 minutes for inference
67
  "ehr_fetch_timeout": 30, # 30 seconds for EHR fetch
68
- "streaming_timeout": 600 # 10 minutes for streaming responses
69
  }
70
 
71
  def get_optimized_model(model_type: str) -> str:
 
65
  "model_loading_timeout": 300, # 5 minutes for model loading
66
  "inference_timeout": 120, # 2 minutes for inference
67
  "ehr_fetch_timeout": 30, # 30 seconds for EHR fetch
68
+ "streaming_timeout": 1200 # 10 minutes for streaming responses
69
  }
70
 
71
  def get_optimized_model(model_type: str) -> str:
services/ai-service/src/ai_med_extract/utils/openvino_summarizer_utils.py CHANGED
@@ -238,7 +238,7 @@ def delta_to_text(delta):
238
  from concurrent.futures import ThreadPoolExecutor, as_completed
239
  import threading
240
 
241
- def generate_section(pipeline, prompt, section_name, timeout=600):
242
  """Generate one section with timeout protection."""
243
  try:
244
  # If your pipeline supports timeout, pass it. Otherwise, wrap in future.
 
238
  from concurrent.futures import ThreadPoolExecutor, as_completed
239
  import threading
240
 
241
+ def generate_section(pipeline, prompt, section_name, timeout=1200):
242
  """Generate one section with timeout protection."""
243
  try:
244
  # If your pipeline supports timeout, pass it. Otherwise, wrap in future.
services/ai-service/src/ai_med_extract/utils/performance_monitor.py CHANGED
@@ -76,7 +76,7 @@ class PerformanceMonitor:
76
  class RobustParsingCache:
77
  """Intelligent caching system for robust JSON parsing operations."""
78
 
79
- def __init__(self, cache_dir: str = "/tmp/medical_ai_cache", ttl: int = 3600):
80
  self.cache_dir = cache_dir
81
  self.ttl = ttl # Time to live in seconds
82
  os.makedirs(cache_dir, exist_ok=True)
 
76
  class RobustParsingCache:
77
  """Intelligent caching system for robust JSON parsing operations."""
78
 
79
+ def __init__(self, cache_dir: str = "/tmp/medical_ai_cache", ttl: int = 31200):
80
  self.cache_dir = cache_dir
81
  self.ttl = ttl # Time to live in seconds
82
  os.makedirs(cache_dir, exist_ok=True)
services/ai-service/src/ai_med_extract/utils/unified_model_manager.py CHANGED
@@ -499,7 +499,7 @@ class UnifiedModelManager:
499
 
500
  for key, model in self._models.items():
501
  # Remove models not used in last hour
502
- if current_time - model._last_used > 3600:
503
  to_remove.append(key)
504
 
505
  for key in to_remove:
 
499
 
500
  for key, model in self._models.items():
501
  # Remove models not used in last hour
502
+ if current_time - model._last_used > 31200:
503
  to_remove.append(key)
504
 
505
  for key in to_remove: