sachinchandrankallar commited on
Commit
6c585d3
Β·
1 Parent(s): 3de57f3

Revert "merge conflicts"

Browse files

This reverts commit 3de57f34b087996592583edd47a6e4a02b9dbc0a, reversing
changes made to 29a03e37a15cb4a97eceb9a4b71cd489c6f6c0a1.

requirements.txt CHANGED
@@ -2,10 +2,7 @@
2
  torch>=2.3.0
3
  torchvision>=0.18.0
4
  torchaudio>=2.3.0
5
- # Pin transformers to <4.50.0 to avoid DynamicCache.get_max_length compatibility issues
6
- # with OpenVINO/optimum-intel. Version 4.47.0+ works; 4.50.0+ removed get_max_length.
7
- # Also using use_cache=False workaround in code as safety measure.
8
- transformers>=4.47.0,<4.50.0
9
  tokenizers==0.21.4
10
  accelerate>=0.30.0
11
  safetensors==0.6.2
 
2
  torch>=2.3.0
3
  torchvision>=0.18.0
4
  torchaudio>=2.3.0
5
+ transformers>=4.42.0
 
 
 
6
  tokenizers==0.21.4
7
  accelerate>=0.30.0
8
  safetensors==0.6.2
services/ai-service/src/ai_med_extract/__pycache__/app.cpython-311.pyc CHANGED
Binary files a/services/ai-service/src/ai_med_extract/__pycache__/app.cpython-311.pyc and b/services/ai-service/src/ai_med_extract/__pycache__/app.cpython-311.pyc differ
 
services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc CHANGED
Binary files a/services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc and b/services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc differ
 
services/ai-service/src/ai_med_extract/api/routes_fastapi.py CHANGED
@@ -1,5 +1,7 @@
1
- # ========== IMPORTS ==========
2
- # Standard library
 
 
3
  import os
4
  import uuid
5
  import json
@@ -7,29 +9,10 @@ import asyncio
7
  import threading
8
  import time
9
  import logging
10
- import gc
11
- import hashlib
12
- import re
13
- from collections import defaultdict
14
- from datetime import datetime, timedelta
15
- from concurrent.futures import ThreadPoolExecutor, as_completed
16
- from typing import List, Optional, Dict
17
-
18
- # Third-party libraries
19
- import torch
20
- import psutil
21
- import requests
22
- from fastapi import APIRouter, Request, UploadFile, File, Form, HTTPException, BackgroundTasks, Query
23
- from fastapi.responses import JSONResponse, StreamingResponse, HTMLResponse
24
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline as transformers_pipeline
25
-
26
- # Local imports
27
  from ..core_logger import log_with_memory, log_exception_with_memory
28
- from ..utils.file_utils import allowed_file, check_file_size, get_data_from_storage, save_data_to_storage
29
 
30
  # Set up logger
31
  logger = logging.getLogger(__name__)
32
- <<<<<<< HEAD
33
  from concurrent.futures import ThreadPoolExecutor, as_completed
34
  import torch
35
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline as transformers_pipeline
@@ -44,8 +27,6 @@ from datetime import datetime, timedelta
44
 
45
  from ..utils.file_utils import allowed_file, check_file_size, get_data_from_storage, save_data_to_storage
46
  from ..utils.unified_model_manager import unified_model_manager, GenerationConfig
47
- =======
48
- >>>>>>> FT-DEV-20/10/2025
49
 
50
  # ========== CONSTANTS AND CONFIGURATION ==========
51
  # Standardized timeout values for consistent behavior across all modes
@@ -209,50 +190,25 @@ async def retry_operation(operation, max_attempts: int, operation_name: str, job
209
  def cleanup_memory():
210
  """
211
  Clean up memory by forcing garbage collection and clearing caches.
212
- This helps prevent memory leaks in long-running processes with comprehensive monitoring.
213
  """
214
  try:
215
- # Get initial memory usage
216
- process = psutil.Process(os.getpid())
217
- initial_memory = process.memory_info().rss / 1024 / 1024 # MB
218
-
219
  # Force garbage collection
220
  gc.collect()
221
-
222
- # Clear CUDA cache if available
223
  if torch.cuda.is_available():
224
  torch.cuda.empty_cache()
225
- torch.cuda.synchronize()
226
-
227
- # Get GPU memory info
228
- allocated = torch.cuda.memory_allocated() / 1024**3 # GB
229
- cached = torch.cuda.memory_reserved() / 1024**3 # GB
230
- gpu_info = f", GPU: {allocated:.2f}GB allocated, {cached:.2f}GB cached"
231
- else:
232
- gpu_info = ""
233
-
234
  # Clean up global caches to prevent memory leaks
235
  cleanup_global_caches()
236
-
237
- # Get final memory usage
238
- final_memory = process.memory_info().rss / 1024 / 1024 # MB
239
- memory_delta = final_memory - initial_memory
240
- PERFORMANCE_METRICS["memory_usage_mb"] = final_memory
241
-
242
  # Log memory usage for monitoring
243
  memory_info = psutil.virtual_memory()
244
- available_gb = memory_info.available / 1024 / 1024 / 1024
245
- print(f"Memory cleanup completed. Current usage: {final_memory:.1f} MB (Ξ”{memory_delta:+.1f}MB), Available: {available_gb:.2f} GB{gpu_info}")
246
-
247
- # Log warning if memory usage is high
248
- if final_memory > 3000: # 3GB threshold
249
- print(f"⚠️ High memory usage detected: {final_memory:.1f} MB")
250
-
251
- logging.info(f"Memory cleanup completed. Available memory: {available_gb:.2f} GB{gpu_info}")
252
 
253
  except Exception as e:
254
  logging.warning(f"Memory cleanup failed: {str(e)}")
255
- print(f"Memory cleanup failed: {e}")
256
 
257
  def cleanup_global_caches():
258
  """
@@ -673,153 +629,41 @@ def generate_rule_based_summary(baseline, delta_text, visits=None, patientid=Non
673
  except Exception:
674
  return result
675
 
676
-
677
- # ========== HELPER FUNCTIONS FOR MODEL GENERATION ==========
678
- def safe_log(level: int, message: str, exc_info: bool = False) -> None:
679
- """
680
- Safely log a message, catching any logging exceptions to prevent them from breaking execution.
681
-
682
- Args:
683
- level: Logging level (e.g., logging.INFO, logging.WARNING)
684
- message: Message to log
685
- exc_info: Whether to include exception info
686
- """
687
- try:
688
- log_with_memory(level, message)
689
- except Exception:
690
- # Fallback to standard logging if log_with_memory fails
691
- try:
692
- logging.log(level, message, exc_info=exc_info)
693
- except Exception:
694
- pass # Silently fail to prevent logging errors from breaking execution
695
-
696
-
697
- def safe_log_exception(message: str, exception: Exception) -> None:
698
- """
699
- Safely log an exception, catching any logging exceptions to prevent them from breaking execution.
700
-
701
- Args:
702
- message: Context message for the exception
703
- exception: The exception that occurred
704
- """
705
  try:
706
- log_exception_with_memory(message, exception)
707
- except Exception:
708
- # Fallback to standard logging if log_exception_with_memory fails
709
- try:
710
- logging.exception(f"{message}: {exception}")
711
- except Exception:
712
- pass # Silently fail to prevent logging errors from breaking execution
713
-
714
- def prepare_generation_kwargs(
715
- inputs: Dict,
716
- max_new_tokens: int,
717
- pad_token_id: int,
718
- model_type: str,
719
- use_cache: Optional[bool] = None
720
- ) -> Dict:
721
- """
722
- Prepare generation kwargs with proper cache settings for OpenVINO models.
723
-
724
- Args:
725
- inputs: Tokenized input tensors
726
- max_new_tokens: Maximum number of new tokens to generate
727
- pad_token_id: Padding token ID
728
- model_type: Type of model (causal-openvino, text-generation, etc.)
729
- use_cache: Optional explicit cache setting (if None, auto-detect for OpenVINO)
730
-
731
- Returns:
732
- Dictionary of generation kwargs
733
- """
734
- generation_kwargs = {
735
- **inputs,
736
- "max_new_tokens": max_new_tokens,
737
- "do_sample": False,
738
- "pad_token_id": pad_token_id,
739
- }
740
-
741
- # Disable cache for OpenVINO models to avoid DynamicCache.get_max_length compatibility issues
742
- if use_cache is None:
743
- use_cache = not (model_type == "causal-openvino")
744
-
745
- if not use_cache:
746
- generation_kwargs["use_cache"] = False
747
-
748
- return generation_kwargs
749
-
750
-
751
- def extract_summary_from_text(text: str, patterns: Optional[List[str]] = None) -> str:
752
- """
753
- Extract summary from generated text by finding the first matching pattern.
754
-
755
- Args:
756
- text: Generated text from model
757
- patterns: List of patterns to search for (default: common summary patterns)
758
-
759
- Returns:
760
- Extracted summary text
761
- """
762
- if patterns is None:
763
- patterns = [
764
- "Now generate the complete clinical summary",
765
- "Now generate the complete clinical summary in markdown format:",
766
- "## Clinical",
767
- "# Clinical",
768
- "Clinical"
769
- ]
770
-
771
- for pattern in patterns:
772
- if pattern in text:
773
- return text.split(pattern)[-1].strip()
774
-
775
- return text.strip()
776
 
 
 
 
 
 
 
 
 
 
 
 
777
 
778
- async def generate_text_with_pipeline(
779
- pipeline,
780
- prompt: str,
781
- max_new_tokens: int,
782
- model_type: str,
783
- pad_token_id: Optional[int] = None
784
- ) -> str:
785
- """
786
- Generate text using a model pipeline with proper error handling and cache settings.
787
-
788
- Args:
789
- pipeline: Model pipeline with model and tokenizer attributes
790
- prompt: Input prompt text
791
- max_new_tokens: Maximum number of new tokens to generate
792
- model_type: Type of model (causal-openvino, text-generation, etc.)
793
- pad_token_id: Padding token ID (defaults to tokenizer.pad_token_id or eos_token_id)
794
-
795
- Returns:
796
- Generated text
797
- """
798
- # Tokenize input
799
- inputs = pipeline.tokenizer([prompt], return_tensors="pt")
800
-
801
- # Get pad_token_id if not provided
802
- if pad_token_id is None:
803
- pad_token_id = pipeline.tokenizer.pad_token_id or pipeline.tokenizer.eos_token_id or 0
804
-
805
- # Prepare generation kwargs
806
- generation_kwargs = prepare_generation_kwargs(
807
- inputs=inputs,
808
- max_new_tokens=max_new_tokens,
809
- pad_token_id=pad_token_id,
810
- model_type=model_type
811
- )
812
-
813
- # Generate text
814
- outputs = await asyncio.to_thread(
815
- pipeline.model.generate,
816
- **generation_kwargs,
817
- )
818
-
819
- # Decode output
820
- text = pipeline.tokenizer.decode(outputs[0], skip_special_tokens=True)
821
- return text
822
 
 
 
 
 
 
 
 
 
823
 
824
  def monitor_memory_usage(operation_name: Optional[str] = None, job_id: Optional[str] = None):
825
  """Monitor and return current memory usage. Optionally log warnings.
@@ -1427,7 +1271,6 @@ DATA:
1427
  'patientName': ehr_data.get('result', {}).get('patientname', 'Unknown')
1428
  }
1429
  })
1430
- <<<<<<< HEAD
1431
  inputs = pipeline.tokenizer([prompt], return_tensors="pt")
1432
  from ..utils.unified_model_manager import unified_model_manager as _unified_manager
1433
 
@@ -1447,22 +1290,22 @@ DATA:
1447
  stream=False
1448
  )
1449
  raw_summary = await asyncio.to_thread(model.generate, prompt, config)
1450
- =======
1451
- # Generate text using helper function with proper cache settings
1452
- text = await generate_text_with_pipeline(
1453
- pipeline=pipeline,
1454
- prompt=prompt,
1455
- max_new_tokens=_effective_max_new_tokens(data.get("max_new_tokens"), default=1024),
1456
- model_type=model_type
1457
- )
1458
- >>>>>>> FT-DEV-20/10/2025
1459
  try:
1460
  log_with_memory(logging.INFO, f"[SUMMARY] text-gen generated request_id={request_id} chars={len(raw_summary)}")
1461
  except Exception:
1462
  pass
1463
 
1464
- # Extract summary using helper function
1465
- new_summary = extract_summary_from_text(text)
 
 
 
 
 
 
 
 
 
1466
 
1467
  markdown_summary = summary_to_markdown(new_summary)
1468
  markdown_summary = ensure_four_sections(markdown_summary)
@@ -3650,7 +3493,7 @@ async def clear_performance_cache():
3650
  torch.set_num_threads(2)
3651
  logging.info(f"βš™οΈ [STEP 3.1] Set torch threads to 2")
3652
 
3653
- # Tokenize and generate using helper function
3654
  tokenize_start = time.time()
3655
  logging.info(f"πŸ”€ [STEP 3.2] Tokenizing prompt (length: {len(prompt)} chars)...")
3656
  inputs = pipeline.tokenizer([prompt], return_tensors="pt")
@@ -3661,30 +3504,49 @@ async def clear_performance_cache():
3661
  # Generate response
3662
  generate_start = time.time()
3663
  logging.info(f"🎯 [STEP 3.3] Generating response (max_new_tokens=8192)...")
3664
- text = await generate_text_with_pipeline(
3665
- pipeline=pipeline,
3666
- prompt=prompt,
3667
- max_new_tokens=8192,
3668
- model_type=model_type,
3669
- pad_token_id=pipeline.tokenizer.eos_token_id or 32000
3670
- )
3671
  generate_time = time.time() - generate_start
3672
  logging.info(f"βœ… [STEP 3.3] Generated in {generate_time:.3f}s")
3673
- logging.info(f"πŸ“ [STEP 3.3] Generated text length: {len(text)} characters")
3674
- logging.info(f"πŸ“ [STEP 3.3] Generated text preview: {text[:200]}...")
 
 
 
 
 
 
 
 
3675
 
3676
  generation_time = time.time() - generation_start
3677
  logging.info(f"🎯 [STEP 3] Model generation completed in {generation_time:.3f}s total")
3678
- logging.info(f"πŸ“Š [STEP 3] Generation breakdown: tokenize={tokenize_time:.3f}s, generate={generate_time:.3f}s")
3679
 
3680
  # Extract and process summary
3681
  logging.info(f"πŸ“„ [STEP 4] Starting summary extraction and processing...")
3682
  extraction_start = time.time()
3683
 
3684
- # Extract summary using helper function
 
 
 
 
 
 
3685
  logging.info(f"πŸ” [STEP 4.1] Extracting summary from generated text...")
3686
- new_summary = extract_summary_from_text(text)
3687
- logging.info(f"βœ… [STEP 4.1] Summary extracted")
 
 
 
 
 
 
 
 
 
 
 
3688
 
3689
  extraction_time = time.time() - extraction_start
3690
  logging.info(f"βœ… [STEP 4.1] Summary extracted in {extraction_time:.3f}s")
 
1
+ from collections import defaultdict
2
+ from fastapi import APIRouter, Request, UploadFile, File, Form, HTTPException, BackgroundTasks, Query
3
+ from fastapi.responses import JSONResponse, StreamingResponse, HTMLResponse
4
+ from typing import List, Optional, Dict
5
  import os
6
  import uuid
7
  import json
 
9
  import threading
10
  import time
11
  import logging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  from ..core_logger import log_with_memory, log_exception_with_memory
 
13
 
14
  # Set up logger
15
  logger = logging.getLogger(__name__)
 
16
  from concurrent.futures import ThreadPoolExecutor, as_completed
17
  import torch
18
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline as transformers_pipeline
 
27
 
28
  from ..utils.file_utils import allowed_file, check_file_size, get_data_from_storage, save_data_to_storage
29
  from ..utils.unified_model_manager import unified_model_manager, GenerationConfig
 
 
30
 
31
  # ========== CONSTANTS AND CONFIGURATION ==========
32
  # Standardized timeout values for consistent behavior across all modes
 
190
  def cleanup_memory():
191
  """
192
  Clean up memory by forcing garbage collection and clearing caches.
193
+ This helps prevent memory leaks in long-running processes.
194
  """
195
  try:
 
 
 
 
196
  # Force garbage collection
197
  gc.collect()
198
+
199
+ # Clear PyTorch cache if available
200
  if torch.cuda.is_available():
201
  torch.cuda.empty_cache()
202
+
 
 
 
 
 
 
 
 
203
  # Clean up global caches to prevent memory leaks
204
  cleanup_global_caches()
205
+
 
 
 
 
 
206
  # Log memory usage for monitoring
207
  memory_info = psutil.virtual_memory()
208
+ logging.info(f"Memory cleanup completed. Available memory: {memory_info.available / 1024 / 1024 / 1024:.2f} GB")
 
 
 
 
 
 
 
209
 
210
  except Exception as e:
211
  logging.warning(f"Memory cleanup failed: {str(e)}")
 
212
 
213
  def cleanup_global_caches():
214
  """
 
629
  except Exception:
630
  return result
631
 
632
+ def cleanup_memory():
633
+ """Clean up memory after model operations for HF Spaces with comprehensive monitoring"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
634
  try:
635
+ # Get initial memory usage
636
+ process = psutil.Process(os.getpid())
637
+ initial_memory = process.memory_info().rss / 1024 / 1024
638
+
639
+ # Force garbage collection
640
+ gc.collect()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
641
 
642
+ # Clear CUDA cache if available
643
+ if torch.cuda.is_available():
644
+ torch.cuda.empty_cache()
645
+ torch.cuda.synchronize()
646
+
647
+ # Get GPU memory info
648
+ allocated = torch.cuda.memory_allocated() / 1024**3 # GB
649
+ cached = torch.cuda.memory_reserved() / 1024**3 # GB
650
+ gpu_info = f", GPU: {allocated:.2f}GB allocated, {cached:.2f}GB cached"
651
+ else:
652
+ gpu_info = ""
653
 
654
+ # Get final memory usage
655
+ final_memory = process.memory_info().rss / 1024 / 1024
656
+ memory_delta = final_memory - initial_memory
657
+ PERFORMANCE_METRICS["memory_usage_mb"] = final_memory
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
658
 
659
+ print(f"Memory cleanup completed. Current usage: {final_memory:.1f} MB (Ξ”{memory_delta:+.1f}MB){gpu_info}")
660
+
661
+ # Log warning if memory usage is high
662
+ if final_memory > 3000: # 3GB threshold
663
+ print(f"⚠️ High memory usage detected: {final_memory:.1f} MB")
664
+
665
+ except Exception as e:
666
+ print(f"Memory cleanup failed: {e}")
667
 
668
  def monitor_memory_usage(operation_name: Optional[str] = None, job_id: Optional[str] = None):
669
  """Monitor and return current memory usage. Optionally log warnings.
 
1271
  'patientName': ehr_data.get('result', {}).get('patientname', 'Unknown')
1272
  }
1273
  })
 
1274
  inputs = pipeline.tokenizer([prompt], return_tensors="pt")
1275
  from ..utils.unified_model_manager import unified_model_manager as _unified_manager
1276
 
 
1290
  stream=False
1291
  )
1292
  raw_summary = await asyncio.to_thread(model.generate, prompt, config)
 
 
 
 
 
 
 
 
 
1293
  try:
1294
  log_with_memory(logging.INFO, f"[SUMMARY] text-gen generated request_id={request_id} chars={len(raw_summary)}")
1295
  except Exception:
1296
  pass
1297
 
1298
+ summary_start_patterns = [
1299
+ "Now generate the complete clinical summary",
1300
+ "## Clinical",
1301
+ "# Clinical",
1302
+ "Clinical"
1303
+ ]
1304
+ new_summary = text
1305
+ for pattern in summary_start_patterns:
1306
+ if pattern in text:
1307
+ new_summary = text.split(pattern)[-1].strip()
1308
+ break
1309
 
1310
  markdown_summary = summary_to_markdown(new_summary)
1311
  markdown_summary = ensure_four_sections(markdown_summary)
 
3493
  torch.set_num_threads(2)
3494
  logging.info(f"βš™οΈ [STEP 3.1] Set torch threads to 2")
3495
 
3496
+ # Tokenize input
3497
  tokenize_start = time.time()
3498
  logging.info(f"πŸ”€ [STEP 3.2] Tokenizing prompt (length: {len(prompt)} chars)...")
3499
  inputs = pipeline.tokenizer([prompt], return_tensors="pt")
 
3504
  # Generate response
3505
  generate_start = time.time()
3506
  logging.info(f"🎯 [STEP 3.3] Generating response (max_new_tokens=8192)...")
3507
+ outputs = pipeline.model.generate(**inputs, max_new_tokens=8192, do_sample=False, pad_token_id=pipeline.tokenizer.eos_token_id or 32000)
 
 
 
 
 
 
3508
  generate_time = time.time() - generate_start
3509
  logging.info(f"βœ… [STEP 3.3] Generated in {generate_time:.3f}s")
3510
+ logging.info(f"πŸ“Š [STEP 3.3] Output tensor shape: {outputs.shape}")
3511
+
3512
+ # Decode response
3513
+ decode_start = time.time()
3514
+ logging.info(f"πŸ“ [STEP 3.4] Decoding generated tokens...")
3515
+ text = pipeline.tokenizer.decode(outputs[0], skip_special_tokens=True)
3516
+ decode_time = time.time() - decode_start
3517
+ logging.info(f"βœ… [STEP 3.4] Decoded in {decode_time:.3f}s")
3518
+ logging.info(f"πŸ“ [STEP 3.4] Generated text length: {len(text)} characters")
3519
+ logging.info(f"πŸ“ [STEP 3.4] Generated text preview: {text[:200]}...")
3520
 
3521
  generation_time = time.time() - generation_start
3522
  logging.info(f"🎯 [STEP 3] Model generation completed in {generation_time:.3f}s total")
3523
+ logging.info(f"πŸ“Š [STEP 3] Generation breakdown: tokenize={tokenize_time:.3f}s, generate={generate_time:.3f}s, decode={decode_time:.3f}s")
3524
 
3525
  # Extract and process summary
3526
  logging.info(f"πŸ“„ [STEP 4] Starting summary extraction and processing...")
3527
  extraction_start = time.time()
3528
 
3529
+ summary_start_patterns = [
3530
+ "Now generate the complete clinical summary in markdown format:",
3531
+ "## Clinical",
3532
+ "# Clinical",
3533
+ "Clinical"
3534
+ ]
3535
+
3536
  logging.info(f"πŸ” [STEP 4.1] Extracting summary from generated text...")
3537
+ logging.info(f"πŸ“Š [STEP 4.1] Looking for patterns: {summary_start_patterns}")
3538
+
3539
+ new_summary = text
3540
+ pattern_found = None
3541
+ for pattern in summary_start_patterns:
3542
+ if pattern in text:
3543
+ new_summary = text.split(pattern)[-1].strip()
3544
+ pattern_found = pattern
3545
+ logging.info(f"βœ… [STEP 4.1] Found pattern: '{pattern}'")
3546
+ break
3547
+
3548
+ if not pattern_found:
3549
+ logging.warning(f"⚠️ [STEP 4.1] No extraction pattern found, using full text")
3550
 
3551
  extraction_time = time.time() - extraction_start
3552
  logging.info(f"βœ… [STEP 4.1] Summary extracted in {extraction_time:.3f}s")