Spaces:

salvinjose
/

HNTAI

Paused

sachinchandrankallar commited on Nov 3, 2025

Commit

6c585d3

1 Parent(s): 3de57f3

Revert "merge conflicts"

This reverts commit 3de57f34b087996592583edd47a6e4a02b9dbc0a, reversing
changes made to 29a03e37a15cb4a97eceb9a4b71cd489c6f6c0a1.

Files changed (4) hide show

requirements.txt +1 -4
services/ai-service/src/ai_med_extract/__pycache__/app.cpython-311.pyc +0 -0
services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc +0 -0
services/ai-service/src/ai_med_extract/api/routes_fastapi.py +85 -223

requirements.txt CHANGED Viewed

@@ -2,10 +2,7 @@
 torch>=2.3.0
 torchvision>=0.18.0
 torchaudio>=2.3.0
-# Pin transformers to <4.50.0 to avoid DynamicCache.get_max_length compatibility issues
-# with OpenVINO/optimum-intel. Version 4.47.0+ works; 4.50.0+ removed get_max_length.
-# Also using use_cache=False workaround in code as safety measure.
-transformers>=4.47.0,<4.50.0
 tokenizers==0.21.4
 accelerate>=0.30.0
 safetensors==0.6.2

 torch>=2.3.0
 torchvision>=0.18.0
 torchaudio>=2.3.0
+transformers>=4.42.0
 tokenizers==0.21.4
 accelerate>=0.30.0
 safetensors==0.6.2

services/ai-service/src/ai_med_extract/__pycache__/app.cpython-311.pyc CHANGED Viewed

Binary files a/services/ai-service/src/ai_med_extract/__pycache__/app.cpython-311.pyc and b/services/ai-service/src/ai_med_extract/__pycache__/app.cpython-311.pyc differ

services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc CHANGED Viewed

Binary files a/services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc and b/services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc differ

services/ai-service/src/ai_med_extract/api/routes_fastapi.py CHANGED Viewed

@@ -1,5 +1,7 @@
-# ========== IMPORTS ==========
-# Standard library
 import os
 import uuid
 import json
@@ -7,29 +9,10 @@ import asyncio
 import threading
 import time
 import logging
-import gc
-import hashlib
-import re
-from collections import defaultdict
-from datetime import datetime, timedelta
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import List, Optional, Dict
-# Third-party libraries
-import torch
-import psutil
-import requests
-from fastapi import APIRouter, Request, UploadFile, File, Form, HTTPException, BackgroundTasks, Query
-from fastapi.responses import JSONResponse, StreamingResponse, HTMLResponse
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline as transformers_pipeline
-# Local imports
 from ..core_logger import log_with_memory, log_exception_with_memory
-from ..utils.file_utils import allowed_file, check_file_size, get_data_from_storage, save_data_to_storage
 # Set up logger
 logger = logging.getLogger(__name__)
-<<<<<<< HEAD
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline as transformers_pipeline
@@ -44,8 +27,6 @@ from datetime import datetime, timedelta
 from ..utils.file_utils import allowed_file, check_file_size, get_data_from_storage, save_data_to_storage
 from ..utils.unified_model_manager import unified_model_manager, GenerationConfig
-=======
->>>>>>> FT-DEV-20/10/2025
 # ========== CONSTANTS AND CONFIGURATION ==========
 # Standardized timeout values for consistent behavior across all modes
@@ -209,50 +190,25 @@ async def retry_operation(operation, max_attempts: int, operation_name: str, job
 def cleanup_memory():
     """
     Clean up memory by forcing garbage collection and clearing caches.
-    This helps prevent memory leaks in long-running processes with comprehensive monitoring.
     """
     try:
-        # Get initial memory usage
-        process = psutil.Process(os.getpid())
-        initial_memory = process.memory_info().rss / 1024 / 1024  # MB
         # Force garbage collection
         gc.collect()
-        # Clear CUDA cache if available
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-            # Get GPU memory info
-            allocated = torch.cuda.memory_allocated() / 1024**3  # GB
-            cached = torch.cuda.memory_reserved() / 1024**3  # GB
-            gpu_info = f", GPU: {allocated:.2f}GB allocated, {cached:.2f}GB cached"
-        else:
-            gpu_info = ""
         # Clean up global caches to prevent memory leaks
         cleanup_global_caches()
-        # Get final memory usage
-        final_memory = process.memory_info().rss / 1024 / 1024  # MB
-        memory_delta = final_memory - initial_memory
-        PERFORMANCE_METRICS["memory_usage_mb"] = final_memory
         # Log memory usage for monitoring
         memory_info = psutil.virtual_memory()
-        available_gb = memory_info.available / 1024 / 1024 / 1024
-        print(f"Memory cleanup completed. Current usage: {final_memory:.1f} MB (Δ{memory_delta:+.1f}MB), Available: {available_gb:.2f} GB{gpu_info}")
-        # Log warning if memory usage is high
-        if final_memory > 3000:  # 3GB threshold
-            print(f"⚠️  High memory usage detected: {final_memory:.1f} MB")
-        logging.info(f"Memory cleanup completed. Available memory: {available_gb:.2f} GB{gpu_info}")
     except Exception as e:
         logging.warning(f"Memory cleanup failed: {str(e)}")
-        print(f"Memory cleanup failed: {e}")
 def cleanup_global_caches():
     """
@@ -673,153 +629,41 @@ def generate_rule_based_summary(baseline, delta_text, visits=None, patientid=Non
     except Exception:
         return result
-# ========== HELPER FUNCTIONS FOR MODEL GENERATION ==========
-def safe_log(level: int, message: str, exc_info: bool = False) -> None:
-    """
-    Safely log a message, catching any logging exceptions to prevent them from breaking execution.
-    Args:
-        level: Logging level (e.g., logging.INFO, logging.WARNING)
-        message: Message to log
-        exc_info: Whether to include exception info
-    """
-    try:
-        log_with_memory(level, message)
-    except Exception:
-        # Fallback to standard logging if log_with_memory fails
-        try:
-            logging.log(level, message, exc_info=exc_info)
-        except Exception:
-            pass  # Silently fail to prevent logging errors from breaking execution
-def safe_log_exception(message: str, exception: Exception) -> None:
-    """
-    Safely log an exception, catching any logging exceptions to prevent them from breaking execution.
-    Args:
-        message: Context message for the exception
-        exception: The exception that occurred
-    """
     try:
-        log_exception_with_memory(message, exception)
-    except Exception:
-        # Fallback to standard logging if log_exception_with_memory fails
-        try:
-            logging.exception(f"{message}: {exception}")
-        except Exception:
-            pass  # Silently fail to prevent logging errors from breaking execution
-def prepare_generation_kwargs(
-    inputs: Dict,
-    max_new_tokens: int,
-    pad_token_id: int,
-    model_type: str,
-    use_cache: Optional[bool] = None
-) -> Dict:
-    """
-    Prepare generation kwargs with proper cache settings for OpenVINO models.
-    Args:
-        inputs: Tokenized input tensors
-        max_new_tokens: Maximum number of new tokens to generate
-        pad_token_id: Padding token ID
-        model_type: Type of model (causal-openvino, text-generation, etc.)
-        use_cache: Optional explicit cache setting (if None, auto-detect for OpenVINO)
-    Returns:
-        Dictionary of generation kwargs
-    """
-    generation_kwargs = {
-        **inputs,
-        "max_new_tokens": max_new_tokens,
-        "do_sample": False,
-        "pad_token_id": pad_token_id,
-    }
-    # Disable cache for OpenVINO models to avoid DynamicCache.get_max_length compatibility issues
-    if use_cache is None:
-        use_cache = not (model_type == "causal-openvino")
-    if not use_cache:
-        generation_kwargs["use_cache"] = False
-    return generation_kwargs
-def extract_summary_from_text(text: str, patterns: Optional[List[str]] = None) -> str:
-    """
-    Extract summary from generated text by finding the first matching pattern.
-    Args:
-        text: Generated text from model
-        patterns: List of patterns to search for (default: common summary patterns)
-    Returns:
-        Extracted summary text
-    """
-    if patterns is None:
-        patterns = [
-            "Now generate the complete clinical summary",
-            "Now generate the complete clinical summary in markdown format:",
-            "## Clinical",
-            "# Clinical",
-            "Clinical"
-        ]
-    for pattern in patterns:
-        if pattern in text:
-            return text.split(pattern)[-1].strip()
-    return text.strip()
-async def generate_text_with_pipeline(
-    pipeline,
-    prompt: str,
-    max_new_tokens: int,
-    model_type: str,
-    pad_token_id: Optional[int] = None
-) -> str:
-    """
-    Generate text using a model pipeline with proper error handling and cache settings.
-    Args:
-        pipeline: Model pipeline with model and tokenizer attributes
-        prompt: Input prompt text
-        max_new_tokens: Maximum number of new tokens to generate
-        model_type: Type of model (causal-openvino, text-generation, etc.)
-        pad_token_id: Padding token ID (defaults to tokenizer.pad_token_id or eos_token_id)
-    Returns:
-        Generated text
-    """
-    # Tokenize input
-    inputs = pipeline.tokenizer([prompt], return_tensors="pt")
-    # Get pad_token_id if not provided
-    if pad_token_id is None:
-        pad_token_id = pipeline.tokenizer.pad_token_id or pipeline.tokenizer.eos_token_id or 0
-    # Prepare generation kwargs
-    generation_kwargs = prepare_generation_kwargs(
-        inputs=inputs,
-        max_new_tokens=max_new_tokens,
-        pad_token_id=pad_token_id,
-        model_type=model_type
-    )
-    # Generate text
-    outputs = await asyncio.to_thread(
-        pipeline.model.generate,
-        **generation_kwargs,
-    )
-    # Decode output
-    text = pipeline.tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return text
 def monitor_memory_usage(operation_name: Optional[str] = None, job_id: Optional[str] = None):
     """Monitor and return current memory usage. Optionally log warnings.
@@ -1427,7 +1271,6 @@ DATA:
                         'patientName': ehr_data.get('result', {}).get('patientname', 'Unknown')
                     }
                 })
-<<<<<<< HEAD
             inputs = pipeline.tokenizer([prompt], return_tensors="pt")
             from ..utils.unified_model_manager import unified_model_manager as _unified_manager
@@ -1447,22 +1290,22 @@ DATA:
                 stream=False
                 )
             raw_summary = await asyncio.to_thread(model.generate, prompt, config)
-=======
-            # Generate text using helper function with proper cache settings
-            text = await generate_text_with_pipeline(
-                pipeline=pipeline,
-                prompt=prompt,
-                max_new_tokens=_effective_max_new_tokens(data.get("max_new_tokens"), default=1024),
-                model_type=model_type
-            )
->>>>>>> FT-DEV-20/10/2025
             try:
                 log_with_memory(logging.INFO, f"[SUMMARY] text-gen generated request_id={request_id} chars={len(raw_summary)}")
             except Exception:
                 pass
-            # Extract summary using helper function
-            new_summary = extract_summary_from_text(text)
             markdown_summary = summary_to_markdown(new_summary)
             markdown_summary = ensure_four_sections(markdown_summary)
@@ -3650,7 +3493,7 @@ async def clear_performance_cache():
             torch.set_num_threads(2)
             logging.info(f"⚙️ [STEP 3.1] Set torch threads to 2")
-            # Tokenize and generate using helper function
             tokenize_start = time.time()
             logging.info(f"🔤 [STEP 3.2] Tokenizing prompt (length: {len(prompt)} chars)...")
             inputs = pipeline.tokenizer([prompt], return_tensors="pt")
@@ -3661,30 +3504,49 @@ async def clear_performance_cache():
             # Generate response
             generate_start = time.time()
             logging.info(f"🎯 [STEP 3.3] Generating response (max_new_tokens=8192)...")
-            text = await generate_text_with_pipeline(
-                pipeline=pipeline,
-                prompt=prompt,
-                max_new_tokens=8192,
-                model_type=model_type,
-                pad_token_id=pipeline.tokenizer.eos_token_id or 32000
-            )
             generate_time = time.time() - generate_start
             logging.info(f"✅ [STEP 3.3] Generated in {generate_time:.3f}s")
-            logging.info(f"📝 [STEP 3.3] Generated text length: {len(text)} characters")
-            logging.info(f"📝 [STEP 3.3] Generated text preview: {text[:200]}...")
             generation_time = time.time() - generation_start
             logging.info(f"🎯 [STEP 3] Model generation completed in {generation_time:.3f}s total")
-            logging.info(f"📊 [STEP 3] Generation breakdown: tokenize={tokenize_time:.3f}s, generate={generate_time:.3f}s")
             # Extract and process summary
             logging.info(f"📄 [STEP 4] Starting summary extraction and processing...")
             extraction_start = time.time()
-            # Extract summary using helper function
             logging.info(f"🔍 [STEP 4.1] Extracting summary from generated text...")
-            new_summary = extract_summary_from_text(text)
-            logging.info(f"✅ [STEP 4.1] Summary extracted")
             extraction_time = time.time() - extraction_start
             logging.info(f"✅ [STEP 4.1] Summary extracted in {extraction_time:.3f}s")

+from collections import defaultdict
+from fastapi import APIRouter, Request, UploadFile, File, Form, HTTPException, BackgroundTasks, Query
+from fastapi.responses import JSONResponse, StreamingResponse, HTMLResponse
+from typing import List, Optional, Dict
 import os
 import uuid
 import json
 import threading
 import time
 import logging
 from ..core_logger import log_with_memory, log_exception_with_memory
 # Set up logger
 logger = logging.getLogger(__name__)
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline as transformers_pipeline
 from ..utils.file_utils import allowed_file, check_file_size, get_data_from_storage, save_data_to_storage
 from ..utils.unified_model_manager import unified_model_manager, GenerationConfig
 # ========== CONSTANTS AND CONFIGURATION ==========
 # Standardized timeout values for consistent behavior across all modes
 def cleanup_memory():
     """
     Clean up memory by forcing garbage collection and clearing caches.
+    This helps prevent memory leaks in long-running processes.
     """
     try:
         # Force garbage collection
         gc.collect()
+        # Clear PyTorch cache if available
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         # Clean up global caches to prevent memory leaks
         cleanup_global_caches()
         # Log memory usage for monitoring
         memory_info = psutil.virtual_memory()
+        logging.info(f"Memory cleanup completed. Available memory: {memory_info.available / 1024 / 1024 / 1024:.2f} GB")
     except Exception as e:
         logging.warning(f"Memory cleanup failed: {str(e)}")
 def cleanup_global_caches():
     """
     except Exception:
         return result
+def cleanup_memory():
+    """Clean up memory after model operations for HF Spaces with comprehensive monitoring"""
     try:
+        # Get initial memory usage
+        process = psutil.Process(os.getpid())
+        initial_memory = process.memory_info().rss / 1024 / 1024
+        # Force garbage collection
+        gc.collect()
+        # Clear CUDA cache if available
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+            # Get GPU memory info
+            allocated = torch.cuda.memory_allocated() / 1024**3  # GB
+            cached = torch.cuda.memory_reserved() / 1024**3  # GB
+            gpu_info = f", GPU: {allocated:.2f}GB allocated, {cached:.2f}GB cached"
+        else:
+            gpu_info = ""
+        # Get final memory usage
+        final_memory = process.memory_info().rss / 1024 / 1024
+        memory_delta = final_memory - initial_memory
+        PERFORMANCE_METRICS["memory_usage_mb"] = final_memory
+        print(f"Memory cleanup completed. Current usage: {final_memory:.1f} MB (Δ{memory_delta:+.1f}MB){gpu_info}")
+        # Log warning if memory usage is high
+        if final_memory > 3000:  # 3GB threshold
+            print(f"⚠️  High memory usage detected: {final_memory:.1f} MB")
+    except Exception as e:
+        print(f"Memory cleanup failed: {e}")
 def monitor_memory_usage(operation_name: Optional[str] = None, job_id: Optional[str] = None):
     """Monitor and return current memory usage. Optionally log warnings.
                         'patientName': ehr_data.get('result', {}).get('patientname', 'Unknown')
                     }
                 })
             inputs = pipeline.tokenizer([prompt], return_tensors="pt")
             from ..utils.unified_model_manager import unified_model_manager as _unified_manager
                 stream=False
                 )
             raw_summary = await asyncio.to_thread(model.generate, prompt, config)
             try:
                 log_with_memory(logging.INFO, f"[SUMMARY] text-gen generated request_id={request_id} chars={len(raw_summary)}")
             except Exception:
                 pass
+            summary_start_patterns = [
+                "Now generate the complete clinical summary",
+                "## Clinical",
+                "# Clinical",
+                "Clinical"
+            ]
+            new_summary = text
+            for pattern in summary_start_patterns:
+                if pattern in text:
+                    new_summary = text.split(pattern)[-1].strip()
+                    break
             markdown_summary = summary_to_markdown(new_summary)
             markdown_summary = ensure_four_sections(markdown_summary)
             torch.set_num_threads(2)
             logging.info(f"⚙️ [STEP 3.1] Set torch threads to 2")
+            # Tokenize input
             tokenize_start = time.time()
             logging.info(f"🔤 [STEP 3.2] Tokenizing prompt (length: {len(prompt)} chars)...")
             inputs = pipeline.tokenizer([prompt], return_tensors="pt")
             # Generate response
             generate_start = time.time()
             logging.info(f"🎯 [STEP 3.3] Generating response (max_new_tokens=8192)...")
+            outputs = pipeline.model.generate(**inputs, max_new_tokens=8192, do_sample=False, pad_token_id=pipeline.tokenizer.eos_token_id or 32000)
             generate_time = time.time() - generate_start
             logging.info(f"✅ [STEP 3.3] Generated in {generate_time:.3f}s")
+            logging.info(f"📊 [STEP 3.3] Output tensor shape: {outputs.shape}")
+            # Decode response
+            decode_start = time.time()
+            logging.info(f"📝 [STEP 3.4] Decoding generated tokens...")
+            text = pipeline.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            decode_time = time.time() - decode_start
+            logging.info(f"✅ [STEP 3.4] Decoded in {decode_time:.3f}s")
+            logging.info(f"📝 [STEP 3.4] Generated text length: {len(text)} characters")
+            logging.info(f"📝 [STEP 3.4] Generated text preview: {text[:200]}...")
             generation_time = time.time() - generation_start
             logging.info(f"🎯 [STEP 3] Model generation completed in {generation_time:.3f}s total")
+            logging.info(f"📊 [STEP 3] Generation breakdown: tokenize={tokenize_time:.3f}s, generate={generate_time:.3f}s, decode={decode_time:.3f}s")
             # Extract and process summary
             logging.info(f"📄 [STEP 4] Starting summary extraction and processing...")
             extraction_start = time.time()
+            summary_start_patterns = [
+                "Now generate the complete clinical summary in markdown format:",
+                "## Clinical",
+                "# Clinical",
+                "Clinical"
+            ]
             logging.info(f"🔍 [STEP 4.1] Extracting summary from generated text...")
+            logging.info(f"📊 [STEP 4.1] Looking for patterns: {summary_start_patterns}")
+            new_summary = text
+            pattern_found = None
+            for pattern in summary_start_patterns:
+                if pattern in text:
+                    new_summary = text.split(pattern)[-1].strip()
+                    pattern_found = pattern
+                    logging.info(f"✅ [STEP 4.1] Found pattern: '{pattern}'")
+                    break
+            if not pattern_found:
+                logging.warning(f"⚠️ [STEP 4.1] No extraction pattern found, using full text")
             extraction_time = time.time() - extraction_start
             logging.info(f"✅ [STEP 4.1] Summary extracted in {extraction_time:.3f}s")