Spaces:
Paused
Paused
Commit Β·
6c585d3
1
Parent(s): 3de57f3
Revert "merge conflicts"
Browse filesThis reverts commit 3de57f34b087996592583edd47a6e4a02b9dbc0a, reversing
changes made to 29a03e37a15cb4a97eceb9a4b71cd489c6f6c0a1.
requirements.txt
CHANGED
|
@@ -2,10 +2,7 @@
|
|
| 2 |
torch>=2.3.0
|
| 3 |
torchvision>=0.18.0
|
| 4 |
torchaudio>=2.3.0
|
| 5 |
-
|
| 6 |
-
# with OpenVINO/optimum-intel. Version 4.47.0+ works; 4.50.0+ removed get_max_length.
|
| 7 |
-
# Also using use_cache=False workaround in code as safety measure.
|
| 8 |
-
transformers>=4.47.0,<4.50.0
|
| 9 |
tokenizers==0.21.4
|
| 10 |
accelerate>=0.30.0
|
| 11 |
safetensors==0.6.2
|
|
|
|
| 2 |
torch>=2.3.0
|
| 3 |
torchvision>=0.18.0
|
| 4 |
torchaudio>=2.3.0
|
| 5 |
+
transformers>=4.42.0
|
|
|
|
|
|
|
|
|
|
| 6 |
tokenizers==0.21.4
|
| 7 |
accelerate>=0.30.0
|
| 8 |
safetensors==0.6.2
|
services/ai-service/src/ai_med_extract/__pycache__/app.cpython-311.pyc
CHANGED
|
Binary files a/services/ai-service/src/ai_med_extract/__pycache__/app.cpython-311.pyc and b/services/ai-service/src/ai_med_extract/__pycache__/app.cpython-311.pyc differ
|
|
|
services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc
CHANGED
|
Binary files a/services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc and b/services/ai-service/src/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc differ
|
|
|
services/ai-service/src/ai_med_extract/api/routes_fastapi.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
|
| 3 |
import os
|
| 4 |
import uuid
|
| 5 |
import json
|
|
@@ -7,29 +9,10 @@ import asyncio
|
|
| 7 |
import threading
|
| 8 |
import time
|
| 9 |
import logging
|
| 10 |
-
import gc
|
| 11 |
-
import hashlib
|
| 12 |
-
import re
|
| 13 |
-
from collections import defaultdict
|
| 14 |
-
from datetime import datetime, timedelta
|
| 15 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 16 |
-
from typing import List, Optional, Dict
|
| 17 |
-
|
| 18 |
-
# Third-party libraries
|
| 19 |
-
import torch
|
| 20 |
-
import psutil
|
| 21 |
-
import requests
|
| 22 |
-
from fastapi import APIRouter, Request, UploadFile, File, Form, HTTPException, BackgroundTasks, Query
|
| 23 |
-
from fastapi.responses import JSONResponse, StreamingResponse, HTMLResponse
|
| 24 |
-
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline as transformers_pipeline
|
| 25 |
-
|
| 26 |
-
# Local imports
|
| 27 |
from ..core_logger import log_with_memory, log_exception_with_memory
|
| 28 |
-
from ..utils.file_utils import allowed_file, check_file_size, get_data_from_storage, save_data_to_storage
|
| 29 |
|
| 30 |
# Set up logger
|
| 31 |
logger = logging.getLogger(__name__)
|
| 32 |
-
<<<<<<< HEAD
|
| 33 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 34 |
import torch
|
| 35 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline as transformers_pipeline
|
|
@@ -44,8 +27,6 @@ from datetime import datetime, timedelta
|
|
| 44 |
|
| 45 |
from ..utils.file_utils import allowed_file, check_file_size, get_data_from_storage, save_data_to_storage
|
| 46 |
from ..utils.unified_model_manager import unified_model_manager, GenerationConfig
|
| 47 |
-
=======
|
| 48 |
-
>>>>>>> FT-DEV-20/10/2025
|
| 49 |
|
| 50 |
# ========== CONSTANTS AND CONFIGURATION ==========
|
| 51 |
# Standardized timeout values for consistent behavior across all modes
|
|
@@ -209,50 +190,25 @@ async def retry_operation(operation, max_attempts: int, operation_name: str, job
|
|
| 209 |
def cleanup_memory():
|
| 210 |
"""
|
| 211 |
Clean up memory by forcing garbage collection and clearing caches.
|
| 212 |
-
This helps prevent memory leaks in long-running processes
|
| 213 |
"""
|
| 214 |
try:
|
| 215 |
-
# Get initial memory usage
|
| 216 |
-
process = psutil.Process(os.getpid())
|
| 217 |
-
initial_memory = process.memory_info().rss / 1024 / 1024 # MB
|
| 218 |
-
|
| 219 |
# Force garbage collection
|
| 220 |
gc.collect()
|
| 221 |
-
|
| 222 |
-
# Clear
|
| 223 |
if torch.cuda.is_available():
|
| 224 |
torch.cuda.empty_cache()
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
# Get GPU memory info
|
| 228 |
-
allocated = torch.cuda.memory_allocated() / 1024**3 # GB
|
| 229 |
-
cached = torch.cuda.memory_reserved() / 1024**3 # GB
|
| 230 |
-
gpu_info = f", GPU: {allocated:.2f}GB allocated, {cached:.2f}GB cached"
|
| 231 |
-
else:
|
| 232 |
-
gpu_info = ""
|
| 233 |
-
|
| 234 |
# Clean up global caches to prevent memory leaks
|
| 235 |
cleanup_global_caches()
|
| 236 |
-
|
| 237 |
-
# Get final memory usage
|
| 238 |
-
final_memory = process.memory_info().rss / 1024 / 1024 # MB
|
| 239 |
-
memory_delta = final_memory - initial_memory
|
| 240 |
-
PERFORMANCE_METRICS["memory_usage_mb"] = final_memory
|
| 241 |
-
|
| 242 |
# Log memory usage for monitoring
|
| 243 |
memory_info = psutil.virtual_memory()
|
| 244 |
-
|
| 245 |
-
print(f"Memory cleanup completed. Current usage: {final_memory:.1f} MB (Ξ{memory_delta:+.1f}MB), Available: {available_gb:.2f} GB{gpu_info}")
|
| 246 |
-
|
| 247 |
-
# Log warning if memory usage is high
|
| 248 |
-
if final_memory > 3000: # 3GB threshold
|
| 249 |
-
print(f"β οΈ High memory usage detected: {final_memory:.1f} MB")
|
| 250 |
-
|
| 251 |
-
logging.info(f"Memory cleanup completed. Available memory: {available_gb:.2f} GB{gpu_info}")
|
| 252 |
|
| 253 |
except Exception as e:
|
| 254 |
logging.warning(f"Memory cleanup failed: {str(e)}")
|
| 255 |
-
print(f"Memory cleanup failed: {e}")
|
| 256 |
|
| 257 |
def cleanup_global_caches():
|
| 258 |
"""
|
|
@@ -673,153 +629,41 @@ def generate_rule_based_summary(baseline, delta_text, visits=None, patientid=Non
|
|
| 673 |
except Exception:
|
| 674 |
return result
|
| 675 |
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
def safe_log(level: int, message: str, exc_info: bool = False) -> None:
|
| 679 |
-
"""
|
| 680 |
-
Safely log a message, catching any logging exceptions to prevent them from breaking execution.
|
| 681 |
-
|
| 682 |
-
Args:
|
| 683 |
-
level: Logging level (e.g., logging.INFO, logging.WARNING)
|
| 684 |
-
message: Message to log
|
| 685 |
-
exc_info: Whether to include exception info
|
| 686 |
-
"""
|
| 687 |
-
try:
|
| 688 |
-
log_with_memory(level, message)
|
| 689 |
-
except Exception:
|
| 690 |
-
# Fallback to standard logging if log_with_memory fails
|
| 691 |
-
try:
|
| 692 |
-
logging.log(level, message, exc_info=exc_info)
|
| 693 |
-
except Exception:
|
| 694 |
-
pass # Silently fail to prevent logging errors from breaking execution
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
def safe_log_exception(message: str, exception: Exception) -> None:
|
| 698 |
-
"""
|
| 699 |
-
Safely log an exception, catching any logging exceptions to prevent them from breaking execution.
|
| 700 |
-
|
| 701 |
-
Args:
|
| 702 |
-
message: Context message for the exception
|
| 703 |
-
exception: The exception that occurred
|
| 704 |
-
"""
|
| 705 |
try:
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
pass # Silently fail to prevent logging errors from breaking execution
|
| 713 |
-
|
| 714 |
-
def prepare_generation_kwargs(
|
| 715 |
-
inputs: Dict,
|
| 716 |
-
max_new_tokens: int,
|
| 717 |
-
pad_token_id: int,
|
| 718 |
-
model_type: str,
|
| 719 |
-
use_cache: Optional[bool] = None
|
| 720 |
-
) -> Dict:
|
| 721 |
-
"""
|
| 722 |
-
Prepare generation kwargs with proper cache settings for OpenVINO models.
|
| 723 |
-
|
| 724 |
-
Args:
|
| 725 |
-
inputs: Tokenized input tensors
|
| 726 |
-
max_new_tokens: Maximum number of new tokens to generate
|
| 727 |
-
pad_token_id: Padding token ID
|
| 728 |
-
model_type: Type of model (causal-openvino, text-generation, etc.)
|
| 729 |
-
use_cache: Optional explicit cache setting (if None, auto-detect for OpenVINO)
|
| 730 |
-
|
| 731 |
-
Returns:
|
| 732 |
-
Dictionary of generation kwargs
|
| 733 |
-
"""
|
| 734 |
-
generation_kwargs = {
|
| 735 |
-
**inputs,
|
| 736 |
-
"max_new_tokens": max_new_tokens,
|
| 737 |
-
"do_sample": False,
|
| 738 |
-
"pad_token_id": pad_token_id,
|
| 739 |
-
}
|
| 740 |
-
|
| 741 |
-
# Disable cache for OpenVINO models to avoid DynamicCache.get_max_length compatibility issues
|
| 742 |
-
if use_cache is None:
|
| 743 |
-
use_cache = not (model_type == "causal-openvino")
|
| 744 |
-
|
| 745 |
-
if not use_cache:
|
| 746 |
-
generation_kwargs["use_cache"] = False
|
| 747 |
-
|
| 748 |
-
return generation_kwargs
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
def extract_summary_from_text(text: str, patterns: Optional[List[str]] = None) -> str:
|
| 752 |
-
"""
|
| 753 |
-
Extract summary from generated text by finding the first matching pattern.
|
| 754 |
-
|
| 755 |
-
Args:
|
| 756 |
-
text: Generated text from model
|
| 757 |
-
patterns: List of patterns to search for (default: common summary patterns)
|
| 758 |
-
|
| 759 |
-
Returns:
|
| 760 |
-
Extracted summary text
|
| 761 |
-
"""
|
| 762 |
-
if patterns is None:
|
| 763 |
-
patterns = [
|
| 764 |
-
"Now generate the complete clinical summary",
|
| 765 |
-
"Now generate the complete clinical summary in markdown format:",
|
| 766 |
-
"## Clinical",
|
| 767 |
-
"# Clinical",
|
| 768 |
-
"Clinical"
|
| 769 |
-
]
|
| 770 |
-
|
| 771 |
-
for pattern in patterns:
|
| 772 |
-
if pattern in text:
|
| 773 |
-
return text.split(pattern)[-1].strip()
|
| 774 |
-
|
| 775 |
-
return text.strip()
|
| 776 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 777 |
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
model_type: str,
|
| 783 |
-
pad_token_id: Optional[int] = None
|
| 784 |
-
) -> str:
|
| 785 |
-
"""
|
| 786 |
-
Generate text using a model pipeline with proper error handling and cache settings.
|
| 787 |
-
|
| 788 |
-
Args:
|
| 789 |
-
pipeline: Model pipeline with model and tokenizer attributes
|
| 790 |
-
prompt: Input prompt text
|
| 791 |
-
max_new_tokens: Maximum number of new tokens to generate
|
| 792 |
-
model_type: Type of model (causal-openvino, text-generation, etc.)
|
| 793 |
-
pad_token_id: Padding token ID (defaults to tokenizer.pad_token_id or eos_token_id)
|
| 794 |
-
|
| 795 |
-
Returns:
|
| 796 |
-
Generated text
|
| 797 |
-
"""
|
| 798 |
-
# Tokenize input
|
| 799 |
-
inputs = pipeline.tokenizer([prompt], return_tensors="pt")
|
| 800 |
-
|
| 801 |
-
# Get pad_token_id if not provided
|
| 802 |
-
if pad_token_id is None:
|
| 803 |
-
pad_token_id = pipeline.tokenizer.pad_token_id or pipeline.tokenizer.eos_token_id or 0
|
| 804 |
-
|
| 805 |
-
# Prepare generation kwargs
|
| 806 |
-
generation_kwargs = prepare_generation_kwargs(
|
| 807 |
-
inputs=inputs,
|
| 808 |
-
max_new_tokens=max_new_tokens,
|
| 809 |
-
pad_token_id=pad_token_id,
|
| 810 |
-
model_type=model_type
|
| 811 |
-
)
|
| 812 |
-
|
| 813 |
-
# Generate text
|
| 814 |
-
outputs = await asyncio.to_thread(
|
| 815 |
-
pipeline.model.generate,
|
| 816 |
-
**generation_kwargs,
|
| 817 |
-
)
|
| 818 |
-
|
| 819 |
-
# Decode output
|
| 820 |
-
text = pipeline.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 821 |
-
return text
|
| 822 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 823 |
|
| 824 |
def monitor_memory_usage(operation_name: Optional[str] = None, job_id: Optional[str] = None):
|
| 825 |
"""Monitor and return current memory usage. Optionally log warnings.
|
|
@@ -1427,7 +1271,6 @@ DATA:
|
|
| 1427 |
'patientName': ehr_data.get('result', {}).get('patientname', 'Unknown')
|
| 1428 |
}
|
| 1429 |
})
|
| 1430 |
-
<<<<<<< HEAD
|
| 1431 |
inputs = pipeline.tokenizer([prompt], return_tensors="pt")
|
| 1432 |
from ..utils.unified_model_manager import unified_model_manager as _unified_manager
|
| 1433 |
|
|
@@ -1447,22 +1290,22 @@ DATA:
|
|
| 1447 |
stream=False
|
| 1448 |
)
|
| 1449 |
raw_summary = await asyncio.to_thread(model.generate, prompt, config)
|
| 1450 |
-
=======
|
| 1451 |
-
# Generate text using helper function with proper cache settings
|
| 1452 |
-
text = await generate_text_with_pipeline(
|
| 1453 |
-
pipeline=pipeline,
|
| 1454 |
-
prompt=prompt,
|
| 1455 |
-
max_new_tokens=_effective_max_new_tokens(data.get("max_new_tokens"), default=1024),
|
| 1456 |
-
model_type=model_type
|
| 1457 |
-
)
|
| 1458 |
-
>>>>>>> FT-DEV-20/10/2025
|
| 1459 |
try:
|
| 1460 |
log_with_memory(logging.INFO, f"[SUMMARY] text-gen generated request_id={request_id} chars={len(raw_summary)}")
|
| 1461 |
except Exception:
|
| 1462 |
pass
|
| 1463 |
|
| 1464 |
-
|
| 1465 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1466 |
|
| 1467 |
markdown_summary = summary_to_markdown(new_summary)
|
| 1468 |
markdown_summary = ensure_four_sections(markdown_summary)
|
|
@@ -3650,7 +3493,7 @@ async def clear_performance_cache():
|
|
| 3650 |
torch.set_num_threads(2)
|
| 3651 |
logging.info(f"βοΈ [STEP 3.1] Set torch threads to 2")
|
| 3652 |
|
| 3653 |
-
# Tokenize
|
| 3654 |
tokenize_start = time.time()
|
| 3655 |
logging.info(f"π€ [STEP 3.2] Tokenizing prompt (length: {len(prompt)} chars)...")
|
| 3656 |
inputs = pipeline.tokenizer([prompt], return_tensors="pt")
|
|
@@ -3661,30 +3504,49 @@ async def clear_performance_cache():
|
|
| 3661 |
# Generate response
|
| 3662 |
generate_start = time.time()
|
| 3663 |
logging.info(f"π― [STEP 3.3] Generating response (max_new_tokens=8192)...")
|
| 3664 |
-
|
| 3665 |
-
pipeline=pipeline,
|
| 3666 |
-
prompt=prompt,
|
| 3667 |
-
max_new_tokens=8192,
|
| 3668 |
-
model_type=model_type,
|
| 3669 |
-
pad_token_id=pipeline.tokenizer.eos_token_id or 32000
|
| 3670 |
-
)
|
| 3671 |
generate_time = time.time() - generate_start
|
| 3672 |
logging.info(f"β
[STEP 3.3] Generated in {generate_time:.3f}s")
|
| 3673 |
-
logging.info(f"
|
| 3674 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3675 |
|
| 3676 |
generation_time = time.time() - generation_start
|
| 3677 |
logging.info(f"π― [STEP 3] Model generation completed in {generation_time:.3f}s total")
|
| 3678 |
-
logging.info(f"π [STEP 3] Generation breakdown: tokenize={tokenize_time:.3f}s, generate={generate_time:.3f}s")
|
| 3679 |
|
| 3680 |
# Extract and process summary
|
| 3681 |
logging.info(f"π [STEP 4] Starting summary extraction and processing...")
|
| 3682 |
extraction_start = time.time()
|
| 3683 |
|
| 3684 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3685 |
logging.info(f"π [STEP 4.1] Extracting summary from generated text...")
|
| 3686 |
-
|
| 3687 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3688 |
|
| 3689 |
extraction_time = time.time() - extraction_start
|
| 3690 |
logging.info(f"β
[STEP 4.1] Summary extracted in {extraction_time:.3f}s")
|
|
|
|
| 1 |
+
from collections import defaultdict
|
| 2 |
+
from fastapi import APIRouter, Request, UploadFile, File, Form, HTTPException, BackgroundTasks, Query
|
| 3 |
+
from fastapi.responses import JSONResponse, StreamingResponse, HTMLResponse
|
| 4 |
+
from typing import List, Optional, Dict
|
| 5 |
import os
|
| 6 |
import uuid
|
| 7 |
import json
|
|
|
|
| 9 |
import threading
|
| 10 |
import time
|
| 11 |
import logging
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
from ..core_logger import log_with_memory, log_exception_with_memory
|
|
|
|
| 13 |
|
| 14 |
# Set up logger
|
| 15 |
logger = logging.getLogger(__name__)
|
|
|
|
| 16 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 17 |
import torch
|
| 18 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline as transformers_pipeline
|
|
|
|
| 27 |
|
| 28 |
from ..utils.file_utils import allowed_file, check_file_size, get_data_from_storage, save_data_to_storage
|
| 29 |
from ..utils.unified_model_manager import unified_model_manager, GenerationConfig
|
|
|
|
|
|
|
| 30 |
|
| 31 |
# ========== CONSTANTS AND CONFIGURATION ==========
|
| 32 |
# Standardized timeout values for consistent behavior across all modes
|
|
|
|
| 190 |
def cleanup_memory():
|
| 191 |
"""
|
| 192 |
Clean up memory by forcing garbage collection and clearing caches.
|
| 193 |
+
This helps prevent memory leaks in long-running processes.
|
| 194 |
"""
|
| 195 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
# Force garbage collection
|
| 197 |
gc.collect()
|
| 198 |
+
|
| 199 |
+
# Clear PyTorch cache if available
|
| 200 |
if torch.cuda.is_available():
|
| 201 |
torch.cuda.empty_cache()
|
| 202 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
# Clean up global caches to prevent memory leaks
|
| 204 |
cleanup_global_caches()
|
| 205 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
# Log memory usage for monitoring
|
| 207 |
memory_info = psutil.virtual_memory()
|
| 208 |
+
logging.info(f"Memory cleanup completed. Available memory: {memory_info.available / 1024 / 1024 / 1024:.2f} GB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
except Exception as e:
|
| 211 |
logging.warning(f"Memory cleanup failed: {str(e)}")
|
|
|
|
| 212 |
|
| 213 |
def cleanup_global_caches():
|
| 214 |
"""
|
|
|
|
| 629 |
except Exception:
|
| 630 |
return result
|
| 631 |
|
| 632 |
+
def cleanup_memory():
|
| 633 |
+
"""Clean up memory after model operations for HF Spaces with comprehensive monitoring"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 634 |
try:
|
| 635 |
+
# Get initial memory usage
|
| 636 |
+
process = psutil.Process(os.getpid())
|
| 637 |
+
initial_memory = process.memory_info().rss / 1024 / 1024
|
| 638 |
+
|
| 639 |
+
# Force garbage collection
|
| 640 |
+
gc.collect()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
|
| 642 |
+
# Clear CUDA cache if available
|
| 643 |
+
if torch.cuda.is_available():
|
| 644 |
+
torch.cuda.empty_cache()
|
| 645 |
+
torch.cuda.synchronize()
|
| 646 |
+
|
| 647 |
+
# Get GPU memory info
|
| 648 |
+
allocated = torch.cuda.memory_allocated() / 1024**3 # GB
|
| 649 |
+
cached = torch.cuda.memory_reserved() / 1024**3 # GB
|
| 650 |
+
gpu_info = f", GPU: {allocated:.2f}GB allocated, {cached:.2f}GB cached"
|
| 651 |
+
else:
|
| 652 |
+
gpu_info = ""
|
| 653 |
|
| 654 |
+
# Get final memory usage
|
| 655 |
+
final_memory = process.memory_info().rss / 1024 / 1024
|
| 656 |
+
memory_delta = final_memory - initial_memory
|
| 657 |
+
PERFORMANCE_METRICS["memory_usage_mb"] = final_memory
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 658 |
|
| 659 |
+
print(f"Memory cleanup completed. Current usage: {final_memory:.1f} MB (Ξ{memory_delta:+.1f}MB){gpu_info}")
|
| 660 |
+
|
| 661 |
+
# Log warning if memory usage is high
|
| 662 |
+
if final_memory > 3000: # 3GB threshold
|
| 663 |
+
print(f"β οΈ High memory usage detected: {final_memory:.1f} MB")
|
| 664 |
+
|
| 665 |
+
except Exception as e:
|
| 666 |
+
print(f"Memory cleanup failed: {e}")
|
| 667 |
|
| 668 |
def monitor_memory_usage(operation_name: Optional[str] = None, job_id: Optional[str] = None):
|
| 669 |
"""Monitor and return current memory usage. Optionally log warnings.
|
|
|
|
| 1271 |
'patientName': ehr_data.get('result', {}).get('patientname', 'Unknown')
|
| 1272 |
}
|
| 1273 |
})
|
|
|
|
| 1274 |
inputs = pipeline.tokenizer([prompt], return_tensors="pt")
|
| 1275 |
from ..utils.unified_model_manager import unified_model_manager as _unified_manager
|
| 1276 |
|
|
|
|
| 1290 |
stream=False
|
| 1291 |
)
|
| 1292 |
raw_summary = await asyncio.to_thread(model.generate, prompt, config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1293 |
try:
|
| 1294 |
log_with_memory(logging.INFO, f"[SUMMARY] text-gen generated request_id={request_id} chars={len(raw_summary)}")
|
| 1295 |
except Exception:
|
| 1296 |
pass
|
| 1297 |
|
| 1298 |
+
summary_start_patterns = [
|
| 1299 |
+
"Now generate the complete clinical summary",
|
| 1300 |
+
"## Clinical",
|
| 1301 |
+
"# Clinical",
|
| 1302 |
+
"Clinical"
|
| 1303 |
+
]
|
| 1304 |
+
new_summary = text
|
| 1305 |
+
for pattern in summary_start_patterns:
|
| 1306 |
+
if pattern in text:
|
| 1307 |
+
new_summary = text.split(pattern)[-1].strip()
|
| 1308 |
+
break
|
| 1309 |
|
| 1310 |
markdown_summary = summary_to_markdown(new_summary)
|
| 1311 |
markdown_summary = ensure_four_sections(markdown_summary)
|
|
|
|
| 3493 |
torch.set_num_threads(2)
|
| 3494 |
logging.info(f"βοΈ [STEP 3.1] Set torch threads to 2")
|
| 3495 |
|
| 3496 |
+
# Tokenize input
|
| 3497 |
tokenize_start = time.time()
|
| 3498 |
logging.info(f"π€ [STEP 3.2] Tokenizing prompt (length: {len(prompt)} chars)...")
|
| 3499 |
inputs = pipeline.tokenizer([prompt], return_tensors="pt")
|
|
|
|
| 3504 |
# Generate response
|
| 3505 |
generate_start = time.time()
|
| 3506 |
logging.info(f"π― [STEP 3.3] Generating response (max_new_tokens=8192)...")
|
| 3507 |
+
outputs = pipeline.model.generate(**inputs, max_new_tokens=8192, do_sample=False, pad_token_id=pipeline.tokenizer.eos_token_id or 32000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3508 |
generate_time = time.time() - generate_start
|
| 3509 |
logging.info(f"β
[STEP 3.3] Generated in {generate_time:.3f}s")
|
| 3510 |
+
logging.info(f"π [STEP 3.3] Output tensor shape: {outputs.shape}")
|
| 3511 |
+
|
| 3512 |
+
# Decode response
|
| 3513 |
+
decode_start = time.time()
|
| 3514 |
+
logging.info(f"π [STEP 3.4] Decoding generated tokens...")
|
| 3515 |
+
text = pipeline.tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 3516 |
+
decode_time = time.time() - decode_start
|
| 3517 |
+
logging.info(f"β
[STEP 3.4] Decoded in {decode_time:.3f}s")
|
| 3518 |
+
logging.info(f"π [STEP 3.4] Generated text length: {len(text)} characters")
|
| 3519 |
+
logging.info(f"π [STEP 3.4] Generated text preview: {text[:200]}...")
|
| 3520 |
|
| 3521 |
generation_time = time.time() - generation_start
|
| 3522 |
logging.info(f"π― [STEP 3] Model generation completed in {generation_time:.3f}s total")
|
| 3523 |
+
logging.info(f"π [STEP 3] Generation breakdown: tokenize={tokenize_time:.3f}s, generate={generate_time:.3f}s, decode={decode_time:.3f}s")
|
| 3524 |
|
| 3525 |
# Extract and process summary
|
| 3526 |
logging.info(f"π [STEP 4] Starting summary extraction and processing...")
|
| 3527 |
extraction_start = time.time()
|
| 3528 |
|
| 3529 |
+
summary_start_patterns = [
|
| 3530 |
+
"Now generate the complete clinical summary in markdown format:",
|
| 3531 |
+
"## Clinical",
|
| 3532 |
+
"# Clinical",
|
| 3533 |
+
"Clinical"
|
| 3534 |
+
]
|
| 3535 |
+
|
| 3536 |
logging.info(f"π [STEP 4.1] Extracting summary from generated text...")
|
| 3537 |
+
logging.info(f"π [STEP 4.1] Looking for patterns: {summary_start_patterns}")
|
| 3538 |
+
|
| 3539 |
+
new_summary = text
|
| 3540 |
+
pattern_found = None
|
| 3541 |
+
for pattern in summary_start_patterns:
|
| 3542 |
+
if pattern in text:
|
| 3543 |
+
new_summary = text.split(pattern)[-1].strip()
|
| 3544 |
+
pattern_found = pattern
|
| 3545 |
+
logging.info(f"β
[STEP 4.1] Found pattern: '{pattern}'")
|
| 3546 |
+
break
|
| 3547 |
+
|
| 3548 |
+
if not pattern_found:
|
| 3549 |
+
logging.warning(f"β οΈ [STEP 4.1] No extraction pattern found, using full text")
|
| 3550 |
|
| 3551 |
extraction_time = time.time() - extraction_start
|
| 3552 |
logging.info(f"β
[STEP 4.1] Summary extracted in {extraction_time:.3f}s")
|