Spaces:

salvinjose
/

HNTAI

Paused

App Files Files Community

sachinchandrankallar commited on Sep 9, 2025

Commit

f91c303

1 Parent(s): 8eb4114

patient summary working

Browse files

Files changed (18) hide show

GGUF_TROUBLESHOOTING.md +1 -1
TODO.md +1 -1
ai_med_extract/__pycache__/app.cpython-311.pyc +0 -0
ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc +0 -0
ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc +0 -0
ai_med_extract/agents/patient_summary_agent.py +1 -1
ai_med_extract/agents/summarizer.py +2 -2
ai_med_extract/api/__pycache__/routes.cpython-311.pyc +0 -0
ai_med_extract/api/routes.py +472 -309
ai_med_extract/app.py +1 -0
ai_med_extract/utils/__pycache__/model_loader_gguf.cpython-311.pyc +0 -0
ai_med_extract/utils/__pycache__/model_manager.cpython-311.pyc +0 -0
ai_med_extract/utils/__pycache__/openvino_summarizer_utils.cpython-311.pyc +0 -0
ai_med_extract/utils/model_config.py +2 -2
ai_med_extract/utils/model_loader_gguf.py +3 -3
ai_med_extract/utils/model_manager.py +2 -2
ai_med_extract/utils/openvino_summarizer_utils.py +58 -13
generate_patient_summary_colab.py +4 -4

GGUF_TROUBLESHOOTING.md CHANGED Viewed

@@ -34,7 +34,7 @@ Your Hugging Face Space is throwing 500 errors when calling the `generatepatient
 - Threading-based timeout (more reliable than signals)
 ### 3. **Memory Optimization**
-- Reduced context window from 4096 to 2048 tokens
 - Reduced batch size from 128 to 64
 - CPU-only mode with optimized thread usage

 - Threading-based timeout (more reliable than signals)
 ### 3. **Memory Optimization**
+- Reduced context window from 4096 to 4000 tokens
 - Reduced batch size from 128 to 64
 - CPU-only mode with optimized thread usage

TODO.md CHANGED Viewed

@@ -7,7 +7,7 @@
 ## Details
 - Approximate prompt tokens by word count (split on whitespace)
-- Calculate allowed max_tokens = 2048 - prompt_tokens
 - Reduce max_tokens if necessary, log warning
 - Raise error if prompt too long
 - Set n_threads to os.cpu_count() for speed

 ## Details
 - Approximate prompt tokens by word count (split on whitespace)
+- Calculate allowed max_tokens = 4000 - prompt_tokens
 - Reduce max_tokens if necessary, log warning
 - Raise error if prompt too long
 - Set n_threads to os.cpu_count() for speed

ai_med_extract/__pycache__/app.cpython-311.pyc CHANGED Viewed

Binary files a/ai_med_extract/__pycache__/app.cpython-311.pyc and b/ai_med_extract/__pycache__/app.cpython-311.pyc differ

ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc CHANGED Viewed

Binary files a/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc and b/ai_med_extract/agents/__pycache__/patient_summary_agent.cpython-311.pyc differ

ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc CHANGED Viewed

Binary files a/ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc and b/ai_med_extract/agents/__pycache__/summarizer.cpython-311.pyc differ

ai_med_extract/agents/patient_summary_agent.py CHANGED Viewed

@@ -16,7 +16,7 @@ class PatientSummarizerAgent:
         model_name: str = "falconsai/medical_summarization",
         model_type: str = "summarization",
         device: Optional[str] = None,
-        max_input_tokens: int = 2048,
         max_output_tokens: int = 512
     ):
         self.model_name = model_name

         model_name: str = "falconsai/medical_summarization",
         model_type: str = "summarization",
         device: Optional[str] = None,
+        max_input_tokens: int = 4000,
         max_output_tokens: int = 512
     ):
         self.model_name = model_name

ai_med_extract/agents/summarizer.py CHANGED Viewed

@@ -14,7 +14,7 @@ class SummarizerAgent:
         # Base parameters
         min_length = max(30, min(100, int(word_count * 0.1)))  # 10% of word count, min 30, max 100
-        max_length = max(512, min(2048, int(word_count * 0.5)))  # 50% of word count, min 512, max 2048
         # Adjust based on previous summary length to prevent degradation
         if self.request_count > 0 and self.last_summary_length > 0:
@@ -90,7 +90,7 @@ class SummarizerAgent:
                 # Use GGUF's built-in method that handles large inputs and 4-section requirement
                 summary = model.generate_full_summary(
                     clean_text,
-                    max_tokens=2048,  # Increased to handle larger inputs
                     max_loops=2
                 )
             else:

         # Base parameters
         min_length = max(30, min(100, int(word_count * 0.1)))  # 10% of word count, min 30, max 100
+        max_length = max(512, min(4000, int(word_count * 0.5)))  # 50% of word count, min 512, max 4000
         # Adjust based on previous summary length to prevent degradation
         if self.request_count > 0 and self.last_summary_length > 0:
                 # Use GGUF's built-in method that handles large inputs and 4-section requirement
                 summary = model.generate_full_summary(
                     clean_text,
+                    max_tokens=4000,  # Increased to handle larger inputs
                     max_loops=2
                 )
             else:

ai_med_extract/api/__pycache__/routes.cpython-311.pyc CHANGED Viewed

Binary files a/ai_med_extract/api/__pycache__/routes.cpython-311.pyc and b/ai_med_extract/api/__pycache__/routes.cpython-311.pyc differ

ai_med_extract/api/routes.py CHANGED Viewed

@@ -1,18 +1,15 @@
 """
 Medical Data Extraction API Routes
 This module provides Flask API endpoints for medical data processing, including:
 - Patient summary generation using various model types (GGUF, OpenVINO, HuggingFace)
 - File upload and text extraction
 - Medical data extraction from text and audio
 - Protected Health Information (PHI) scrubbing
 - Model management and dynamic loading
 The API supports multiple model formats and includes comprehensive error handling,
 memory optimization, and caching mechanisms for efficient operation in both
 local and cloud environments (Hugging Face Spaces).
 """
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import json
 import logging
@@ -28,6 +25,7 @@ from transformers import (
     pipeline as transformers_pipeline
 )
 from ai_med_extract.agents.patient_summary_agent import PatientSummarizerAgent
 agent = PatientSummarizerAgent(model_name="falconsai/medical_summarization")
 from ai_med_extract.agents.summarizer import SummarizerAgent
 from ai_med_extract.utils.file_utils import (
@@ -37,35 +35,28 @@ from ai_med_extract.utils.file_utils import (
     get_data_from_storage,
 )
 from ..utils.validation import clean_result, validate_patient_name
-# from ..utils.patient_summary_utils import clean_patient_data, flatten_to_string_list
-from ai_med_extract.utils.patient_summary_utils import   clean_patient_data, flatten_to_string_list
 import time
 logger = logging.getLogger(__name__)
 # Add GGUF model cache at the top of the file
 GGUF_MODEL_CACHE = {}
 def get_gguf_pipeline(model_name: str, filename: str = None):
     """
     Load and cache GGUF model pipelines with comprehensive error handling.
     This function provides a cached interface to GGUF models with fallback mechanisms
     for robust operation in production environments.
     Args:
         model_name (str): The name of the GGUF model or HuggingFace repository ID.
                          Can be a local file path or HuggingFace model identifier.
         filename (str, optional): Specific filename for HuggingFace repository models.
                                  Required when model_name is a repository ID.
     Returns:
         GGUFModelPipeline: A loaded GGUF model pipeline instance or fallback pipeline.
     Raises:
         RuntimeError: If both model loading and fallback mechanisms fail.
     Notes:
         - Uses a global cache to avoid reloading the same model multiple times
         - Implements timeout mechanism for model loading (5 minutes)
@@ -77,11 +68,9 @@ def get_gguf_pipeline(model_name: str, filename: str = None):
         try:
             from ai_med_extract.utils.model_loader_gguf import GGUFModelPipeline, create_fallback_pipeline
             import time
             # Add timeout for model loading
             start_time = time.time()
             timeout = 300  # 5 minutes timeout
             # Try to load the GGUF model
             try:
                 GGUF_MODEL_CACHE[key] = GGUFModelPipeline(model_name, filename, timeout=timeout)
@@ -90,55 +79,67 @@ def get_gguf_pipeline(model_name: str, filename: str = None):
             except Exception as e:
                 load_time = time.time() - start_time
                 print(f"[GGUF] Failed to load model {model_name} after {load_time:.2f}s: {e}")
                 # If model loading fails, use fallback
                 print("[GGUF] Using fallback pipeline")
                 GGUF_MODEL_CACHE[key] = create_fallback_pipeline()
         except Exception as e:
             print(f"[GGUF] Critical error in model loading: {e}")
             # Create a basic fallback
             from ai_med_extract.utils.model_loader_gguf import create_fallback_pipeline
             GGUF_MODEL_CACHE[key] = create_fallback_pipeline()
     return GGUF_MODEL_CACHE[key]
 def get_qa_pipeline(qa_model_type, qa_model_name):
     if not qa_model_type or not qa_model_name:
         raise ValueError("Both qa_model_type and qa_model_name must be provided")
     if not hasattr(get_qa_pipeline, "cache"):
         get_qa_pipeline.cache = {}
     # For Hugging Face Spaces, we need to be memory efficient
     import torch
     torch.cuda.empty_cache()  # Clear GPU memory before loading model
     # Set default tensor type to float32 for better compatibility
     torch.set_default_tensor_type(torch.FloatTensor)
     if torch.cuda.is_available():
         torch.set_default_tensor_type(torch.cuda.FloatTensor)
     key = (qa_model_type, qa_model_name)
     if key in get_qa_pipeline.cache:
         return get_qa_pipeline.cache[key]
     try:
         # For Hugging Face Spaces, use smaller models by default
         if "Qwen/Qwen-7B-Chat" in qa_model_name:
             qa_model_name = "Qwen/Qwen-1_8B-Chat"
         elif "Llama" in qa_model_name:
             qa_model_name = "facebook/opt-125m"
         # Load tokenizer with trust_remote_code=True for custom tokenizers
         tokenizer = AutoTokenizer.from_pretrained(
             qa_model_name,
             trust_remote_code=True,
             cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
         )
         # Load model with memory optimizations
         try:
             model = AutoModelForCausalLM.from_pretrained(
@@ -160,7 +161,6 @@ def get_qa_pipeline(qa_model_type, qa_model_name):
                 low_cpu_mem_usage=True,
                 cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
             )
         # Create pipeline with memory optimizations
         pipeline = transformers_pipeline(
             task=qa_model_type,
@@ -169,10 +169,8 @@ def get_qa_pipeline(qa_model_type, qa_model_name):
             device_map="auto",
             torch_dtype=torch.float32
         )
         get_qa_pipeline.cache[key] = pipeline
         return pipeline
     except Exception as e:
         raise
@@ -182,14 +180,11 @@ def run_qa_pipeline(qa_pipeline, question, context):
     """
     if not qa_pipeline or not question or not context:
         raise ValueError("Pipeline, question and context are required")
     qa_model_type = getattr(qa_pipeline, '_qa_model_type', None)
     try:
         if qa_model_type == 'text-generation':
             prompt = f"Question: {question}\nContext: {context}\nAnswer:"
             result = qa_pipeline(prompt, max_new_tokens=128, do_sample=False)
             if isinstance(result, list) and result and 'generated_text' in result[0]:
                 answer = result[0]['generated_text'].split('Answer:')[-1].strip()
                 return {'answer': answer}
@@ -203,30 +198,23 @@ def run_qa_pipeline(qa_pipeline, question, context):
 def get_ner_pipeline(ner_model_type, ner_model_name):
     if not ner_model_type or not ner_model_name:
         raise ValueError("Both ner_model_type and ner_model_name must be provided")
     if not hasattr(get_ner_pipeline, "cache"):
         get_ner_pipeline.cache = {}
     # For Hugging Face Spaces, we need to be memory efficient
     import torch
     torch.cuda.empty_cache()  # Clear GPU memory before loading model
     # Set default tensor type
     torch.set_default_tensor_type(torch.FloatTensor)
     if torch.cuda.is_available():
         torch.set_default_tensor_type(torch.cuda.FloatTensor)
     key = (ner_model_type, ner_model_name)
     if key in get_ner_pipeline.cache:
         return get_ner_pipeline.cache[key]
     try:
         from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
         # Clear any existing models from memory
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         # Load tokenizer
         try:
             tokenizer = AutoTokenizer.from_pretrained(
@@ -242,7 +230,6 @@ def get_ner_pipeline(ner_model_type, ner_model_name):
                 trust_remote_code=True,
                 cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
             )
         # Load model with memory optimizations
         try:
             # For NER models, we'll use CPU if device_map='auto' is not supported
@@ -276,7 +263,6 @@ def get_ner_pipeline(ner_model_type, ner_model_name):
                 torch_dtype=torch.float32,
                 cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
             )
         # Create pipeline with appropriate device configuration
         try:
             qa_pipeline = pipeline(
@@ -297,15 +283,12 @@ def get_ner_pipeline(ner_model_type, ner_model_name):
                 )
             else:
                 raise
         # Cache the pipeline
         get_ner_pipeline.cache[key] = qa_pipeline
         return qa_pipeline
     except Exception as e:
         raise
 def get_summarizer_pipeline(summarizer_model_type, summarizer_model_name):
     if not hasattr(get_summarizer_pipeline, "cache"):
         get_summarizer_pipeline.cache = {}
@@ -313,7 +296,6 @@ def get_summarizer_pipeline(summarizer_model_type, summarizer_model_name):
     if key not in get_summarizer_pipeline.cache:
         import torch
         from transformers import pipeline
         # Use float16 only if CUDA is available, else use float32
         if torch.cuda.is_available():
             dtype = torch.float16
@@ -323,7 +305,6 @@ def get_summarizer_pipeline(summarizer_model_type, summarizer_model_name):
             dtype = torch.float32
             device = -1
             device_map = None
         get_summarizer_pipeline.cache[key] = pipeline(
             task=summarizer_model_type,
             model=summarizer_model_name,
@@ -334,7 +315,6 @@ def get_summarizer_pipeline(summarizer_model_type, summarizer_model_name):
         )
     return get_summarizer_pipeline.cache[key]
 def register_routes(app, agents):
     from ai_med_extract.utils.openvino_summarizer_utils import (
         parse_ehr_chartsummarydtl, visits_sorted, compute_deltas, build_compact_baseline, delta_to_text, build_main_prompt
@@ -353,10 +333,8 @@ def register_routes(app, agents):
             chartsummarydtl = ehr_result.get("chartsummarydtl") if isinstance(ehr_result, dict) else None
             if not chartsummarydtl:
                 return jsonify({"error": "Missing chartsummarydtl in input"}), 400
             # Normalize visits
             visits = parse_ehr_chartsummarydtl(chartsummarydtl)
             # Extract patient demographics if available
             patient_info = ""
             if isinstance(ehr_result, dict):
@@ -367,7 +345,6 @@ def register_routes(app, agents):
                 past_medical_history = ', '.join(ehr_result.get('past_medical_history', []))
                 social_history = ehr_result.get('social_history', 'Not specified')
                 patient_info = f"Patient: {patient_name} (ID: {patient_id}, Age: {age}, Gender: {gender})\nPast Medical History: {past_medical_history}\nSocial History: {social_history}\n"
             # Generate summary from current data only (no state tracking)
             # Use empty old visits to compute deltas against baseline
             delta = compute_deltas([], visits)
@@ -375,7 +352,6 @@ def register_routes(app, agents):
             baseline = build_compact_baseline(all_visits)
             delta_text = delta_to_text(delta)
             prompt = build_main_prompt(baseline, delta_text, patient_info)
             # Model selection logic (model_name, model_type)
             model_name = data.get("model_name") or "microsoft/Phi-3-mini-4k-instruct"
             model_type = data.get("model_type") or "text-generation"
@@ -387,14 +363,12 @@ def register_routes(app, agents):
             pipeline = loader.model_loader.load() if hasattr(loader, "model_loader") else None
             if not pipeline:
                 return jsonify({"error": "Model pipeline not available"}), 500
             # Run inference
             import torch
             torch.set_num_threads(2)
             inputs = pipeline.tokenizer([prompt], return_tensors="pt")
             outputs = pipeline.model.generate(**inputs, max_new_tokens=100000, do_sample=False, pad_token_id=pipeline.tokenizer.eos_token_id or 32000)
             text = pipeline.tokenizer.decode(outputs[0], skip_special_tokens=True)
             # Extract just the markdown summary (remove prompt text)
             # The model should return the complete markdown-formatted summary
             summary_start_patterns = [
@@ -403,13 +377,11 @@ def register_routes(app, agents):
                 "# Clinical Assessment",
                 "Clinical Assessment"
             ]
             new_summary = text
             for pattern in summary_start_patterns:
                 if pattern in text:
                     new_summary = text.split(pattern)[-1].strip()
                     break
             return jsonify({
                 "summary": new_summary,
                 "baseline": baseline,
@@ -417,15 +389,15 @@ def register_routes(app, agents):
             }), 200
         except Exception as e:
             return jsonify({"error": f"Failed to generate summary: {str(e)}"}), 500
-    # Configure upload directory based on environment    import os
     if os.environ.get('SPACE_ID'):  # We're running on Hugging Face Spaces
         app.config['UPLOAD_FOLDER'] = '/data/uploads'
     else:  # We're running locally
         upload_dir = os.path.join(os.getcwd(), 'uploads')
         os.makedirs(upload_dir, exist_ok=True)
         app.config['UPLOAD_FOLDER'] = upload_dir
     # Ensure the upload directory exists and is writable
     if not os.path.exists(app.config['UPLOAD_FOLDER']):
         try:
@@ -444,11 +416,9 @@ def register_routes(app, agents):
     def upload_file():
         import torch
         torch.cuda.empty_cache()  # Clear GPU memory before processing
         files = request.files.getlist("file")
         patient_name = request.form.get("patient_name", "").strip()
         password = request.form.get("password")
         # Use more compatible models by default
         qa_model_name = request.form.get("qa_model_name", "facebook/bart-base")
         qa_model_type = request.form.get("qa_model_type", "text-generation")
@@ -456,10 +426,8 @@ def register_routes(app, agents):
         ner_model_type = request.form.get("ner_model_type", "ner")
         summarizer_model_name = request.form.get("summarizer_model_name", "facebook/bart-base")
         summarizer_model_type = request.form.get("summarizer_model_type", "summarization")
         if not files:
             return jsonify({"error": "No file uploaded"}), 400
         # Accept any model type and model name for QA, NER, and summarizer
         if not qa_model_name or not qa_model_type:
             return jsonify({"error": "QA model name and type are required"}), 400
@@ -467,21 +435,18 @@ def register_routes(app, agents):
             qa_pipeline = get_qa_pipeline(qa_model_type, qa_model_name)
         except Exception as e:
             return jsonify({"error": f"QA model load failed: {str(e)}"}), 500
         if not ner_model_name or not ner_model_type:
             return jsonify({"error": "NER model name and type are required"}), 400
         try:
             ner_pipeline = get_ner_pipeline(ner_model_type, ner_model_name)
         except Exception as e:
             return jsonify({"error": f"NER model load failed: {str(e)}"}), 500
         if not summarizer_model_name or not summarizer_model_type:
             return jsonify({"error": "Summarizer model name and type are required"}), 400
         try:
             summarizer_pipeline = get_summarizer_pipeline(summarizer_model_type, summarizer_model_name)
         except Exception as e:
             return jsonify({"error": f"Summarizer model load failed: {str(e)}"}), 500
         extracted_data = []
         for file in files:
             if file.filename == "":
@@ -514,7 +479,6 @@ def register_routes(app, agents):
             except Exception as e:
                 os.remove(filepath)  # Clean up on failure
                 return jsonify({"error": f"Text extraction failed: {str(e)}"}), 500
             skip_medical_check = (
                 request.form.get("skip_medical_check", "false").lower() == "true"
             )
@@ -650,13 +614,11 @@ def register_routes(app, agents):
             file = request.files["file"]
             if file.filename == "":
                 return jsonify({"error": "No selected file"}), 400
             # Use secure filename
             from werkzeug.utils import secure_filename
             import uuid
             temp_filename = f"{uuid.uuid4()}_{secure_filename(file.filename)}"
             temp_path = os.path.join(app.config['UPLOAD_FOLDER'], temp_filename)
             file.save(temp_path)
             result = whisper_model.transcribe(temp_path)
             os.remove(temp_path)
@@ -666,7 +628,6 @@ def register_routes(app, agents):
                 os.remove(temp_path)
             return jsonify({"error": str(e)}), 500
     def group_by_category(data):
         grouped = defaultdict(list)
         for item in data:
@@ -678,20 +639,17 @@ def register_routes(app, agents):
                     "answer": item.get("answer", "Not Available"),
                 }
             )
         return [{"category": k, "detail": v} for k, v in grouped.items()]
     def deduplicate_extractions(data):
         seen = set()
         reversed_unique = []
         # Loop in reverse to keep the *last* occurrence
         for item in reversed(data):
             key = (item.get("label"))
             if key not in seen:
                 seen.add(key)
                 reversed_unique.append(item)
         # Reverse back to preserve original order (latest kept, first dropped)
         return list(reversed(reversed_unique))
@@ -701,24 +659,19 @@ def register_routes(app, agents):
             text,
             add_special_tokens=False
         )
         chunks = []
         start = 0
         while start < len(input_ids):
             end = min(start + max_tokens, len(input_ids))
             chunk_ids = input_ids[start:end]
             chunk_text = tokenizer.decode(
                 chunk_ids,
                 skip_special_tokens=True,
                 clean_up_tokenization_spaces=True
             )
             # Ensure partial continuation isn't cut off mid-sentence
             if not chunk_text.endswith(('.', '?', '!', ':')):
                 chunk_text += "..."
             chunks.append(chunk_text)
             start += max_tokens - overlap
         return chunks
@@ -731,7 +684,6 @@ def register_routes(app, agents):
         except ValueError:
             # '[' not found in output
             return []
         # Try parsing full array first
         try:
             parsed = json.loads(json_text)
@@ -739,7 +691,6 @@ def register_routes(app, agents):
                 return parsed
         except Exception:
             pass  # fallback to manual parsing
         # Manual recovery via brace matching
         stack = 0
         obj_start = None
@@ -758,15 +709,12 @@ def register_routes(app, agents):
                     except Exception as e:
                         print(f"❌ Invalid JSON object: {e}")
                     obj_start = None
         return extracted
     def process_chunk(generator, chunk, idx):
         prompt = f"""
                 [INST] <<SYS>>
                 You are a clinical data extraction assistant.
                 Your job is to:
                 1. Read the following medical report.
                 2. Extract all medically relevant facts as a list of JSON objects.
@@ -775,7 +723,6 @@ def register_routes(app, agents):
                 - "question": a question related to that field
                 - "answer": the answer from the text
                 4. After extracting the list, categorize each object under one of the following fixed categories:
                 - Patient Info
                 - Vitals
                 - Symptoms
@@ -787,7 +734,6 @@ def register_routes(app, agents):
                 - Laboratory
                 - Radiology
                 - Doctor Note
                 Example format for structure only — do not include in output:
                 [
                 {{
@@ -797,22 +743,17 @@ def register_routes(app, agents):
                     "category": "Patient Info"
                 }},
                 ]
-                ⚠ Use these categories listed above. If an item does not fit any of these categories, create a new category for it.
                 Text:
                 {chunk}
                 Return a single valid JSON array of all extracted objects.
                 Do not include any explanations or commentary.
                 Only output the JSON array
                 <</SYS>> [/INST]
                 """
         try:
             # Clear GPU memory before processing
             torch.cuda.empty_cache()
             # Process with memory optimizations
             output = generator(
                 prompt,
@@ -820,31 +761,26 @@ def register_routes(app, agents):
                 do_sample=False,     # Disable sampling for deterministic output
                 temperature=0.3,     # Lower temperature for more focused output
             )[0]["generated_text"]
             return idx, output
         except Exception as e:
             return idx, None
     @app.route("/extract_medical_data", methods=["POST"])
     def extract_medical_data():
         data = request.json
         qa_model_name = data.get("qa_model_name")
         qa_model_type = data.get("qa_model_type")
         extracted_files = data.get("extracted_data")
         if not qa_model_name or not qa_model_type:
             return jsonify({"error": "Missing 'qa_model_name' or 'qa_model_type'"}), 400
         if not extracted_files:
             return jsonify({"error": "Missing 'extracted_data' in request"}), 400
         try:
             tokenizer = AutoTokenizer.from_pretrained(
                 qa_model_name,
                 trust_remote_code=True,
                 cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
             )
             model = AutoModelForCausalLM.from_pretrained(
                 qa_model_name,
                 device_map="auto",
@@ -853,32 +789,25 @@ def register_routes(app, agents):
                 low_cpu_mem_usage=True,
                 cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
             )
             generator = transformers_pipeline(
                 task=qa_model_type,
                 model=model,
                 tokenizer=tokenizer,
                 torch_dtype=torch.float32
             )
         except Exception as e:
             return jsonify({"error": f"Could not load model: {str(e)}"}), 500
         structured_response = {"extracted_data": []}
         for file_data in extracted_files:
             filename = file_data.get("file", "unknown_file")
             context = file_data.get("extracted_text", "").strip()
             if not context:
                 structured_response["extracted_data"].append(
                     {"file": filename, "medical_fields": []}
                 )
                 continue
             chunks = chunk_text(context, tokenizer)
             all_extracted = []
             with ThreadPoolExecutor(max_workers=4) as executor:
                 futures = {
                     executor.submit(process_chunk, generator, chunk, idx): idx
@@ -887,19 +816,16 @@ def register_routes(app, agents):
                 for future in as_completed(futures):
                     idx = futures[future]
                     _, output = future.result()
                     if not output:
                         continue
                     try:
                         objs = extract_json_objects(output)
                         if objs:
                             all_extracted.extend(objs)
                         else:
-                           print(f"⚠ Chunk {idx+1} yielded no valid JSON.")
                     except Exception as e:
                         print(f"❌ Error extracting JSON from chunk {idx+1}")
             # Clean and group results for this file
             if all_extracted:
                 deduped = deduplicate_extractions(all_extracted)
@@ -907,20 +833,16 @@ def register_routes(app, agents):
                 grouped_data = group_by_category(deduped)
             else:
                 grouped_data = {"error": "No valid data extracted"}
             structured_response["extracted_data"].append(
                 {"file": filename, "medical_fields": grouped_data}
             )
             try:
                 save_data_to_storage(filename, grouped_data)
             except Exception as e:
-                print(f"⚠ Failed to save data for {filename}: {e}")
         print("✅ Extraction complete.")
         return jsonify(structured_response)
     @app.route("/api/generate_summary", methods=["POST"])
     def generate_summary():
         logger.info("Received request to generate summary.")
@@ -951,34 +873,27 @@ def register_routes(app, agents):
             torch.set_default_tensor_type(torch.FloatTensor)
             if torch.cuda.is_available():
                 torch.set_default_tensor_type(torch.cuda.FloatTensor)
             # Handle multipart form data from Flutter
             if "audio" not in request.files:
                 return jsonify({"error": "No audio file provided"}), 400
             audio_file = request.files["audio"]
             if audio_file.filename == "":
                 return jsonify({"error": "No selected audio file"}), 400
             # Validate file extension
             if not allowed_file(audio_file.filename):
                 return jsonify({"error": f"Unsupported audio format. Allowed formats: wav, mp3, m4a, ogg"}), 400
             # Check file size
             valid_size, error_message = check_file_size(audio_file)
             if not valid_size:
                 return jsonify({"error": error_message}), 400
             # Use default model if not specified
             qa_model_name = request.form.get("qa_model_name", "facebook/bart-base")
             qa_model_type = request.form.get("qa_model_type", "text-generation")
             # Load QA model with proper error handling
             try:
                 qa_pipeline = get_qa_pipeline(qa_model_type, qa_model_name)
             except Exception as e:
                 return jsonify({"error": f"QA model load failed: {str(e)}"}), 500
             # Use platform-agnostic temp directory
             import uuid
             from werkzeug.utils import secure_filename
@@ -987,10 +902,8 @@ def register_routes(app, agents):
             os.makedirs(temp_dir, exist_ok=True)
             temp_filename = f"{uuid.uuid4()}_{secure_filename(audio_file.filename)}"
             temp_path = os.path.join(temp_dir, temp_filename)
             try:
                 audio_file.save(temp_path)
                 # Transcribe audio with retries
                 max_retries = 3
                 transcribed_text = None
@@ -1007,16 +920,13 @@ def register_routes(app, agents):
                             raise
                         torch.cuda.empty_cache()  # Clear GPU memory between attempts
                         continue
                 if not transcribed_text:
                     raise ValueError("Failed to transcribe audio after multiple attempts")
                 # Clean and process text
                 try:
                     clean_text = PHIScrubberAgent.scrub_phi(transcribed_text)
                 except Exception as e:
                     clean_text = transcribed_text
                 # Extract medical data with proper device handling
                 try:
                     with torch.cuda.device(0) if torch.cuda.is_available() else torch.no_grad():
@@ -1025,11 +935,9 @@ def register_routes(app, agents):
                         medical_data = medical_data_extractor.extract_medical_data(clean_text)
                 except Exception as e:
                     medical_data = {"error": f"Medical data extraction failed: {str(e)}"}
                 # Clean up temporary file
                 if os.path.exists(temp_path):
                     os.remove(temp_path)
                 # Return response in the format expected by Flutter
                 return jsonify({
                     "status": "success",
@@ -1038,7 +946,6 @@ def register_routes(app, agents):
                         "medical_chart": medical_data
                     }
                 }), 200
             except Exception as e:
                 if temp_path and os.path.exists(temp_path):
                     os.remove(temp_path)
@@ -1046,7 +953,6 @@ def register_routes(app, agents):
                     "status": "error",
                     "error": f"Processing failed: {str(e)}"
                 }), 500
         except Exception as e:
             if temp_path and os.path.exists(temp_path):
                 os.remove(temp_path)
@@ -1055,22 +961,17 @@ def register_routes(app, agents):
                 "error": f"Request handling failed: {str(e)}"
             }), 500
-    # Initialize GGUF pipeline with proper model name handling
-    gguf_model_name = "microsoft/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-q4.gguf"
-    if gguf_model_name.endswith('.gguf') and '/' in gguf_model_name:
-        repo_id, filename = gguf_model_name.rsplit('/', 1)
-        PIPELINE = get_gguf_pipeline(repo_id, filename)
-    else:
-        PIPELINE = get_gguf_pipeline(gguf_model_name)
-    _ = PIPELINE.generate("Hello", max_tokens=5)
     @app.route('/generate_patient_summary', methods=['POST'])
     def generate_patient_summary():
         """
-        Enhanced: Uses OpenVINO-style prompt, delta, and validation logic for patient summary generation.
-        Generates fresh summary every time without state tracking.
         """
         from ai_med_extract.utils.openvino_summarizer_utils import (
             parse_ehr_chartsummarydtl, visits_sorted, compute_deltas, build_compact_baseline, delta_to_text, build_main_prompt
@@ -1078,217 +979,488 @@ def register_routes(app, agents):
         try:
             start_total = time.time()
             data = request.get_json()
-            t0 = time.time()
             patientid = data.get("patientid")
             token = data.get("token")
             key = data.get("key")
-            model_name = data.get("patient_summarizer_model_name") or "falconsai/medical_summarization"
-            model_type = data.get("patient_summarizer_model_type") or data.get("model_type") or "summarization"
             if not patientid or not token or not key:
                 return jsonify({"error": "Missing required fields: patientid, token, or key"}), 400
-            api_url = f"{key}/Transactionapi/api/PatientList/patientsummary"
             headers = {
                 "Authorization": f"Bearer {token}",
                 "Content-Type": "application/json",
             }
-            # Only include x-api-key if it's a distinct API key, not a base URL
             if key and not key.startswith("http"):
                 headers["x-api-key"] = key
             t_api_start = time.time()
-            response = requests.post(api_url, json={"patientid": patientid}, headers=headers, timeout=30)
             t_api_end = time.time()
             if response.status_code != 200:
                 return jsonify({
-                    "error": "API request failed",
-                    "status": response.status_code,
-                    "message": response.text
-                }), 502
             try:
                 api_data = response.json()
             except ValueError:
-                api_data = response.text
-            if isinstance(api_data, dict):
-                ehr_result = api_data.get("result") or api_data
-            else:
-                ehr_result = api_data
             chartsummarydtl = ehr_result.get("chartsummarydtl") if isinstance(ehr_result, dict) else None
             if not chartsummarydtl:
-                # Return diagnostics to aid debugging on Spaces
                 return jsonify({
-                    "error": "Missing chartsummarydtl in EHR response",
-                    "diagnostic": {
-                        "api_url": api_url,
-                        "status": response.status_code,
-                        "content_type": response.headers.get("content-type"),
-                        "body_preview": (response.text[:500] if hasattr(response, "text") else str(api_data))
-                    }
                 }), 500
             visits = parse_ehr_chartsummarydtl(chartsummarydtl)
-            # Generate summary from current data only (no state tracking)
-            # Use empty old visits to compute deltas against baseline
             delta = compute_deltas([], visits)
             all_visits = visits_sorted(visits)
             baseline = build_compact_baseline(all_visits)
             delta_text = delta_to_text(delta)
-            prompt = build_main_prompt(baseline, delta_text)
-            t_model_load_start = time.time()
-            # Model selection logic (supporting OpenVINO, HuggingFace, and GGUF)
-            pipeline = None
-            loader = None
-            import torch
-            torch.set_num_threads(2)
             if model_type == "gguf":
-                logger.info("Using GGUF model for summary generation.")
                 try:
-                    # Support both local path and HuggingFace repo/filename
-                    if model_name.endswith('.gguf') and '/' in model_name:
                         repo_id, filename = model_name.rsplit('/', 1)
-                        pipeline = get_gguf_pipeline(repo_id, filename)
-                    else:
-                        pipeline = get_gguf_pipeline(model_name)
-                    logger.info(f"Prompt length for GGUF model: {len(prompt)} characters.")
                     try:
-                        # The timeout is now handled internally by the pipeline
-                        summary_raw = pipeline.generate_full_summary(prompt, max_tokens=100000, max_loops=5)
-                        # Extract markdown summary directly from model output
-                        summary_start_patterns = [
-                            "Now generate the complete, updated clinical summary with all four sections in a markdown format:",
                             "## Clinical Assessment",
-                            "# Clinical Assessment",
-                            "Clinical Assessment"
                         ]
-                        markdown_summary = summary_raw
-                        for pattern in summary_start_patterns:
-                            if pattern in summary_raw:
-                                markdown_summary = summary_raw.split(pattern)[-1].strip()
-                                break
-                        # No state tracking - just return the summary
-                        # Remove undefined timing variables and only log steps that are actually measured
-                        total_time = time.time() - start_total
-                        logger.info(f"[TIMING] API call: {t_api_end-t_api_start:.2f}s, TOTAL: {total_time:.2f}s")
-                        return jsonify({
-                            "summary": markdown_summary,
-                            "baseline": baseline,
-                            "delta": delta_text
-                        }), 200
-                    except TimeoutError as e:
-                        logger.error(f"GGUF model generation timed out: {e}")
-                        # Try to use a simpler fallback model
-                        try:
-                            from ai_med_extract.utils.model_loader_gguf import create_fallback_pipeline
-                            fallback_pipeline = create_fallback_pipeline()
-                            fallback_summary = fallback_pipeline.generate_full_summary(prompt)
-                            # Extract markdown summary directly from fallback output
-                            summary_start_patterns = [
-                                "Now generate the complete, updated clinical summary with all four sections in a markdown format:",
-                                "## Clinical Assessment",
-                                "# Clinical Assessment",
-                                "Clinical Assessment"
-                            ]
-                            markdown_summary = fallback_summary
-                            for pattern in summary_start_patterns:
-                                if pattern in fallback_summary:
-                                    markdown_summary = fallback_summary.split(pattern)[-1].strip()
                                     break
-                            return jsonify({
-                                "summary": markdown_summary,
-                                "baseline": baseline,
-                                "delta": delta_text,
-                                "warning": "GGUF model timed out, using fallback summary"
-                            }), 200
-                        except Exception as fallback_error:
-                            return jsonify({
-                                "error": f"GGUF model generation timed out and fallback failed: {str(e)}",
-                                "original_error": str(e)
-                            }), 408
-                    except Exception as e:
-                        logger.error(f"GGUF model generation failed: {e}")
-                        return jsonify({"error": f"GGUF model generation failed: {str(e)}"}), 500
                 except Exception as e:
-                    logger.error(f"Failed to load GGUF model: {e}")
-                    # Try to use fallback pipeline
-                    try:
-                        from ai_med_extract.utils.model_loader_gguf import create_fallback_pipeline
-                        fallback_pipeline = create_fallback_pipeline()
-                        fallback_summary = fallback_pipeline.generate_full_summary(prompt)
-                        markdown_summary =fallback_summary
-                        # summary_to_markdown(fallback_summary)
-                        return jsonify({
-                            "summary": markdown_summary,
-                            "baseline": baseline,
-                            "delta": delta_text,
-                            "warning": "GGUF model failed to load, using fallback summary"
-                        }), 200
-                    except Exception as fallback_error:
-                        return jsonify({
-                            "error": f"Failed to load GGUF model and fallback failed: {str(e)}",
-                            "original_error": str(e)
-                        }), 500
             elif model_type in {"text-generation", "causal-openvino"}:
-                # Try to use an existing loader if available
-                loader = agents.get("medical_data_extractor")
-                if not loader or getattr(loader, 'model_name', None) != model_name:
-                    # Dynamically create OpenVINO loader if needed
-                    from ai_med_extract.utils.model_loader_spaces import get_openvino_pipeline
-                    try:
                         pipeline = get_openvino_pipeline(model_name)
-                    except Exception as e:
-                        return jsonify({"error": f"Failed to load OpenVINO pipeline: {str(e)}"}), 500
             elif model_type == "summarization":
-                loader = agents.get("summarizer")
-            # Use loader if available
-            if not pipeline and loader and hasattr(loader, "model_loader"):
-                pipeline = loader.model_loader.load()
-            if not pipeline:
-                return jsonify({"error": "Model pipeline not available"}), 500
-            inputs = pipeline.tokenizer([prompt], return_tensors="pt")
-            outputs = pipeline.model.generate(**inputs, max_new_tokens=100000, do_sample=False, pad_token_id=pipeline.tokenizer.eos_token_id or 32000)
-            text = pipeline.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            new_summary = text.split("Now generate the complete, updated clinical summary with all four sections in a markdown format:")[-1].strip()
-            # For other models, after extracting new_summary:
-            from ai_med_extract.api.routes import summary_to_markdown
-            logger.info(f"Baseline length: {len(baseline)} characters.")
-            logger.info(f"Delta text length: {len(delta_text)} characters.")
-            logger.info(f"Raw summary length: {len(new_summary)} characters.")
-            markdown_summary = summary_to_markdown(new_summary)
-            logger.info(f"Formatted summary length: {len(markdown_summary)} characters.")
-            # Validate and ensure the summary has all 4 required sections
-            markdown_summary = ensure_four_sections(markdown_summary)
-            # Remove undefined timing variables and only log steps that are actually measured
-            total_time = time.time() - start_total
-            print(f"[TIMING] API call: {t_api_end-t_api_start:.2f}s, TOTAL: {total_time:.2f}s")
             return jsonify({
-                "summary": markdown_summary,
-                "baseline": baseline,
-                "delta": delta_text
             }), 200
-        except requests.exceptions.Timeout:
-            return jsonify({"error": "Request to EHR API timed out"}), 504
-        except requests.exceptions.RequestException as e:
-            return jsonify({"error": f"Network error: {str(e)}"}), 503
-        except Exception as e:
-            logger.error(f"Unexpected error: {str(e)}", exc_info=True)
-            return jsonify({"error": f"Internal server error: {str(e)}"}), 500
     @app.route("/")
     def home():
         return "Medical Data Extraction API is running!", 200
 def summary_to_markdown(summary):
     import re
     # Remove '- answer:' and similar artifacts
     summary = re.sub(r'-\s*answer: ?', '', summary, flags=re.IGNORECASE)
     # Convert numbered sections to markdown headers
     lines = summary.splitlines()
     out = []
@@ -1298,7 +1470,6 @@ def summary_to_markdown(summary):
         '3.': '##',
         '4.': '##',
     }
     for line in lines:
         m = re.match(r'^(\d\.)\s*(.+)', line)
         if m and m.group(1) in section_map:
@@ -1306,18 +1477,14 @@ def summary_to_markdown(summary):
             out.append(f"{header} {m.group(2).strip()}")
         else:
             out.append(line)
     # Remove empty lines at the start
     while out and not out[0].strip():
         out = out[1:]
     # Check if we have the expected 4-section structure
     def is_header(line: str) -> bool:
         return bool(re.match(r'^(#{1,6})\s+.+', line.strip()))
     # Find all headers in the output
     headers = [i for i, line in enumerate(out) if is_header(line)]
     # If we have at least 4 headers, check if they match the expected structure
     if len(headers) >= 4:
         header_texts = [out[i].strip() for i in headers[:4]]
@@ -1327,23 +1494,19 @@ def summary_to_markdown(summary):
             r'##.*Plan.*Suggested.*Actions',
             r'##.*Direct.*Guidance.*Physician'
         ]
         # Check if headers match expected patterns
         matches_pattern = all(
             re.search(pattern, header, re.IGNORECASE)
             for pattern, header in zip(expected_patterns, header_texts)
         )
         if matches_pattern:
             # Keep the entire content - don't truncate
             return '\n'.join(out).strip()
     # If we don't have the expected structure, try to find the actual summary content
     # Look for the start of the clinical assessment section
     clinical_assessment_pattern = r'(?:# Clinical Assessment|## Clinical Assessment|Clinical Assessment)'
     for i, line in enumerate(out):
         if re.search(clinical_assessment_pattern, line, re.IGNORECASE):
             return '\n'.join(out[i:]).strip()
     # If no clinical assessment found, return the entire summary
-    return '\n'.join(out).strip()

 """
 Medical Data Extraction API Routes
 This module provides Flask API endpoints for medical data processing, including:
 - Patient summary generation using various model types (GGUF, OpenVINO, HuggingFace)
 - File upload and text extraction
 - Medical data extraction from text and audio
 - Protected Health Information (PHI) scrubbing
 - Model management and dynamic loading
 The API supports multiple model formats and includes comprehensive error handling,
 memory optimization, and caching mechanisms for efficient operation in both
 local and cloud environments (Hugging Face Spaces).
 """
 from concurrent.futures import ThreadPoolExecutor, as_completed
 import json
 import logging
     pipeline as transformers_pipeline
 )
 from ai_med_extract.agents.patient_summary_agent import PatientSummarizerAgent
+from ai_med_extract.utils.openvino_summarizer_utils import generate_section
 agent = PatientSummarizerAgent(model_name="falconsai/medical_summarization")
 from ai_med_extract.agents.summarizer import SummarizerAgent
 from ai_med_extract.utils.file_utils import (
     get_data_from_storage,
 )
 from ..utils.validation import clean_result, validate_patient_name
+from ai_med_extract.utils.patient_summary_utils import clean_patient_data, flatten_to_string_list
 import time
 logger = logging.getLogger(__name__)
 # Add GGUF model cache at the top of the file
 GGUF_MODEL_CACHE = {}
+GGUF_PIPELINE_CACHE = {}
 def get_gguf_pipeline(model_name: str, filename: str = None):
     """
     Load and cache GGUF model pipelines with comprehensive error handling.
     This function provides a cached interface to GGUF models with fallback mechanisms
     for robust operation in production environments.
     Args:
         model_name (str): The name of the GGUF model or HuggingFace repository ID.
                          Can be a local file path or HuggingFace model identifier.
         filename (str, optional): Specific filename for HuggingFace repository models.
                                  Required when model_name is a repository ID.
     Returns:
         GGUFModelPipeline: A loaded GGUF model pipeline instance or fallback pipeline.
     Raises:
         RuntimeError: If both model loading and fallback mechanisms fail.
     Notes:
         - Uses a global cache to avoid reloading the same model multiple times
         - Implements timeout mechanism for model loading (5 minutes)
         try:
             from ai_med_extract.utils.model_loader_gguf import GGUFModelPipeline, create_fallback_pipeline
             import time
             # Add timeout for model loading
             start_time = time.time()
             timeout = 300  # 5 minutes timeout
             # Try to load the GGUF model
             try:
                 GGUF_MODEL_CACHE[key] = GGUFModelPipeline(model_name, filename, timeout=timeout)
             except Exception as e:
                 load_time = time.time() - start_time
                 print(f"[GGUF] Failed to load model {model_name} after {load_time:.2f}s: {e}")
                 # If model loading fails, use fallback
                 print("[GGUF] Using fallback pipeline")
                 GGUF_MODEL_CACHE[key] = create_fallback_pipeline()
         except Exception as e:
             print(f"[GGUF] Critical error in model loading: {e}")
             # Create a basic fallback
             from ai_med_extract.utils.model_loader_gguf import create_fallback_pipeline
             GGUF_MODEL_CACHE[key] = create_fallback_pipeline()
     return GGUF_MODEL_CACHE[key]
+def get_cached_gguf_pipeline(model_name: str, filename: str = None):
+    key = (model_name, filename)
+    if key not in GGUF_PIPELINE_CACHE:
+        GGUF_PIPELINE_CACHE[key] = get_gguf_pipeline(model_name, filename)
+    return GGUF_PIPELINE_CACHE[key]
+def ensure_four_sections(summary: str) -> str:
+    """
+    Ensures the summary contains all four required sections.
+    If any are missing, appends a placeholder.
+    """
+    required_sections = [
+        "## Clinical Assessment",
+        "## Key Trends & Changes",
+        "## Plan & Suggested Actions",
+        "## Direct Guidance for Physician"
+    ]
+    lines = summary.splitlines()
+    existing_headers = [line.strip() for line in lines if line.strip().startswith("##")]
+    for section in required_sections:
+        if section not in existing_headers:
+            summary += f"\n{section}\n- *Section was not generated. Consider retrying or checking input data.*"
+    return summary
 def get_qa_pipeline(qa_model_type, qa_model_name):
     if not qa_model_type or not qa_model_name:
         raise ValueError("Both qa_model_type and qa_model_name must be provided")
     if not hasattr(get_qa_pipeline, "cache"):
         get_qa_pipeline.cache = {}
     # For Hugging Face Spaces, we need to be memory efficient
     import torch
     torch.cuda.empty_cache()  # Clear GPU memory before loading model
     # Set default tensor type to float32 for better compatibility
     torch.set_default_tensor_type(torch.FloatTensor)
     if torch.cuda.is_available():
         torch.set_default_tensor_type(torch.cuda.FloatTensor)
     key = (qa_model_type, qa_model_name)
     if key in get_qa_pipeline.cache:
         return get_qa_pipeline.cache[key]
     try:
         # For Hugging Face Spaces, use smaller models by default
         if "Qwen/Qwen-7B-Chat" in qa_model_name:
             qa_model_name = "Qwen/Qwen-1_8B-Chat"
         elif "Llama" in qa_model_name:
             qa_model_name = "facebook/opt-125m"
         # Load tokenizer with trust_remote_code=True for custom tokenizers
         tokenizer = AutoTokenizer.from_pretrained(
             qa_model_name,
             trust_remote_code=True,
             cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
         )
         # Load model with memory optimizations
         try:
             model = AutoModelForCausalLM.from_pretrained(
                 low_cpu_mem_usage=True,
                 cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
             )
         # Create pipeline with memory optimizations
         pipeline = transformers_pipeline(
             task=qa_model_type,
             device_map="auto",
             torch_dtype=torch.float32
         )
         get_qa_pipeline.cache[key] = pipeline
         return pipeline
     except Exception as e:
         raise
     """
     if not qa_pipeline or not question or not context:
         raise ValueError("Pipeline, question and context are required")
     qa_model_type = getattr(qa_pipeline, '_qa_model_type', None)
     try:
         if qa_model_type == 'text-generation':
             prompt = f"Question: {question}\nContext: {context}\nAnswer:"
             result = qa_pipeline(prompt, max_new_tokens=128, do_sample=False)
             if isinstance(result, list) and result and 'generated_text' in result[0]:
                 answer = result[0]['generated_text'].split('Answer:')[-1].strip()
                 return {'answer': answer}
 def get_ner_pipeline(ner_model_type, ner_model_name):
     if not ner_model_type or not ner_model_name:
         raise ValueError("Both ner_model_type and ner_model_name must be provided")
     if not hasattr(get_ner_pipeline, "cache"):
         get_ner_pipeline.cache = {}
     # For Hugging Face Spaces, we need to be memory efficient
     import torch
     torch.cuda.empty_cache()  # Clear GPU memory before loading model
     # Set default tensor type
     torch.set_default_tensor_type(torch.FloatTensor)
     if torch.cuda.is_available():
         torch.set_default_tensor_type(torch.cuda.FloatTensor)
     key = (ner_model_type, ner_model_name)
     if key in get_ner_pipeline.cache:
         return get_ner_pipeline.cache[key]
     try:
         from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
         # Clear any existing models from memory
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         # Load tokenizer
         try:
             tokenizer = AutoTokenizer.from_pretrained(
                 trust_remote_code=True,
                 cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
             )
         # Load model with memory optimizations
         try:
             # For NER models, we'll use CPU if device_map='auto' is not supported
                 torch_dtype=torch.float32,
                 cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
             )
         # Create pipeline with appropriate device configuration
         try:
             qa_pipeline = pipeline(
                 )
             else:
                 raise
         # Cache the pipeline
         get_ner_pipeline.cache[key] = qa_pipeline
         return qa_pipeline
     except Exception as e:
         raise
 def get_summarizer_pipeline(summarizer_model_type, summarizer_model_name):
     if not hasattr(get_summarizer_pipeline, "cache"):
         get_summarizer_pipeline.cache = {}
     if key not in get_summarizer_pipeline.cache:
         import torch
         from transformers import pipeline
         # Use float16 only if CUDA is available, else use float32
         if torch.cuda.is_available():
             dtype = torch.float16
             dtype = torch.float32
             device = -1
             device_map = None
         get_summarizer_pipeline.cache[key] = pipeline(
             task=summarizer_model_type,
             model=summarizer_model_name,
         )
     return get_summarizer_pipeline.cache[key]
 def register_routes(app, agents):
     from ai_med_extract.utils.openvino_summarizer_utils import (
         parse_ehr_chartsummarydtl, visits_sorted, compute_deltas, build_compact_baseline, delta_to_text, build_main_prompt
             chartsummarydtl = ehr_result.get("chartsummarydtl") if isinstance(ehr_result, dict) else None
             if not chartsummarydtl:
                 return jsonify({"error": "Missing chartsummarydtl in input"}), 400
             # Normalize visits
             visits = parse_ehr_chartsummarydtl(chartsummarydtl)
             # Extract patient demographics if available
             patient_info = ""
             if isinstance(ehr_result, dict):
                 past_medical_history = ', '.join(ehr_result.get('past_medical_history', []))
                 social_history = ehr_result.get('social_history', 'Not specified')
                 patient_info = f"Patient: {patient_name} (ID: {patient_id}, Age: {age}, Gender: {gender})\nPast Medical History: {past_medical_history}\nSocial History: {social_history}\n"
             # Generate summary from current data only (no state tracking)
             # Use empty old visits to compute deltas against baseline
             delta = compute_deltas([], visits)
             baseline = build_compact_baseline(all_visits)
             delta_text = delta_to_text(delta)
             prompt = build_main_prompt(baseline, delta_text, patient_info)
             # Model selection logic (model_name, model_type)
             model_name = data.get("model_name") or "microsoft/Phi-3-mini-4k-instruct"
             model_type = data.get("model_type") or "text-generation"
             pipeline = loader.model_loader.load() if hasattr(loader, "model_loader") else None
             if not pipeline:
                 return jsonify({"error": "Model pipeline not available"}), 500
             # Run inference
             import torch
             torch.set_num_threads(2)
             inputs = pipeline.tokenizer([prompt], return_tensors="pt")
             outputs = pipeline.model.generate(**inputs, max_new_tokens=100000, do_sample=False, pad_token_id=pipeline.tokenizer.eos_token_id or 32000)
             text = pipeline.tokenizer.decode(outputs[0], skip_special_tokens=True)
             # Extract just the markdown summary (remove prompt text)
             # The model should return the complete markdown-formatted summary
             summary_start_patterns = [
                 "# Clinical Assessment",
                 "Clinical Assessment"
             ]
             new_summary = text
             for pattern in summary_start_patterns:
                 if pattern in text:
                     new_summary = text.split(pattern)[-1].strip()
                     break
             return jsonify({
                 "summary": new_summary,
                 "baseline": baseline,
             }), 200
         except Exception as e:
             return jsonify({"error": f"Failed to generate summary: {str(e)}"}), 500
+    # Configure upload directory based on environment
+    import os
     if os.environ.get('SPACE_ID'):  # We're running on Hugging Face Spaces
         app.config['UPLOAD_FOLDER'] = '/data/uploads'
     else:  # We're running locally
         upload_dir = os.path.join(os.getcwd(), 'uploads')
         os.makedirs(upload_dir, exist_ok=True)
         app.config['UPLOAD_FOLDER'] = upload_dir
     # Ensure the upload directory exists and is writable
     if not os.path.exists(app.config['UPLOAD_FOLDER']):
         try:
     def upload_file():
         import torch
         torch.cuda.empty_cache()  # Clear GPU memory before processing
         files = request.files.getlist("file")
         patient_name = request.form.get("patient_name", "").strip()
         password = request.form.get("password")
         # Use more compatible models by default
         qa_model_name = request.form.get("qa_model_name", "facebook/bart-base")
         qa_model_type = request.form.get("qa_model_type", "text-generation")
         ner_model_type = request.form.get("ner_model_type", "ner")
         summarizer_model_name = request.form.get("summarizer_model_name", "facebook/bart-base")
         summarizer_model_type = request.form.get("summarizer_model_type", "summarization")
         if not files:
             return jsonify({"error": "No file uploaded"}), 400
         # Accept any model type and model name for QA, NER, and summarizer
         if not qa_model_name or not qa_model_type:
             return jsonify({"error": "QA model name and type are required"}), 400
             qa_pipeline = get_qa_pipeline(qa_model_type, qa_model_name)
         except Exception as e:
             return jsonify({"error": f"QA model load failed: {str(e)}"}), 500
         if not ner_model_name or not ner_model_type:
             return jsonify({"error": "NER model name and type are required"}), 400
         try:
             ner_pipeline = get_ner_pipeline(ner_model_type, ner_model_name)
         except Exception as e:
             return jsonify({"error": f"NER model load failed: {str(e)}"}), 500
         if not summarizer_model_name or not summarizer_model_type:
             return jsonify({"error": "Summarizer model name and type are required"}), 400
         try:
             summarizer_pipeline = get_summarizer_pipeline(summarizer_model_type, summarizer_model_name)
         except Exception as e:
             return jsonify({"error": f"Summarizer model load failed: {str(e)}"}), 500
         extracted_data = []
         for file in files:
             if file.filename == "":
             except Exception as e:
                 os.remove(filepath)  # Clean up on failure
                 return jsonify({"error": f"Text extraction failed: {str(e)}"}), 500
             skip_medical_check = (
                 request.form.get("skip_medical_check", "false").lower() == "true"
             )
             file = request.files["file"]
             if file.filename == "":
                 return jsonify({"error": "No selected file"}), 400
             # Use secure filename
             from werkzeug.utils import secure_filename
             import uuid
             temp_filename = f"{uuid.uuid4()}_{secure_filename(file.filename)}"
             temp_path = os.path.join(app.config['UPLOAD_FOLDER'], temp_filename)
             file.save(temp_path)
             result = whisper_model.transcribe(temp_path)
             os.remove(temp_path)
                 os.remove(temp_path)
             return jsonify({"error": str(e)}), 500
     def group_by_category(data):
         grouped = defaultdict(list)
         for item in data:
                     "answer": item.get("answer", "Not Available"),
                 }
             )
         return [{"category": k, "detail": v} for k, v in grouped.items()]
     def deduplicate_extractions(data):
         seen = set()
         reversed_unique = []
         # Loop in reverse to keep the *last* occurrence
         for item in reversed(data):
             key = (item.get("label"))
             if key not in seen:
                 seen.add(key)
                 reversed_unique.append(item)
         # Reverse back to preserve original order (latest kept, first dropped)
         return list(reversed(reversed_unique))
             text,
             add_special_tokens=False
         )
         chunks = []
         start = 0
         while start < len(input_ids):
             end = min(start + max_tokens, len(input_ids))
             chunk_ids = input_ids[start:end]
             chunk_text = tokenizer.decode(
                 chunk_ids,
                 skip_special_tokens=True,
                 clean_up_tokenization_spaces=True
             )
             # Ensure partial continuation isn't cut off mid-sentence
             if not chunk_text.endswith(('.', '?', '!', ':')):
                 chunk_text += "..."
             chunks.append(chunk_text)
             start += max_tokens - overlap
         return chunks
         except ValueError:
             # '[' not found in output
             return []
         # Try parsing full array first
         try:
             parsed = json.loads(json_text)
                 return parsed
         except Exception:
             pass  # fallback to manual parsing
         # Manual recovery via brace matching
         stack = 0
         obj_start = None
                     except Exception as e:
                         print(f"❌ Invalid JSON object: {e}")
                     obj_start = None
         return extracted
     def process_chunk(generator, chunk, idx):
         prompt = f"""
                 [INST] <<SYS>>
                 You are a clinical data extraction assistant.
                 Your job is to:
                 1. Read the following medical report.
                 2. Extract all medically relevant facts as a list of JSON objects.
                 - "question": a question related to that field
                 - "answer": the answer from the text
                 4. After extracting the list, categorize each object under one of the following fixed categories:
                 - Patient Info
                 - Vitals
                 - Symptoms
                 - Laboratory
                 - Radiology
                 - Doctor Note
                 Example format for structure only — do not include in output:
                 [
                 {{
                     "category": "Patient Info"
                 }},
                 ]
+                ⚠️ Use these categories listed above. If an item does not fit any of these categories, create a new category for it.
                 Text:
                 {chunk}
                 Return a single valid JSON array of all extracted objects.
                 Do not include any explanations or commentary.
                 Only output the JSON array
                 <</SYS>> [/INST]
                 """
         try:
             # Clear GPU memory before processing
             torch.cuda.empty_cache()
             # Process with memory optimizations
             output = generator(
                 prompt,
                 do_sample=False,     # Disable sampling for deterministic output
                 temperature=0.3,     # Lower temperature for more focused output
             )[0]["generated_text"]
             return idx, output
         except Exception as e:
             return idx, None
     @app.route("/extract_medical_data", methods=["POST"])
     def extract_medical_data():
         data = request.json
         qa_model_name = data.get("qa_model_name")
         qa_model_type = data.get("qa_model_type")
         extracted_files = data.get("extracted_data")
         if not qa_model_name or not qa_model_type:
             return jsonify({"error": "Missing 'qa_model_name' or 'qa_model_type'"}), 400
         if not extracted_files:
             return jsonify({"error": "Missing 'extracted_data' in request"}), 400
         try:
             tokenizer = AutoTokenizer.from_pretrained(
                 qa_model_name,
                 trust_remote_code=True,
                 cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
             )
             model = AutoModelForCausalLM.from_pretrained(
                 qa_model_name,
                 device_map="auto",
                 low_cpu_mem_usage=True,
                 cache_dir=os.environ.get('HF_HOME', '/tmp/huggingface')
             )
             generator = transformers_pipeline(
                 task=qa_model_type,
                 model=model,
                 tokenizer=tokenizer,
                 torch_dtype=torch.float32
             )
         except Exception as e:
             return jsonify({"error": f"Could not load model: {str(e)}"}), 500
         structured_response = {"extracted_data": []}
         for file_data in extracted_files:
             filename = file_data.get("file", "unknown_file")
             context = file_data.get("extracted_text", "").strip()
             if not context:
                 structured_response["extracted_data"].append(
                     {"file": filename, "medical_fields": []}
                 )
                 continue
             chunks = chunk_text(context, tokenizer)
             all_extracted = []
             with ThreadPoolExecutor(max_workers=4) as executor:
                 futures = {
                     executor.submit(process_chunk, generator, chunk, idx): idx
                 for future in as_completed(futures):
                     idx = futures[future]
                     _, output = future.result()
                     if not output:
                         continue
                     try:
                         objs = extract_json_objects(output)
                         if objs:
                             all_extracted.extend(objs)
                         else:
+                           print(f"⚠️ Chunk {idx+1} yielded no valid JSON.")
                     except Exception as e:
                         print(f"❌ Error extracting JSON from chunk {idx+1}")
             # Clean and group results for this file
             if all_extracted:
                 deduped = deduplicate_extractions(all_extracted)
                 grouped_data = group_by_category(deduped)
             else:
                 grouped_data = {"error": "No valid data extracted"}
             structured_response["extracted_data"].append(
                 {"file": filename, "medical_fields": grouped_data}
             )
             try:
                 save_data_to_storage(filename, grouped_data)
             except Exception as e:
+                print(f"⚠️ Failed to save data for {filename}: {e}")
         print("✅ Extraction complete.")
         return jsonify(structured_response)
     @app.route("/api/generate_summary", methods=["POST"])
     def generate_summary():
         logger.info("Received request to generate summary.")
             torch.set_default_tensor_type(torch.FloatTensor)
             if torch.cuda.is_available():
                 torch.set_default_tensor_type(torch.cuda.FloatTensor)
             # Handle multipart form data from Flutter
             if "audio" not in request.files:
                 return jsonify({"error": "No audio file provided"}), 400
             audio_file = request.files["audio"]
             if audio_file.filename == "":
                 return jsonify({"error": "No selected audio file"}), 400
             # Validate file extension
             if not allowed_file(audio_file.filename):
                 return jsonify({"error": f"Unsupported audio format. Allowed formats: wav, mp3, m4a, ogg"}), 400
             # Check file size
             valid_size, error_message = check_file_size(audio_file)
             if not valid_size:
                 return jsonify({"error": error_message}), 400
             # Use default model if not specified
             qa_model_name = request.form.get("qa_model_name", "facebook/bart-base")
             qa_model_type = request.form.get("qa_model_type", "text-generation")
             # Load QA model with proper error handling
             try:
                 qa_pipeline = get_qa_pipeline(qa_model_type, qa_model_name)
             except Exception as e:
                 return jsonify({"error": f"QA model load failed: {str(e)}"}), 500
             # Use platform-agnostic temp directory
             import uuid
             from werkzeug.utils import secure_filename
             os.makedirs(temp_dir, exist_ok=True)
             temp_filename = f"{uuid.uuid4()}_{secure_filename(audio_file.filename)}"
             temp_path = os.path.join(temp_dir, temp_filename)
             try:
                 audio_file.save(temp_path)
                 # Transcribe audio with retries
                 max_retries = 3
                 transcribed_text = None
                             raise
                         torch.cuda.empty_cache()  # Clear GPU memory between attempts
                         continue
                 if not transcribed_text:
                     raise ValueError("Failed to transcribe audio after multiple attempts")
                 # Clean and process text
                 try:
                     clean_text = PHIScrubberAgent.scrub_phi(transcribed_text)
                 except Exception as e:
                     clean_text = transcribed_text
                 # Extract medical data with proper device handling
                 try:
                     with torch.cuda.device(0) if torch.cuda.is_available() else torch.no_grad():
                         medical_data = medical_data_extractor.extract_medical_data(clean_text)
                 except Exception as e:
                     medical_data = {"error": f"Medical data extraction failed: {str(e)}"}
                 # Clean up temporary file
                 if os.path.exists(temp_path):
                     os.remove(temp_path)
                 # Return response in the format expected by Flutter
                 return jsonify({
                     "status": "success",
                         "medical_chart": medical_data
                     }
                 }), 200
             except Exception as e:
                 if temp_path and os.path.exists(temp_path):
                     os.remove(temp_path)
                     "status": "error",
                     "error": f"Processing failed: {str(e)}"
                 }), 500
         except Exception as e:
             if temp_path and os.path.exists(temp_path):
                 os.remove(temp_path)
                 "error": f"Request handling failed: {str(e)}"
             }), 500
+    # ==================== ULTRA-OPTIMIZED generate_patient_summary ENDPOINT ====================
     @app.route('/generate_patient_summary', methods=['POST'])
     def generate_patient_summary():
         """
+        🚀 ULTRA-OPTIMIZED + TIMEOUT-FLEXIBLE PATIENT SUMMARY — HF SPACES READY
+        - Ultra-fast by default (8s EHR, 25s gen) → perfect for HF Spaces
+        - Supports "timeout_mode": "extended" → 30s EHR, 55s gen for heavy tasks
+        - Works with ANY model_name and model_type (GGUF, text-generation, summarization)
+        - GGUF uses SINGLE PROMPT → 4x faster
+        - NEVER breaks — multi-layer fallbacks
+        - Preserves medical accuracy via delta/baseline logic
         """
         from ai_med_extract.utils.openvino_summarizer_utils import (
             parse_ehr_chartsummarydtl, visits_sorted, compute_deltas, build_compact_baseline, delta_to_text, build_main_prompt
         try:
             start_total = time.time()
             data = request.get_json()
             patientid = data.get("patientid")
             token = data.get("token")
             key = data.get("key")
+            # Support any model — default to GGUF Phi-3-mini
+            model_name = data.get("patient_summarizer_model_name") or "microsoft/Phi-3-mini-4k-instruct-gguf/Phi-3-mini-4k-instruct-q4.gguf"
+            model_type = data.get("patient_summarizer_model_type") or data.get("model_type") or "gguf"
+            # ⚡ TIMEOUT MODE: "fast" (default) or "extended"
+            timeout_mode = data.get("timeout_mode", "fast")  # fast (HF Spaces) | extended (heavy tasks)
+            if timeout_mode == "extended":
+                EHR_TIMEOUT = 30      # Longer for slow EHR systems
+                GEN_TIMEOUT = 500      # Almost full 60s HF limit
+                logger.info("🕒 Using EXTENDED timeout mode")
+            else:
+                EHR_TIMEOUT = 8       # Ultra-fast for HF Spaces
+                GEN_TIMEOUT = 500      # Leaves room for overhead
+                logger.info("⚡ Using FAST timeout mode (default)")
             if not patientid or not token or not key:
                 return jsonify({"error": "Missing required fields: patientid, token, or key"}), 400
+            api_url = f"{key.strip()}/Transactionapi/api/PatientList/patientsummary"
             headers = {
                 "Authorization": f"Bearer {token}",
                 "Content-Type": "application/json",
             }
             if key and not key.startswith("http"):
                 headers["x-api-key"] = key
+            # ⚡ DYNAMIC EHR TIMEOUT
             t_api_start = time.time()
+            try:
+                response = requests.post(api_url, json={"patientid": patientid}, headers=headers, timeout=EHR_TIMEOUT)
+            except requests.exceptions.Timeout:
+                logger.warning(f"EHR API timeout ({EHR_TIMEOUT}s) — returning structured fallback.")
+                minimal_fallback = f"""
+## Clinical Assessment
+- EHR API timeout ({EHR_TIMEOUT}s) — could not fetch patient data.
+## Key Trends & Changes
+- No data available due to API timeout.
+## Plan & Suggested Actions
+- Retry with "timeout_mode": "extended" or check EHR API performance.
+## Direct Guidance for Physician
+- Patient data unavailable — do not proceed without verification.
+"""
+                return jsonify({
+                    "summary": ensure_four_sections(minimal_fallback),
+                    "warning": f"EHR API timeout ({EHR_TIMEOUT}s) — used minimal fallback.",
+                    "timing": {"total": round(time.time() - start_total, 1)},
+                    "timeout_mode_used": timeout_mode
+                }), 200
+            except requests.exceptions.RequestException as e:
+                logger.error(f"Network error contacting EHR API: {e}")
+                return jsonify({"error": f"Network error: {str(e)}"}), 503
             t_api_end = time.time()
             if response.status_code != 200:
+                logger.warning(f"EHR API non-200 status: {response.status_code}")
+                minimal_fallback = f"""
+## Clinical Assessment
+- EHR API returned error {response.status_code}.
+## Key Trends & Changes
+- No patient data available.
+## Plan & Suggested Actions
+- Verify API key, token, and patient ID.
+## Direct Guidance for Physician
+- System received invalid response from EHR — do not proceed.
+"""
                 return jsonify({
+                    "summary": ensure_four_sections(minimal_fallback),
+                    "warning": f"EHR API error {response.status_code}",
+                    "timing": {"total": round(time.time() - start_total, 1)},
+                    "timeout_mode_used": timeout_mode
+                }), 200
             try:
                 api_data = response.json()
             except ValueError:
+                logger.error("Invalid JSON from EHR API")
+                minimal_fallback = """
+## Clinical Assessment
+- EHR API returned invalid JSON.
+## Key Trends & Changes
+- Unable to parse patient data.
+## Plan & Suggested Actions
+- Contact EHR API administrator.
+## Direct Guidance for Physician
+- Patient data corrupted — do not proceed.
+"""
+                return jsonify({
+                    "summary": ensure_four_sections(minimal_fallback),
+                    "warning": "Invalid JSON from EHR API",
+                    "timing": {"total": round(time.time() - start_total, 1)},
+                    "timeout_mode_used": timeout_mode
+                }), 500
+            ehr_result = api_data.get("result") or api_data
             chartsummarydtl = ehr_result.get("chartsummarydtl") if isinstance(ehr_result, dict) else None
             if not chartsummarydtl:
+                logger.warning("Missing chartsummarydtl in EHR response")
+                minimal_fallback = """
+## Clinical Assessment
+- No chartsummarydtl found in EHR response.
+## Key Trends & Changes
+- Patient data structure invalid.
+## Plan & Suggested Actions
+- Verify EHR API response format.
+## Direct Guidance for Physician
+- Incomplete patient data — manual review required.
+"""
                 return jsonify({
+                    "summary": ensure_four_sections(minimal_fallback),
+                    "warning": "Missing chartsummarydtl",
+                    "timing": {"total": round(time.time() - start_total, 1)},
+                    "timeout_mode_used": timeout_mode
                 }), 500
+            # Parse and compute deltas — YOUR LOGIC PRESERVED
             visits = parse_ehr_chartsummarydtl(chartsummarydtl)
             delta = compute_deltas([], visits)
             all_visits = visits_sorted(visits)
             baseline = build_compact_baseline(all_visits)
             delta_text = delta_to_text(delta)
+            # ==================== GGUF MODEL HANDLING ====================
             if model_type == "gguf":
+                logger.info(f"🧠 GGUF MODE: Single-prompt generation for {model_name}")
                 try:
+                    # Extract repo_id/filename if needed
+                    repo_id, filename = model_name, None
+                    if '/' in model_name and model_name.endswith('.gguf'):
                         repo_id, filename = model_name.rsplit('/', 1)
+                    # Load pipeline — uses global cache
+                    pipeline = get_cached_gguf_pipeline(repo_id, filename)
+                    # ⚡⚡⚡ SINGLE PROMPT — ALL 4 SECTIONS AT ONCE
+                    # ==================== OPTIMIZED PROMPT FOR GGUF MODEL ====================
+                    full_prompt = f"""
+<|system|>
+You are an expert clinical AI assistant. Your task is to generate a patient summary with EXACTLY FOUR sections in valid markdown format.
+### STRICT OUTPUT FORMAT RULES ###
+1.  Your response MUST start immediately with "## Clinical Assessment" (no preamble, no "Sure", no explanations).
+2.  Use ONLY these four section headers, in this exact order:
+    ## Clinical Assessment
+    ## Key Trends & Changes
+    ## Plan & Suggested Actions
+    ## Direct Guidance for Physician
+3.  Under each header, provide 2-4 concise bullet points using "- ".
+4.  Base your summary SOLELY on the data provided below. DO NOT HALLUCINATE or invent information.
+5.  End your response after the "## Direct Guidance for Physician" section.
+### DATA TO SUMMARIZE ###
+- PATIENT VISITS: {visits}
+- BASELINE: {baseline}
+- DELTAS: {delta_text}
+### EXAMPLE OUTPUT FORMAT ###
+## Clinical Assessment
+- Patient presents with chronic ischemic heart disease.
+- Current medications include telmisartan, atorvastatin, metoprolol, and aspirin.
+## Key Trends & Changes
+- Blood pressure elevated at 160/100 mmHg.
+- No significant weight change recorded.
+- No new diagnoses or medications since last visit.
+## Plan & Suggested Actions
+- Consider medication adjustment for hypertension.
+- Schedule follow-up to monitor BP and lipid panel.
+## Direct Guidance for Physician
+- Prioritize BP control to mitigate cardiac risk.
+- Review recent lab results when available.
+</s>
+<|user|>
+Generate the 4-section patient summary in the exact format specified above.
+</s>
+<|assistant|>
+## Clinical Assessment
+"""
+                    # Generate with DYNAMIC timeout using generate_full_summary for better section completion
                     try:
+                        raw_summary = pipeline.generate(
+                            full_prompt,
+                            max_tokens=2000,
+                            temperature=0.1,
+                            top_p=0.9,
+                            # max_loops=3  # Allow up to 3 loops to complete sections
+                        )
+                        logger.info(f"GGUF raw summary length: {len(raw_summary)} chars")
+                    except Exception as gen_error:
+                        logger.error(f"GGUF generation failed: {gen_error}")
+                        raise  # Trigger fallback below
+                    # Clean output
+                    def extract_markdown_sections(text):
+                        sections = [
                             "## Clinical Assessment",
+                            "## Key Trends & Changes",
+                            "## Plan & Suggested Actions",
+                            "## Direct Guidance for Physician"
                         ]
+                        output_lines = []
+                        current_section = None
+                        for line in text.splitlines():
+                            stripped = line.strip()
+                            for section in sections:
+                                if stripped.startswith(section):
+                                    current_section = section
+                                    output_lines.append(section)
                                     break
+                            else:
+                                if current_section and stripped:
+                                    output_lines.append(line)
+                        return "\n".join(output_lines)
+                    markdown_summary = extract_markdown_sections(raw_summary)
+                    markdown_summary = ensure_four_sections(markdown_summary)
+                    total_time = time.time() - start_total
+                    logger.info(f"[✅ SUCCESS] GGUF | TIMEOUT_MODE: {timeout_mode} | TOTAL: {total_time:.1f}s")
+                    return jsonify({
+                        "summary": raw_summary,
+                        "baseline": baseline,
+                        "delta": delta_text,
+                        "timing": {
+                            "ehr_api": round(t_api_end - t_api_start, 1),
+                            "generation": round(total_time - (t_api_end - t_api_start), 1),
+                            "total": round(total_time, 1)
+                        },
+                        "model_used": f"{model_name} ({model_type})",
+                        "timeout_mode_used": timeout_mode
+                    }), 200
                 except Exception as e:
+                    logger.error(f"GGUF generation failed: {e}")
+                    # ⚡ FALLBACK 1: STRUCTURED MINIMAL SUMMARY
+                    structured_fallback = f"""
+## Clinical Assessment
+- System generated fallback due to model error.
+## Key Trends & Changes
+- Weight: {delta['weight']['curr'] or 'N/A'} (Δ {delta['weight']['delta'] or 'N/A'})
+- BP: {delta['bp_sys']['curr'] or 'N/A'}/{delta['bp_dia']['curr'] or 'N/A'}
+- New Dx: {', '.join(delta['added_dx']) if delta['added_dx'] else 'None'}
+- Meds Started: {', '.join(delta['started_meds']) if delta['started_meds'] else 'None'}
+## Plan & Suggested Actions
+- Review recent vitals and medication changes.
+## Direct Guidance for Physician
+- Model generation failed — verify all data manually.
+"""
+                    total_time = time.time() - start_total
+                    logger.info(f"[⚠️ FALLBACK 1] Structured summary | TIMEOUT_MODE: {timeout_mode} | TOTAL: {total_time:.1f}s")
+                    return jsonify({
+                        "summary": ensure_four_sections(structured_fallback),
+                        "baseline": baseline,
+                        "delta": delta_text,
+                        "warning": "Model generation failed — used structured fallback.",
+                        "error": str(e),
+                        "timing": {"total": round(total_time, 1)},
+                        "timeout_mode_used": timeout_mode
+                    }), 200
+            # ==================== TEXT-GENERATION / OPENVINO ====================
             elif model_type in {"text-generation", "causal-openvino"}:
+                logger.info(f"🔤 TEXT-GENERATION MODE: {model_name}")
+                try:
+                    loader = agents.get("medical_data_extractor")
+                    if not loader or getattr(loader, 'model_name', None) != model_name:
+                        from ai_med_extract.utils.model_loader_spaces import get_openvino_pipeline
                         pipeline = get_openvino_pipeline(model_name)
+                    else:
+                        pipeline = loader.model_loader.load() if hasattr(loader, "model_loader") else None
+                    if not pipeline:
+                        raise ValueError("Pipeline not available")
+                    prompt = build_main_prompt(baseline, delta_text)
+                    inputs = pipeline.tokenizer([prompt], return_tensors="pt")
+                    outputs = pipeline.model.generate(
+                        **inputs,
+                        max_new_tokens=800,
+                        do_sample=False,
+                        pad_token_id=pipeline.tokenizer.pad_token_id or pipeline.tokenizer.eos_token_id or 0
+                    )
+                    text = pipeline.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                    summary_start_patterns = [
+                        "Now generate the complete clinical summary",
+                        "## Clinical Assessment",
+                        "# Clinical Assessment",
+                        "Clinical Assessment"
+                    ]
+                    new_summary = text
+                    for pattern in summary_start_patterns:
+                        if pattern in text:
+                            new_summary = text.split(pattern)[-1].strip()
+                            break
+                    markdown_summary = summary_to_markdown(new_summary)
+                    markdown_summary = ensure_four_sections(markdown_summary)
+                    total_time = time.time() - start_total
+                    logger.info(f"[✅ SUCCESS] Text-generation | TIMEOUT_MODE: {timeout_mode} | TOTAL: {total_time:.1f}s")
+                    return jsonify({
+                        "summary": markdown_summary,
+                        "baseline": baseline,
+                        "delta": delta_text,
+                        "timing": {"total": round(total_time, 1)},
+                        "model_used": f"{model_name} ({model_type})",
+                        "timeout_mode_used": timeout_mode
+                    }), 200
+                except Exception as e:
+                    logger.error(f"Text-generation failed: {e}")
+                    structured_fallback = f"""
+## Clinical Assessment
+- Text-generation model failed.
+## Key Trends & Changes
+- Refer to delta data for details.
+## Plan & Suggested Actions
+- Manual clinical review required.
+## Direct Guidance for Physician
+- AI model unavailable — use clinical judgment.
+"""
+                    return jsonify({
+                        "summary": ensure_four_sections(structured_fallback),
+                        "baseline": baseline,
+                        "delta": delta_text,
+                        "warning": "Text-generation model failed — used fallback.",
+                        "error": str(e),
+                        "timing": {"total": round(time.time() - start_total, 1)},
+                        "timeout_mode_used": timeout_mode
+                    }), 200
+            # ==================== SUMMARIZATION MODEL ====================
             elif model_type == "summarization":
+                logger.info(f"📝 SUMMARIZATION MODE: {model_name}")
+                try:
+                    loader = agents.get("summarizer")
+                    pipeline = loader.model_loader.load() if hasattr(loader, "model_loader") else get_summarizer_pipeline("summarization", model_name)
+                    context = f"Patient Data:\nBaseline: {baseline}\nChanges: {delta_text}"
+                    result = pipeline(context, max_length=400, min_length=100, do_sample=False)
+                    if isinstance(result, list) and result and "summary_text" in result[0]:
+                        raw_summary = result[0]["summary_text"]
+                    else:
+                        raw_summary = str(result)
+                    markdown_summary = f"""
+## Clinical Assessment
+{raw_summary[:250]}...
+## Key Trends & Changes
+See delta data for details.
+## Plan & Suggested Actions
+Further evaluation recommended.
+## Direct Guidance for Physician
+Generic summary — verify details clinically.
+"""
+                    markdown_summary = ensure_four_sections(markdown_summary)
+                    total_time = time.time() - start_total
+                    logger.info(f"[✅ SUCCESS] Summarization | TIMEOUT_MODE: {timeout_mode} | TOTAL: {total_time:.1f}s")
+                    return jsonify({
+                        "summary": markdown_summary,
+                        "baseline": baseline,
+                        "delta": delta_text,
+                        "timing": {"total": round(total_time, 1)},
+                        "model_used": f"{model_name} ({model_type})",
+                        "timeout_mode_used": timeout_mode
+                    }), 200
+                except Exception as e:
+                    logger.error(f"Summarization failed: {e}")
+                    structured_fallback = """
+## Clinical Assessment
+- Summarization model failed.
+## Key Trends & Changes
+- Unable to generate trends.
+## Plan & Suggested Actions
+- Full manual review required.
+## Direct Guidance for Physician
+- AI assistance unavailable — proceed with caution.
+"""
+                    return jsonify({
+                        "summary": ensure_four_sections(structured_fallback),
+                        "baseline": baseline,
+                        "delta": delta_text,
+                        "warning": "Summarization model failed — used fallback.",
+                        "error": str(e),
+                        "timing": {"total": round(time.time() - start_total, 1)},
+                        "timeout_mode_used": timeout_mode
+                    }), 200
+            # ==================== UNSUPPORTED MODEL TYPE ====================
+            else:
+                logger.warning(f"Unsupported model_type: {model_type}")
+                generic_fallback = f"""
+## Clinical Assessment
+- Unsupported model type: {model_type}
+## Key Trends & Changes
+- Please use model_type: gguf, text-generation, or summarization
+## Plan & Suggested Actions
+- Update API request with supported model type.
+## Direct Guidance for Physician
+- System configuration error — contact administrator.
+"""
+                return jsonify({
+                    "summary": ensure_four_sections(generic_fallback),
+                    "baseline": baseline,
+                    "delta": delta_text,
+                    "warning": f"Unsupported model_type: {model_type}",
+                    "supported_types": ["gguf", "text-generation", "causal-openvino", "summarization"],
+                    "timing": {"total": round(time.time() - start_total, 1)},
+                    "timeout_mode_used": timeout_mode
+                }), 400
+        except Exception as e:
+            logger.error(f"🚨 CRITICAL ERROR: {str(e)}", exc_info=True)
+            # ⚡ FINAL FALLBACK — NEVER FAIL
+            emergency_fallback = """
+## Clinical Assessment
+- System emergency fallback — critical error occurred.
+## Key Trends & Changes
+- No data available due to system error.
+## Plan & Suggested Actions
+- Retry request or contact system administrator.
+## Direct Guidance for Physician
+- DO NOT rely on this summary — system malfunction.
+"""
             return jsonify({
+                "summary": ensure_four_sections(emergency_fallback),
+                "warning": "Critical system error — used emergency fallback.",
+                "error": str(e),
+                "timing": {"total": round(time.time() - start_total, 1)},
+                "timeout_mode_used": data.get("timeout_mode", "fast") if request.get_json() else "unknown"
             }), 200
     @app.route("/")
     def home():
         return "Medical Data Extraction API is running!", 200
 def summary_to_markdown(summary):
     import re
     # Remove '- answer:' and similar artifacts
     summary = re.sub(r'-\s*answer: ?', '', summary, flags=re.IGNORECASE)
     # Convert numbered sections to markdown headers
     lines = summary.splitlines()
     out = []
         '3.': '##',
         '4.': '##',
     }
     for line in lines:
         m = re.match(r'^(\d\.)\s*(.+)', line)
         if m and m.group(1) in section_map:
             out.append(f"{header} {m.group(2).strip()}")
         else:
             out.append(line)
     # Remove empty lines at the start
     while out and not out[0].strip():
         out = out[1:]
     # Check if we have the expected 4-section structure
     def is_header(line: str) -> bool:
         return bool(re.match(r'^(#{1,6})\s+.+', line.strip()))
     # Find all headers in the output
     headers = [i for i, line in enumerate(out) if is_header(line)]
     # If we have at least 4 headers, check if they match the expected structure
     if len(headers) >= 4:
         header_texts = [out[i].strip() for i in headers[:4]]
             r'##.*Plan.*Suggested.*Actions',
             r'##.*Direct.*Guidance.*Physician'
         ]
         # Check if headers match expected patterns
         matches_pattern = all(
             re.search(pattern, header, re.IGNORECASE)
             for pattern, header in zip(expected_patterns, header_texts)
         )
         if matches_pattern:
             # Keep the entire content - don't truncate
             return '\n'.join(out).strip()
     # If we don't have the expected structure, try to find the actual summary content
     # Look for the start of the clinical assessment section
     clinical_assessment_pattern = r'(?:# Clinical Assessment|## Clinical Assessment|Clinical Assessment)'
     for i, line in enumerate(out):
         if re.search(clinical_assessment_pattern, line, re.IGNORECASE):
             return '\n'.join(out[i:]).strip()
     # If no clinical assessment found, return the entire summary
+    return '\n'.join(out).strip()

ai_med_extract/app.py CHANGED Viewed

@@ -13,6 +13,7 @@ from .agents.medical_data_extractor import MedicalDocDataExtractorAgent
 from .agents.patient_summary_agent import PatientSummarizerAgent
 from .utils.model_manager import model_manager
 import torch
 # Load environment variables
 load_dotenv()

 from .agents.patient_summary_agent import PatientSummarizerAgent
 from .utils.model_manager import model_manager
 import torch
+torch.set_num_threads(1)  # Prevent PyTorch thread fighting with llama.cpp
 # Load environment variables
 load_dotenv()

ai_med_extract/utils/__pycache__/model_loader_gguf.cpython-311.pyc CHANGED Viewed

Binary files a/ai_med_extract/utils/__pycache__/model_loader_gguf.cpython-311.pyc and b/ai_med_extract/utils/__pycache__/model_loader_gguf.cpython-311.pyc differ

ai_med_extract/utils/__pycache__/model_manager.cpython-311.pyc CHANGED Viewed

Binary files a/ai_med_extract/utils/__pycache__/model_manager.cpython-311.pyc and b/ai_med_extract/utils/__pycache__/model_manager.cpython-311.pyc differ

ai_med_extract/utils/__pycache__/openvino_summarizer_utils.cpython-311.pyc CHANGED Viewed

Binary files a/ai_med_extract/utils/__pycache__/openvino_summarizer_utils.cpython-311.pyc and b/ai_med_extract/utils/__pycache__/openvino_summarizer_utils.cpython-311.pyc differ

ai_med_extract/utils/model_config.py CHANGED Viewed

@@ -62,7 +62,7 @@ SPACES_OPTIMIZED_MODELS = {
 MODEL_VALIDATION_RULES = {
     "text-generation": {
         "min_tokens": 100,
-        "max_tokens": 2048,
         "supported_formats": ["huggingface", "local"]
     },
     "summarization": {
@@ -82,7 +82,7 @@ MODEL_VALIDATION_RULES = {
     },
     "openvino": {
         "min_tokens": 100,
-        "max_tokens": 2048,
         "supported_formats": ["huggingface", "local"]
     }
 }

 MODEL_VALIDATION_RULES = {
     "text-generation": {
         "min_tokens": 100,
+        "max_tokens": 4000,
         "supported_formats": ["huggingface", "local"]
     },
     "summarization": {
     },
     "openvino": {
         "min_tokens": 100,
+        "max_tokens": 4000,
         "supported_formats": ["huggingface", "local"]
     }
 }

ai_med_extract/utils/model_loader_gguf.py CHANGED Viewed

@@ -59,7 +59,7 @@ class GGUFModelPipeline:
             # Memory-optimized settings for Hugging Face Spaces
             self.model = Llama(
                 model_path=local_path,
-                n_ctx=2048,  # Reduced from 4096 to save memory
                 n_threads=n_threads,
                 n_batch=n_batch,
                 n_gpu_layers=0,           # CPU-only on Spaces by default
@@ -90,7 +90,7 @@ class GGUFModelPipeline:
         """Generate text with timeout using threading"""
         # Approximate token count by splitting on whitespace
         prompt_tokens = len(prompt.split())
-        n_ctx = 2048
         allowed_max_tokens = n_ctx - prompt_tokens
         if allowed_max_tokens <= 0:
             raise ValueError(f"Prompt too long: {prompt_tokens} tokens exceed context window of {n_ctx}")
@@ -105,7 +105,7 @@ class GGUFModelPipeline:
                     max_tokens=max_tokens,
                     temperature=temperature,
                     top_p=top_p,
-                    stop=["</s>", "###"]
                 )
                 return output
             except Exception as e:

             # Memory-optimized settings for Hugging Face Spaces
             self.model = Llama(
                 model_path=local_path,
+                n_ctx=4000,  # Reduced from 4096 to save memory
                 n_threads=n_threads,
                 n_batch=n_batch,
                 n_gpu_layers=0,           # CPU-only on Spaces by default
         """Generate text with timeout using threading"""
         # Approximate token count by splitting on whitespace
         prompt_tokens = len(prompt.split())
+        n_ctx = 4000
         allowed_max_tokens = n_ctx - prompt_tokens
         if allowed_max_tokens <= 0:
             raise ValueError(f"Prompt too long: {prompt_tokens} tokens exceed context window of {n_ctx}")
                     max_tokens=max_tokens,
                     temperature=temperature,
                     top_p=top_p,
+                      stop=["\n\n##", "\n\n#", "###", "</s>", "<|endoftext|>", "User:", "System:"]
                 )
                 return output
             except Exception as e:

ai_med_extract/utils/model_manager.py CHANGED Viewed

@@ -100,7 +100,7 @@ class TransformersModelLoader(BaseModelLoader):
             if self.model_type == "text-generation":
                 result = pipeline(
                     prompt,
-                    max_new_tokens=kwargs.get('max_new_tokens', 2048),
                     do_sample=kwargs.get('do_sample', False),
                     temperature=kwargs.get('temperature', 0.7),
                     pad_token_id=self._tokenizer.eos_token_id
@@ -179,7 +179,7 @@ class GGUFModelLoader(BaseModelLoader):
         pipeline = self.load()
         try:
-            max_tokens = kwargs.get('max_tokens', 2048)
             temperature = kwargs.get('temperature', 0.7)
             top_p = kwargs.get('top_p', 0.95)

             if self.model_type == "text-generation":
                 result = pipeline(
                     prompt,
+                    max_new_tokens=kwargs.get('max_new_tokens', 4000),
                     do_sample=kwargs.get('do_sample', False),
                     temperature=kwargs.get('temperature', 0.7),
                     pad_token_id=self._tokenizer.eos_token_id
         pipeline = self.load()
         try:
+            max_tokens = kwargs.get('max_tokens', 4000)
             temperature = kwargs.get('temperature', 0.7)
             top_p = kwargs.get('top_p', 0.95)

ai_med_extract/utils/openvino_summarizer_utils.py CHANGED Viewed

@@ -6,6 +6,8 @@ import difflib
 import logging
 from copy import deepcopy
 def parse_ehr_chartsummarydtl(chartsummarydtl):
     """
     Converts EHR API chartsummarydtl list to the internal visit format expected by the summarizer.
@@ -169,29 +171,72 @@ def delta_to_text(delta):
             L.append(f"{lab_name}: {_fmt(lab_data['prev'])} -> {_fmt(lab_data['curr'])} (Δ {_fmt(lab_data['delta'], '+.1f')})")
     return "\n".join(L)
-def build_main_prompt(baseline, delta_text, patient_info=""):
-    # return("You are an expert clinical AI assistant.\n"
-    #        "Produce a concise, physician-ready update. Never omit critical new information from the deltas.\n\n"
-    #        "The summary MUST have four sections:\n"
-    #          "1) Clinical Assessment\n"
-    #          "2) Key Trends & Changes\n"
-    #          "3) Plan & Suggested Actions\n"
-    #          "4) Direct Guidance for Physician\n\n"
-    #         "Now generate the complete, updated clinical summary with all four sections in markdown format:")
-    return (
         "You are an expert clinical AI assistant. Your task is to generate a patient summary.\n"
         "Use the chartsummarydtl for context. The STRUCTURED BASELINE and DELTAS are the absolute ground truth.\n"
         "Produce a concise, physician-ready summary. Never omit critical new information from the deltas.\n\n"
         "The summary MUST have four sections:\n"
         "1) Clinical Assessment\n"
         "2) Key Trends & Changes\n"
         "3) Plan & Suggested Actions\n"
         "4) Direct Guidance for Physician\n\n"
-        f"PATIENT INFORMATION:\n{patient_info}\n\n"
-        f"STRUCTURED BASELINE (authoritative):\n{baseline}\n\n"
-        f"STRUCTURED DELTAS (authoritative):\n{delta_text}\n\n"
         "Now generate the complete clinical summary with all four sections in markdown format:"
     )
 def validate_and_compare_summaries(old_summary, new_summary, update_name=""):
     report = f"### Validation Report for {update_name}\n"
     report += "This report validates that the updated summary incorporates new information correctly.\n"

 import logging
 from copy import deepcopy
 def parse_ehr_chartsummarydtl(chartsummarydtl):
     """
     Converts EHR API chartsummarydtl list to the internal visit format expected by the summarizer.
             L.append(f"{lab_name}: {_fmt(lab_data['prev'])} -> {_fmt(lab_data['curr'])} (Δ {_fmt(lab_data['delta'], '+.1f')})")
     return "\n".join(L)
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+def generate_section(pipeline, prompt, section_name, timeout=60):
+    """Generate one section with timeout protection."""
+    try:
+        # If your pipeline supports timeout, pass it. Otherwise, wrap in future.
+        from concurrent.futures import ThreadPoolExecutor as TPE, TimeoutError as TE
+        with TPE(max_workers=1) as executor:
+            future = executor.submit(pipeline.generate_full_summary, prompt, max_tokens=2000, max_loops=3)
+            raw = future.result(timeout=timeout)
+        # Clean: remove instruction residue, extract content
+        patterns_to_split = [
+            "Now generate the complete",
+            "## Clinical Assessment",
+            "# Clinical Assessment",
+            "Clinical Assessment",
+            "Output ONLY the section content"
+        ]
+        content = raw
+        for pat in patterns_to_split:
+            if pat in content:
+                content = content.split(pat)[-1].strip()
+        # Ensure it starts with section header if not present
+        header = f"## {section_name}"
+        if not content.startswith(header):
+            content = f"{header}\n{content.strip()}"
+        return content.strip()
+    except Exception as e:
+        # Return placeholder if generation fails
+        logging.Logger.error(f"Section '{section_name}' generation failed: {e}")
+        return f"## {section_name}\n- *Generation failed or timed out. Please retry or check logs.*"
+def build_main_prompt(baseline, delta_text, patient_info="", section=None):
+    base_prompt = (
         "You are an expert clinical AI assistant. Your task is to generate a patient summary.\n"
         "Use the chartsummarydtl for context. The STRUCTURED BASELINE and DELTAS are the absolute ground truth.\n"
         "Produce a concise, physician-ready summary. Never omit critical new information from the deltas.\n\n"
+        f"PATIENT INFORMATION:\n{patient_info}\n\n"
+        f"STRUCTURED BASELINE (authoritative):\n{baseline}\n\n"
+        f"STRUCTURED DELTAS (authoritative):\n{delta_text}\n\n"
+    )
+    if section:
+        section_prompts = {
+            "Clinical Assessment": "Generate ONLY the 'Clinical Assessment' section. Be concise, accurate, and evidence-based.",
+            "Key Trends & Changes": "Generate ONLY the 'Key Trends & Changes' section. Focus on deltas, trends, vitals, labs, and med changes.",
+            "Plan & Suggested Actions": "Generate ONLY the 'Plan & Suggested Actions' section. Suggest next steps, monitoring, referrals, or med adjustments.",
+            "Direct Guidance for Physician": "Generate ONLY the 'Direct Guidance for Physician' section. Give clear, actionable advice for the clinician."
+        }
+        instruction = section_prompts.get(section, f"Generate the '{section}' section.")
+        return base_prompt + f"{instruction}\n\nOutput ONLY the section content. Do not include headers unless specified.\n\n"
+    # Default: generate full 4-section summary
+    return base_prompt + (
         "The summary MUST have four sections:\n"
         "1) Clinical Assessment\n"
         "2) Key Trends & Changes\n"
         "3) Plan & Suggested Actions\n"
         "4) Direct Guidance for Physician\n\n"
         "Now generate the complete clinical summary with all four sections in markdown format:"
     )
 def validate_and_compare_summaries(old_summary, new_summary, update_name=""):
     report = f"### Validation Report for {update_name}\n"
     report += "This report validates that the updated summary incorporates new information correctly.\n"

generate_patient_summary_colab.py CHANGED Viewed

@@ -313,7 +313,7 @@ class GGUFModelPipeline:
             text = re.sub(p, "", text, flags=re.IGNORECASE)
         return text.strip()
-    def _generate_with_timeout(self, prompt, max_tokens=2048, temperature=0.5, top_p=0.95, timeout=None):
         if timeout is None:
             is_hf_space = os.environ.get('SPACE_ID') is not None
             timeout = int(os.environ.get('GGUF_GENERATION_TIMEOUT', '600' if is_hf_space else '300'))
@@ -341,7 +341,7 @@ class GGUFModelPipeline:
                 future.cancel()
                 raise TimeoutError(f"Generation timed out after {timeout} seconds")
-    def generate(self, prompt, max_tokens=2048, temperature=0.5, top_p=0.95):
         t0 = time.time()
         try:
             output = self._generate_with_timeout(prompt, max_tokens, temperature, top_p)
@@ -358,7 +358,7 @@ class GGUFModelPipeline:
             logging.error(f"Generation failed: {e}")
             raise RuntimeError(f"Text generation failed: {str(e)}")
-    def generate_full_summary(self, prompt, max_tokens=2048, max_loops=5):
         def is_complete(text):
             required_sections = [
                 'Clinical Assessment',
@@ -537,7 +537,7 @@ class SummarizerAgent:
                 return "Input text is too short for summarization"
             model = self.summarization_model_loader.load()
             if hasattr(model, 'generate_full_summary'):
-                summary = model.generate_full_summary(clean_text, max_tokens=2048, max_loops=2)
             else:
                 # fallback simple summarization
                 summary = model(clean_text, max_length=512, min_length=50, do_sample=False)

             text = re.sub(p, "", text, flags=re.IGNORECASE)
         return text.strip()
+    def _generate_with_timeout(self, prompt, max_tokens=4000, temperature=0.5, top_p=0.95, timeout=None):
         if timeout is None:
             is_hf_space = os.environ.get('SPACE_ID') is not None
             timeout = int(os.environ.get('GGUF_GENERATION_TIMEOUT', '600' if is_hf_space else '300'))
                 future.cancel()
                 raise TimeoutError(f"Generation timed out after {timeout} seconds")
+    def generate(self, prompt, max_tokens=4000, temperature=0.5, top_p=0.95):
         t0 = time.time()
         try:
             output = self._generate_with_timeout(prompt, max_tokens, temperature, top_p)
             logging.error(f"Generation failed: {e}")
             raise RuntimeError(f"Text generation failed: {str(e)}")
+    def generate_full_summary(self, prompt, max_tokens=4000, max_loops=5):
         def is_complete(text):
             required_sections = [
                 'Clinical Assessment',
                 return "Input text is too short for summarization"
             model = self.summarization_model_loader.load()
             if hasattr(model, 'generate_full_summary'):
+                summary = model.generate_full_summary(clean_text, max_tokens=4000, max_loops=2)
             else:
                 # fallback simple summarization
                 summary = model(clean_text, max_length=512, min_length=50, do_sample=False)