Spaces:

salvinjose
/

HNTAI

Paused

App Files Files Community

Joyna-Joy commited on Jun 12, 2025

Commit

a9612bb

1 Parent(s): 206c5f7

new_updated_code

Browse files

Former-commit-id: 301deea94ad0073a344b4f8ce0904b594c9e15b4

Files changed (2) hide show

ai_med_extract/api/routes.py.REMOVED.git-id +1 -1
ai_med_extract/utils/validation.py +239 -0

ai_med_extract/api/routes.py.REMOVED.git-id CHANGED Viewed

	@@ -1 +1 @@
1	- ~~ff540d5471cce91e425947ea7e6397c986f9a7fb~~


1	+ 053c0d73058268dec33b161e1067d37c3fbe1855

ai_med_extract/utils/validation.py CHANGED Viewed

@@ -1,8 +1,43 @@
 import re
 from flask import jsonify
 import logging
 import os
 def clean_result(value):
     value = re.sub(r"\s+", " ", value)
     value = re.sub(r"[-_:]+", " ", value)
@@ -138,3 +173,207 @@ def validate_patient_name(extracted_text, patient_name, filename, qa_pipeline):

+from collections import defaultdict
+import functools
+import json
 import re
+import time
 from flask import jsonify
 import logging
 import os
+# -------------------- Logging Config -------------------- #
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    handlers=[
+        logging.FileHandler("app.log"),
+        logging.StreamHandler()
+    ]
+)
+logger = logging.getLogger(__name__)
+# -------------------- Execution Time Decorator -------------------- #
+def log_execution_time(level=logging.INFO):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            start_time = time.time()
+            try:
+                result = func(*args, **kwargs)
+                duration = time.time() - start_time
+                logger.log(level, f"⏱️ {func.__name__} executed in {duration:.6f} seconds")
+                return result
+            except Exception as e:
+                duration = time.time() - start_time
+                logger.exception(f"❌ Exception in {func.__name__} after {duration:.6f} seconds: {e}")
+                raise
+        return wrapper
+    return decorator
 def clean_result(value):
     value = re.sub(r"\s+", " ", value)
     value = re.sub(r"[-_:]+", " ", value)
+# ------------------ CLEAN FUNCTION  ------------------ #
+@log_execution_time()
+def clean_result(value):
+    logger.debug("Cleaning value: %s", value)
+    if isinstance(value, str):
+        value = re.sub(r"\s+", " ", value)
+        value = re.sub(r"[-_:]+", " ", value)
+        value = re.sub(r"[^\x00-\x7F]+", " ", value)
+        value = re.sub(
+            r"(?<=\d),(?=\d)", "", value
+        )  # Remove commas in numbers like 250,000
+        return value.strip() if value.strip() else "Not Available"
+    elif isinstance(value, list):
+        cleaned = [clean_result(v) for v in value if v is not None]
+        return cleaned if cleaned else ["Not Available"]
+    elif isinstance(value, dict):
+        return {k: clean_result(v) for k, v in value.items()}
+    return value
+# ------------------Group by Category ------------------ #
+@log_execution_time()
+def group_by_category(data):
+    logger.info("Grouping extracted items by category")
+    grouped = defaultdict(list)
+    category_times = {}
+    for item in data:
+        cat = item.get("category", "General")
+        start_time = time.time()
+        grouped[cat].append(
+            {
+                "question": item.get("question", "Not Created"),
+                "label": item.get("label", "Unknown"),
+                "answer": item.get("answer", "Not Available"),
+            }
+        )
+        elapsed = time.time() - start_time
+        category_times[cat] = category_times.get(cat, 0) + elapsed
+    for cat, details in grouped.items():
+        logger.info(f"📂 Category '{cat}': {len(details)} items, time taken: {category_times[cat]:.4f}s")
+    return [{"category": k, "detail": v} for k, v in grouped.items()]
+# ------------------detect duplicate to keep latest ------------------ #
+@log_execution_time()
+def deduplicate_extractions(data):
+    logger.info("Deduplicating extracted data (keep last duplicates)")
+    seen = set()
+    reversed_unique = []
+    # Loop in reverse to keep the *last* occurrence
+    for item in reversed(data):
+        key = (item.get("label"))
+        if key not in seen:
+            seen.add(key)
+            reversed_unique.append(item)
+    # Reverse back to preserve original order (latest kept, first dropped)
+    return list(reversed(reversed_unique))
+# -----------------------------Split text into overlapping chunks---------------#
+@log_execution_time()
+def chunk_text(text, tokenizer, max_tokens=512, overlap=50):
+    """
+    Splits text into overlapping token-based chunks without using NLTK.
+    Args:
+        text (str): Raw input text.
+        tokenizer (transformers tokenizer): Hugging Face tokenizer instance.
+        max_tokens (int): Max tokens per chunk.
+        overlap (int): Number of overlapping tokens between chunks.
+    Returns:
+        List[str]: List of decoded text chunks.
+    """
+    # Tokenize the full text
+    logger.info("Splitting text into chunks")
+    input_ids = tokenizer.encode(text, add_special_tokens=False)
+    chunks = []
+    start = 0
+    while start < len(input_ids):
+        end = start + max_tokens
+        chunk_ids = input_ids[start:end]
+        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
+        # Ensure partial continuation isn't cut off mid-sentence
+        if not chunk_text.endswith(('.', '?', '!', ':')):
+            chunk_text += "..."
+        chunks.append(chunk_text)
+        start += max_tokens - overlap
+    logger.info("Created %d chunks", len(chunks))
+    return chunks
+    # ------------------ PARSE JSON OBJECTS FROM OUTPUT ------------------ #
+@log_execution_time()
+def extract_json_objects(text):
+    logger.info("Extracting JSON objects from text")
+    extracted = []
+    try:
+        json_start = text.index('[')
+        json_text = text[json_start:]
+    except ValueError:
+        logger.warning("⚠ '[' not found in output")
+        return []
+    # Try parsing full array first
+    try:
+        parsed = json.loads(json_text)
+        if isinstance(parsed, list):
+            return parsed
+    except Exception:
+        pass  # fallback to manual parsing
+    # Manual recovery via brace matching
+    stack = 0
+    obj_start = None
+    for i, char in enumerate(json_text):
+        if char == '{':
+            if stack == 0:
+                obj_start = i
+            stack += 1
+        elif char == '}':
+            stack -= 1
+            if stack == 0 and obj_start is not None:
+                obj_str = json_text[obj_start:i+1]
+                try:
+                    obj = json.loads(obj_str)
+                    extracted.append(obj)
+                except Exception as e:
+                    logger.error(f"❌ Invalid JSON object: {e}")
+                obj_start = None
+    return extracted
+# ------------------ PROCESS A SINGLE CHUNK ------------------ #
+@log_execution_time()
+def process_chunk(generator, chunk, idx):
+    logger.info("Processing chunk %d", idx + 1)
+    prompt = f"""
+            [INST] <<SYS>>
+            You are a clinical data extraction assistant.
+            Your job is to:
+            1. Read the following medical report.
+            2. Extract all medically relevant facts as a list of JSON objects.
+            3. Each object must include:
+            - "label": a short field name (e.g., "blood pressure", "diagnosis")
+            - "question": a question related to that field
+            - "answer": the answer from the text
+            4. After extracting the list, categorize each object under one of the following fixed categories:
+            - Patient Info
+            - Vitals
+            - Symptoms
+            - Allergies
+            - Habits
+            - Comorbidities
+            - Diagnosis
+            - Medication
+            - Laboratory
+            - Radiology
+            - Doctor Note
+             Example format for structure only — do not include in output:
+            [
+            {{
+                "label": "patient name",
+                "question": "What is the patient's name?",
+                "answer": "Marry John",
+                "category": "Patient Info"
+            }},
+            ]
+            ⚠ Use these categories listed above.If an item does not fit any of these categories, create a new category for it.
+            Text:
+            {chunk}
+            Return a single valid JSON array of all extracted objects.
+            Do not include any explanations or commentary.
+            Only output the JSON array
+            <</SYS>> [/INST]
+            """
+    try:
+        output = generator(
+            prompt,
+            max_new_tokens=1024,
+            do_sample=True,
+            temperature=0.3
+        )[0]["generated_text"]
+        print("----------------------------------")
+        logger.info(f"📤 Output from chunk {idx}: {output}...")
+        return idx, output
+    except Exception as e:
+        logger.error("Error processing chunk %d: %s", idx, e)
+        return idx, None