Spaces:

salvinjose
/

HNTAI

Paused

Joyna-Joy commited on Jun 12, 2025

Commit

97ea1d0

1 Parent(s): a9612bb

Revert "new_updated_code"

This reverts commit a9612bbc791186d89d3ea3f508bec036f435975d [formerly 301deea94ad0073a344b4f8ce0904b594c9e15b4].

Former-commit-id: f31832c58ce096d831af0ca39e6de25d7a3419b1

Files changed (2) hide show

ai_med_extract/api/routes.py.REMOVED.git-id +1 -1
ai_med_extract/utils/validation.py +0 -239

ai_med_extract/api/routes.py.REMOVED.git-id CHANGED Viewed

	@@ -1 +1 @@
1	- ~~053c0d73058268dec33b161e1067d37c3fbe1855~~


1	+ ff540d5471cce91e425947ea7e6397c986f9a7fb

ai_med_extract/utils/validation.py CHANGED Viewed

@@ -1,43 +1,8 @@
-from collections import defaultdict
-import functools
-import json
 import re
-import time
 from flask import jsonify
 import logging
 import os
-# -------------------- Logging Config -------------------- #
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s - %(levelname)s - %(message)s",
-    handlers=[
-        logging.FileHandler("app.log"),
-        logging.StreamHandler()
-    ]
-)
-logger = logging.getLogger(__name__)
-# -------------------- Execution Time Decorator -------------------- #
-def log_execution_time(level=logging.INFO):
-    def decorator(func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            start_time = time.time()
-            try:
-                result = func(*args, **kwargs)
-                duration = time.time() - start_time
-                logger.log(level, f"⏱️ {func.__name__} executed in {duration:.6f} seconds")
-                return result
-            except Exception as e:
-                duration = time.time() - start_time
-                logger.exception(f"❌ Exception in {func.__name__} after {duration:.6f} seconds: {e}")
-                raise
-        return wrapper
-    return decorator
 def clean_result(value):
     value = re.sub(r"\s+", " ", value)
     value = re.sub(r"[-_:]+", " ", value)
@@ -173,207 +138,3 @@ def validate_patient_name(extracted_text, patient_name, filename, qa_pipeline):
-# ------------------ CLEAN FUNCTION  ------------------ #
-@log_execution_time()
-def clean_result(value):
-    logger.debug("Cleaning value: %s", value)
-    if isinstance(value, str):
-        value = re.sub(r"\s+", " ", value)
-        value = re.sub(r"[-_:]+", " ", value)
-        value = re.sub(r"[^\x00-\x7F]+", " ", value)
-        value = re.sub(
-            r"(?<=\d),(?=\d)", "", value
-        )  # Remove commas in numbers like 250,000
-        return value.strip() if value.strip() else "Not Available"
-    elif isinstance(value, list):
-        cleaned = [clean_result(v) for v in value if v is not None]
-        return cleaned if cleaned else ["Not Available"]
-    elif isinstance(value, dict):
-        return {k: clean_result(v) for k, v in value.items()}
-    return value
-# ------------------Group by Category ------------------ #
-@log_execution_time()
-def group_by_category(data):
-    logger.info("Grouping extracted items by category")
-    grouped = defaultdict(list)
-    category_times = {}
-    for item in data:
-        cat = item.get("category", "General")
-        start_time = time.time()
-        grouped[cat].append(
-            {
-                "question": item.get("question", "Not Created"),
-                "label": item.get("label", "Unknown"),
-                "answer": item.get("answer", "Not Available"),
-            }
-        )
-        elapsed = time.time() - start_time
-        category_times[cat] = category_times.get(cat, 0) + elapsed
-    for cat, details in grouped.items():
-        logger.info(f"📂 Category '{cat}': {len(details)} items, time taken: {category_times[cat]:.4f}s")
-    return [{"category": k, "detail": v} for k, v in grouped.items()]
-# ------------------detect duplicate to keep latest ------------------ #
-@log_execution_time()
-def deduplicate_extractions(data):
-    logger.info("Deduplicating extracted data (keep last duplicates)")
-    seen = set()
-    reversed_unique = []
-    # Loop in reverse to keep the *last* occurrence
-    for item in reversed(data):
-        key = (item.get("label"))
-        if key not in seen:
-            seen.add(key)
-            reversed_unique.append(item)
-    # Reverse back to preserve original order (latest kept, first dropped)
-    return list(reversed(reversed_unique))
-# -----------------------------Split text into overlapping chunks---------------#
-@log_execution_time()
-def chunk_text(text, tokenizer, max_tokens=512, overlap=50):
-    """
-    Splits text into overlapping token-based chunks without using NLTK.
-    Args:
-        text (str): Raw input text.
-        tokenizer (transformers tokenizer): Hugging Face tokenizer instance.
-        max_tokens (int): Max tokens per chunk.
-        overlap (int): Number of overlapping tokens between chunks.
-    Returns:
-        List[str]: List of decoded text chunks.
-    """
-    # Tokenize the full text
-    logger.info("Splitting text into chunks")
-    input_ids = tokenizer.encode(text, add_special_tokens=False)
-    chunks = []
-    start = 0
-    while start < len(input_ids):
-        end = start + max_tokens
-        chunk_ids = input_ids[start:end]
-        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=True)
-        # Ensure partial continuation isn't cut off mid-sentence
-        if not chunk_text.endswith(('.', '?', '!', ':')):
-            chunk_text += "..."
-        chunks.append(chunk_text)
-        start += max_tokens - overlap
-    logger.info("Created %d chunks", len(chunks))
-    return chunks
-    # ------------------ PARSE JSON OBJECTS FROM OUTPUT ------------------ #
-@log_execution_time()
-def extract_json_objects(text):
-    logger.info("Extracting JSON objects from text")
-    extracted = []
-    try:
-        json_start = text.index('[')
-        json_text = text[json_start:]
-    except ValueError:
-        logger.warning("⚠ '[' not found in output")
-        return []
-    # Try parsing full array first
-    try:
-        parsed = json.loads(json_text)
-        if isinstance(parsed, list):
-            return parsed
-    except Exception:
-        pass  # fallback to manual parsing
-    # Manual recovery via brace matching
-    stack = 0
-    obj_start = None
-    for i, char in enumerate(json_text):
-        if char == '{':
-            if stack == 0:
-                obj_start = i
-            stack += 1
-        elif char == '}':
-            stack -= 1
-            if stack == 0 and obj_start is not None:
-                obj_str = json_text[obj_start:i+1]
-                try:
-                    obj = json.loads(obj_str)
-                    extracted.append(obj)
-                except Exception as e:
-                    logger.error(f"❌ Invalid JSON object: {e}")
-                obj_start = None
-    return extracted
-# ------------------ PROCESS A SINGLE CHUNK ------------------ #
-@log_execution_time()
-def process_chunk(generator, chunk, idx):
-    logger.info("Processing chunk %d", idx + 1)
-    prompt = f"""
-            [INST] <<SYS>>
-            You are a clinical data extraction assistant.
-            Your job is to:
-            1. Read the following medical report.
-            2. Extract all medically relevant facts as a list of JSON objects.
-            3. Each object must include:
-            - "label": a short field name (e.g., "blood pressure", "diagnosis")
-            - "question": a question related to that field
-            - "answer": the answer from the text
-            4. After extracting the list, categorize each object under one of the following fixed categories:
-            - Patient Info
-            - Vitals
-            - Symptoms
-            - Allergies
-            - Habits
-            - Comorbidities
-            - Diagnosis
-            - Medication
-            - Laboratory
-            - Radiology
-            - Doctor Note
-             Example format for structure only — do not include in output:
-            [
-            {{
-                "label": "patient name",
-                "question": "What is the patient's name?",
-                "answer": "Marry John",
-                "category": "Patient Info"
-            }},
-            ]
-            ⚠ Use these categories listed above.If an item does not fit any of these categories, create a new category for it.
-            Text:
-            {chunk}
-            Return a single valid JSON array of all extracted objects.
-            Do not include any explanations or commentary.
-            Only output the JSON array
-            <</SYS>> [/INST]
-            """
-    try:
-        output = generator(
-            prompt,
-            max_new_tokens=1024,
-            do_sample=True,
-            temperature=0.3
-        )[0]["generated_text"]
-        print("----------------------------------")
-        logger.info(f"📤 Output from chunk {idx}: {output}...")
-        return idx, output
-    except Exception as e:
-        logger.error("Error processing chunk %d: %s", idx, e)
-        return idx, None

 import re
 from flask import jsonify
 import logging
 import os
 def clean_result(value):
     value = re.sub(r"\s+", " ", value)
     value = re.sub(r"[-_:]+", " ", value)