Spaces:

hoshikrana
/

MedSightAI

Running

App Files Files Community

hoshikrana commited on May 5

Commit

b1406c1

1 Parent(s): 7807e33

feat: NLP module, multimodal fusion, and RAG chatbot

Browse files

Files changed (14) hide show

backend/api/v1/routers/analyze.py +30 -2
backend/api/v1/routers/chat.py +59 -0
backend/ml/fusion/medclip.py +74 -0
backend/ml/nlp/classifier.py +81 -0
backend/ml/nlp/ner.py +134 -0
backend/ml/nlp/whisper.py +70 -0
backend/ml/rag/generator.py +53 -0
backend/ml/rag/retriever.py +53 -0
backend/ml/rag/vectorstore.py +117 -0
frontend/components/analysis/VoiceInput.jsx +138 -0
frontend/components/chat/ChatInterface.jsx +98 -0
training/scripts/finetune_ner.py +103 -0
training/scripts/ingest_pubmed.py +77 -0
training/scripts/prepare_ner_data.py +83 -0

backend/api/v1/routers/analyze.py CHANGED Viewed

@@ -1,14 +1,16 @@
 from fastapi import APIRouter, Depends, Form, File, UploadFile, HTTPException, Request
 from sqlalchemy.ext.asyncio import AsyncSession
 from pathlib import Path
 from backend.db.session import get_db
 from backend.db.models import AnalysisSession, AnalysisTask
-from backend.core.dependencies import get_current_user, get_pagination
-from backend.core.exceptions import InvalidFileTypeError, FileTooLargeError
 from backend.utils.validators import ImageValidator, sanitize_symptoms_text, validate_patient_id, safe_temp_path
 from backend.api.v1.schemas.analysis import TaskSubmitResponse, TaskStatusResponse, AnalysisResult
 from backend.orchestration.queue import task_queue
 router = APIRouter()
@@ -77,3 +79,29 @@ async def get_result(task_id: str, db: AsyncSession = Depends(get_db), current_u
 async def cancel_task(task_id: str, current_user = Depends(get_current_user)):
     success = await task_queue.cancel(task_id, current_user.id)
     return {"cancelled": success, "message": "Task cancelled" if success else "Cannot cancel task"}

+import asyncio
 from fastapi import APIRouter, Depends, Form, File, UploadFile, HTTPException, Request
 from sqlalchemy.ext.asyncio import AsyncSession
 from pathlib import Path
 from backend.db.session import get_db
 from backend.db.models import AnalysisSession, AnalysisTask
+from backend.core.dependencies import get_current_user, get_pagination, get_model_registry
+from backend.core.exceptions import InvalidFileTypeError, FileTooLargeError, ModelNotLoadedError
 from backend.utils.validators import ImageValidator, sanitize_symptoms_text, validate_patient_id, safe_temp_path
 from backend.api.v1.schemas.analysis import TaskSubmitResponse, TaskStatusResponse, AnalysisResult
 from backend.orchestration.queue import task_queue
+from backend.ml.nlp.whisper import WhisperTranscriber
 router = APIRouter()
 async def cancel_task(task_id: str, current_user = Depends(get_current_user)):
     success = await task_queue.cancel(task_id, current_user.id)
     return {"cancelled": success, "message": "Task cancelled" if success else "Cannot cancel task"}
+@router.post("/transcribe")
+async def transcribe_audio(
+    audio: UploadFile = File(...),
+    registry = Depends(get_model_registry),
+    current_user = Depends(get_current_user)
+):
+    if audio.content_type not in ["audio/wav", "audio/mpeg", "audio/webm", "audio/ogg"]:
+        raise InvalidFileTypeError("Audio must be WAV, MP3, WebM, or OGG")
+    content = await audio.read()
+    if len(content) > 25 * 1024 * 1024:
+        raise FileTooLargeError("Audio file too large (max 25MB)")
+    temp_path = safe_temp_path(audio.filename or "audio.webm")
+    temp_path.write_bytes(content)
+    try:
+        whisper_state = await registry.get("whisper_tiny")
+        if not whisper_state.is_available:
+            raise ModelNotLoadedError("Voice transcription unavailable")
+        result = await asyncio.to_thread(WhisperTranscriber.transcribe, temp_path, whisper_state.model)
+        return result
+    finally:
+        temp_path.unlink(missing_ok=True)

backend/api/v1/routers/chat.py CHANGED Viewed

	@@ -0,0 +1,59 @@

+import json
+import asyncio
+from fastapi import APIRouter, Depends
+from fastapi.responses import StreamingResponse
+from sqlalchemy.ext.asyncio import AsyncSession
+from backend.api.v1.schemas.chat import ChatRequest
+from backend.core.dependencies import get_session_or_404, get_model_registry, get_current_user
+from backend.db.session import get_db
+# Note: Assume session_manager is built in db/utils.py to manage chat history records
+# from backend.db.utils import session_manager
+from backend.ml.rag.retriever import MedicalRAG
+from backend.ml.rag.generator import ChatGenerator
+router = APIRouter()
+@router.post("/")
+async def chat(
+    body: ChatRequest,
+    session = Depends(get_session_or_404),
+    registry = Depends(get_model_registry),
+    current_user = Depends(get_current_user),
+    db: AsyncSession = Depends(get_db)
+):
+    is_safe, warning = MedicalRAG.is_safe_query(body.message)
+    # Placeholder for actual DB history fetch
+    history_dicts = []
+    session_result = session.result_json if hasattr(session, 'result_json') else {}
+    chunks = MedicalRAG.retrieve(body.message, session_result, n_results=5)
+    prompt = MedicalRAG.build_prompt(body.message, chunks, history_dicts, session_result)
+    biogpt_state = await registry.get("biogpt_base")
+    full_response = []
+    async def stream_generator():
+        yield f"data: {json.dumps({'type': 'sources', 'sources': chunks[:3]})}\n\n"
+        if not biogpt_state.is_available:
+            fallback = "I cannot provide a detailed answer right now as the AI system is unavailable."
+            yield f"data: {json.dumps({'type': 'token', 'token': fallback})}\n\n"
+        else:
+            token_gen = await asyncio.to_thread(lambda: list(ChatGenerator.generate_stream(prompt, biogpt_state.model, biogpt_state.tokenizer)))
+            for token in token_gen:
+                full_response.append(token)
+                yield f"data: {json.dumps({'type': 'token', 'token': token})}\n\n"
+                await asyncio.sleep(0.02) # Optional smoothing
+        if warning:
+            yield f"data: {json.dumps({'type': 'token', 'token': warning})}\n\n"
+        yield f"data: {json.dumps({'type': 'done'})}\n\n"
+    return StreamingResponse(
+        stream_generator(),
+        media_type="text/event-stream",
+        headers={"Cache-Control": "no-cache", "Connection": "keep-alive", "X-Accel-Buffering": "no"}
+    )

backend/ml/fusion/medclip.py CHANGED Viewed

	@@ -0,0 +1,74 @@

+import torch
+import torch.nn.functional as F
+from pathlib import Path
+from PIL import Image
+from torchvision import transforms
+import logging
+logger = logging.getLogger(__name__)
+class MultimodalFusion:
+    """Computes image-text alignment using BiomedVLP."""
+    IMAGE_SIZE = 224
+    @staticmethod
+    def get_image_transform():
+        return transforms.Compose([
+            transforms.Resize((MultimodalFusion.IMAGE_SIZE, MultimodalFusion.IMAGE_SIZE)),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+    @staticmethod
+    def get_image_embedding(image_path: Path, model, device: str) -> torch.Tensor:
+        image = Image.open(image_path).convert("RGB")
+        transform = MultimodalFusion.get_image_transform()
+        image_tensor = transform(image).unsqueeze(0).to(device)
+        with torch.no_grad():
+            embedding = model.get_image_embeddings(image_tensor)
+        return F.normalize(embedding, p=2, dim=-1)
+    @staticmethod
+    def get_text_embedding(text: str, model, tokenizer, device: str) -> torch.Tensor:
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256, padding="max_length").to(device)
+        with torch.no_grad():
+            embedding = model.get_text_embeddings(**inputs)
+        return F.normalize(embedding, p=2, dim=-1)
+    @staticmethod
+    def compute_similarity(image_path: Path, text: str, model, tokenizer, device: str) -> tuple[float, str]:
+        try:
+            img_emb = MultimodalFusion.get_image_embedding(image_path, model, device)
+            txt_emb = MultimodalFusion.get_text_embedding(text, model, tokenizer, device)
+            similarity = float(torch.cosine_similarity(img_emb, txt_emb).item())
+            similarity = (similarity + 1) / 2 # Shift to [0,1]
+            if similarity >= 0.7:
+                alignment = "HIGH"
+            elif similarity >= 0.4:
+                alignment = "MEDIUM"
+            else:
+                alignment = "LOW"
+            return round(similarity, 3), alignment
+        except Exception as e:
+            logger.warning(f"Fusion similarity computation failed: {e}")
+            return 0.5, "UNKNOWN"
+    @staticmethod
+    def get_fused_embedding(image_path: Path, text: str, model, tokenizer, device: str) -> torch.Tensor:
+        img_emb = MultimodalFusion.get_image_embedding(image_path, model, device)
+        txt_emb = MultimodalFusion.get_text_embedding(text, model, tokenizer, device)
+        return torch.cat([img_emb, txt_emb], dim=-1)
+class FallbackFusion:
+    @staticmethod
+    def compute_similarity(image_path: Path, text: str) -> tuple[float, str]:
+        """Simple keyword-based fallback when BiomedVLP unavailable due to RAM constraints."""
+        CHEST_KEYWORDS = ["chest", "lung", "cardiac", "pleural", "pneumo", "infiltrate", "opacity", "nodule", "effusion"]
+        text_lower = text.lower()
+        matches = sum(1 for kw in CHEST_KEYWORDS if kw in text_lower)
+        score = min(0.9, 0.3 + matches * 0.1)
+        alignment = "HIGH" if score > 0.6 else "MEDIUM" if score > 0.4 else "LOW"
+        return score, alignment

backend/ml/nlp/classifier.py CHANGED Viewed

	@@ -0,0 +1,81 @@

+import threading
+from backend.api.v1.schemas.analysis import NERResult
+CHEST_CONDITIONS = [
+    "Pneumonia", "Pleural Effusion", "Cardiomegaly", "Atelectasis",
+    "Pneumothorax", "Pulmonary Edema", "Tuberculosis", "Lung Cancer",
+    "COVID-19", "Chronic Obstructive Pulmonary Disease", "Asthma",
+    "Pulmonary Fibrosis", "Bronchitis", "Emphysema", "Heart Failure",
+    "Aortic Aneurysm", "Pulmonary Embolism", "Sarcoidosis",
+    "No significant finding", "Other condition"
+]
+class DiseaseClassifier:
+    """Zero-shot classifier. No fine-tuning needed. Always runs on CPU."""
+    _pipeline = None
+    _pipeline_lock = threading.Lock()
+    @classmethod
+    def _get_pipeline(cls):
+        if cls._pipeline is None:
+            with cls._pipeline_lock:
+                if cls._pipeline is None:
+                    from transformers import pipeline
+                    # Note: For faster inference use "valhalla/distilbart-mnli-12-1"
+                    # We stick to bart-large-mnli as instructed, but it takes ~2-4s on CPU
+                    cls._pipeline = pipeline(
+                        "zero-shot-classification",
+                        model="facebook/bart-large-mnli",
+                        device=-1
+                    )
+        return cls._pipeline
+    @staticmethod
+    def classify(text: str, entities: NERResult, top_k: int = 3) -> dict:
+        if not text or not text.strip():
+            return {"primary": "Insufficient information", "confidence": 0.0, "differential": []}
+        enriched_text = DiseaseClassifier._build_enriched_text(text, entities)
+        pipe = DiseaseClassifier._get_pipeline()
+        result = pipe(
+            enriched_text,
+            candidate_labels=CHEST_CONDITIONS,
+            multi_label=False
+        )
+        scores_dict = dict(zip(result["labels"], result["scores"]))
+        sorted_labels = sorted(scores_dict, key=scores_dict.get, reverse=True)
+        primary = sorted_labels[0]
+        primary_confidence = scores_dict[primary]
+        differential = [
+            {"disease": label, "confidence": round(scores_dict[label], 3)}
+            for label in sorted_labels[1:top_k]
+        ]
+        return {
+            "primary": primary,
+            "confidence": round(primary_confidence, 3),
+            "differential": differential
+        }
+    @staticmethod
+    def _build_enriched_text(original: str, entities: NERResult) -> str:
+        parts = [original]
+        if entities.diseases:
+            parts.append(f"Diagnosed conditions: {', '.join(entities.diseases)}")
+        if entities.symptoms:
+            parts.append(f"Presenting symptoms: {', '.join(entities.symptoms)}")
+        if entities.medications:
+            parts.append(f"Current medications: {', '.join(entities.medications)}")
+        enriched = ". ".join(parts)
+        return enriched[:1024]
+if __name__ == "__main__":
+    test_text = "Patient complains of severe chest pain and shortness of breath."
+    entities = NERResult(diseases=[], symptoms=["chest pain", "shortness of breath"], medications=[], anatomy=[], raw_entities=[])
+    res = DiseaseClassifier.classify(test_text, entities)
+    print("Classification result:", res)

backend/ml/nlp/ner.py CHANGED Viewed

	@@ -0,0 +1,134 @@

+import torch
+from transformers import AutoModelForTokenClassification, AutoTokenizer
+from backend.api.v1.schemas.analysis import NERResult
+class NERExtractor:
+    ENTITY_MAP = {
+        "DISEASE": "diseases",
+        "SYMPTOM": "symptoms",
+        "MEDICATION": "medications",
+        "ANATOMY": "anatomy"
+    }
+    @staticmethod
+    def extract(text: str, model: AutoModelForTokenClassification, tokenizer: AutoTokenizer, device: str = "cuda") -> NERResult:
+        if not text or not text.strip():
+            return NERResult(diseases=[], symptoms=[], medications=[], anatomy=[], raw_entities=[])
+        chunks = NERExtractor._chunk_text(text, tokenizer, max_length=400, overlap=50)
+        all_entities = []
+        seen_spans = set()
+        for chunk_text, char_offset in chunks:
+            chunk_entities = NERExtractor._extract_chunk(chunk_text, model, tokenizer, device, char_offset)
+            for entity in chunk_entities:
+                span_key = (entity["text"].lower(), entity["entity_type"])
+                if span_key not in seen_spans:
+                    seen_spans.add(span_key)
+                    all_entities.append(entity)
+        grouped = {"diseases": [], "symptoms": [], "medications": [], "anatomy": []}
+        for entity in all_entities:
+            entity_type = entity["entity_type"].split("-")[-1]
+            group_key = NERExtractor.ENTITY_MAP.get(entity_type)
+            if group_key:
+                entity_text = entity["text"].strip()
+                if entity_text and entity_text not in grouped[group_key]:
+                    grouped[group_key].append(entity_text)
+        return NERResult(
+            diseases=grouped["diseases"], symptoms=grouped["symptoms"],
+            medications=grouped["medications"], anatomy=grouped["anatomy"],
+            raw_entities=all_entities
+        )
+    @staticmethod
+    def _extract_chunk(text: str, model, tokenizer, device: str, char_offset: int = 0) -> list[dict]:
+        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, return_offsets_mapping=True, padding=False)
+        offset_mapping = inputs.pop("offset_mapping")[0]
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model(**inputs)
+        predictions = torch.argmax(outputs.logits, dim=-1)[0]
+        entities = []
+        current_entity = None
+        for idx, (pred_id, offsets) in enumerate(zip(predictions, offset_mapping)):
+            label = model.config.id2label[pred_id.item()]
+            start, end = offsets[0].item(), offsets[1].item()
+            if start == 0 and end == 0: # Special token
+                if current_entity:
+                    entities.append(current_entity)
+                    current_entity = None
+                continue
+            token_text = text[start:end]
+            if label.startswith("B-"):
+                if current_entity: entities.append(current_entity)
+                current_entity = {
+                    "text": token_text, "entity_type": label,
+                    "start": start + char_offset, "end": end + char_offset,
+                    "confidence": torch.softmax(outputs.logits[0][idx], dim=-1).max().item()
+                }
+            elif label.startswith("I-") and current_entity:
+                current_entity["text"] += token_text if not token_text.startswith("##") else token_text[2:]
+                current_entity["end"] = end + char_offset
+            else:
+                if current_entity:
+                    entities.append(current_entity)
+                    current_entity = None
+        if current_entity:
+            entities.append(current_entity)
+        return [e for e in entities if e["confidence"] > 0.7]
+    @staticmethod
+    def _chunk_text(text: str, tokenizer, max_length: int = 400, overlap: int = 50) -> list[tuple[str, int]]:
+        words = text.split()
+        chunks = []
+        current_words = []
+        current_length = 0
+        char_offset = 0
+        for word in words:
+            word_tokens = tokenizer(word, add_special_tokens=False)["input_ids"]
+            if current_length + len(word_tokens) > max_length and current_words:
+                chunk_text = " ".join(current_words)
+                chunks.append((chunk_text, char_offset))
+                overlap_words = current_words[-overlap//4:]
+                char_offset += len(" ".join(current_words[:-len(overlap_words)])) + 1
+                current_words = overlap_words
+                current_length = sum(len(tokenizer(w, add_special_tokens=False)["input_ids"]) for w in current_words)
+            current_words.append(word)
+            current_length += len(word_tokens)
+        if current_words:
+            chunks.append((" ".join(current_words), char_offset))
+        return chunks if chunks else [(text, 0)]
+def highlight_entities(text: str, entities: list[dict]) -> str:
+    COLORS = {
+        "DISEASE": "#FF6B6B", "SYMPTOM": "#FFD93D",
+        "MEDICATION": "#6BCB77", "ANATOMY": "#4D96FF"
+    }
+    sorted_entities = sorted(entities, key=lambda e: e["start"], reverse=True)
+    result = text
+    for entity in sorted_entities:
+        entity_type = entity["entity_type"].replace("B-", "").replace("I-", "")
+        color = COLORS.get(entity_type, "#cccccc")
+        span = (
+            f'<mark style="background:{color};padding:2px 4px;border-radius:3px;'
+            f'font-size:0.85em" title="{entity_type} ({entity["confidence"]:.0%})">'
+            f'{entity["text"]}</mark>'
+        )
+        result = result[:entity["start"]] + span + result[entity["end"]:]
+    return result

backend/ml/nlp/whisper.py CHANGED Viewed

	@@ -0,0 +1,70 @@

+import subprocess
+import json
+import numpy as np
+from pathlib import Path
+from backend.core.exceptions import InvalidFileError, ValidationError, InferenceError
+class WhisperTranscriber:
+    SUPPORTED_INPUT_FORMATS = {".wav", ".mp3", ".webm", ".ogg", ".m4a"}
+    MAX_DURATION_SECONDS = 60
+    @staticmethod
+    def transcribe(audio_file_path: Path, model, language: str = "en") -> dict:
+        audio_path = Path(audio_file_path)
+        if not audio_path.exists():
+            raise InvalidFileError(f"Audio file not found: {audio_path}")
+        wav_path = None
+        try:
+            if audio_path.suffix.lower() != ".wav":
+                wav_path = audio_path.with_suffix(".converted.wav")
+                WhisperTranscriber._convert_to_wav(audio_path, wav_path)
+                process_path = wav_path
+            else:
+                process_path = audio_path
+            duration = WhisperTranscriber._get_duration(process_path)
+            if duration > WhisperTranscriber.MAX_DURATION_SECONDS:
+                raise ValidationError(f"Audio too long ({duration:.0f}s). Max {WhisperTranscriber.MAX_DURATION_SECONDS}s.")
+            result = model.transcribe(
+                str(process_path), language=language, verbose=False,
+                word_timestamps=True, fp16=False, condition_on_previous_text=False,
+                no_speech_threshold=0.6, logprob_threshold=-1.0
+            )
+            avg_logprob = np.mean([s["avg_logprob"] for s in result["segments"]]) if result["segments"] else -1
+            confidence = float(min(1.0, max(0.0, np.exp(avg_logprob))))
+            return {
+                "text": result["text"].strip(),
+                "language": result["language"],
+                "confidence": round(confidence, 3),
+                "duration_seconds": round(duration, 1),
+                "segments": [{"start": s["start"], "end": s["end"], "text": s["text"]} for s in result["segments"]]
+            }
+        finally:
+            if wav_path and wav_path.exists():
+                wav_path.unlink()
+    @staticmethod
+    def _convert_to_wav(input_path: Path, output_path: Path):
+        cmd = [
+            "ffmpeg", "-i", str(input_path), "-acodec", "pcm_s16le",
+            "-ac", "1", "-ar", "16000", "-y", str(output_path)
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+        if result.returncode != 0:
+            raise InferenceError(f"Audio conversion failed: {result.stderr[:200]}")
+    @staticmethod
+    def _get_duration(wav_path: Path) -> float:
+        cmd = [
+            "ffprobe", "-v", "quiet", "-print_format", "json",
+            "-show_format", str(wav_path)
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
+        if result.returncode != 0:
+            return 0.0
+        data = json.loads(result.stdout)
+        return float(data.get("format", {}).get("duration", 0))

backend/ml/rag/generator.py CHANGED Viewed

	@@ -0,0 +1,53 @@

+import torch
+from threading import Thread
+from transformers import TextIteratorStreamer
+class ReportGenerator:
+    @staticmethod
+    def generate(vision, nlp, fusion, model, tokenizer) -> str:
+        parts = ["Generate a medical radiology report based on these findings:"]
+        if vision: parts.append(f"Imaging: {vision.risk_level} risk, anomaly score {vision.anomaly_score}/100")
+        if nlp: parts.append(f"Clinical: {nlp.primary_diagnosis}, confidence {nlp.diagnosis_confidence:.0%}")
+        if fusion: parts.append(f"Image-text alignment: {fusion.alignment}")
+        parts.append("Report:")
+        prompt = " ".join(parts)
+        inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs, max_new_tokens=300, do_sample=False, num_beams=4,
+                early_stopping=True, pad_token_id=tokenizer.eos_token_id, repetition_penalty=1.3
+            )
+        generated = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+        return ReportGenerator._format_report(generated, vision, nlp)
+    @staticmethod
+    def _format_report(raw_text: str, vision, nlp) -> str:
+        sections = ["## AI Diagnostic Report\n"]
+        if vision:
+            risk_emoji = "🔴" if vision.risk_level == "HIGH" else "🟡" if vision.risk_level == "MEDIUM" else "🟢"
+            sections.append(f"### Imaging Findings\n{risk_emoji} **Risk Level:** {vision.risk_level}  \n**Anomaly Score:** {vision.anomaly_score}/100\n")
+        if nlp:
+            sections.append(f"### Clinical Assessment\n**Primary Impression:** {nlp.primary_diagnosis}\n")
+        sections.append(f"### AI Analysis\n{raw_text.strip()}\n\n### Recommendation\nPlease consult a licensed physician.")
+        return "\n".join(sections)
+class ChatGenerator:
+    @staticmethod
+    def generate_stream(prompt: str, model, tokenizer, max_new_tokens: int = 200):
+        inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
+        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = {
+            **inputs, "streamer": streamer, "max_new_tokens": max_new_tokens,
+            "do_sample": True, "temperature": 0.7, "top_p": 0.9,
+            "repetition_penalty": 1.2, "pad_token_id": tokenizer.eos_token_id
+        }
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        for token in streamer:
+            yield token

backend/ml/rag/retriever.py CHANGED Viewed

	@@ -0,0 +1,53 @@

+import re
+from backend.ml.rag.vectorstore import vector_store
+class MedicalRAG:
+    """Retrieval-Augmented Generation for medical Q&A."""
+    MAX_CONTEXT_TOKENS = 800
+    MAX_HISTORY_TURNS = 6
+    SAFETY_DISCLAIMER = "\n\n*Note: This is AI-generated information for educational purposes only. Please consult a licensed physician for medical advice.*"
+    DRUG_DOSAGE_PATTERNS = [r"\b\d+\s*mg\b", r"how much.*take", r"dosage", r"dose of"]
+    @staticmethod
+    def retrieve(query: str, session_result: dict | None, n_results: int = 5) -> list[dict]:
+        enriched_query = query
+        if session_result:
+            primary_dx = session_result.get("nlp", {}).get("primary_diagnosis", "")
+            symptoms = session_result.get("nlp", {}).get("entities", {}).get("symptoms", [])
+            if primary_dx:
+                enriched_query = f"{query} {primary_dx} {' '.join(symptoms[:3])}"
+        return vector_store.search(enriched_query, n_results=n_results)
+    @staticmethod
+    def build_prompt(query: str, retrieved_chunks: list[dict], chat_history: list[dict], session_result: dict | None) -> str:
+        parts = []
+        if session_result:
+            vision, nlp = session_result.get("vision", {}), session_result.get("nlp", {})
+            if vision or nlp:
+                parts.append("Patient analysis context:")
+                if vision:
+                    parts.append(f"- Imaging: {vision.get('risk_level', 'unknown')} risk (anomaly score: {vision.get('anomaly_score', 'N/A')}/100)")
+                if nlp:
+                    parts.append(f"- Primary impression: {nlp.get('primary_diagnosis', 'unknown')}")
+                parts.append("")
+        if retrieved_chunks:
+            parts.append("Relevant medical information:")
+            for chunk in retrieved_chunks[:3]:
+                parts.append(f"- {chunk['text'][:200]}...")
+            parts.append("")
+        recent_history = chat_history[-MedicalRAG.MAX_HISTORY_TURNS:]
+        for msg in recent_history:
+            role = "Patient" if msg["role"] == "user" else "Assistant"
+            parts.append(f"{role}: {msg['content'][:100]}")
+        parts.append(f"Patient: {query}\nAssistant:")
+        return "\n".join(parts)
+    @staticmethod
+    def is_safe_query(query: str) -> tuple[bool, str | None]:
+        for pattern in MedicalRAG.DRUG_DOSAGE_PATTERNS:
+            if re.search(pattern, query.lower()):
+                return False, "Please consult a physician for specific dosage information."
+        return True, None

backend/ml/rag/vectorstore.py CHANGED Viewed

	@@ -0,0 +1,117 @@

+import json
+import asyncio
+import logging
+from pathlib import Path
+from backend.core.config import settings
+logger = logging.getLogger(__name__)
+class MedicalVectorStore:
+    COLLECTION_NAME = "pubmed_medical"
+    CHUNK_SIZE = 256
+    CHUNK_OVERLAP = 32
+    BATCH_SIZE = 100
+    def __init__(self):
+        self._client = None
+        self._collection = None
+        self._embedder = None
+    async def initialize(self):
+        if settings.ENVIRONMENT == "production":
+            import chromadb
+            self._client = chromadb.EphemeralClient()
+        else:
+            import chromadb
+            persist_path = str(settings.BASE_DIR / "data" / "chromadb" if hasattr(settings, 'BASE_DIR') else Path("data/chromadb"))
+            self._client = chromadb.PersistentClient(path=persist_path)
+        self._collection = self._client.get_or_create_collection(
+            name=self.COLLECTION_NAME, metadata={"hnsw:space": "cosine"}
+        )
+        doc_count = self._collection.count()
+        logger.info(f"ChromaDB initialized. Documents: {doc_count}")
+        if doc_count == 0:
+            logger.info("ChromaDB is empty. Starting PubMed ingestion...")
+            await asyncio.to_thread(self.ingest_from_json, Path("data/pubmed_raw.json"))
+    def ingest_from_json(self, json_path: Path):
+        if not json_path.exists():
+            logger.warning(f"PubMed data file not found: {json_path}. RAG will be limited.")
+            return
+        records = json.loads(json_path.read_text(encoding="utf-8"))
+        logger.info(f"Ingesting {len(records)} PubMed records into ChromaDB...")
+        all_chunks, all_ids, all_metadatas = [], [], []
+        for record in records:
+            if not record.get("abstract"): continue
+            text = f"{record['title']}. {record['abstract']}"
+            chunks = self._chunk_text(text, self.CHUNK_SIZE, self.CHUNK_OVERLAP)
+            for i, chunk in enumerate(chunks):
+                all_chunks.append(chunk)
+                all_ids.append(f"{record['pmid']}_chunk_{i}")
+                all_metadatas.append({
+                    "pmid": str(record["pmid"]),
+                    "title": record["title"][:200],
+                    "disease_category": record.get("category", "general"),
+                    "year": str(record.get("year", ""))
+                })
+        embedder = self._get_embedder()
+        for i in range(0, len(all_chunks), self.BATCH_SIZE):
+            batch_chunks = all_chunks[i:i+self.BATCH_SIZE]
+            batch_ids = all_ids[i:i+self.BATCH_SIZE]
+            batch_metas = all_metadatas[i:i+self.BATCH_SIZE]
+            embeddings = embedder.encode(batch_chunks, show_progress_bar=False).tolist()
+            self._collection.add(documents=batch_chunks, embeddings=embeddings, ids=batch_ids, metadatas=batch_metas)
+        logger.info(f"Ingestion complete. Total documents: {self._collection.count()}")
+    def search(self, query: str, n_results: int = 5, disease_filter: str = None) -> list[dict]:
+        if not self._collection: return []
+        embedder = self._get_embedder()
+        query_embedding = embedder.encode([query])[0].tolist()
+        where_filter = {"disease_category": disease_filter} if disease_filter else None
+        results = self._collection.query(
+            query_embeddings=[query_embedding], n_results=n_results,
+            where=where_filter, include=["documents", "metadatas", "distances"]
+        )
+        output = []
+        if results["documents"]:
+            for doc, meta, dist in zip(results["documents"][0], results["metadatas"][0], results["distances"][0]):
+                relevance = float(1 - dist)
+                output.append({
+                    "text": doc, "pmid": meta.get("pmid", ""), "title": meta.get("title", ""),
+                    "disease_category": meta.get("disease_category", ""), "year": meta.get("year", ""),
+                    "relevance_score": round(max(0, relevance), 3)
+                })
+        return output
+    def _chunk_text(self, text: str, chunk_size: int, overlap: int) -> list[str]:
+        words = text.split()
+        chunks, start = [], 0
+        while start < len(words):
+            end = min(start + chunk_size, len(words))
+            chunks.append(" ".join(words[start:end]))
+            start = end - overlap
+            if start >= len(words): break
+        return chunks
+    def _get_embedder(self):
+        if self._embedder is None:
+            from sentence_transformers import SentenceTransformer
+            cache_dir = str(settings.MODEL_CACHE_DIR / "minilm")
+            self._embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", cache_folder=cache_dir)
+        return self._embedder
+vector_store = MedicalVectorStore()

frontend/components/analysis/VoiceInput.jsx ADDED Viewed

	@@ -0,0 +1,138 @@

+'use client'
+import { useState, useRef } from 'react'
+import { apiClient } from '@/lib/api/client'
+export default function VoiceInput({ onTranscribed }) {
+    const [isRecording, setIsRecording] = useState(false)
+    const [recordingDuration, setRecordingDuration] = useState(0)
+    const [isTranscribing, setIsTranscribing] = useState(false)
+    const [transcribedText, setTranscribedText] = useState("")
+    const [confidence, setConfidence] = useState(null)
+    const [error, setError] = useState(null)
+    const [audioBlob, setAudioBlob] = useState(null)
+    const mediaRecorderRef = useRef(null)
+    const chunksRef = useRef([])
+    const timerRef = useRef(null)
+    const startRecording = async () => {
+        try {
+            const stream = await navigator.mediaDevices.getUserMedia({ audio: true })
+            const mediaRecorder = new MediaRecorder(stream, { mimeType: "audio/webm" })
+            mediaRecorderRef.current = mediaRecorder
+            chunksRef.current = []
+            mediaRecorder.ondataavailable = e => chunksRef.current.push(e.data)
+            mediaRecorder.onstop = () => {
+                const blob = new Blob(chunksRef.current, { type: "audio/webm" })
+                setAudioBlob(blob)
+                sendForTranscription(blob)
+                stream.getTracks().forEach(track => track.stop())
+            }
+            mediaRecorder.start(1000)
+            setIsRecording(true)
+            setRecordingDuration(0)
+            setError(null)
+            timerRef.current = setInterval(() => {
+                setRecordingDuration(prev => {
+                    if (prev >= 59) {
+                        stopRecording()
+                        return 60
+                    }
+                    return prev + 1
+                })
+            }, 1000)
+        } catch (err) {
+            setError("Microphone access denied or unavailable.")
+        }
+    }
+    const stopRecording = () => {
+        if (mediaRecorderRef.current && isRecording) {
+            mediaRecorderRef.current.stop()
+            setIsRecording(false)
+            clearInterval(timerRef.current)
+        }
+    }
+    const sendForTranscription = async (blob) => {
+        setIsTranscribing(true)
+        const formData = new FormData()
+        formData.append("audio", blob, "recording.webm")
+        try {
+            const response = await apiClient.post('/api/v1/analyze/transcribe', formData, {
+                headers: { 'Content-Type': 'multipart/form-data' }
+            })
+            setTranscribedText(response.data.text)
+            setConfidence(response.data.confidence)
+        } catch (err) {
+            setError("Transcription failed. Please try again.")
+        } finally {
+            setIsTranscribing(false)
+        }
+    }
+    const handleReset = () => {
+        setTranscribedText("")
+        setConfidence(null)
+        setAudioBlob(null)
+        setRecordingDuration(0)
+    }
+    return (
+        <div className="p-4 border rounded-lg bg-gray-50">
+            {!transcribedText && !isTranscribing && (
+                <div className="flex flex-col items-center">
+                    <button
+                        onClick={isRecording ? stopRecording : startRecording}
+                        className={`w-16 h-16 rounded-full flex items-center justify-center transition-all ${
+                            isRecording ? "bg-red-500 animate-pulse" : "bg-teal-500 hover:bg-teal-600"
+                        }`}
+                    >
+                        <svg className="w-8 h-8 text-white" fill="none" stroke="currentColor" viewBox="0 0 24 24">
+                            <path strokeLinecap="round" strokeLinejoin="round" strokeWidth="2" d="M19 11a7 7 0 01-7 7m0 0a7 7 0 01-7-7m7 7v4m0 0H8m4 0h4m-4-8a3 3 0 01-3-3V5a3 3 0 116 0v6a3 3 0 01-3 3z"></path>
+                        </svg>
+                    </button>
+                    <p className={`mt-2 font-mono ${recordingDuration > 50 ? 'text-red-500' : 'text-gray-600'}`}>
+                        {Math.floor(recordingDuration / 60)}:{(recordingDuration % 60).toString().padStart(2, '0')} / 1:00
+                    </p>
+                    {error && <p className="mt-2 text-sm text-red-500">{error}</p>}
+                </div>
+            )}
+            {isTranscribing && (
+                <div className="flex flex-col items-center py-4">
+                    <div className="w-6 h-6 border-2 border-teal-500 border-t-transparent rounded-full animate-spin" />
+                    <p className="mt-2 text-sm text-gray-600">Transcribing audio...</p>
+                </div>
+            )}
+            {transcribedText && !isTranscribing && (
+                <div className="space-y-4">
+                    <div className="p-3 bg-white border rounded">
+                        <p className="text-gray-800 animate-[typewriter_0.5s_steps(40,end)] overflow-hidden break-words">
+                            "{transcribedText}"
+                        </p>
+                        <div className="flex items-center mt-2 text-xs">
+                            <span className={confidence > 0.8 ? "text-green-600" : "text-yellow-600"}>
+                                {confidence > 0.8 ? "✅" : "⚠️"} Confidence: {(confidence * 100).toFixed(0)}%
+                            </span>
+                        </div>
+                    </div>
+                    <div className="flex space-x-2">
+                        <button onClick={() => onTranscribed(transcribedText)} className="px-4 py-2 text-white bg-teal-600 rounded hover:bg-teal-700">
+                            Use This Text
+                        </button>
+                        <button onClick={handleReset} className="px-4 py-2 text-gray-700 bg-gray-200 rounded hover:bg-gray-300">
+                            Re-record
+                        </button>
+                    </div>
+                </div>
+            )}
+        </div>
+    )
+}

frontend/components/chat/ChatInterface.jsx ADDED Viewed

	@@ -0,0 +1,98 @@

+'use client'
+import { useState } from 'react'
+import { v4 as uuid } from 'uuid'
+import { useAuth } from '@/lib/auth/AuthContext'
+export default function ChatInterface({ sessionId }) {
+    const { accessToken } = useAuth()
+    const [messages, setMessages] = useState([])
+    const [inputText, setInputText] = useState("")
+    const [streamingMessageId, setStreamingMessageId] = useState(null)
+    const sendMessage = async (e) => {
+        e.preventDefault()
+        if(!inputText.trim() || streamingMessageId) return
+        const text = inputText
+        setInputText("")
+        const userMsg = { id: uuid(), role: "user", content: text }
+        setMessages(prev => [...prev, userMsg])
+        const assistantMsgId = uuid()
+        const assistantMsg = { id: assistantMsgId, role: "assistant", content: "", isStreaming: true, sources: [] }
+        setMessages(prev => [...prev, assistantMsg])
+        setStreamingMessageId(assistantMsgId)
+        const API_URL = process.env.NEXT_PUBLIC_API_URL || 'http://localhost:8000'
+        try {
+            const response = await fetch(`${API_URL}/api/v1/chat`, {
+                method: "POST",
+                headers: { "Content-Type": "application/json", "Authorization": `Bearer ${accessToken}` },
+                body: JSON.stringify({ session_id: sessionId, message: text })
+            })
+            const reader = response.body.getReader()
+            const decoder = new TextDecoder()
+            while (true) {
+                const { done, value } = await reader.read()
+                if (done) break
+                const chunk = decoder.decode(value, { stream: true })
+                const lines = chunk.split("\n\n").filter(l => l.startsWith("data: "))
+                for (const line of lines) {
+                    const data = JSON.parse(line.replace("data: ", ""))
+                    if (data.type === "token") {
+                        setMessages(prev => prev.map(m => m.id === assistantMsgId ? { ...m, content: m.content + data.token } : m))
+                    } else if (data.type === "sources") {
+                        setMessages(prev => prev.map(m => m.id === assistantMsgId ? { ...m, sources: data.sources } : m))
+                    } else if (data.type === "done") {
+                        setMessages(prev => prev.map(m => m.id === assistantMsgId ? { ...m, isStreaming: false } : m))
+                        setStreamingMessageId(null)
+                    }
+                }
+            }
+        } catch (error) {
+            setMessages(prev => prev.map(m => m.id === assistantMsgId ? { ...m, content: "Network error occurred.", isStreaming: false } : m))
+            setStreamingMessageId(null)
+        }
+    }
+    return (
+        <div className="flex flex-col h-[600px] border rounded-lg bg-white">
+            <div className="flex-1 p-4 overflow-y-auto space-y-4">
+                {messages.map(m => (
+                    <div key={m.id} className={`flex ${m.role === 'user' ? 'justify-end' : 'justify-start'}`}>
+                        <div className={`max-w-[80%] p-3 rounded-lg ${m.role === 'user' ? 'bg-teal-600 text-white' : 'bg-gray-100 text-gray-800'}`}>
+                            <p>{m.content}</p>
+                            {m.isStreaming && <span className="inline-block w-2 h-4 ml-1 bg-gray-400 animate-pulse" />}
+                            {m.sources?.length > 0 && (
+                                <div className="mt-2 pt-2 border-t border-gray-300 text-xs">
+                                    <p className="font-semibold mb-1">Sources:</p>
+                                    {m.sources.map((s, i) => <p key={i}>• {s.title.substring(0, 50)}...</p>)}
+                                </div>
+                            )}
+                        </div>
+                    </div>
+                ))}
+            </div>
+            <form onSubmit={sendMessage} className="p-3 border-t bg-gray-50 flex items-center">
+                <input
+                    type="text"
+                    value={inputText}
+                    onChange={e => setInputText(e.target.value)}
+                    disabled={!!streamingMessageId}
+                    className="flex-1 p-2 border rounded-l-md focus:outline-none focus:ring-2 focus:ring-teal-500 disabled:opacity-50"
+                    placeholder="Ask about the results..."
+                />
+                <button type="submit" disabled={!!streamingMessageId} className="px-4 py-2 bg-teal-600 text-white rounded-r-md hover:bg-teal-700 disabled:opacity-50">
+                    Send
+                </button>
+            </form>
+        </div>
+    )
+}

training/scripts/finetune_ner.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import logging
+import numpy as np
+from pathlib import Path
+from transformers import (
+    AutoModelForTokenClassification, AutoTokenizer,
+    TrainingArguments, Trainer, DataCollatorForTokenClassification
+)
+from datasets import load_from_disk
+from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+id2label = {
+    0: "O", 1: "B-DISEASE", 2: "I-DISEASE",
+    3: "B-SYMPTOM", 4: "I-SYMPTOM",
+    5: "B-MEDICATION", 6: "I-MEDICATION",
+    7: "B-ANATOMY", 8: "I-ANATOMY"
+}
+label2id = {v: k for k, v in id2label.items()}
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    predictions = np.argmax(predictions, axis=2)
+    true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
+    true_predictions = [
+        [id2label[p] for p, l in zip(pred, label) if l != -100]
+        for pred, label in zip(predictions, labels)
+    ]
+    return {
+        "precision": precision_score(true_labels, true_predictions),
+        "recall": recall_score(true_labels, true_predictions),
+        "f1": f1_score(true_labels, true_predictions),
+    }
+def main():
+    dataset = load_from_disk("data/processed/ner_dataset")
+    model_id = "dmis-lab/biobert-base-cased-v1.2"
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model = AutoModelForTokenClassification.from_pretrained(
+        model_id, num_labels=9, id2label=id2label, label2id=label2id
+    )
+    training_args = TrainingArguments(
+        output_dir="models/biobert_ner",
+        per_device_train_batch_size=8,
+        per_device_eval_batch_size=8,
+        gradient_accumulation_steps=4,
+        fp16=True,
+        gradient_checkpointing=True,
+        dataloader_num_workers=0,  # Windows fix
+        dataloader_pin_memory=False, # Windows fix
+        num_train_epochs=10,
+        warmup_ratio=0.1,
+        learning_rate=2e-5,
+        weight_decay=0.01,
+        evaluation_strategy="epoch",
+        save_strategy="epoch",
+        load_best_model_at_end=True,
+        metric_for_best_model="f1",
+        greater_is_better=True,
+        save_total_limit=3,
+        logging_steps=50,
+        report_to="none",
+    )
+    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=dataset["train"],
+        eval_dataset=dataset["validation"],
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics
+    )
+    # Auto-resume logic
+    checkpoint_dir = Path("models/biobert_ner")
+    latest_checkpoint = None
+    if checkpoint_dir.exists():
+        checkpoints = list(checkpoint_dir.glob("checkpoint-*"))
+        if checkpoints:
+            latest_checkpoint = str(max(checkpoints, key=lambda p: int(p.name.split("-")[1])))
+            logger.info(f"📂 Resuming from {latest_checkpoint}")
+    trainer.train(resume_from_checkpoint=latest_checkpoint)
+    logger.info("Evaluating on test set...")
+    test_results = trainer.predict(dataset["test"])
+    print(classification_report(*Trainer._get_labels_and_preds(test_results.predictions, test_results.label_ids)))
+    final_dir = "models/biobert_ner_finetuned"
+    trainer.save_model(final_dir)
+    tokenizer.save_pretrained(final_dir)
+    logger.info(f"✅ Model saved to {final_dir}")
+if __name__ == "__main__":
+    main()

training/scripts/ingest_pubmed.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import time
+import json
+import logging
+import requests
+import xml.etree.ElementTree as ET
+from pathlib import Path
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+SEARCH_QUERIES = {
+    "pneumonia": "chest X-ray pneumonia findings radiology",
+    "pleural_effusion": "pleural effusion chest radiograph diagnosis",
+    "cardiomegaly": "cardiomegaly cardiac enlargement chest X-ray",
+    "tuberculosis": "tuberculosis pulmonary chest radiograph findings",
+    "normal_chest": "normal chest radiograph no significant finding"
+}
+def fetch_pubmed_abstracts(query: str, category: str, max_results=200) -> list[dict]:
+    logger.info(f"Searching PubMed for: {category}")
+    search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+    params = {"db": "pubmed", "term": query, "retmax": max_results, "retmode": "json", "usehistory": "y"}
+    res = requests.get(search_url, params=params, timeout=30)
+    res.raise_for_status()
+    pmids = res.json().get("esearchresult", {}).get("idlist", [])
+    results = []
+    fetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+    for i in range(0, len(pmids), 20):
+        batch = pmids[i:i+20]
+        f_params = {"db": "pubmed", "id": ",".join(batch), "rettype": "abstract", "retmode": "xml"}
+        f_res = requests.get(fetch_url, params=f_params, timeout=30)
+        try:
+            tree = ET.fromstring(f_res.content)
+            for article in tree.findall(".//PubmedArticle"):
+                pmid = article.findtext(".//PMID")
+                title = article.findtext(".//ArticleTitle")
+                abstract = article.findtext(".//AbstractText")
+                year = article.findtext(".//PubDate/Year")
+                if abstract and title:
+                    results.append({
+                        "pmid": pmid, "title": title, "abstract": abstract,
+                        "year": year, "category": category
+                    })
+        except ET.ParseError:
+            logger.error("XML parse error on a batch, skipping.")
+        time.sleep(0.4) # Respect 3 req/sec rate limit
+    return results
+def main():
+    out_file = Path("data/pubmed_raw.json")
+    out_file.parent.mkdir(parents=True, exist_ok=True)
+    all_data = []
+    completed_cats = set()
+    if out_file.exists():
+        all_data = json.loads(out_file.read_text())
+        completed_cats = {d["category"] for d in all_data}
+        logger.info(f"Resuming... found {len(all_data)} existing records.")
+    for cat, query in SEARCH_QUERIES.items():
+        if cat in completed_cats:
+            continue
+        data = fetch_pubmed_abstracts(query, cat)
+        all_data.extend(data)
+        out_file.write_text(json.dumps(all_data, indent=2))
+        logger.info(f"Saved {len(data)} abstracts for {cat}.")
+    logger.info(f"✅ PubMed ingestion complete. Total records: {len(all_data)}")
+if __name__ == "__main__":
+    main()

training/scripts/prepare_ner_data.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import json
+import logging
+from pathlib import Path
+from datasets import load_dataset
+from transformers import AutoTokenizer
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+SYMPTOMS_LIST = [
+    "fever", "cough", "pain", "fatigue", "nausea", "vomiting",
+    "headache", "dizziness", "shortness of breath", "chest pain",
+    "dyspnea", "tachycardia", "edema", "rash"
+]
+def add_symptom_labels(example):
+    tokens = example["tokens"]
+    tags = example["ner_tags"]
+    for i, token in enumerate(tokens):
+        if tags[i] == 0:  # If currently 'O'
+            if token.lower() in SYMPTOMS_LIST:
+                # 3 is B-SYMPTOM, 4 is I-SYMPTOM in our mapping
+                tags[i] = 3
+    return {"tokens": tokens, "ner_tags": tags}
+def tokenize_and_align_labels(examples, tokenizer):
+    tokenized = tokenizer(
+        examples["tokens"],
+        truncation=True,
+        max_length=512,
+        is_split_into_words=True,  # CRITICAL: Tells tokenizer input is already word-split
+        padding=False
+    )
+    all_labels = []
+    for i, labels in enumerate(examples["ner_tags"]):
+        word_ids = tokenized.word_ids(batch_index=i)
+        aligned = []
+        prev_word_id = None
+        for word_id in word_ids:
+            if word_id is None:
+                aligned.append(-100)  # Special tokens (CLS, SEP) ignored in loss
+            elif word_id != prev_word_id:
+                aligned.append(labels[word_id])  # First subword gets the actual label
+            else:
+                label = labels[word_id]
+                # If B- label (odd numbers in NCBI), convert to I- label (even) for subwords
+                if label % 2 == 1:
+                    label += 1
+                aligned.append(label)
+            prev_word_id = word_id
+        all_labels.append(aligned)
+    tokenized["labels"] = all_labels
+    return tokenized
+def main():
+    out_dir = Path("data/processed/ner_dataset")
+    out_dir.parent.mkdir(parents=True, exist_ok=True)
+    logger.info("Loading NCBI Disease dataset...")
+    dataset = load_dataset("ncbi_disease")
+    logger.info("Augmenting symptom labels...")
+    dataset = dataset.map(add_symptom_labels)
+    logger.info("Tokenizing and aligning BIO tags...")
+    tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
+    tokenized_dataset = dataset.map(
+        lambda x: tokenize_and_align_labels(x, tokenizer),
+        batched=True,
+        remove_columns=dataset["train"].column_names
+    )
+    tokenized_dataset.save_to_disk(str(out_dir))
+    logger.info(f"✅ NER dataset prepared and saved to {out_dir}")
+if __name__ == "__main__":
+    main()