import re
from typing import List, Dict, Any, Tuple

"""
entity_extraction.py

What it does:
Extracts medical entities (Drugs, Dosages, Diseases, Symptoms) and relationships from text chunks.

How it works:
It uses rule-based pattern matching (Regex) and predefined dictionaries to identify key medical terms
in the text. For a production system, this would be augmented with an NLP library like spaCy (en_core_sci_sm)
or an LLM-based extraction. For Version 1, we use robust regex patterns and keyword matching.

Key algorithms/formulas:
- Regex for Dosages: r'(\\d+(?:\\.\\d+)?\\s*(?:mg|g|mcg|ml|mg/m2|IU)(?:/(?:kg|m2))?(?:\\s*(?:PO|IV|SC))?)'
- Relationship heuristic: Co-occurrence within the same chunk implies a potential relationship.

Example input:
- "Pemetrexed 500mg/m2 IV is indicated for NSCLC. It may cause nausea."

Example output:
- Entities: 
  [{"name": "Pemetrexed", "type": "drug"}, {"name": "500mg/m2 IV", "type": "dosage"}, ...]
- Relationships: 
  [{"source": "Pemetrexed", "target": "NSCLC", "type": "treats"}, ...]
"""

# Pre-defined dictionaries for demonstration (would be loaded from a medical ontology in production)
KNOWN_DRUGS = {"pemetrexed", "cisplatin", "carboplatin", "pembrolizumab", "nivolumab", "docetaxel"}
KNOWN_DISEASES = {"nsclc", "non-small cell lung cancer", "lung cancer", "squamous cell carcinoma"}
KNOWN_SYMPTOMS = {"nausea", "fatigue", "vomiting", "anemia", "neutropenia", "rash"}

def extract_entities_from_text(text: str) -> List[Dict[str, Any]]:
    """
    Extracts entities from a given text chunk.
    """
    entities = []
    text_lower = text.lower()
    
    # 1. Extract Drugs
    for drug in KNOWN_DRUGS:
        if drug in text_lower:
            entities.append({"name": drug.title(), "type": "drug"})
            
    # 2. Extract Diseases
    for disease in KNOWN_DISEASES:
        if disease in text_lower:
            entities.append({"name": disease.upper() if disease == "nsclc" else disease.title(), "type": "disease"})
            
    # 3. Extract Symptoms
    for symptom in KNOWN_SYMPTOMS:
        if symptom in text_lower:
            entities.append({"name": symptom.title(), "type": "symptom"})
            
    # 4. Extract Dosages (Regex)
    # Matches patterns like "500 mg", "500mg/m2", "10 mg/kg"
    dosage_pattern = re.compile(r'\b(\d+(?:\.\d+)?\s*(?:mg|g|mcg|ml)(?:/(?:m2|kg|ml))?)\b', re.IGNORECASE)
    dosages = dosage_pattern.findall(text)
    for dosage in dosages:
        entities.append({"name": dosage, "type": "dosage"})
        
    return entities

def extract_relationships(entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Infers relationships based on extracted entities in the same context (chunk).
    """
    relationships = []
    
    drugs = [e for e in entities if e["type"] == "drug"]
    diseases = [e for e in entities if e["type"] == "disease"]
    symptoms = [e for e in entities if e["type"] == "symptom"]
    dosages = [e for e in entities if e["type"] == "dosage"]
    
    # Heuristic 1: Drug treats Disease
    for drug in drugs:
        for disease in diseases:
            relationships.append({
                "source_name": drug["name"],
                "target_name": disease["name"],
                "type": "treats",
                "confidence": 0.8  # Default heuristic confidence
            })
            
    # Heuristic 2: Drug has Side Effect Symptom
    for drug in drugs:
        for symptom in symptoms:
            relationships.append({
                "source_name": drug["name"],
                "target_name": symptom["name"],
                "type": "has_side_effect",
                "confidence": 0.7
            })
            
    # Heuristic 3: Dosage applies to Drug
    if len(drugs) == 1 and len(dosages) > 0:
        for dosage in dosages:
            relationships.append({
                "source_name": dosage["name"],
                "target_name": drugs[0]["name"],
                "type": "applies_to",
                "confidence": 0.9
            })

    return relationships

def process_chunk_for_graph(chunk_text: str) -> Tuple[List[Dict], List[Dict]]:
    """
    Processes a chunk to return both entities and relationships.
    """
    entities = extract_entities_from_text(chunk_text)
    relationships = extract_relationships(entities)
    return entities, relationships

if __name__ == "__main__":
    sample = "Pemetrexed 500mg/m2 is used for the treatment of NSCLC. It frequently causes nausea."
    ents, rels = process_chunk_for_graph(sample)
    print("Entities:", ents)
    print("Relationships:", rels)