import re from typing import List, Dict, Any, Tuple """ entity_extraction.py What it does: Extracts medical entities (Drugs, Dosages, Diseases, Symptoms) and relationships from text chunks. How it works: It uses rule-based pattern matching (Regex) and predefined dictionaries to identify key medical terms in the text. For a production system, this would be augmented with an NLP library like spaCy (en_core_sci_sm) or an LLM-based extraction. For Version 1, we use robust regex patterns and keyword matching. Key algorithms/formulas: - Regex for Dosages: r'(\\d+(?:\\.\\d+)?\\s*(?:mg|g|mcg|ml|mg/m2|IU)(?:/(?:kg|m2))?(?:\\s*(?:PO|IV|SC))?)' - Relationship heuristic: Co-occurrence within the same chunk implies a potential relationship. Example input: - "Pemetrexed 500mg/m2 IV is indicated for NSCLC. It may cause nausea." Example output: - Entities: [{"name": "Pemetrexed", "type": "drug"}, {"name": "500mg/m2 IV", "type": "dosage"}, ...] - Relationships: [{"source": "Pemetrexed", "target": "NSCLC", "type": "treats"}, ...] """ # Pre-defined dictionaries for demonstration (would be loaded from a medical ontology in production) KNOWN_DRUGS = {"pemetrexed", "cisplatin", "carboplatin", "pembrolizumab", "nivolumab", "docetaxel"} KNOWN_DISEASES = {"nsclc", "non-small cell lung cancer", "lung cancer", "squamous cell carcinoma"} KNOWN_SYMPTOMS = {"nausea", "fatigue", "vomiting", "anemia", "neutropenia", "rash"} def extract_entities_from_text(text: str) -> List[Dict[str, Any]]: """ Extracts entities from a given text chunk. """ entities = [] text_lower = text.lower() # 1. Extract Drugs for drug in KNOWN_DRUGS: if drug in text_lower: entities.append({"name": drug.title(), "type": "drug"}) # 2. Extract Diseases for disease in KNOWN_DISEASES: if disease in text_lower: entities.append({"name": disease.upper() if disease == "nsclc" else disease.title(), "type": "disease"}) # 3. Extract Symptoms for symptom in KNOWN_SYMPTOMS: if symptom in text_lower: entities.append({"name": symptom.title(), "type": "symptom"}) # 4. Extract Dosages (Regex) # Matches patterns like "500 mg", "500mg/m2", "10 mg/kg" dosage_pattern = re.compile(r'\b(\d+(?:\.\d+)?\s*(?:mg|g|mcg|ml)(?:/(?:m2|kg|ml))?)\b', re.IGNORECASE) dosages = dosage_pattern.findall(text) for dosage in dosages: entities.append({"name": dosage, "type": "dosage"}) return entities def extract_relationships(entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """ Infers relationships based on extracted entities in the same context (chunk). """ relationships = [] drugs = [e for e in entities if e["type"] == "drug"] diseases = [e for e in entities if e["type"] == "disease"] symptoms = [e for e in entities if e["type"] == "symptom"] dosages = [e for e in entities if e["type"] == "dosage"] # Heuristic 1: Drug treats Disease for drug in drugs: for disease in diseases: relationships.append({ "source_name": drug["name"], "target_name": disease["name"], "type": "treats", "confidence": 0.8 # Default heuristic confidence }) # Heuristic 2: Drug has Side Effect Symptom for drug in drugs: for symptom in symptoms: relationships.append({ "source_name": drug["name"], "target_name": symptom["name"], "type": "has_side_effect", "confidence": 0.7 }) # Heuristic 3: Dosage applies to Drug if len(drugs) == 1 and len(dosages) > 0: for dosage in dosages: relationships.append({ "source_name": dosage["name"], "target_name": drugs[0]["name"], "type": "applies_to", "confidence": 0.9 }) return relationships def process_chunk_for_graph(chunk_text: str) -> Tuple[List[Dict], List[Dict]]: """ Processes a chunk to return both entities and relationships. """ entities = extract_entities_from_text(chunk_text) relationships = extract_relationships(entities) return entities, relationships if __name__ == "__main__": sample = "Pemetrexed 500mg/m2 is used for the treatment of NSCLC. It frequently causes nausea." ents, rels = process_chunk_for_graph(sample) print("Entities:", ents) print("Relationships:", rels)