import re from flask import jsonify import logging import os def clean_result(value): value = re.sub(r"\s+", " ", value) value = re.sub(r"[-_:]+", " ", value) value = re.sub(r"[^\x00-\x7F]+", " ", value) return value if value else "Not Available" def extract_patient_name(text, qa_pipeline): if not text or not qa_pipeline: return None try: # Try multiple questions to extract the name questions = [ "What is the patient's name?", "Who is the patient?", "What is the name of the person this document is about?", "What is the full name of the patient?", "What is the name of the person in this medical document?" ] for question in questions: result = qa_pipeline(question=question, context=text) answer = result.get("answer", "").strip() if answer and len(answer) > 1: # Basic validation that we got a meaningful answer return answer return None except Exception as e: logging.error(f"Error extracting patient name: {str(e)}") return None def normalize_name(name): """Clean and normalize names by lowercasing, stripping, removing punctuation and salutations.""" if not name: return "" # Convert to lowercase and strip name = name.lower().strip() # Remove common salutations and titles salutations = [ r"^(mr|mrs|ms|dr|prof|sir|madam|patient|name|of)\b\s*", r"\b(mr|mrs|ms|dr|prof|sir|madam|patient|name|of)\b\s*" ] for pattern in salutations: name = re.sub(pattern, "", name, flags=re.IGNORECASE) # Remove punctuation and extra spaces name = re.sub(r"[^\w\s]", "", name) name = re.sub(r"\s+", " ", name) return name.strip() def extract_name_from_filename(filename): """Extract potential patient name from filename.""" if not filename: return None # Remove file extension name = os.path.splitext(filename)[0] # Remove common prefixes name = re.sub(r"^(mr|mrs|ms|dr|prof|sir|madam)\b\s*", "", name, flags=re.IGNORECASE) # Clean up the name name = re.sub(r"[^\w\s]", " ", name) name = re.sub(r"\s+", " ", name) return name.strip() def validate_patient_name(extracted_text, patient_name, filename, qa_pipeline): """Validates if the extracted name matches the registered patient name""" if not extracted_text or not patient_name: return ( jsonify({"error": f"Missing required information for validation"}), 400 ) # First try to extract name from the text using QA pipeline detected_name = extract_patient_name(extracted_text, qa_pipeline) # If QA pipeline fails, try pattern matching in the text if not detected_name: name_patterns = [ r"Patient:\s*([A-Za-z\s]+)", r"Name:\s*([A-Za-z\s]+)", r"Patient Name:\s*([A-Za-z\s]+)", r"Name of Patient:\s*([A-Za-z\s]+)", r"Patient's Name:\s*([A-Za-z\s]+)", r"Patient Name is:\s*([A-Za-z\s]+)", r"Name of the Patient:\s*([A-Za-z\s]+)" ] for pattern in name_patterns: match = re.search(pattern, extracted_text, re.IGNORECASE) if match: detected_name = match.group(1).strip() break # If still no name found, try to extract from filename if not detected_name: detected_name = extract_name_from_filename(filename) if not detected_name: return ( jsonify({"error": f"Could not determine patient name from {filename}"}), 400, ) normalized_detected_name = normalize_name(detected_name) normalized_patient_name = normalize_name(patient_name) # Check if either name is contained within the other if (normalized_detected_name in normalized_patient_name or normalized_patient_name in normalized_detected_name): return None # Try partial matching for names detected_parts = set(normalized_detected_name.split()) patient_parts = set(normalized_patient_name.split()) # If there's significant overlap in name parts, consider it a match if len(detected_parts.intersection(patient_parts)) >= min(len(detected_parts), len(patient_parts)) * 0.5: return None return ( jsonify( { "error": f"Document '{filename}' does not belong to {patient_name}. Found: {detected_name}" } ), 400, )