Omnibimol-Worker / drug_repurposing_engine.py
GitHub Actions Deployer
Automated Worker deployment from GitHub commit b227394289e876f4810cbd73a0669c28442d2304
83157b1
Raw
History Blame Contribute Delete
39.2 kB
"""
Drug Repurposing Engine
Identifies novel therapeutic uses for existing drugs using biological network analysis.
Core Concept:
- Model biomedical ecosystem as network graph: Drugs → Proteins → Pathways → Diseases
- Use network proximity and shortest path analysis to discover repurposing opportunities
"""
import networkx as nx
import pandas as pd
import numpy as np
from typing import Dict, List, Tuple, Optional, Set
import asyncio
import httpx
from collections import defaultdict
import streamlit as st
class DrugRepurposingEngine:
"""
Graph-based drug repurposing engine that analyzes drug-protein-disease networks
to identify novel therapeutic indications for existing drugs.
"""
def __init__(self, api_client, cache_manager):
self.api_client = api_client
self.cache = cache_manager
self.graph = nx.Graph()
self.drug_to_proteins = {} # drug_name -> [uniprot_ids]
self.protein_to_diseases = {} # uniprot_id -> [disease_names]
self.protein_to_pathways = {} # uniprot_id -> [pathway_names]
self.disease_to_proteins = defaultdict(list) # disease_name -> [uniprot_ids]
async def fetch_drug_targets(self, drug_name: str, drugbank_id: Optional[str] = None) -> Dict:
"""
Fetch protein targets for a given drug from DrugBank and ChEMBL.
Args:
drug_name: Name of the drug
drugbank_id: Optional DrugBank ID
Returns:
Dictionary with drug info and list of target proteins
"""
cache_key = f"drug_targets_{drug_name.lower()}_{drugbank_id or ''}"
cached = self.cache.get(cache_key)
if cached:
return cached
targets = []
drug_info = {
"name": drug_name,
"drugbank_id": drugbank_id,
"targets": []
}
try:
async with httpx.AsyncClient(timeout=30.0) as client:
# Try DrugBank API (if ID provided)
if drugbank_id:
try:
# DrugBank public API endpoint
drugbank_url = f"https://go.drugbank.com/drugs/{drugbank_id}.json"
# Note: DrugBank requires authentication for API access
# For now, we'll use ChEMBL as primary source
except:
pass
# Primary source: ChEMBL
# Search for drug by name
chembl_search_url = "https://www.ebi.ac.uk/chembl/api/data/molecule/search.json"
search_params = {
"q": drug_name,
"max_phase": 4, # FDA approved
"limit": 5 # Get more results
}
search_response = await client.get(chembl_search_url, params=search_params)
if search_response.status_code == 200:
search_data = search_response.json()
molecules = search_data.get("molecules", [])
# Try to find exact match first
exact_match = None
for mol in molecules:
pref_name = mol.get("pref_name", "").lower()
synonyms = [s.lower() for s in mol.get("synonyms", [])]
if (drug_name.lower() in pref_name or
drug_name.lower() in synonyms or
pref_name in drug_name.lower()):
exact_match = mol
break
molecule = exact_match or (molecules[0] if molecules else None)
if molecule:
chembl_id = molecule.get("molecule_chembl_id")
# Get targets for this molecule
target_url = "https://www.ebi.ac.uk/chembl/api/data/mechanism.json"
target_params = {
"molecule_chembl_id": chembl_id,
"format": "json"
}
target_response = await client.get(target_url, params=target_params)
if target_response.status_code == 200:
target_data = target_response.json()
for mechanism in target_data.get("mechanisms", []):
target_chembl_id = mechanism.get("target_chembl_id")
action_type = mechanism.get("action_type", "N/A")
# Get target details
target_detail_url = f"https://www.ebi.ac.uk/chembl/api/data/target/{target_chembl_id}.json"
target_detail_response = await client.get(target_detail_url)
if target_detail_response.status_code == 200:
target_detail = target_detail_response.json()
# Extract UniProt IDs
target_components = target_detail.get("target_components", [])
for component in target_components:
for accession in self._iter_uniprot_accessions(component):
# Avoid duplicates
if not any(t["uniprot_id"] == accession for t in targets):
targets.append({
"uniprot_id": accession,
"target_name": target_detail.get("pref_name", "Unknown"),
"action_type": action_type,
"chembl_target_id": target_chembl_id
})
# Also try activity data as fallback
if not targets:
activity_url = "https://www.ebi.ac.uk/chembl/api/data/activity.json"
activity_params = {
"molecule_chembl_id": chembl_id,
"target_organism": "Homo sapiens",
"format": "json",
"limit": 10
}
activity_response = await client.get(activity_url, params=activity_params)
if activity_response.status_code == 200:
activity_data = activity_response.json()
seen_targets = set()
for activity in activity_data.get("activities", []):
target_chembl_id = activity.get("target_chembl_id")
if target_chembl_id and target_chembl_id not in seen_targets:
seen_targets.add(target_chembl_id)
# Get target details
target_detail_url = f"https://www.ebi.ac.uk/chembl/api/data/target/{target_chembl_id}.json"
target_detail_response = await client.get(target_detail_url)
if target_detail_response.status_code == 200:
target_detail = target_detail_response.json()
target_components = target_detail.get("target_components", [])
for component in target_components:
for accession in self._iter_uniprot_accessions(component):
if not any(t["uniprot_id"] == accession for t in targets):
targets.append({
"uniprot_id": accession,
"target_name": target_detail.get("pref_name", "Unknown"),
"action_type": "Activity",
"chembl_target_id": target_chembl_id
})
# Alternative: Search UniProt for drug name (less reliable)
if not targets:
uniprot_search_url = "https://rest.uniprot.org/uniprotkb/search"
uniprot_params = {
"query": f"{drug_name} AND (reviewed:true) AND (organism_id:9606)",
"format": "json",
"size": 5
}
uniprot_response = await client.get(uniprot_search_url, params=uniprot_params)
if uniprot_response.status_code == 200:
uniprot_data = uniprot_response.json()
# This is a fallback - UniProt doesn't directly link drugs
# but we can try to find proteins mentioned with drug name
pass
# Fallback: Use curated drug-target database for common drugs
if not targets:
targets = self._get_curated_drug_targets(drug_name)
drug_info["targets"] = targets
self.cache.set(cache_key, drug_info)
return drug_info
except Exception as e:
st.warning(f"Error fetching drug targets: {str(e)}")
# Try curated database as fallback
targets = self._get_curated_drug_targets(drug_name)
drug_info["targets"] = targets
return drug_info
def _get_curated_drug_targets(self, drug_name: str) -> List[Dict]:
"""
Curated drug-target associations for common FDA-approved drugs.
Used as fallback when API data is unavailable.
"""
drug_name_lower = drug_name.lower()
curated_targets = {
"metformin": [
{"uniprot_id": "Q9Y478", "target_name": "AMPK", "action_type": "Activator"},
{"uniprot_id": "P42345", "target_name": "mTOR", "action_type": "Inhibitor"},
],
"aspirin": [
{"uniprot_id": "P23219", "target_name": "PTGS1 (COX-1)", "action_type": "Inhibitor"},
{"uniprot_id": "P35354", "target_name": "PTGS2 (COX-2)", "action_type": "Inhibitor"},
],
"ibuprofen": [
{"uniprot_id": "P23219", "target_name": "PTGS1 (COX-1)", "action_type": "Inhibitor"},
{"uniprot_id": "P35354", "target_name": "PTGS2 (COX-2)", "action_type": "Inhibitor"},
],
"erlotinib": [
{"uniprot_id": "P00533", "target_name": "EGFR", "action_type": "Inhibitor"},
],
"gefitinib": [
{"uniprot_id": "P00533", "target_name": "EGFR", "action_type": "Inhibitor"},
],
"cetuximab": [
{"uniprot_id": "P00533", "target_name": "EGFR", "action_type": "Antibody"},
],
"olaparib": [
{"uniprot_id": "P38398", "target_name": "BRCA1", "action_type": "PARP Inhibitor"},
{"uniprot_id": "P51587", "target_name": "BRCA2", "action_type": "PARP Inhibitor"},
],
"imatinib": [
{"uniprot_id": "P00519", "target_name": "ABL1", "action_type": "Inhibitor"},
{"uniprot_id": "P16234", "target_name": "PDGFR", "action_type": "Inhibitor"},
],
"atorvastatin": [
{"uniprot_id": "P04035", "target_name": "HMGCR", "action_type": "Inhibitor"},
],
"simvastatin": [
{"uniprot_id": "P04035", "target_name": "HMGCR", "action_type": "Inhibitor"},
],
}
# Try exact match first
if drug_name_lower in curated_targets:
return curated_targets[drug_name_lower]
# Try partial match
for drug_key, targets_list in curated_targets.items():
if drug_key in drug_name_lower or drug_name_lower in drug_key:
return targets_list
return []
@staticmethod
def _iter_uniprot_accessions(component: Dict) -> List[str]:
"""Normalize ChEMBL target component accessions to UniProt-like IDs."""
raw_accessions = component.get("accession", [])
if isinstance(raw_accessions, str):
raw_accessions = [raw_accessions]
elif raw_accessions is None:
raw_accessions = []
accessions = []
for accession in raw_accessions:
accession_text = str(accession).strip()
if (
len(accession_text) == 6
and accession_text[0] in {"O", "P", "Q"}
and accession_text[-1].isdigit()
):
accessions.append(accession_text)
return accessions
async def fetch_disease_protein_associations(self, uniprot_ids: List[str]) -> Dict:
"""
Fetch disease-protein associations from DisGeNET and OpenTargets.
Args:
uniprot_ids: List of UniProt IDs
Returns:
Dictionary mapping uniprot_id -> list of diseases with scores
"""
cache_key = f"disease_proteins_{hash(tuple(sorted(uniprot_ids)))}"
cached = self.cache.get(cache_key)
if cached:
return cached
associations = defaultdict(list)
try:
async with httpx.AsyncClient(timeout=30.0) as client:
# Use curated disease-protein associations
# In production, integrate with DisGeNET/OpenTargets APIs
curated = self._get_curated_disease_associations_detailed(uniprot_ids)
for uniprot_id, diseases in curated.items():
associations[uniprot_id].extend(diseases)
except Exception as e:
st.warning(f"Error fetching disease associations: {str(e)}")
result = dict(associations)
self.cache.set(cache_key, result)
return result
def _get_curated_disease_associations_detailed(self, uniprot_ids: List[str]) -> Dict:
"""
Detailed curated disease-protein associations with confidence scores.
Based on known literature and database associations.
"""
curated = {
# EGFR - Epidermal Growth Factor Receptor
"P00533": [
{"disease_name": "Non-small cell lung cancer", "score": 0.95, "evidence": "Strong"},
{"disease_name": "Colorectal cancer", "score": 0.85, "evidence": "Strong"},
{"disease_name": "Head and neck cancer", "score": 0.80, "evidence": "Moderate"},
{"disease_name": "Glioblastoma", "score": 0.75, "evidence": "Moderate"},
{"disease_name": "Breast cancer", "score": 0.70, "evidence": "Moderate"},
],
# TP53 - Tumor Protein p53
"P04637": [
{"disease_name": "Li-Fraumeni syndrome", "score": 0.98, "evidence": "Strong"},
{"disease_name": "Ovarian cancer", "score": 0.90, "evidence": "Strong"},
{"disease_name": "Colorectal cancer", "score": 0.88, "evidence": "Strong"},
{"disease_name": "Breast cancer", "score": 0.85, "evidence": "Strong"},
{"disease_name": "Lung cancer", "score": 0.82, "evidence": "Moderate"},
{"disease_name": "Pancreatic cancer", "score": 0.80, "evidence": "Moderate"},
],
# BRCA1 - Breast Cancer 1
"P38398": [
{"disease_name": "Hereditary breast and ovarian cancer", "score": 0.98, "evidence": "Strong"},
{"disease_name": "Breast cancer", "score": 0.95, "evidence": "Strong"},
{"disease_name": "Ovarian cancer", "score": 0.92, "evidence": "Strong"},
{"disease_name": "Prostate cancer", "score": 0.70, "evidence": "Moderate"},
],
# BRCA2 - Breast Cancer 2
"P51587": [
{"disease_name": "Hereditary breast and ovarian cancer", "score": 0.98, "evidence": "Strong"},
{"disease_name": "Breast cancer", "score": 0.95, "evidence": "Strong"},
{"disease_name": "Ovarian cancer", "score": 0.92, "evidence": "Strong"},
{"disease_name": "Pancreatic cancer", "score": 0.75, "evidence": "Moderate"},
],
# INS - Insulin
"P01308": [
{"disease_name": "Type 1 diabetes", "score": 0.95, "evidence": "Strong"},
{"disease_name": "Type 2 diabetes", "score": 0.90, "evidence": "Strong"},
{"disease_name": "Diabetes mellitus", "score": 0.88, "evidence": "Strong"},
{"disease_name": "Metabolic syndrome", "score": 0.70, "evidence": "Moderate"},
],
# ALB - Albumin
"P02768": [
{"disease_name": "Hypoalbuminemia", "score": 0.95, "evidence": "Strong"},
{"disease_name": "Nephrotic syndrome", "score": 0.85, "evidence": "Strong"},
{"disease_name": "Liver disease", "score": 0.75, "evidence": "Moderate"},
{"disease_name": "Malnutrition", "score": 0.70, "evidence": "Moderate"},
],
# ABCB1 - P-glycoprotein (MDR1)
"P08183": [
{"disease_name": "Drug resistance", "score": 0.90, "evidence": "Strong"},
{"disease_name": "Cancer", "score": 0.75, "evidence": "Moderate"},
{"disease_name": "Epilepsy", "score": 0.65, "evidence": "Moderate"},
],
# PTGS2 - COX-2
"P35354": [
{"disease_name": "Inflammation", "score": 0.90, "evidence": "Strong"},
{"disease_name": "Pain", "score": 0.85, "evidence": "Strong"},
{"disease_name": "Arthritis", "score": 0.80, "evidence": "Strong"},
{"disease_name": "Colorectal cancer", "score": 0.70, "evidence": "Moderate"},
],
# PTGS1 - COX-1
"P23219": [
{"disease_name": "Inflammation", "score": 0.88, "evidence": "Strong"},
{"disease_name": "Pain", "score": 0.85, "evidence": "Strong"},
{"disease_name": "Cardiovascular disease", "score": 0.75, "evidence": "Moderate"},
],
# APP - Amyloid Beta Precursor Protein
"P05067": [
{"disease_name": "Alzheimer's Disease", "score": 0.95, "evidence": "Strong"},
{"disease_name": "Dementia", "score": 0.85, "evidence": "Moderate"},
],
# SNCA - Alpha-synuclein
"P37840": [
{"disease_name": "Parkinson's Disease", "score": 0.95, "evidence": "Strong"},
{"disease_name": "Dementia", "score": 0.80, "evidence": "Moderate"},
],
# HTT - Huntingtin
"P42858": [
{"disease_name": "Huntington's Disease", "score": 0.98, "evidence": "Strong"},
],
# CFTR - Cystic Fibrosis Transmembrane Conductance Regulator
"P13569": [
{"disease_name": "Cystic fibrosis", "score": 0.98, "evidence": "Strong"},
],
}
result = {}
for uniprot_id in uniprot_ids:
if uniprot_id in curated:
result[uniprot_id] = curated[uniprot_id]
return result
def build_network_graph(self, drug_name: str, drug_targets: List[Dict],
ppi_data: Dict, disease_associations: Dict,
pathway_data: Dict) -> nx.Graph:
"""
Build a network graph connecting drugs, proteins, pathways, and diseases.
Args:
drug_name: Name of the drug
drug_targets: List of target proteins
ppi_data: Protein-protein interaction data
disease_associations: Disease-protein associations
pathway_data: Pathway-protein associations
Returns:
NetworkX graph
"""
G = nx.Graph()
# Add drug node
G.add_node(drug_name, node_type="drug")
# Add direct target proteins
for target in drug_targets:
uniprot_id = target.get("uniprot_id")
target_name = target.get("target_name", uniprot_id)
if uniprot_id:
G.add_node(uniprot_id, node_type="protein", name=target_name)
G.add_edge(drug_name, uniprot_id,
edge_type="drug_target",
action=target.get("action_type", "unknown"))
# Add PPI network (indirect targets)
if ppi_data and ppi_data.get("available"):
interactions = ppi_data.get("interactions", [])
for interaction in interactions:
partner_id = interaction.get("partner_id")
partner_name = interaction.get("partner_name")
score = interaction.get("combined_score", 0)
# Only add high-confidence interactions
if score >= 400: # Medium confidence threshold
# Check if this partner is a direct target
is_direct_target = any(
t.get("uniprot_id") == partner_id for t in drug_targets
)
if not is_direct_target:
G.add_node(partner_id, node_type="protein", name=partner_name)
# Link to direct targets (if they exist in graph)
for target in drug_targets:
target_id = target.get("uniprot_id")
if target_id in G:
G.add_edge(target_id, partner_id,
edge_type="ppi",
score=score,
confidence=interaction.get("confidence", "Medium"))
# Add pathway nodes and connections
if pathway_data and pathway_data.get("available"):
pathways = pathway_data.get("pathways", [])
for pathway in pathways[:10]: # Limit pathways
pathway_id = pathway.get("pathway_id", "")
pathway_name = pathway.get("pathway_name", "")
if pathway_id:
G.add_node(pathway_id, node_type="pathway", name=pathway_name)
# Connect proteins to pathways
for target in drug_targets:
target_id = target.get("uniprot_id")
if target_id in G:
G.add_edge(target_id, pathway_id, edge_type="protein_pathway")
# Add disease nodes and connections
for uniprot_id, diseases in disease_associations.items():
if uniprot_id in G:
for disease_info in diseases:
disease_name = disease_info.get("disease_name", "")
score = disease_info.get("score", 0)
if disease_name and score > 0.3: # Confidence threshold
G.add_node(disease_name, node_type="disease")
G.add_edge(uniprot_id, disease_name,
edge_type="protein_disease",
score=score)
return G
def calculate_network_proximity(self, graph: nx.Graph, drug_name: str,
disease_name: str) -> Dict:
"""
Calculate network proximity between drug and disease.
Uses shortest path analysis and network distance metrics.
Args:
graph: NetworkX graph
drug_name: Name of the drug node
disease_name: Name of the disease node
Returns:
Dictionary with proximity metrics
"""
if drug_name not in graph or disease_name not in graph:
return {
"distance": float('inf'),
"shortest_path": [],
"proximity_score": 0.0,
"pathway_count": 0,
"intermediate_proteins": []
}
try:
# Calculate shortest path
if nx.has_path(graph, drug_name, disease_name):
shortest_path = nx.shortest_path(graph, drug_name, disease_name)
distance = len(shortest_path) - 1 # Number of edges
# Extract intermediate nodes
intermediate_proteins = [
node for node in shortest_path[1:-1]
if graph.nodes[node].get("node_type") == "protein"
]
# Count pathways in path
pathway_count = sum(
1 for node in shortest_path
if graph.nodes[node].get("node_type") == "pathway"
)
# Calculate proximity score (inverse of distance, normalized)
# Shorter paths = higher score
max_distance = 10 # Maximum expected path length
proximity_score = max(0, 1 - (distance / max_distance))
# Boost score if pathways are involved
if pathway_count > 0:
proximity_score *= (1 + 0.2 * pathway_count)
proximity_score = min(1.0, proximity_score)
return {
"distance": distance,
"shortest_path": shortest_path,
"proximity_score": proximity_score,
"pathway_count": pathway_count,
"intermediate_proteins": intermediate_proteins,
"path_length": len(shortest_path)
}
else:
return {
"distance": float('inf'),
"shortest_path": [],
"proximity_score": 0.0,
"pathway_count": 0,
"intermediate_proteins": []
}
except Exception as e:
st.warning(f"Error calculating proximity: {str(e)}")
return {
"distance": float('inf'),
"shortest_path": [],
"proximity_score": 0.0,
"pathway_count": 0,
"intermediate_proteins": []
}
def calculate_confidence_score(self, proximity_metrics: Dict,
disease_associations: Dict,
pathway_count: int) -> float:
"""
Calculate confidence score for a repurposing prediction.
Combines multiple factors:
- Network proximity
- Disease association strength
- Pathway involvement
- Number of connecting paths
Args:
proximity_metrics: Results from calculate_network_proximity
disease_associations: Disease-protein association scores
pathway_count: Number of pathways involved
Returns:
Confidence score (0-100)
"""
base_score = 0.0
# Factor 1: Network proximity (40% weight)
proximity_score = proximity_metrics.get("proximity_score", 0.0)
distance = proximity_metrics.get("distance", float('inf'))
if distance == 1:
# Direct connection (drug -> protein -> disease)
base_score += 40.0
elif distance == 2:
# One intermediate (drug -> protein -> protein -> disease)
base_score += 30.0
elif distance == 3:
# Two intermediates
base_score += 20.0
elif distance <= 5:
# Short path
base_score += 10.0
# Factor 2: Disease association strength (30% weight)
max_disease_score = 0.0
for uniprot_id, diseases in disease_associations.items():
for disease_info in diseases:
score = disease_info.get("score", 0.0)
max_disease_score = max(max_disease_score, score)
base_score += max_disease_score * 30.0
# Factor 3: Pathway involvement (20% weight)
if pathway_count > 0:
pathway_bonus = min(20.0, pathway_count * 5.0)
base_score += pathway_bonus
# Factor 4: Number of connecting proteins (10% weight)
intermediate_count = len(proximity_metrics.get("intermediate_proteins", []))
if intermediate_count > 0:
protein_bonus = min(10.0, intermediate_count * 2.0)
base_score += protein_bonus
# Normalize to 0-100 scale
confidence = min(100.0, max(0.0, base_score))
return round(confidence, 1)
def generate_explanation(self, drug_name: str, disease_name: str,
proximity_metrics: Dict, graph: nx.Graph) -> str:
"""
Generate human-readable explanation for repurposing prediction.
Args:
drug_name: Name of the drug
disease_name: Name of the disease
proximity_metrics: Proximity analysis results
graph: Network graph
Returns:
Explanation string
"""
distance = proximity_metrics.get("distance", float('inf'))
shortest_path = proximity_metrics.get("shortest_path", [])
intermediate_proteins = proximity_metrics.get("intermediate_proteins", [])
pathway_count = proximity_metrics.get("pathway_count", 0)
if distance == float('inf'):
return f"No direct or indirect network connection found between {drug_name} and {disease_name}."
explanation_parts = []
# Direct target mechanism
if distance == 1:
explanation_parts.append(
f"{drug_name} directly targets proteins associated with {disease_name}."
)
elif distance == 2:
explanation_parts.append(
f"{drug_name} targets proteins that interact with disease-associated proteins in {disease_name}."
)
else:
explanation_parts.append(
f"{drug_name} influences {disease_name} through a network of {distance-1} protein interactions."
)
# Pathway involvement
if pathway_count > 0:
pathway_nodes = [
graph.nodes[node].get("name", node)
for node in shortest_path
if graph.nodes[node].get("node_type") == "pathway"
]
if pathway_nodes:
explanation_parts.append(
f"Mechanism involves {', '.join(pathway_nodes[:2])} pathways."
)
# Intermediate proteins
if intermediate_proteins:
protein_names = []
for protein_id in intermediate_proteins[:3]:
name = graph.nodes[protein_id].get("name", protein_id)
protein_names.append(name)
if protein_names:
explanation_parts.append(
f"Key intermediate proteins: {', '.join(protein_names)}."
)
return " ".join(explanation_parts)
async def predict_repurposing_opportunities(self, drug_name: str,
drugbank_id: Optional[str] = None,
max_results: int = 10) -> List[Dict]:
"""
Main function to predict drug repurposing opportunities.
Args:
drug_name: Name of the drug
drugbank_id: Optional DrugBank ID
max_results: Maximum number of predictions to return
Returns:
List of repurposing predictions with scores and explanations
"""
# Step 1: Fetch drug targets
drug_targets_data = await self.fetch_drug_targets(drug_name, drugbank_id)
drug_targets = drug_targets_data.get("targets", [])
if not drug_targets:
return [{
"disease_name": "No targets found",
"confidence": 0.0,
"explanation": f"Could not identify protein targets for {drug_name}. Please verify the drug name or DrugBank ID.",
"affected_proteins": [],
"pathways": []
}]
# Step 2: Fetch PPI data for targets
all_ppi_data = {}
all_uniprot_ids = [t.get("uniprot_id") for t in drug_targets if t.get("uniprot_id")]
# Fetch PPI for each target (limit to first 3 to avoid too many API calls)
for target in drug_targets[:3]:
uniprot_id = target.get("uniprot_id")
if uniprot_id:
try:
# Get gene name from UniProt
uniprot_data = await self.api_client.fetch_uniprot_data(uniprot_id)
gene_name = uniprot_data.get("gene_name", "")
if gene_name:
ppi_data = await self.api_client.fetch_string_ppi(gene_name, uniprot_id, limit=15)
if ppi_data and ppi_data.get("available"):
all_ppi_data[uniprot_id] = ppi_data
except Exception as e:
st.warning(f"Could not fetch PPI data for {uniprot_id}: {str(e)}")
continue
# Step 3: Fetch pathway data
pathway_data = {}
if all_uniprot_ids:
try:
first_target = drug_targets[0]
uniprot_id = first_target.get("uniprot_id")
uniprot_data = await self.api_client.fetch_uniprot_data(uniprot_id)
gene_name = uniprot_data.get("gene_name", "")
if gene_name:
pathway_data = await self.api_client.fetch_kegg_pathways(gene_name, uniprot_id)
except Exception as e:
st.warning(f"Could not fetch pathway data: {str(e)}")
pathway_data = {}
# Step 4: Fetch disease associations
disease_associations = await self.fetch_disease_protein_associations(all_uniprot_ids)
# Step 5: Build network graph
graph = self.build_network_graph(
drug_name, drug_targets,
all_ppi_data.get(all_uniprot_ids[0] if all_uniprot_ids else "", {}),
disease_associations,
pathway_data
)
# Step 6: Find all diseases in graph
diseases_in_graph = [
node for node in graph.nodes()
if graph.nodes[node].get("node_type") == "disease"
]
if not diseases_in_graph:
# Fallback: Use curated disease-protein associations
diseases_in_graph = self._get_curated_disease_associations(all_uniprot_ids)
# Step 7: Calculate repurposing scores for each disease
predictions = []
for disease_name in diseases_in_graph[:50]: # Limit to avoid too many calculations
# Calculate network proximity
proximity_metrics = self.calculate_network_proximity(graph, drug_name, disease_name)
# Calculate confidence score
confidence = self.calculate_confidence_score(
proximity_metrics, disease_associations,
proximity_metrics.get("pathway_count", 0)
)
# Generate explanation
explanation = self.generate_explanation(
drug_name, disease_name, proximity_metrics, graph
)
# Extract affected proteins and pathways
affected_proteins = proximity_metrics.get("intermediate_proteins", [])
if proximity_metrics.get("shortest_path"):
# Get protein names from path
protein_names = [
graph.nodes[node].get("name", node)
for node in proximity_metrics["shortest_path"]
if graph.nodes[node].get("node_type") == "protein"
]
affected_proteins = list(set(affected_proteins + protein_names))
pathway_names = [
graph.nodes[node].get("name", node)
for node in proximity_metrics.get("shortest_path", [])
if graph.nodes[node].get("node_type") == "pathway"
]
predictions.append({
"disease_name": disease_name,
"confidence": confidence,
"explanation": explanation,
"affected_proteins": affected_proteins[:5], # Limit to top 5
"pathways": pathway_names,
"distance": proximity_metrics.get("distance", float('inf')),
"proximity_score": proximity_metrics.get("proximity_score", 0.0)
})
# Step 8: Sort by confidence and return top results
predictions.sort(key=lambda x: x["confidence"], reverse=True)
return predictions[:max_results]
def _get_curated_disease_associations(self, uniprot_ids: List[str]) -> List[str]:
"""
Curated disease-protein associations for common proteins.
Used as fallback when API data is unavailable.
"""
detailed = self._get_curated_disease_associations_detailed(uniprot_ids)
diseases = set()
for uniprot_id, disease_list in detailed.items():
for disease_info in disease_list:
diseases.add(disease_info["disease_name"])
return list(diseases)