""" Phase 1: DeepPurpose -> Qdrant Ingestion Pipeline (v2 - Fixed) Fixes applied: - Proper PCA dimensionality reduction (not just first 3 dims) - No shuffle to preserve data ordering - Proper error handling and validation - Shared config import """ import os os.environ["DGL_DISABLE_GRAPHBOLT"] = "1" import sys import numpy as np import pandas as pd import torch import pickle from sklearn.decomposition import PCA from qdrant_client import QdrantClient from qdrant_client.models import VectorParams, Distance, PointStruct from DeepPurpose import utils, DTI as dp_models import warnings warnings.filterwarnings("ignore") # Import shared config from config import ( BEST_MODEL_RUN, MODEL_CONFIG, QDRANT_HOST, QDRANT_PORT, COLLECTION_NAME, METRICS ) def validate_model_path(run_dir: str) -> bool: """Check if run directory contains valid model.""" model_path = os.path.join(run_dir, "model.pt") if not os.path.exists(model_path): print(f"[ERROR] No model.pt found in {run_dir}") print(" Available runs with models:") for d in os.listdir("runs"): if os.path.exists(os.path.join("runs", d, "model.pt")): print(f" - runs/{d}") return False return True def load_model(run_dir: str): """Load DeepPurpose model with validation.""" if not validate_model_path(run_dir): return None print(f"[1/6] Loading Model from {run_dir}...") # Load the config.pkl if it exists for exact config match config_path = os.path.join(run_dir, "config.pkl") if os.path.exists(config_path): with open(config_path, "rb") as f: config = pickle.load(f) print(" Using saved config.pkl") # Override result_folder to current directory (old path may be stale) config["result_folder"] = run_dir else: # Fallback to hardcoded config config = utils.generate_config( drug_encoding=MODEL_CONFIG["drug_encoding"], target_encoding=MODEL_CONFIG["target_encoding"], cls_hidden_dims=MODEL_CONFIG["cls_hidden_dims"], train_epoch=1, LR=1e-4, batch_size=256, result_folder=run_dir ) print(" Using hardcoded config (no config.pkl found)") model = dp_models.model_initialize(**config) model.load_pretrained(os.path.join(run_dir, "model.pt")) model.model.eval() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.model.to(device) print(f" Model loaded on {device}") return model def load_data(dataset_name: str = "KIBA") -> pd.DataFrame: """ Load DTI dataset from TDC (Therapeutics Data Commons). We need the original Drug/Target sequences to generate embeddings. """ print(f"[2/6] Loading {dataset_name} Dataset from TDC...") from tdc.multi_pred import DTI data = DTI(name=dataset_name) # Get just the test split (smaller, faster for demo) split = data.get_split() df = split['test'] # Use test set # TDC uses different column names df = df.rename(columns={'Drug': 'Drug', 'Target': 'Target', 'Y': 'Label'}) print(f" Loaded {len(df)} test samples") print(f" Columns: {list(df.columns)}") print(f" Sample Drug (SMILES): {df['Drug'].iloc[0][:50]}...") print(f" Sample Target (first 50 aa): {df['Target'].iloc[0][:50]}...") return df def generate_embeddings(model, df: pd.DataFrame): """ Generate embeddings WITHOUT shuffling to preserve order alignment. Returns: (drug_embeddings, target_embeddings, labels) """ print("[3/6] Generating Embeddings (no shuffle)...") drugs = df['Drug'].values targets = df['Target'].values labels = df['Label'].values drug_encoding = model.config['drug_encoding'] target_encoding = model.config['target_encoding'] print(f" Encoding {len(drugs)} drugs with {drug_encoding}...") print(f" Encoding {len(targets)} targets with {target_encoding}...") # data_process returns DataFrames with encoded columns train_df, val_df, test_df = utils.data_process( drugs, targets, labels, drug_encoding, target_encoding, split_method='random', frac=[0.01, 0.01, 0.98], # Mostly test random_seed=42 ) # Combine all DataFrames all_df = pd.concat([train_df, val_df, test_df], ignore_index=True) print(f" Combined {len(all_df)} samples") # Create proper Dataset using DeepPurpose's loader from DeepPurpose.utils import data_process_loader config = { 'drug_encoding': drug_encoding, 'target_encoding': target_encoding } # Create loader - pass indices, labels, and the dataframe with config indices = list(range(len(all_df))) label_values = all_df['Label'].values dataset = data_process_loader(indices, label_values, all_df, **config) # Create DataLoader WITHOUT shuffling - CRITICAL! loader = torch.utils.data.DataLoader( dataset, batch_size=64, shuffle=False, drop_last=False ) device = next(model.model.parameters()).device print(f" Running inference on {device}... ({len(dataset)} samples)") all_drug_emb = [] all_target_emb = [] all_labels = [] with torch.no_grad(): for v_d, v_p, label in loader: v_d = v_d.to(device) v_p = v_p.to(device) emb_drug = model.model.model_drug(v_d).cpu().numpy() emb_target = model.model.model_protein(v_p).cpu().numpy() all_drug_emb.append(emb_drug) all_target_emb.append(emb_target) all_labels.extend(label.numpy().tolist()) drug_embeddings = np.vstack(all_drug_emb) target_embeddings = np.vstack(all_target_emb) print(f" Drug embeddings: {drug_embeddings.shape}") print(f" Target embeddings: {target_embeddings.shape}") return drug_embeddings, target_embeddings, np.array(all_labels) def compute_3d_projections(drug_emb: np.ndarray, target_emb: np.ndarray): """ Compute REAL PCA projections for 3D visualization. """ print("[4/6] Computing PCA projections...") combined = np.hstack([drug_emb, target_emb]) pca = PCA(n_components=3) combined_3d = pca.fit_transform(combined) pca_drug = PCA(n_components=3) drug_3d = pca_drug.fit_transform(drug_emb) pca_target = PCA(n_components=3) target_3d = pca_target.fit_transform(target_emb) print(f" Explained variance (combined): {pca.explained_variance_ratio_.sum():.2%}") print(f" Explained variance (drug): {pca_drug.explained_variance_ratio_.sum():.2%}") print(f" Explained variance (target): {pca_target.explained_variance_ratio_.sum():.2%}") return drug_3d, target_3d, combined_3d def upload_to_qdrant( df: pd.DataFrame, drug_emb: np.ndarray, target_emb: np.ndarray, drug_3d: np.ndarray, target_3d: np.ndarray, combined_3d: np.ndarray, labels: np.ndarray ): """Upload vectors and metadata to Qdrant.""" print(f"[5/6] Connecting to Qdrant ({QDRANT_HOST}:{QDRANT_PORT})...") try: client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT, timeout=10) client.get_collections() except Exception as e: print(f"[ERROR] Cannot connect to Qdrant: {e}") print(" Make sure Qdrant is running:") print(" docker run -p 6333:6333 -p 6334:6334 qdrant/qdrant") return False drug_dim = drug_emb.shape[1] target_dim = target_emb.shape[1] existing = [c.name for c in client.get_collections().collections] if COLLECTION_NAME in existing: print(f" Collection '{COLLECTION_NAME}' exists. Recreating...") client.recreate_collection( collection_name=COLLECTION_NAME, vectors_config={ "drug": VectorParams(size=drug_dim, distance=Distance.COSINE), "target": VectorParams(size=target_dim, distance=Distance.COSINE), } ) print(f" Collection created: drug={drug_dim}d, target={target_dim}d") print("[6/6] Uploading points...") drugs = df['Drug'].values targets = df['Target'].values points = [] for i in range(len(df)): payload = { "smiles": str(drugs[i]), "target_seq": str(targets[i])[:500], "label_true": float(labels[i]), "pca_drug": drug_3d[i].tolist(), "pca_target": target_3d[i].tolist(), "pca_combined": combined_3d[i].tolist(), "affinity_class": "high" if labels[i] > 7 else "medium" if labels[i] > 5 else "low", } point = PointStruct( id=i, vector={ "drug": drug_emb[i].tolist(), "target": target_emb[i].tolist() }, payload=payload ) points.append(point) if len(points) >= 100: client.upsert(collection_name=COLLECTION_NAME, points=points) points = [] print(f" Uploaded {i+1}/{len(df)}", end='\r') if points: client.upsert(collection_name=COLLECTION_NAME, points=points) print(f"\n ✓ Successfully indexed {len(df)} points!") return True def main(): print("=" * 60) print(" DEEPPURPOSE -> QDRANT INGESTION PIPELINE (v2)") print("=" * 60) model = load_model(BEST_MODEL_RUN) if not model: sys.exit(1) # Load KIBA dataset from TDC (the model was trained on KIBA) df = load_data("KIBA") if df is None: sys.exit(1) drug_emb, target_emb, labels = generate_embeddings(model, df) drug_3d, target_3d, combined_3d = compute_3d_projections(drug_emb, target_emb) success = upload_to_qdrant(df, drug_emb, target_emb, drug_3d, target_3d, combined_3d, labels) if success: print("\n" + "=" * 60) print(" INGESTION COMPLETE") print("=" * 60) print(f" Collection: {COLLECTION_NAME}") print(f" Points: {len(df)}") print(f" Next: python -m uvicorn server.api:app --reload") else: print("\n[FAILED] Check errors above.") sys.exit(1) if __name__ == "__main__": main()