""" Variant Prioritization Service - Research-backed pathogenicity scoring. Implements CPU-friendly, evidence-grounded variant pathogenicity ranking using a model stack based on Hugging Face artifacts: - Primary: XGBoost with precomputed missense features (gpn_msa_score, cadd_raw, cadd_phred, phyloP100way_vertebrate, phyloP241way_mammalian, phastCons100way_vertebrate, phastCons241way_mammalian, esm1b_embedding_mean, esm1b_embedding_max, esm1b_embedding_norm, nt_score, hyena_dna_embedding_mean) - Fallback 1: Random Forest for protein/AA-change features - Fallback 2: Logistic Regression ultra-light model Scientific foundations from: - Frazer et al. 2021 (esm1b_embedding_* features) - Cheng et al. 2023 (gpn_msa_score, cadd_*, phyloP*, phastCons*, nt_score) - Notin et al. 2022 (hyena_dna_embedding_mean) - Landrum et al. 2018 (ClinVar ground truth) All models run CPU-only, lazy-load with in-process caching, and gracefully degrade on artifact unavailability. """ import json import logging import math import os import warnings from pathlib import Path from typing import Any, Dict, List, Optional, Tuple import numpy as np # Optional imports - will only be imported when actually needed _hf_hub = None _joblib = None _xgb = None _sklearn = None logger = logging.getLogger(__name__) # Tier definitions for pathogenicity scores TIER_THRESHOLDS = { 1: 0.90, # Tier 1: Very high pathogenicity confidence 2: 0.70, # Tier 2: High pathogenicity confidence 3: 0.50, # Tier 3: Moderate pathogenicity confidence } # Default Hugging Face repository settings DEFAULT_HF_REPO_ID = "omshrivastava/omnibimol-variant-priority" DEFAULT_HF_REVISION = "main" DEFAULT_CACHE_PATH = "./cache/hf_artifacts" # Precomputed missense feature names (order matters for schema validation) # These are the canonical feature keys used by the tested model inputs. PRE_COMPUTED_FEATURES = [ "gpn_msa_score", "cadd_raw", "cadd_phred", "phyloP100way_vertebrate", "phyloP241way_mammalian", "phastCons100way_vertebrate", "phastCons241way_mammalian", "esm1b_embedding_mean", "esm1b_embedding_max", "esm1b_embedding_norm", "nt_score", "hyena_dna_embedding_mean", ] # Protein-level / AA-change feature names PROTEIN_FEATURES = [ "aa_position", "aa_change_type", "domain_score", "conservation_score", "blosum62_score", "grantham_distance", "sift_score", "polyphen_score", ] def _lazy_import_hf_hub(): """Lazy import of huggingface_hub to avoid unnecessary dependencies.""" global _hf_hub if _hf_hub is None: try: from huggingface_hub import snapshot_download _hf_hub = snapshot_download except ImportError: _hf_hub = False # Mark as unavailable return _hf_hub if _hf_hub else None def _lazy_import_joblib(): """Lazy import of joblib for model loading.""" global _joblib if _joblib is None: try: import joblib _joblib = joblib except ImportError: _joblib = False return _joblib if _joblib else None def _lazy_import_xgboost(): """Lazy import of xgboost.""" global _xgb if _xgb is None: try: import xgboost as xgb _xgb = xgb except ImportError: _xgb = False return _xgb if _xgb else None def _lazy_import_sklearn(): """Lazy import of sklearn.""" global _sklearn if _sklearn is None: try: from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler _sklearn = { "RandomForestClassifier": RandomForestClassifier, "LogisticRegression": LogisticRegression, "StandardScaler": StandardScaler, } except ImportError: _sklearn = False return _sklearn if _sklearn else None class VariantPrioritizer: """ Research-backed variant pathogenicity prioritization service. Implements a three-tier model stack for missense variant scoring: 1. Primary: XGBoost with comprehensive precomputed features - Requires: Full precomputed missense feature set - Features: gpn_msa_score, cadd_raw, cadd_phred, phyloP100way_vertebrate, phyloP241way_mammalian, phastCons100way_vertebrate, phastCons241way_mammalian, esm1b_embedding_mean, esm1b_embedding_max, esm1b_embedding_norm, nt_score, hyena_dna_embedding_mean - Output: Pathogenicity probability [0, 1] 2. Fallback 1: Random Forest for protein-level features - Triggered when: Precomputed features missing but AA-change features available - Features: Position, conservation, physicochemical properties - Output: Pathogenicity probability [0, 1] 3. Fallback 2: Logistic Regression ultra-light model - Triggered when: Only minimal features available - Output: Pathogenicity probability [0, 1] All models use pre-trained imputers and scalers for deterministic handling of missing values. Feature schema is strictly versioned to prevent drift. Args: hf_repo_id: Hugging Face repository ID for model artifacts hf_revision: Git revision (branch/tag/commit) to use cache_path: Local directory for cached artifacts enable_remote_download: Whether to download from Hugging Face if not cached strict_schema: Whether to enforce strict feature schema validation """ def __init__( self, hf_repo_id: str = DEFAULT_HF_REPO_ID, hf_revision: str = DEFAULT_HF_REVISION, cache_path: str = DEFAULT_CACHE_PATH, enable_remote_download: bool = True, strict_schema: bool = True, ): self.hf_repo_id = hf_repo_id self.hf_revision = hf_revision self.cache_path = Path(cache_path) self.enable_remote_download = enable_remote_download self.strict_schema = strict_schema # Model state self._xgb_model = None self._xgb_imputer = None self._xgb_scaler = None self._rf_model = None self._rf_imputer = None self._rf_scaler = None self._lr_model = None self._lr_imputer = None self._lr_scaler = None self._metrics_summary = None self._model_loaded = False self._load_lock = False # Simple lock for lazy loading # Version info for reproducibility self.artifact_versions = { "xgb_precomputed": None, "rf_protein": None, "lr_protein": None, "research_memo": None, } def _ensure_models_loaded(self) -> bool: """Lazy-load model artifacts on first use.""" if self._model_loaded: return True if self._load_lock: # Another thread/process is loading; skip to avoid recursion return False self._load_lock = True try: return self._load_model_artifacts() finally: self._load_lock = False def _load_model_artifacts(self) -> bool: """Load model artifacts from Hugging Face with local caching.""" repo_path = self._get_or_download_artifacts() if repo_path is None: logger.warning( "VariantPrioritizer: Could not load model artifacts. " "Remote download may be disabled or repo unavailable." ) return False # Load XGBoost stack xgb_loaded = self._load_xgboost_models(repo_path) # Load Random Forest fallback rf_loaded = self._load_rf_models(repo_path) # Load Logistic Regression fallback lr_loaded = self._load_lr_models(repo_path) # Load metrics summary for explainability self._load_metrics_summary(repo_path) # Load version metadata self._load_version_info(repo_path) success = xgb_loaded or rf_loaded or lr_loaded if success: self._model_loaded = True logger.info( f"VariantPrioritizer: Loaded models from {repo_path}. " f"XGB={xgb_loaded}, RF={rf_loaded}, LR={lr_loaded}" ) return success def _get_or_download_artifacts(self) -> Path | None: """Get cached artifacts or download from Hugging Face.""" # Check local cache first cached_repo = self.cache_path / self.hf_repo_id.replace("/", "--") cached_repo.mkdir(parents=True, exist_ok=True) if self._has_artifacts(cached_repo): return cached_repo if not self.enable_remote_download: logger.warning( "VariantPrioritizer: Remote download disabled and no cached artifacts found." ) return None # Download from Hugging Face snapshot_download = _lazy_import_hf_hub() if snapshot_download is None: logger.warning( "VariantPrioritizer: huggingface_hub not installed. " "Install with: pip install huggingface_hub" ) return None try: logger.info( f"Downloading model artifacts from {self.hf_repo_id} " f"(revision: {self.hf_revision})..." ) repo_path = snapshot_download( repo_id=self.hf_repo_id, revision=self.hf_revision, cache_dir=str(self.cache_path.parent), local_dir_use_symlinks=False, local_dir=str(cached_repo), allow_patterns=[ "xgb_precomputed.json", "xgb_precomputed.pkl", "xgb_precomputed_imp.pkl", "xgb_precomputed_scaler.pkl", "rf_protein.pkl", "rf_protein_imp.pkl", "rf_protein_scaler.pkl", "rf_protein.json", "lr_protein.pkl", "lr_protein_imp.pkl", "lr_protein_scaler.pkl", "metrics_summary.json", "research_memo.md", ], ) downloaded_repo = Path(repo_path) return downloaded_repo if self._has_artifacts(downloaded_repo) else None except Exception as e: logger.error(f"Failed to download model artifacts: {e}") return cached_repo if self._has_artifacts(cached_repo) else None @staticmethod def _has_artifacts(repo_path: Path) -> bool: """Check whether a cache directory contains at least one expected artifact.""" expected_files = [ "xgb_precomputed.json", "xgb_precomputed.pkl", "xgb_precomputed_imp.pkl", "xgb_precomputed_scaler.pkl", "rf_protein.pkl", "rf_protein_imp.pkl", "rf_protein_scaler.pkl", "rf_protein.json", "lr_protein.pkl", "lr_protein_imp.pkl", "lr_protein_scaler.pkl", "metrics_summary.json", "research_memo.md", ] return any((repo_path / filename).exists() for filename in expected_files) def _load_xgboost_models(self, repo_path: Path) -> bool: """Load XGBoost model and preprocessing artifacts.""" joblib = _lazy_import_joblib() xgb = _lazy_import_xgboost() if joblib is None or xgb is False: return False try: # Load model config model_config_path = repo_path / "xgb_precomputed.json" if model_config_path.exists(): with open(model_config_path) as f: model_config = json.load(f) else: return False # Load model (try pickle first, then try to construct from config) model_path = repo_path / "xgb_precomputed.pkl" if model_path.exists(): self._xgb_model = joblib.load(str(model_path)) else: # Try loading as JSON model config self._xgb_model = xgb.XGBClassifier() # Note: In practice, you'd need the actual model binary # This is a fallback logger.warning("XGBoost model binary not found, using config only") # Load imputer imputer_path = repo_path / "xgb_precomputed_imp.pkl" if imputer_path.exists(): self._xgb_imputer = joblib.load(str(imputer_path)) # Load scaler scaler_path = repo_path / "xgb_precomputed_scaler.pkl" if scaler_path.exists(): self._xgb_scaler = joblib.load(str(scaler_path)) return True except Exception as e: logger.error(f"Failed to load XGBoost models: {e}") self._xgb_model = None self._xgb_imputer = None self._xgb_scaler = None return False def _load_rf_models(self, repo_path: Path) -> bool: """Load Random Forest fallback models.""" joblib = _lazy_import_joblib() sklearn = _lazy_import_sklearn() if joblib is None or sklearn is False: return False try: model_path = repo_path / "rf_protein.pkl" if not model_path.exists(): return False self._rf_model = joblib.load(str(model_path)) imputer_path = repo_path / "rf_protein_imp.pkl" if imputer_path.exists(): self._rf_imputer = joblib.load(str(imputer_path)) scaler_path = repo_path / "rf_protein_scaler.pkl" if scaler_path.exists(): self._rf_scaler = joblib.load(str(scaler_path)) self._validate_rf_feature_schema(repo_path) return True except Exception as e: logger.error(f"Failed to load Random Forest models: {e}") self._rf_model = None self._rf_imputer = None self._rf_scaler = None return False def _validate_rf_feature_schema(self, repo_path: Path) -> None: """Ensure the RF artifact schema matches the current protein feature order.""" expected_features = PROTEIN_FEATURES metadata_path = repo_path / "rf_protein.json" metadata_schema = self._read_feature_schema_metadata(metadata_path) if metadata_schema is not None: if metadata_schema != expected_features: raise ValueError( "rf_protein.json feature_order does not match PROTEIN_FEATURES. " f"Expected {expected_features}, got {metadata_schema}." ) return inferred_schema = self._infer_feature_schema_from_artifacts( { "model": self._rf_model, "imputer": self._rf_imputer, "scaler": self._rf_scaler, } ) if inferred_schema is None: raise ValueError( "Unable to validate rf_protein schema: missing rf_protein.json and " "no feature_names_in_ metadata found on loaded artifacts." ) if inferred_schema != expected_features: raise ValueError( "Loaded rf_protein artifacts do not match PROTEIN_FEATURES. " f"Expected {expected_features}, got {inferred_schema}." ) @staticmethod def _read_feature_schema_metadata(metadata_path: Path) -> list[str] | None: """Read an ordered feature schema from JSON metadata if present.""" if not metadata_path.exists(): return None try: with open(metadata_path) as f: metadata = json.load(f) except Exception as exc: raise ValueError(f"Failed to read {metadata_path.name}: {exc}") from exc for key in ("feature_order", "training_features", "features"): value = metadata.get(key) if isinstance(value, list) and all(isinstance(item, str) for item in value): return value raise ValueError( f"{metadata_path.name} does not contain a usable ordered feature list." ) @staticmethod def _infer_feature_schema_from_artifacts(artifacts: dict[str, Any]) -> list[str] | None: """Infer an ordered feature schema from fitted sklearn-style artifacts.""" named_schemas = {} for artifact_name in ("model", "imputer", "scaler"): artifact = artifacts.get(artifact_name) if artifact is None: continue feature_names = getattr(artifact, "feature_names_in_", None) if feature_names is not None: inferred = list(feature_names) if inferred: named_schemas[artifact_name] = inferred if named_schemas: schema_values = list(named_schemas.values()) first_schema = schema_values[0] for _artifact_name, schema in named_schemas.items(): if schema != first_schema: raise ValueError( "Loaded rf_protein artifacts expose inconsistent feature orders: " f"{named_schemas}." ) return first_schema for artifact_name in ("model", "imputer", "scaler"): artifact = artifacts.get(artifact_name) if artifact is None: continue n_features = getattr(artifact, "n_features_in_", None) if n_features is not None: expected_length = len(PROTEIN_FEATURES) if int(n_features) != expected_length: raise ValueError( f"{artifact_name} reports n_features_in_={n_features}, " f"but PROTEIN_FEATURES has {expected_length} entries." ) return None @staticmethod def write_feature_schema_metadata(output_dir: Path, model_name: str, feature_order: list[str]) -> Path: """Persist ordered feature metadata alongside a serialized model artifact.""" metadata_path = output_dir / f"{model_name}.json" metadata = { "model_name": model_name, "feature_order": feature_order, } with open(metadata_path, "w") as f: json.dump(metadata, f, indent=2) return metadata_path def _load_lr_models(self, repo_path: Path) -> bool: """Load Logistic Regression fallback models.""" joblib = _lazy_import_joblib() sklearn = _lazy_import_sklearn() if joblib is None or sklearn is False: return False try: model_path = repo_path / "lr_protein.pkl" if not model_path.exists(): return False self._lr_model = joblib.load(str(model_path)) imputer_path = repo_path / "lr_protein_imp.pkl" if imputer_path.exists(): self._lr_imputer = joblib.load(str(imputer_path)) scaler_path = repo_path / "lr_protein_scaler.pkl" if scaler_path.exists(): self._lr_scaler = joblib.load(str(scaler_path)) return True except Exception as e: logger.error(f"Failed to load Logistic Regression models: {e}") self._lr_model = None self._lr_imputer = None self._lr_scaler = None return False def _load_metrics_summary(self, repo_path: Path): """Load metrics summary for explainability.""" metrics_path = repo_path / "metrics_summary.json" if metrics_path.exists(): try: with open(metrics_path) as f: self._metrics_summary = json.load(f) logger.info(f"Loaded metrics summary with keys: {list(self._metrics_summary.keys())}") except Exception as e: logger.error(f"Failed to load metrics summary: {e}") def _load_version_info(self, repo_path: Path): """Load version information for reproducibility.""" for key, filename in self.artifact_versions.items(): file_path = repo_path / f"{key}.json" if file_path.exists(): try: with open(file_path) as f: metadata = json.load(f) self.artifact_versions[key] = metadata.get("version", "unknown") except Exception: pass def predict_pathogenicity( self, features: Dict[str, Any], feature_type: str = "auto", ) -> Dict[str, Any]: """ Predict variant pathogenicity using the appropriate model tier. Args: features: Dictionary of variant features. Expected keys depend on feature_type: - For 'precomputed' (XGBoost): All PRE_COMPUTED_FEATURES keys - For 'protein' (Random Forest): PROTEIN_FEATURES keys - For 'auto': Automatically detect best available feature set feature_type: One of 'auto', 'precomputed', 'protein', or 'minimal' Returns: Dictionary with prediction results: { "score": float, # Pathogenicity probability [0, 1] "model_used": str, # e.g., "xgb_precomputed", "rf_protein", "lr_protein" "confidence_label": str, # "high", "medium", "low" "evidence_features_used": List[str], "missing_features": List[str], "fallback_reason": Optional[str], "tier": int, # 1, 2, or 3 (or None if score unavailable) "metadata": Dict # Model confidence intervals, etc. } """ if not self._ensure_models_loaded(): return { "score": None, "model_used": None, "confidence_label": "low", "evidence_features_used": [], "missing_features": list(features.keys()), "fallback_reason": "models_not_available", "tier": None, "metadata": {"warning": "Model artifacts not loaded. Scoring unavailable."}, } # Auto-detect feature type if needed if feature_type == "auto": feature_type = self._detect_feature_type(features) elif feature_type in {"precomputed", "protein"}: validation = self.validate_features(features, feature_type) if not validation["valid"]: feature_type = self._detect_feature_type(features) # Route to appropriate scoring method if feature_type == "precomputed" and self._xgb_model is not None: result = self._score_xgboost(features) elif feature_type == "protein" and self._rf_model is not None: result = self._score_random_forest(features) else: # Fallback to LR or whatever is available result = self._score_logistic_regression(features) # Add tier information result["tier"] = self._assign_tier(result["score"]) return result def _detect_feature_type(self, features: Dict[str, Any]) -> str: """Auto-detect which feature set is available.""" if self.validate_features(features, "precomputed")["valid"]: return "precomputed" if self.validate_features(features, "protein")["valid"]: return "protein" return "minimal" def _score_xgboost(self, features: Dict[str, Any]) -> Dict[str, Any]: """Score using XGBoost with precomputed missense features.""" # Extract and validate features feature_vector, missing = self._extract_features( features, PRE_COMPUTED_FEATURES, "precomputed" ) if len(feature_vector) == 0: # No features available; fallback return { "score": 0.5, "model_used": "xgb_precomputed", "confidence_label": "low", "evidence_features_used": [], "missing_features": PRE_COMPUTED_FEATURES, "fallback_reason": "no_features_available", "metadata": {}, } # Impute missing values if self._xgb_imputer is not None: try: feature_vector = self._xgb_imputer.transform([feature_vector])[0] except Exception as e: logger.warning(f"XGBoost imputation failed: {e}") # Scale if scaler available if self._xgb_scaler is not None: try: feature_vector = self._xgb_scaler.transform([feature_vector])[0] except Exception as e: logger.warning(f"XGBoost scaling failed: {e}") # Predict if self._xgb_model is not None: try: # Handle both sklearn-like and native xgboost models if hasattr(self._xgb_model, "predict_proba"): proba = self._xgb_model.predict_proba([feature_vector])[0] score = float(proba[1]) if len(proba) > 1 else float(proba[0]) else: # Native xgboost import xgboost as xgb dmatrix = xgb.DMatrix([feature_vector]) pred = self._xgb_model.predict(dmatrix) score = float(pred[0]) score = max(0.0, min(1.0, score)) # Clamp to [0, 1] return { "score": score, "model_used": "xgb_precomputed", "confidence_label": "high", "evidence_features_used": [ f for f in PRE_COMPUTED_FEATURES if f not in missing ], "missing_features": missing, "fallback_reason": None, "metadata": self._get_metadata("xgb_precomputed"), } except Exception as e: logger.error(f"XGBoost prediction failed: {e}") # Fallback return self._score_random_forest(features) def _score_random_forest(self, features: Dict[str, Any]) -> Dict[str, Any]: """Score using Random Forest fallback.""" feature_vector, missing = self._extract_features( features, PROTEIN_FEATURES, "protein" ) # Impute and scale if self._rf_imputer is not None: try: feature_vector = self._rf_imputer.transform([feature_vector])[0] except Exception: pass if self._rf_scaler is not None: try: feature_vector = self._rf_scaler.transform([feature_vector])[0] except Exception: pass # Predict if self._rf_model is not None: try: if hasattr(self._rf_model, "predict_proba"): proba = self._rf_model.predict_proba([feature_vector])[0] score = float(proba[1]) if len(proba) > 1 else float(proba[0]) else: score = float(self._rf_model.predict([feature_vector])[0]) score = max(0.0, min(1.0, score)) return { "score": score, "model_used": "rf_protein", "confidence_label": "medium", "evidence_features_used": [ f for f in PROTEIN_FEATURES if f not in missing ], "missing_features": missing, "fallback_reason": "precomputed_features_missing" if missing else None, "metadata": self._get_metadata("rf_protein"), } except Exception as e: logger.error(f"Random Forest prediction failed: {e}") # Fallback to LR return self._score_logistic_regression(features) def _score_logistic_regression(self, features: Dict[str, Any]) -> Dict[str, Any]: """Score using Logistic Regression ultra-light fallback.""" # Use whatever features are available available_features = list(features.keys()) feature_vector = [float(features.get(f, 0)) for f in available_features] # Ensure we have something to work with if len(feature_vector) == 0: feature_vector = [0.0] available_features = ["intercept"] # Predict if self._lr_model is not None: try: # Impute and scale if self._lr_imputer is not None: feature_vector = self._lr_imputer.transform([feature_vector])[0] if self._lr_scaler is not None: feature_vector = self._lr_scaler.transform([feature_vector])[0] if hasattr(self._lr_model, "predict_proba"): proba = self._lr_model.predict_proba([feature_vector])[0] score = float(proba[1]) if len(proba) > 1 else float(proba[0]) else: raw = float(self._lr_model.predict([feature_vector])[0]) score = 1.0 / (1.0 + np.exp(-raw)) # Sigmoid score = max(0.0, min(1.0, score)) return { "score": score, "model_used": "lr_protein", "confidence_label": "low", "evidence_features_used": available_features, "missing_features": [], "fallback_reason": "minimal_features_available", "metadata": self._get_metadata("lr_protein"), } except Exception as e: logger.error(f"Logistic Regression prediction failed: {e}") # Ultimate fallback: return neutral score return { "score": 0.5, "model_used": None, "confidence_label": "low", "evidence_features_used": available_features, "missing_features": [], "fallback_reason": "all_models_failed", "metadata": {"warning": "All models failed; returning default"}, } def _extract_features( self, features: Dict[str, Any], expected_features: List[str], feature_set_name: str, ) -> Tuple[List[float], List[str]]: """Extract feature values in expected order, tracking missing values.""" values = [] missing = [] for feat in expected_features: if feat in features and features[feat] is not None: try: val = float(features[feat]) values.append(val) except (TypeError, ValueError): values.append(0.0) missing.append(feat) if self.strict_schema: logger.warning( f"Feature '{feat}' in '{feature_set_name}' " f"has non-numeric value: {features[feat]}" ) else: values.append(0.0) missing.append(feat) return values, missing def _assign_tier(self, score: Optional[float]) -> Optional[int]: """Assign tier based on pathogenicity score.""" if score is None: return None # Sort by threshold value descending to check highest thresholds first for tier, threshold in sorted(TIER_THRESHOLDS.items(), key=lambda x: x[1], reverse=True): if score >= threshold: return tier return 3 # Below 0.5 is still Tier 3 def _get_metadata(self, model_name: str) -> Dict[str, Any]: """Get model metadata for explainability.""" metadata = { "model_name": model_name, "artifact_versions": self.artifact_versions, "feature_sets": { "precomputed": PRE_COMPUTED_FEATURES, "protein": PROTEIN_FEATURES, }, } if self._metrics_summary: # Add model performance metrics if available for key in ["xgb_precomputed", "rf_protein", "lr_protein"]: if key in self._metrics_summary: metadata[f"{key}_metrics"] = self._metrics_summary[key] return metadata def batch_predict( self, variants: List[Dict[str, Any]], feature_type: str = "auto", ) -> List[Dict[str, Any]]: """ Score multiple variants in batch. Args: variants: List of variant feature dictionaries feature_type: Feature type for all variants Returns: List of prediction results (one per variant) """ if not self._ensure_models_loaded(): return [{ "score": None, "model_used": None, "confidence_label": "low", "evidence_features_used": [], "missing_features": [], "fallback_reason": "models_not_available", "tier": None, "metadata": {"warning": "Models not available"}, } for _ in variants] results = [] for variant in variants: try: result = self.predict_pathogenicity(variant, feature_type=feature_type) except Exception as e: logger.error(f"Batch prediction failed for variant: {e}") result = { "score": None, "model_used": None, "confidence_label": "low", "evidence_features_used": [], "missing_features": [], "fallback_reason": f"prediction_error: {e}", "tier": None, "metadata": {}, } results.append(result) return results def validate_features( self, features: Dict[str, Any], feature_type: str = "precomputed", ) -> Dict[str, Any]: """ Validate feature dictionary against expected schema. Args: features: Feature dictionary to validate feature_type: Expected feature type Returns: Dictionary with validation results """ if feature_type == "precomputed": expected = PRE_COMPUTED_FEATURES elif feature_type == "protein": expected = PROTEIN_FEATURES else: expected = [] present = [] missing = [] invalid = [] for feat in expected: if feat not in features: missing.append(feat) elif features[feat] is None: missing.append(feat) else: try: value = float(features[feat]) if math.isfinite(value): present.append(feat) else: invalid.append(feat) except (TypeError, ValueError): invalid.append(feat) completeness = len(present) / len(expected) if expected else 0 return { "valid": len(invalid) == 0 and completeness >= 0.8, "completeness": round(completeness, 4), "present": present, "missing": missing, "invalid": invalid, "expected_features": expected, }