from __future__ import annotations import math import pytest from backend.app.api.routes import strategy from backend.app.core.config import settings from backend.app.services.campaign_vectorizer import FEATURE_KEYS from backend.app.services.gmm_engine import _matrix, run_gmm_clustering from backend.app.services.vectorizer import _feature_matrix def _raw_hcp_record(**overrides): record = { "Name": "Dr Test", "Adoption_Profile": "Early_Adopter", "Channel_Preference": "High_Digital", "Digital_Presence_Indicator": "1", "KOL_Status": "1", "Institutional_Tier": "1", "Academic_Affiliation": "1", "Primary_Specialty": "Medical Oncology", "Seniority_Level": "Executive / Director", "Seniority_Level_Encoded": "3", "Company": "University Cancer Center", "Job": "Chief Medical Officer", "Location": "Dubai, United Arab Emirates", "Skepticism_Level": "High", "Cognitive_Processing_Style": "Empirical", "Patient_Volume_Proxy": "High", "Total_Years_Experience": "24", "Expected_Age": "52", "Current_Role_Tenure": "8", "Highest_Academic_Degree": "Doctorate / Board / MD", "Workplace_Category": "Hospital/Clinic", "cluster_id": 99, "fit_score": 999, "prediction_label": "leak", } record.update(overrides) return record def _behavioral_record(index: int) -> dict[str, float]: return { key: min(0.98, max(0.02, ((index + offset) % 11) / 10.0)) for offset, key in enumerate(FEATURE_KEYS) } def test_hcp_behavioral_adapter_outputs_canonical_12d_space() -> None: from backend.app.services.hcp_behavioral_adapter import map_hcp_to_behavioral_features features = map_hcp_to_behavioral_features(_raw_hcp_record()) assert list(features) == FEATURE_KEYS assert all(isinstance(value, float) for value in features.values()) assert all(0.0 <= value <= 1.0 for value in features.values()) assert features["channel_preference"] > 0.70 assert features["kol_alignment"] > 0.70 assert features["target_seniority"] > 0.70 def test_hcp_behavioral_adapter_handles_missing_unknown_and_invalid_values() -> None: from backend.app.services.hcp_behavioral_adapter import map_hcp_to_behavioral_features missing = map_hcp_to_behavioral_features({}) assert list(missing) == FEATURE_KEYS assert all(value == pytest.approx(0.5) for value in missing.values()) noisy = map_hcp_to_behavioral_features( { "adoption_profile": "EARLY adopter", "channel-preference": "hybrid", "digital_presence_indicator": "TRUE", "kol_status": "not-a-number", "institutional_tier": "0", "academic_affiliation": "yes", "seniority_level_encoded": "2.5", "total_years_experience": "1,200", "current_role_tenure": "bad-value", "patient_volume_proxy": "unknown-volume", "skepticism_level": "MEDIUM", "cognitive_processing_style": "Analytical", } ) assert list(noisy) == FEATURE_KEYS assert all(isinstance(value, float) for value in noisy.values()) assert all(0.0 <= value <= 1.0 for value in noisy.values()) assert noisy["channel_preference"] > 0.55 def test_hcp_behavioral_adapter_does_not_use_academic_metric_shortcut() -> None: from backend.app.services.hcp_behavioral_adapter import map_hcp_to_behavioral_features academic_only = map_hcp_to_behavioral_features( { "h_index": 99, "works_count": 999, "cited_by_count": 99999, "i10_index": 300, } ) assert academic_only == pytest.approx({key: 0.5 for key in FEATURE_KEYS}) def test_hcp_behavioral_batch_preserves_metadata_separately() -> None: from backend.app.services.hcp_behavioral_adapter import map_hcp_records_to_behavioral mapped = map_hcp_records_to_behavioral([_raw_hcp_record()]) assert len(mapped) == 1 assert [key for key in mapped[0] if key in FEATURE_KEYS] == FEATURE_KEYS assert mapped[0]["metadata"]["Name"] == "Dr Test" assert "fit_score" not in [key for key in mapped[0] if isinstance(mapped[0][key], (int, float))] def test_gmm_matrix_uses_feature_whitelist_and_excludes_leakage_fields() -> None: record = _behavioral_record(1) record.update( { "cluster_id": 42, "id": 123, "index": 7, "score": 0.99, "fit_score": 0.88, "prediction": 1, "label": 2, "target": 3, "distance": 4, "ranking": 5, "centroid_x": 6, "pca_0": 7, } ) matrix, feature_names = _matrix([record]) assert feature_names == FEATURE_KEYS assert matrix.shape == (1, len(FEATURE_KEYS)) def test_gmm_matrix_uses_canonical_order_not_alphabetical_order() -> None: assert FEATURE_KEYS != sorted(FEATURE_KEYS) record = {key: idx / 100.0 for idx, key in enumerate(FEATURE_KEYS)} matrix, feature_names = _matrix([record]) assert feature_names == FEATURE_KEYS assert matrix[0].tolist() == pytest.approx([idx / 100.0 for idx in range(len(FEATURE_KEYS))]) def test_gmm_matrix_maps_raw_hcp_records_without_numeric_leakage() -> None: record = _raw_hcp_record( id=999, index=888, label=777, score=666, prediction=555, distance=444, ranking=333, centroid=222, pca_component=111, campaign_vector_pca=123, ) matrix, feature_names = _matrix([record]) assert feature_names == FEATURE_KEYS assert matrix.shape == (1, len(FEATURE_KEYS)) assert 999 not in matrix[0].tolist() assert 123 not in matrix[0].tolist() def test_vectorizer_feature_matrix_uses_canonical_order_and_excludes_leakage() -> None: record = {key: idx / 20.0 for idx, key in enumerate(FEATURE_KEYS)} record.update( { "id": 1000, "cluster_id": 9, "fit_score": 0.99, "campaign_vector_pca": 0.88, } ) matrix, feature_names = _feature_matrix([record]) assert feature_names == FEATURE_KEYS assert matrix.shape == (1, len(FEATURE_KEYS)) assert matrix[0].tolist() == pytest.approx([idx / 20.0 for idx in range(len(FEATURE_KEYS))]) def test_gmm_returns_scaler_artifacts_for_canonical_features() -> None: records = [_behavioral_record(i) for i in range(24)] result = run_gmm_clustering(records, min_k=2) assert result["feature_names"] == FEATURE_KEYS assert len(result["scaler_center"]) == len(FEATURE_KEYS) assert len(result["scaler_scale"]) == len(FEATURE_KEYS) assert all(math.isfinite(float(value)) for value in result["scaler_center"]) assert all(float(value) > 0 for value in result["scaler_scale"]) def test_full_evaluate_gmm_cache_key_includes_min_k_and_adapter_version() -> None: records = [_raw_hcp_record(Name=f"Dr {idx}", Seniority_Level_Encoded=str(idx % 4)) for idx in range(32)] metadata = {"source_signature": "same-source-for-cache-key-test"} strategy.invalidate_strategy_model_cache() first = strategy._get_or_fit_gmm(records, "test_bronze", metadata, min_k=2) second = strategy._get_or_fit_gmm(records, "test_bronze", metadata, min_k=3) assert first["cache_status"] == "miss" assert second["cache_status"] == "miss" assert first["cache_key"] != second["cache_key"] assert first["adapter_version"] assert first["feature_schema_signature"] def test_full_evaluate_uses_scaled_pca_projection(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setattr(settings, "ollama_api_key", "") monkeypatch.setattr( strategy, "_load_hcp_training_records", lambda: ( [_raw_hcp_record(Name=f"Dr {idx}", Seniority_Level_Encoded=str(idx % 4)) for idx in range(32)], "test_bronze", {"source_signature": "test-source"}, ), raising=False, ) if hasattr(strategy, "invalidate_strategy_model_cache"): strategy.invalidate_strategy_model_cache() result = strategy.full_evaluate( strategy.FullEvaluateRequest( campaign_text="Phase 3 oncology launch with digital KOL education, access proof, and patient urgency.", ) ) assert len(result["campaign_vector_12d"]) == len(FEATURE_KEYS) assert result["gmm"]["feature_names"] == FEATURE_KEYS assert result["gmm"]["scaler_center"] assert result["projection"]["path"] == "scaled_pca" assert result["projection"]["fallback_used"] is False assert len(result["campaign_vector_pca"]) == len(result["gmm"]["centroids"][0]) def test_strategy_cluster_labels_are_unique_for_dynamic_k_10() -> None: from backend.app.services.cluster_profiles import get_cluster_name names = [get_cluster_name(cluster_id) for cluster_id in range(10)] assert len(names) == 10 assert len(set(names)) == 10 assert all("Cluster " not in name for name in names)