Spaces:

subisu
/

ppd-recommendation-api

Sleeping

App Files Files Community

surbi karki commited on Feb 5

Commit

d37e0aa

verified ·

1 Parent(s): 759a1c9

Update recommender_core.py

Browse files

Files changed (1) hide show

recommender_core.py +176 -166

recommender_core.py CHANGED Viewed

@@ -1,166 +1,176 @@
-import pandas as pd
-import joblib
-import os
-import time
-from sqlalchemy import create_engine
-from sklearn.metrics.pairwise import cosine_similarity
-from urllib.parse import quote_plus
-from text_utils import TextProcessor
-from functools import lru_cache
-# --- CONFIGURATION ---
-# For cloud deployment (HF/Production), use DATABASE_URL.
-# Fallback to local construction if not present.
-DATABASE_URL = os.getenv("DATABASE_URL")
-if not DATABASE_URL:
-    DB_USER = os.getenv("DB_USER", "postgres")
-    DB_PASSWORD = quote_plus(os.getenv("DB_PASSWORD", "subisu"))
-    DB_HOST = os.getenv("DB_HOST", "localhost")
-    DB_PORT = os.getenv("DB_PORT", "5432")
-    DB_NAME = os.getenv("DB_NAME", "ppd_project_db")
-    DB_URI = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
-else:
-    # Ensure URL is compatible with SQLAlchemy if it starts with postgres://
-    if DATABASE_URL.startswith("postgres://"):
-        DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql+psycopg2://", 1)
-    elif "postgresql://" in DATABASE_URL and "+psycopg2" not in DATABASE_URL:
-        DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+psycopg2://", 1)
-    DB_URI = DATABASE_URL
-class RecommenderCore:
-    def __init__(self):
-        self.engine = create_engine(DB_URI)
-        self.vectorizer = None
-        self.tfidf_matrix = None
-        self.df = None
-        self.load_model()
-    def load_model(self):
-        try:
-            if os.path.exists('vectorizer.pkl') and os.path.exists('tfidf_matrix.pkl'):
-                self.vectorizer = joblib.load('vectorizer.pkl')
-                self.tfidf_matrix = joblib.load('tfidf_matrix.pkl')
-                print("💾 Model Loaded into Memory.")
-            self.df = pd.read_sql("SELECT * FROM articles WHERE status = 'Approved' ORDER BY article_id", self.engine)
-            self.df = self.df.reset_index(drop=True)
-            print(f"📚 Indexed {len(self.df)} articles.")
-        except Exception as e:
-            print(f"Load Error: {e}")
-    @lru_cache(maxsize=128)
-    def recommend_articles(self, symptoms_text, crisis_level, top_n=5):
-        """Modular requirement: Main entry point with caching."""
-        if self.df is None or self.vectorizer is None:
-            return []
-        # 1. Preprocess user query
-        query_raw = symptoms_text
-        query_norm = TextProcessor.normalize(symptoms_text)
-        query_phased = TextProcessor.detect_phrases(query_norm)
-        # 2. Filter by Crisis Level (Safety First)
-        risk_map = {
-            "High": ["High", "Critical", "Moderate", "All"],
-            "Moderate": ["Moderate", "Low", "All"],
-            "Low": ["Low", "All"]
-        }
-        allowed = risk_map.get(crisis_level, ["All"])
-        # Determine the filtered subset
-        mask = self.df['risk_level'].apply(
-            lambda x: any(level.strip() in allowed for level in str(x).split(','))
-        )
-        filtered_df = self.df[mask].copy()
-        if filtered_df.empty: return []
-        # 3. Primary ML Scoring (Cosine Similarity)
-        user_vec = self.vectorizer.transform([query_phased])
-        all_cos_scores = cosine_similarity(user_vec, self.tfidf_matrix).flatten()
-        # 4. Final Ranking
-        # Correctly align scores using the original dataframe's index
-        filtered_df['cosine_score'] = [all_cos_scores[i] for i in filtered_df.index]
-        # Apply the hybrid ranking engine
-        ranked_results = self.apply_ranking(filtered_df, query_raw)
-        # Format for output
-        final_list = ranked_results.head(top_n).to_dict('records')
-        # 5. Live Fallback if needed
-        # Requirement: If results are too few, fetch fresh content
-        K = 3
-        if len(final_list) < K:
-            try:
-                from ingestion_service import IngestionService
-                service = IngestionService()
-                live_arts = service.fetch_from_pubmed(query_raw, limit=K)
-                for art in live_arts:
-                    if len(final_list) >= top_n: break
-                    final_list.append({
-                        "article_id": -1,
-                        "title": art['title'],
-                        "category": "Live Fallback",
-                        "format_type": "pubmed",
-                        "external_url": art['url'],
-                        "content": art['content'],
-                        "risk_level": "All"
-                    })
-                # Background ingestion (optional here, but requested in strategy)
-                if live_arts: service.store_articles(live_arts)
-            except Exception as e:
-                print(f"Fallback error: {e}")
-        for item in final_list:
-            item['access_type'] = 'External Link' if item.get('format_type') == 'pubmed' else 'Direct Text'
-            if 'created_at' in item and item['created_at']:
-                item['created_at'] = str(item['created_at'])
-        return final_list
-    def apply_ranking(self, df, raw_query):
-        """Modular requirement: Hybrid ranking engine."""
-        # Constants for weighting
-        SOURCE_WEIGHT = 1.15  # 15% boost for contributor articles
-        EXACT_MATCH_BOOST = 0.2
-        tokens = TextProcessor.normalize(raw_query).split()
-        now = pd.Timestamp.now()
-        def calculate_hybrid_score(row):
-            score = row['cosine_score']
-            # A. Source Weighting (Trusted Contributors)
-            if row['format_type'] == 'text':
-                score *= SOURCE_WEIGHT
-            # B. Exact Symptom Overlap Boost
-            # Check how many user tokens appear exactly in the normalized title
-            norm_title = TextProcessor.normalize(row['title'])
-            matches = sum(1 for t in tokens if t in norm_title)
-            score += (matches * EXACT_MATCH_BOOST)
-            # C. Recency Boost (PubMed only, newer is better)
-            if row['format_type'] == 'pubmed' and row['created_at']:
-                age_days = (now - pd.to_datetime(row['created_at'])).days
-                # Decaying boost: max 0.1 for brand new, goes to 0 over 365 days
-                recency_boost = max(0, 0.1 * (1 - (min(age_days, 365) / 365)))
-                score += recency_boost
-            return score
-        df['final_score'] = df.apply(calculate_hybrid_score, axis=1)
-        return df.sort_values(by='final_score', ascending=False)
-    def get_article_by_id(self, article_id):
-        """Fetches a single article by its ID."""
-        if self.df is None: return None
-        article = self.df[self.df['article_id'] == article_id]
-        return article.iloc[0].to_dict() if not article.empty else None
-# Singleton instance to be used by main.py
-recommender = RecommenderCore()

+import pandas as pd
+import joblib
+import os
+import time
+from sqlalchemy import create_engine
+from sklearn.metrics.pairwise import cosine_similarity
+from urllib.parse import quote_plus
+from text_utils import TextProcessor
+from functools import lru_cache
+# --- CONFIGURATION ---
+# For cloud deployment (HF/Production), use DATABASE_URL.
+# Fallback to local construction if not present.
+DATABASE_URL = os.getenv("DATABASE_URL")
+if not DATABASE_URL:
+    DB_USER = os.getenv("DB_USER", "postgres")
+    DB_PASSWORD = quote_plus(os.getenv("DB_PASSWORD", "subisu"))
+    DB_HOST = os.getenv("DB_HOST", "localhost")
+    DB_PORT = os.getenv("DB_PORT", "5432")
+    DB_NAME = os.getenv("DB_NAME", "ppd_project_db")
+    DB_URI = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
+else:
+    # Ensure URL is compatible with SQLAlchemy if it starts with postgres://
+    if DATABASE_URL.startswith("postgres://"):
+        DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql+psycopg2://", 1)
+    elif "postgresql://" in DATABASE_URL and "+psycopg2" not in DATABASE_URL:
+        DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+psycopg2://", 1)
+    DB_URI = DATABASE_URL
+class RecommenderCore:
+    def __init__(self):
+        self.engine = create_engine(DB_URI)
+        self.vectorizer = None
+        self.tfidf_matrix = None
+        self.df = None
+        self.load_model()
+    def load_model(self):
+        try:
+            if os.path.exists('vectorizer.pkl') and os.path.exists('tfidf_matrix.pkl'):
+                self.vectorizer = joblib.load('vectorizer.pkl')
+                self.tfidf_matrix = joblib.load('tfidf_matrix.pkl')
+                print("💾 Model Loaded into Memory.")
+            self.df = pd.read_sql("SELECT * FROM articles WHERE status = 'Approved' ORDER BY article_id", self.engine)
+            self.df = self.df.reset_index(drop=True)
+            print(f"📚 Indexed {len(self.df)} articles.")
+        except Exception as e:
+            print(f"Load Error: {e}")
+    @lru_cache(maxsize=128)
+    def recommend_articles(self, symptoms_text, crisis_level, top_n=5):
+        """Modular requirement: Main entry point with caching."""
+        if self.df is None or self.vectorizer is None:
+            return []
+        # 1. Preprocess user query
+        query_raw = symptoms_text
+        query_norm = TextProcessor.normalize(symptoms_text)
+        query_phased = TextProcessor.detect_phrases(query_norm)
+        # 2. Filter by Crisis Level (Safety First)
+        risk_map = {
+            "High": ["High", "Critical", "Moderate", "All"],
+            "Moderate": ["Moderate", "Low", "All"],
+            "Low": ["Low", "All"]
+        }
+        allowed = risk_map.get(crisis_level, ["All"])
+        # Determine the filtered subset
+        mask = self.df['risk_level'].apply(
+            lambda x: any(level.strip() in allowed for level in str(x).split(','))
+        )
+        filtered_df = self.df[mask].copy()
+        if filtered_df.empty: return []
+        # 3. Primary ML Scoring (Cosine Similarity)
+        user_vec = self.vectorizer.transform([query_phased])
+        all_cos_scores = cosine_similarity(user_vec, self.tfidf_matrix).flatten()
+        # 4. Final Ranking
+        # Correctly align scores using the original dataframe's index
+        # SAFETY: Ensure we don't exceed the bounds of the scores array (mismatch protection)
+        max_idx = len(all_cos_scores)
+        cos_scores_for_filtered = []
+        for i in filtered_df.index:
+            if i < max_idx:
+                cos_scores_for_filtered.append(all_cos_scores[i])
+            else:
+                cos_scores_for_filtered.append(0.0)
+        filtered_df['cosine_score'] = cos_scores_for_filtered
+        # Apply the hybrid ranking engine
+        ranked_results = self.apply_ranking(filtered_df, query_raw)
+        # Format for output
+        final_list = ranked_results.head(top_n).to_dict('records')
+        # 5. Live Fallback if needed
+        # Requirement: If results are too few, fetch fresh content
+        K = 3
+        if len(final_list) < K:
+            try:
+                from ingestion_service import IngestionService
+                service = IngestionService()
+                live_arts = service.fetch_from_pubmed(query_raw, limit=K)
+                for art in live_arts:
+                    if len(final_list) >= top_n: break
+                    final_list.append({
+                        "article_id": -1,
+                        "title": art['title'],
+                        "category": "Live Fallback",
+                        "format_type": "pubmed",
+                        "external_url": art['url'],
+                        "content": art['content'],
+                        "risk_level": "All"
+                    })
+                # Background ingestion (optional here, but requested in strategy)
+                if live_arts: service.store_articles(live_arts)
+            except Exception as e:
+                print(f"Fallback error: {e}")
+        for item in final_list:
+            item['access_type'] = 'External Link' if item.get('format_type') == 'pubmed' else 'Direct Text'
+            if 'created_at' in item and item['created_at']:
+                item['created_at'] = str(item['created_at'])
+        return final_list
+    def apply_ranking(self, df, raw_query):
+        """Modular requirement: Hybrid ranking engine."""
+        # Constants for weighting
+        SOURCE_WEIGHT = 1.15  # 15% boost for contributor articles
+        EXACT_MATCH_BOOST = 0.2
+        tokens = TextProcessor.normalize(raw_query).split()
+        now = pd.Timestamp.now()
+        def calculate_hybrid_score(row):
+            score = row['cosine_score']
+            # A. Source Weighting (Trusted Contributors)
+            if row['format_type'] == 'text':
+                score *= SOURCE_WEIGHT
+            # B. Exact Symptom Overlap Boost
+            # Check how many user tokens appear exactly in the normalized title
+            norm_title = TextProcessor.normalize(row['title'])
+            matches = sum(1 for t in tokens if t in norm_title)
+            score += (matches * EXACT_MATCH_BOOST)
+            # C. Recency Boost (PubMed only, newer is better)
+            if row['format_type'] == 'pubmed' and row['created_at']:
+                age_days = (now - pd.to_datetime(row['created_at'])).days
+                # Decaying boost: max 0.1 for brand new, goes to 0 over 365 days
+                recency_boost = max(0, 0.1 * (1 - (min(age_days, 365) / 365)))
+                score += recency_boost
+            return score
+        df['final_score'] = df.apply(calculate_hybrid_score, axis=1)
+        return df.sort_values(by='final_score', ascending=False)
+    def get_article_by_id(self, article_id):
+        """Fetches a single article by its ID."""
+        if self.df is None: return None
+        article = self.df[self.df['article_id'] == article_id]
+        return article.iloc[0].to_dict() if not article.empty else None
+# Singleton instance to be used by main.py
+recommender = RecommenderCore()