Spaces:

subisu
/

ppd-recommendation-api

Sleeping

App Files Files Community

surbi karki commited on Feb 5

Commit

16f0d20

verified ·

1 Parent(s): d37e0aa

Update ingestion_service.py

Browse files

Files changed (1) hide show

ingestion_service.py +142 -129

ingestion_service.py CHANGED Viewed

@@ -1,129 +1,142 @@
-import time
-import pandas as pd
-import joblib
-from Bio import Entrez
-from sqlalchemy import create_engine, text
-from urllib.parse import quote_plus
-from text_utils import TextProcessor
-from sklearn.feature_extraction.text import TfidfVectorizer
-# --- CONFIGURATION ---
-DB_USER = "postgres"
-DB_PASSWORD = quote_plus("subisu")
-DB_NAME = "ppd_project_db"
-DB_URI = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@localhost:5432/{DB_NAME}'
-Entrez.email = "surbi.211740@ncit.edu.np"
-Entrez.tool = "PPD_Recommender_App"
-class IngestionService:
-    def __init__(self):
-        self.engine = create_engine(DB_URI)
-    def fetch_from_pubmed(self, query, limit=100):
-        print(f"🔍 Searching PubMed: '{query}'...")
-        try:
-            h1 = Entrez.esearch(db="pubmed", term=query, retmax=limit, sort="relevance")
-            ids = Entrez.read(h1)["IdList"]
-            if not ids: return []
-            h2 = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
-            papers = Entrez.read(h2)
-            results = []
-            for paper in papers['PubmedArticle']:
-                try:
-                    article = paper['MedlineCitation']['Article']
-                    title = article.get('ArticleTitle', '')
-                    abstract_data = article.get('Abstract', {}).get('AbstractText', [])
-                    abstract = " ".join([str(x) for x in abstract_data]) if isinstance(abstract_data, list) else str(abstract_data)
-                    if not abstract: continue
-                    results.append({
-                        "title": title,
-                        "content": abstract,
-                        "url": f"https://pubmed.ncbi.nlm.nih.gov/{paper['MedlineCitation']['PMID']}/"
-                    })
-                except: continue
-            return results
-        except Exception as e:
-            print(f"Pubmed Error: {e}")
-            return []
-    def store_articles(self, articles, category="General", risk="All"):
-        """Modular requirement: Stores articles with deduplication."""
-        added = 0
-        with self.engine.connect() as conn:
-            for art in articles:
-                # Preprocessing
-                clean_title = TextProcessor.clean_html(art['title'])
-                clean_content = TextProcessor.clean_html(art['content'])
-                query = text("""
-                    INSERT INTO articles
-                    (title, content_clean, content_raw, category, risk_level, status, format_type, external_url)
-                    VALUES (:t, :cc, :cr, :cat, :risk, 'Approved', 'pubmed', :url)
-                    ON CONFLICT (external_url) DO NOTHING
-                """)
-                try:
-                    res = conn.execute(query, {
-                        "t": clean_title,
-                        "cc": clean_content,
-                        "cr": f"<h3>Source: PubMed</h3><p>{art['content']}</p>",
-                        "cat": category,
-                        "risk": risk,
-                        "url": art['url']
-                    })
-                    conn.commit()
-                    if res.rowcount > 0: added += 1
-                except Exception as e:
-                    print(f"DB Error: {e}")
-        print(f"✅ Stored {added} new articles.")
-        return added
-    def build_tfidf_model(self, force=False):
-        """Modular requirement: Builds the TF-IDF model with weighted fields."""
-        print("🧠 Building Weighted TF-IDF Model...")
-        # Use ORDER BY for deterministic indexing
-        df = pd.read_sql("SELECT * FROM articles WHERE status = 'Approved' ORDER BY article_id", self.engine)
-        df = df.reset_index(drop=True)
-        if df.empty:
-            print("⚠️ No articles to build model.")
-            return
-        # Multi-Field Weighting
-        # Title (3x) + Content (1x) + Tags/Categories (1x)
-        # We also apply normalization and phrase detection
-        def prepare_features(row):
-            title = TextProcessor.normalize(row['title'])
-            content = TextProcessor.normalize(row['content_clean'])
-            tags = TextProcessor.normalize(str(row['tags']) + " " + str(row['category']))
-            # Phrase detection on title and content
-            title = TextProcessor.detect_phrases(title)
-            content = TextProcessor.detect_phrases(content)
-            # Weighted concatenation
-            return (title + " ") * 3 + content + " " + tags
-        features = df.apply(prepare_features, axis=1)
-        vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # Support bigrams natively
-        tfidf_matrix = vectorizer.fit_transform(features)
-        joblib.dump(vectorizer, 'vectorizer.pkl')
-        joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')
-        print(f"💾 Model optimized and saved. Vocabulary size: {len(vectorizer.vocabulary_)}")
-if __name__ == "__main__":
-    service = IngestionService()
-    # 24-hour broad update
-    arts = service.fetch_from_pubmed("postpartum depression OR maternal mental health", 100)
-    if arts:
-        service.store_articles(arts)
-        service.build_tfidf_model()

+import time
+import pandas as pd
+import joblib
+from Bio import Entrez
+from sqlalchemy import create_engine, text
+from urllib.parse import quote_plus
+from text_utils import TextProcessor
+from sklearn.feature_extraction.text import TfidfVectorizer
+# --- CONFIGURATION ---
+import os
+DATABASE_URL = os.getenv("DATABASE_URL")
+if not DATABASE_URL:
+    DB_USER = os.getenv("DB_USER", "postgres")
+    DB_PASSWORD = quote_plus(os.getenv("DB_PASSWORD", "subisu"))
+    DB_HOST = os.getenv("DB_HOST", "localhost")
+    DB_PORT = os.getenv("DB_PORT", "5432")
+    DB_NAME = os.getenv("DB_NAME", "ppd_project_db")
+    DB_URI = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
+else:
+    # Ensure URL is compatible with SQLAlchemy if it starts with postgres://
+    if DATABASE_URL.startswith("postgres://"):
+        DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql+psycopg2://", 1)
+    elif "postgresql://" in DATABASE_URL and "+psycopg2" not in DATABASE_URL:
+        DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+psycopg2://", 1)
+    DB_URI = DATABASE_URL
+Entrez.email = "surbi.211740@ncit.edu.np"
+Entrez.tool = "PPD_Recommender_App"
+class IngestionService:
+    def __init__(self):
+        self.engine = create_engine(DB_URI)
+    def fetch_from_pubmed(self, query, limit=100):
+        print(f"🔍 Searching PubMed: '{query}'...")
+        try:
+            h1 = Entrez.esearch(db="pubmed", term=query, retmax=limit, sort="relevance")
+            ids = Entrez.read(h1)["IdList"]
+            if not ids: return []
+            h2 = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
+            papers = Entrez.read(h2)
+            results = []
+            for paper in papers['PubmedArticle']:
+                try:
+                    article = paper['MedlineCitation']['Article']
+                    title = article.get('ArticleTitle', '')
+                    abstract_data = article.get('Abstract', {}).get('AbstractText', [])
+                    abstract = " ".join([str(x) for x in abstract_data]) if isinstance(abstract_data, list) else str(abstract_data)
+                    if not abstract: continue
+                    results.append({
+                        "title": title,
+                        "content": abstract,
+                        "url": f"https://pubmed.ncbi.nlm.nih.gov/{paper['MedlineCitation']['PMID']}/"
+                    })
+                except: continue
+            return results
+        except Exception as e:
+            print(f"Pubmed Error: {e}")
+            return []
+    def store_articles(self, articles, category="General", risk="All"):
+        """Modular requirement: Stores articles with deduplication."""
+        added = 0
+        with self.engine.connect() as conn:
+            for art in articles:
+                # Preprocessing
+                clean_title = TextProcessor.clean_html(art['title'])
+                clean_content = TextProcessor.clean_html(art['content'])
+                query = text("""
+                    INSERT INTO articles
+                    (title, content_clean, content_raw, category, risk_level, status, format_type, external_url)
+                    VALUES (:t, :cc, :cr, :cat, :risk, 'Approved', 'pubmed', :url)
+                    ON CONFLICT (external_url) DO NOTHING
+                """)
+                try:
+                    res = conn.execute(query, {
+                        "t": clean_title,
+                        "cc": clean_content,
+                        "cr": f"<h3>Source: PubMed</h3><p>{art['content']}</p>",
+                        "cat": category,
+                        "risk": risk,
+                        "url": art['url']
+                    })
+                    conn.commit()
+                    if res.rowcount > 0: added += 1
+                except Exception as e:
+                    print(f"DB Error: {e}")
+        print(f"✅ Stored {added} new articles.")
+        return added
+    def build_tfidf_model(self, force=False):
+        """Modular requirement: Builds the TF-IDF model with weighted fields."""
+        print("🧠 Building Weighted TF-IDF Model...")
+        # Use ORDER BY for deterministic indexing
+        df = pd.read_sql("SELECT * FROM articles WHERE status = 'Approved' ORDER BY article_id", self.engine)
+        df = df.reset_index(drop=True)
+        if df.empty:
+            print("⚠️ No articles to build model.")
+            return
+        # Multi-Field Weighting
+        # Title (3x) + Content (1x) + Tags/Categories (1x)
+        # We also apply normalization and phrase detection
+        def prepare_features(row):
+            title = TextProcessor.normalize(row['title'])
+            content = TextProcessor.normalize(row['content_clean'])
+            tags = TextProcessor.normalize(str(row['tags']) + " " + str(row['category']))
+            # Phrase detection on title and content
+            title = TextProcessor.detect_phrases(title)
+            content = TextProcessor.detect_phrases(content)
+            # Weighted concatenation
+            return (title + " ") * 3 + content + " " + tags
+        features = df.apply(prepare_features, axis=1)
+        vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # Support bigrams natively
+        tfidf_matrix = vectorizer.fit_transform(features)
+        joblib.dump(vectorizer, 'vectorizer.pkl')
+        joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')
+        print(f"💾 Model optimized and saved. Vocabulary size: {len(vectorizer.vocabulary_)}")
+if __name__ == "__main__":
+    service = IngestionService()
+    # 24-hour broad update
+    arts = service.fetch_from_pubmed("postpartum depression OR maternal mental health", 100)
+    if arts:
+        service.store_articles(arts)
+        service.build_tfidf_model()