import time import pandas as pd import joblib from Bio import Entrez from sqlalchemy import create_engine, text from urllib.parse import quote_plus from text_utils import TextProcessor from sklearn.feature_extraction.text import TfidfVectorizer # --- CONFIGURATION --- import os DATABASE_URL = os.getenv("DATABASE_URL") if not DATABASE_URL: DB_USER = os.getenv("DB_USER", "postgres") DB_PASSWORD = quote_plus(os.getenv("DB_PASSWORD", "subisu")) DB_HOST = os.getenv("DB_HOST", "localhost") DB_PORT = os.getenv("DB_PORT", "5432") DB_NAME = os.getenv("DB_NAME", "ppd_project_db") DB_URI = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}' else: # Ensure URL is compatible with SQLAlchemy if it starts with postgres:// if DATABASE_URL.startswith("postgres://"): DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql+psycopg2://", 1) elif "postgresql://" in DATABASE_URL and "+psycopg2" not in DATABASE_URL: DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+psycopg2://", 1) DB_URI = DATABASE_URL Entrez.email = "surbi.211740@ncit.edu.np" Entrez.tool = "PPD_Recommender_App" class IngestionService: def __init__(self): self.engine = create_engine(DB_URI) def fetch_from_pubmed(self, query, limit=100): print(f"🔍 Searching PubMed: '{query}'...") try: h1 = Entrez.esearch(db="pubmed", term=query, retmax=limit, sort="relevance") ids = Entrez.read(h1)["IdList"] if not ids: return [] h2 = Entrez.efetch(db="pubmed", id=ids, retmode="xml") papers = Entrez.read(h2) results = [] for paper in papers['PubmedArticle']: try: article = paper['MedlineCitation']['Article'] title = article.get('ArticleTitle', '') abstract_data = article.get('Abstract', {}).get('AbstractText', []) abstract = " ".join([str(x) for x in abstract_data]) if isinstance(abstract_data, list) else str(abstract_data) if not abstract: continue results.append({ "title": title, "content": abstract, "url": f"https://pubmed.ncbi.nlm.nih.gov/{paper['MedlineCitation']['PMID']}/" }) except: continue return results except Exception as e: print(f"Pubmed Error: {e}") return [] def store_articles(self, articles, category="General", risk="All"): """Modular requirement: Stores articles with deduplication.""" added = 0 with self.engine.connect() as conn: for art in articles: # Preprocessing clean_title = TextProcessor.clean_html(art['title']) clean_content = TextProcessor.clean_html(art['content']) query = text(""" INSERT INTO articles (title, content_clean, content_raw, category, risk_level, status, format_type, external_url) VALUES (:t, :cc, :cr, :cat, :risk, 'Approved', 'pubmed', :url) ON CONFLICT (external_url) DO NOTHING """) try: res = conn.execute(query, { "t": clean_title, "cc": clean_content, "cr": f"

Source: PubMed

{art['content']}

", "cat": category, "risk": risk, "url": art['url'] }) conn.commit() if res.rowcount > 0: added += 1 except Exception as e: print(f"DB Error: {e}") print(f"✅ Stored {added} new articles.") return added def build_tfidf_model(self, force=False): """Modular requirement: Builds the TF-IDF model with weighted fields.""" print("🧠 Building Weighted TF-IDF Model...") # Use ORDER BY for deterministic indexing df = pd.read_sql("SELECT * FROM articles WHERE status = 'Approved' ORDER BY article_id", self.engine) df = df.reset_index(drop=True) if df.empty: print("⚠️ No articles to build model.") return # Multi-Field Weighting # Title (3x) + Content (1x) + Tags/Categories (1x) # We also apply normalization and phrase detection def prepare_features(row): title = TextProcessor.normalize(row['title']) content = TextProcessor.normalize(row['content_clean']) tags = TextProcessor.normalize(str(row['tags']) + " " + str(row['category'])) # Phrase detection on title and content title = TextProcessor.detect_phrases(title) content = TextProcessor.detect_phrases(content) # Weighted concatenation return (title + " ") * 3 + content + " " + tags features = df.apply(prepare_features, axis=1) vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # Support bigrams natively tfidf_matrix = vectorizer.fit_transform(features) joblib.dump(vectorizer, 'vectorizer.pkl') joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl') print(f"💾 Model optimized and saved. Vocabulary size: {len(vectorizer.vocabulary_)}") if __name__ == "__main__": service = IngestionService() # 24-hour broad update arts = service.fetch_from_pubmed("postpartum depression OR maternal mental health", 100) if arts: service.store_articles(arts) service.build_tfidf_model()