Spaces:
Sleeping
Sleeping
File size: 5,926 Bytes
16f0d20 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | import time
import pandas as pd
import joblib
from Bio import Entrez
from sqlalchemy import create_engine, text
from urllib.parse import quote_plus
from text_utils import TextProcessor
from sklearn.feature_extraction.text import TfidfVectorizer
# --- CONFIGURATION ---
import os
DATABASE_URL = os.getenv("DATABASE_URL")
if not DATABASE_URL:
DB_USER = os.getenv("DB_USER", "postgres")
DB_PASSWORD = quote_plus(os.getenv("DB_PASSWORD", "subisu"))
DB_HOST = os.getenv("DB_HOST", "localhost")
DB_PORT = os.getenv("DB_PORT", "5432")
DB_NAME = os.getenv("DB_NAME", "ppd_project_db")
DB_URI = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
else:
# Ensure URL is compatible with SQLAlchemy if it starts with postgres://
if DATABASE_URL.startswith("postgres://"):
DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql+psycopg2://", 1)
elif "postgresql://" in DATABASE_URL and "+psycopg2" not in DATABASE_URL:
DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+psycopg2://", 1)
DB_URI = DATABASE_URL
Entrez.email = "surbi.211740@ncit.edu.np"
Entrez.tool = "PPD_Recommender_App"
class IngestionService:
def __init__(self):
self.engine = create_engine(DB_URI)
def fetch_from_pubmed(self, query, limit=100):
print(f"🔍 Searching PubMed: '{query}'...")
try:
h1 = Entrez.esearch(db="pubmed", term=query, retmax=limit, sort="relevance")
ids = Entrez.read(h1)["IdList"]
if not ids: return []
h2 = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
papers = Entrez.read(h2)
results = []
for paper in papers['PubmedArticle']:
try:
article = paper['MedlineCitation']['Article']
title = article.get('ArticleTitle', '')
abstract_data = article.get('Abstract', {}).get('AbstractText', [])
abstract = " ".join([str(x) for x in abstract_data]) if isinstance(abstract_data, list) else str(abstract_data)
if not abstract: continue
results.append({
"title": title,
"content": abstract,
"url": f"https://pubmed.ncbi.nlm.nih.gov/{paper['MedlineCitation']['PMID']}/"
})
except: continue
return results
except Exception as e:
print(f"Pubmed Error: {e}")
return []
def store_articles(self, articles, category="General", risk="All"):
"""Modular requirement: Stores articles with deduplication."""
added = 0
with self.engine.connect() as conn:
for art in articles:
# Preprocessing
clean_title = TextProcessor.clean_html(art['title'])
clean_content = TextProcessor.clean_html(art['content'])
query = text("""
INSERT INTO articles
(title, content_clean, content_raw, category, risk_level, status, format_type, external_url)
VALUES (:t, :cc, :cr, :cat, :risk, 'Approved', 'pubmed', :url)
ON CONFLICT (external_url) DO NOTHING
""")
try:
res = conn.execute(query, {
"t": clean_title,
"cc": clean_content,
"cr": f"<h3>Source: PubMed</h3><p>{art['content']}</p>",
"cat": category,
"risk": risk,
"url": art['url']
})
conn.commit()
if res.rowcount > 0: added += 1
except Exception as e:
print(f"DB Error: {e}")
print(f"✅ Stored {added} new articles.")
return added
def build_tfidf_model(self, force=False):
"""Modular requirement: Builds the TF-IDF model with weighted fields."""
print("🧠 Building Weighted TF-IDF Model...")
# Use ORDER BY for deterministic indexing
df = pd.read_sql("SELECT * FROM articles WHERE status = 'Approved' ORDER BY article_id", self.engine)
df = df.reset_index(drop=True)
if df.empty:
print("⚠️ No articles to build model.")
return
# Multi-Field Weighting
# Title (3x) + Content (1x) + Tags/Categories (1x)
# We also apply normalization and phrase detection
def prepare_features(row):
title = TextProcessor.normalize(row['title'])
content = TextProcessor.normalize(row['content_clean'])
tags = TextProcessor.normalize(str(row['tags']) + " " + str(row['category']))
# Phrase detection on title and content
title = TextProcessor.detect_phrases(title)
content = TextProcessor.detect_phrases(content)
# Weighted concatenation
return (title + " ") * 3 + content + " " + tags
features = df.apply(prepare_features, axis=1)
vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # Support bigrams natively
tfidf_matrix = vectorizer.fit_transform(features)
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')
print(f"💾 Model optimized and saved. Vocabulary size: {len(vectorizer.vocabulary_)}")
if __name__ == "__main__":
service = IngestionService()
# 24-hour broad update
arts = service.fetch_from_pubmed("postpartum depression OR maternal mental health", 100)
if arts:
service.store_articles(arts)
service.build_tfidf_model()
|