File size: 5,926 Bytes
16f0d20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import time
import pandas as pd
import joblib
from Bio import Entrez
from sqlalchemy import create_engine, text
from urllib.parse import quote_plus
from text_utils import TextProcessor
from sklearn.feature_extraction.text import TfidfVectorizer

# --- CONFIGURATION ---
import os
DATABASE_URL = os.getenv("DATABASE_URL")
if not DATABASE_URL:
    DB_USER = os.getenv("DB_USER", "postgres")
    DB_PASSWORD = quote_plus(os.getenv("DB_PASSWORD", "subisu"))
    DB_HOST = os.getenv("DB_HOST", "localhost")
    DB_PORT = os.getenv("DB_PORT", "5432")
    DB_NAME = os.getenv("DB_NAME", "ppd_project_db")
    DB_URI = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
else:
    # Ensure URL is compatible with SQLAlchemy if it starts with postgres://
    if DATABASE_URL.startswith("postgres://"):
        DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql+psycopg2://", 1)
    elif "postgresql://" in DATABASE_URL and "+psycopg2" not in DATABASE_URL:
        DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+psycopg2://", 1)
    DB_URI = DATABASE_URL


Entrez.email = "surbi.211740@ncit.edu.np"
Entrez.tool = "PPD_Recommender_App"

class IngestionService:
    def __init__(self):
        self.engine = create_engine(DB_URI)

    def fetch_from_pubmed(self, query, limit=100):
        print(f"🔍 Searching PubMed: '{query}'...")
        try:
            h1 = Entrez.esearch(db="pubmed", term=query, retmax=limit, sort="relevance")
            ids = Entrez.read(h1)["IdList"]
            
            if not ids: return []
            
            h2 = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
            papers = Entrez.read(h2)
            
            results = []
            for paper in papers['PubmedArticle']:
                try:
                    article = paper['MedlineCitation']['Article']
                    title = article.get('ArticleTitle', '')
                    abstract_data = article.get('Abstract', {}).get('AbstractText', [])
                    abstract = " ".join([str(x) for x in abstract_data]) if isinstance(abstract_data, list) else str(abstract_data)
                    
                    if not abstract: continue
                    
                    results.append({
                        "title": title,
                        "content": abstract,
                        "url": f"https://pubmed.ncbi.nlm.nih.gov/{paper['MedlineCitation']['PMID']}/"
                    })
                except: continue
            return results
        except Exception as e:
            print(f"Pubmed Error: {e}")
            return []

    def store_articles(self, articles, category="General", risk="All"):
        """Modular requirement: Stores articles with deduplication."""
        added = 0
        with self.engine.connect() as conn:
            for art in articles:
                # Preprocessing
                clean_title = TextProcessor.clean_html(art['title'])
                clean_content = TextProcessor.clean_html(art['content'])
                
                query = text("""
                    INSERT INTO articles 
                    (title, content_clean, content_raw, category, risk_level, status, format_type, external_url)
                    VALUES (:t, :cc, :cr, :cat, :risk, 'Approved', 'pubmed', :url)
                    ON CONFLICT (external_url) DO NOTHING
                """)
                
                try:
                    res = conn.execute(query, {
                        "t": clean_title,
                        "cc": clean_content,
                        "cr": f"<h3>Source: PubMed</h3><p>{art['content']}</p>",
                        "cat": category,
                        "risk": risk,
                        "url": art['url']
                    })
                    conn.commit()
                    if res.rowcount > 0: added += 1
                except Exception as e:
                    print(f"DB Error: {e}")
        print(f"✅ Stored {added} new articles.")
        return added

    def build_tfidf_model(self, force=False):
        """Modular requirement: Builds the TF-IDF model with weighted fields."""
        print("🧠 Building Weighted TF-IDF Model...")
        # Use ORDER BY for deterministic indexing
        df = pd.read_sql("SELECT * FROM articles WHERE status = 'Approved' ORDER BY article_id", self.engine)
        df = df.reset_index(drop=True)
        
        if df.empty:
            print("⚠️ No articles to build model.")
            return

        # Multi-Field Weighting
        # Title (3x) + Content (1x) + Tags/Categories (1x)
        # We also apply normalization and phrase detection
        
        def prepare_features(row):
            title = TextProcessor.normalize(row['title'])
            content = TextProcessor.normalize(row['content_clean'])
            tags = TextProcessor.normalize(str(row['tags']) + " " + str(row['category']))
            
            # Phrase detection on title and content
            title = TextProcessor.detect_phrases(title)
            content = TextProcessor.detect_phrases(content)
            
            # Weighted concatenation
            return (title + " ") * 3 + content + " " + tags

        features = df.apply(prepare_features, axis=1)
        
        vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # Support bigrams natively
        tfidf_matrix = vectorizer.fit_transform(features)
        
        joblib.dump(vectorizer, 'vectorizer.pkl')
        joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')
        print(f"💾 Model optimized and saved. Vocabulary size: {len(vectorizer.vocabulary_)}")

if __name__ == "__main__":
    service = IngestionService()
    # 24-hour broad update
    arts = service.fetch_from_pubmed("postpartum depression OR maternal mental health", 100)
    if arts:
        service.store_articles(arts)
        service.build_tfidf_model()