surbi karki commited on
Commit
700d42b
·
verified ·
1 Parent(s): a0cce43

Upload 8 files

Browse files
Files changed (8) hide show
  1. Dockerfile_HF +34 -0
  2. ingestion_service.py +129 -0
  3. main.py +178 -0
  4. recommender_core.py +166 -0
  5. requirements.txt +9 -0
  6. text_utils.py +58 -0
  7. tfidf_matrix.pkl +3 -0
  8. vectorizer.pkl +3 -0
Dockerfile_HF ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python Backend API for Hugging Face Spaces
2
+ FROM python:3.11-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ gcc \
10
+ postgresql-client \
11
+ libpq-dev \
12
+ && rm -rf /var/lib/apt/lists/*
13
+
14
+ # Copy requirements first for better caching
15
+ COPY requirements.txt .
16
+
17
+ # Install Python dependencies
18
+ RUN pip install --no-cache-dir -r requirements.txt
19
+
20
+ # Copy application code
21
+ COPY . .
22
+
23
+ # Create directory for model files
24
+ RUN mkdir -p /app/models
25
+
26
+ # Hugging Face Spaces default port is 7860
27
+ EXPOSE 7860
28
+
29
+ # Health check (pointing to the correct port)
30
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
31
+ CMD python -c "import requests; requests.get('http://localhost:7860/api/health')" || exit 1
32
+
33
+ # Run the application on port 7860 as required by HF Spaces
34
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
ingestion_service.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import pandas as pd
3
+ import joblib
4
+ from Bio import Entrez
5
+ from sqlalchemy import create_engine, text
6
+ from urllib.parse import quote_plus
7
+ from text_utils import TextProcessor
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+
10
+ # --- CONFIGURATION ---
11
+ DB_USER = "postgres"
12
+ DB_PASSWORD = quote_plus("subisu")
13
+ DB_NAME = "ppd_project_db"
14
+ DB_URI = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@localhost:5432/{DB_NAME}'
15
+
16
+ Entrez.email = "surbi.211740@ncit.edu.np"
17
+ Entrez.tool = "PPD_Recommender_App"
18
+
19
+ class IngestionService:
20
+ def __init__(self):
21
+ self.engine = create_engine(DB_URI)
22
+
23
+ def fetch_from_pubmed(self, query, limit=100):
24
+ print(f"🔍 Searching PubMed: '{query}'...")
25
+ try:
26
+ h1 = Entrez.esearch(db="pubmed", term=query, retmax=limit, sort="relevance")
27
+ ids = Entrez.read(h1)["IdList"]
28
+
29
+ if not ids: return []
30
+
31
+ h2 = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
32
+ papers = Entrez.read(h2)
33
+
34
+ results = []
35
+ for paper in papers['PubmedArticle']:
36
+ try:
37
+ article = paper['MedlineCitation']['Article']
38
+ title = article.get('ArticleTitle', '')
39
+ abstract_data = article.get('Abstract', {}).get('AbstractText', [])
40
+ abstract = " ".join([str(x) for x in abstract_data]) if isinstance(abstract_data, list) else str(abstract_data)
41
+
42
+ if not abstract: continue
43
+
44
+ results.append({
45
+ "title": title,
46
+ "content": abstract,
47
+ "url": f"https://pubmed.ncbi.nlm.nih.gov/{paper['MedlineCitation']['PMID']}/"
48
+ })
49
+ except: continue
50
+ return results
51
+ except Exception as e:
52
+ print(f"Pubmed Error: {e}")
53
+ return []
54
+
55
+ def store_articles(self, articles, category="General", risk="All"):
56
+ """Modular requirement: Stores articles with deduplication."""
57
+ added = 0
58
+ with self.engine.connect() as conn:
59
+ for art in articles:
60
+ # Preprocessing
61
+ clean_title = TextProcessor.clean_html(art['title'])
62
+ clean_content = TextProcessor.clean_html(art['content'])
63
+
64
+ query = text("""
65
+ INSERT INTO articles
66
+ (title, content_clean, content_raw, category, risk_level, status, format_type, external_url)
67
+ VALUES (:t, :cc, :cr, :cat, :risk, 'Approved', 'pubmed', :url)
68
+ ON CONFLICT (external_url) DO NOTHING
69
+ """)
70
+
71
+ try:
72
+ res = conn.execute(query, {
73
+ "t": clean_title,
74
+ "cc": clean_content,
75
+ "cr": f"<h3>Source: PubMed</h3><p>{art['content']}</p>",
76
+ "cat": category,
77
+ "risk": risk,
78
+ "url": art['url']
79
+ })
80
+ conn.commit()
81
+ if res.rowcount > 0: added += 1
82
+ except Exception as e:
83
+ print(f"DB Error: {e}")
84
+ print(f"✅ Stored {added} new articles.")
85
+ return added
86
+
87
+ def build_tfidf_model(self, force=False):
88
+ """Modular requirement: Builds the TF-IDF model with weighted fields."""
89
+ print("🧠 Building Weighted TF-IDF Model...")
90
+ # Use ORDER BY for deterministic indexing
91
+ df = pd.read_sql("SELECT * FROM articles WHERE status = 'Approved' ORDER BY article_id", self.engine)
92
+ df = df.reset_index(drop=True)
93
+
94
+ if df.empty:
95
+ print("⚠️ No articles to build model.")
96
+ return
97
+
98
+ # Multi-Field Weighting
99
+ # Title (3x) + Content (1x) + Tags/Categories (1x)
100
+ # We also apply normalization and phrase detection
101
+
102
+ def prepare_features(row):
103
+ title = TextProcessor.normalize(row['title'])
104
+ content = TextProcessor.normalize(row['content_clean'])
105
+ tags = TextProcessor.normalize(str(row['tags']) + " " + str(row['category']))
106
+
107
+ # Phrase detection on title and content
108
+ title = TextProcessor.detect_phrases(title)
109
+ content = TextProcessor.detect_phrases(content)
110
+
111
+ # Weighted concatenation
112
+ return (title + " ") * 3 + content + " " + tags
113
+
114
+ features = df.apply(prepare_features, axis=1)
115
+
116
+ vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # Support bigrams natively
117
+ tfidf_matrix = vectorizer.fit_transform(features)
118
+
119
+ joblib.dump(vectorizer, 'vectorizer.pkl')
120
+ joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')
121
+ print(f"💾 Model optimized and saved. Vocabulary size: {len(vectorizer.vocabulary_)}")
122
+
123
+ if __name__ == "__main__":
124
+ service = IngestionService()
125
+ # 24-hour broad update
126
+ arts = service.fetch_from_pubmed("postpartum depression OR maternal mental health", 100)
127
+ if arts:
128
+ service.store_articles(arts)
129
+ service.build_tfidf_model()
main.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from contextlib import asynccontextmanager
2
+ from fastapi import FastAPI, HTTPException
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from pydantic import BaseModel
5
+ from typing import List, Dict, Any, Optional
6
+ import os
7
+ import logging
8
+ from datetime import datetime
9
+
10
+ # Import refactored engines
11
+ from recommender_core import recommender
12
+ from ingestion_service import IngestionService
13
+
14
+ # Set up logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # --- LIFESPAN MANAGER ---
19
+ @asynccontextmanager
20
+ async def lifespan(app: FastAPI):
21
+ logger.info("⏳ Starting up... RecommenderCore is ready.")
22
+ yield
23
+ logger.info("🛑 Shutting down...")
24
+
25
+ # --- APP CONFIGURATION ---
26
+ app = FastAPI(
27
+ title="PPD Risk & Recommendation Engine",
28
+ version="1.5",
29
+ description="Advanced system with hybrid scoring, multi-field TF-IDF, and offline-first PubMed integration.",
30
+ lifespan=lifespan,
31
+ docs_url="/docs", # Swagger UI
32
+ redoc_url="/redoc" # ReDoc alternative
33
+ )
34
+
35
+ # --- CORS SETUP ---
36
+ app.add_middleware(
37
+ CORSMiddleware,
38
+ allow_origins=["*"],
39
+ allow_credentials=True,
40
+ allow_methods=["*"],
41
+ allow_headers=["*"],
42
+ )
43
+
44
+ # --- DATA MODELS ---
45
+ class RecommendationRequest(BaseModel):
46
+ risk_level: str
47
+ symptoms_text: str
48
+ top_n: Optional[int] = 5
49
+
50
+ class APIResponse(BaseModel):
51
+ status: str
52
+ risk_assessment: str
53
+ recommendations: List[Dict[str, Any]]
54
+
55
+ # --- API ENDPOINTS ---
56
+
57
+ @app.get("/")
58
+ def health_check():
59
+ is_ready = recommender is not None and recommender.df is not None and not recommender.df.empty
60
+ return {"status": "online", "engine_ready": is_ready, "version": "1.5"}
61
+
62
+ @app.get("/api/health")
63
+ def api_health():
64
+ """Detailed health check for container monitoring."""
65
+ try:
66
+ is_ready = recommender is not None and recommender.df is not None and not recommender.df.empty
67
+ db_connected = recommender.engine is not None
68
+ model_loaded = recommender.vectorizer is not None and recommender.tfidf_matrix is not None
69
+
70
+ return {
71
+ "status": "healthy" if is_ready else "degraded",
72
+ "timestamp": datetime.now().isoformat(),
73
+ "checks": {
74
+ "database": "ok" if db_connected else "error",
75
+ "model": "ok" if model_loaded else "error",
76
+ "articles_loaded": len(recommender.df) if is_ready else 0
77
+ }
78
+ }
79
+ except Exception as e:
80
+ logger.error(f"Health check failed: {e}")
81
+ return {"status": "unhealthy", "error": str(e)}
82
+
83
+ @app.get("/api/stats")
84
+ def get_stats():
85
+ """System statistics for monitoring."""
86
+ try:
87
+ if recommender.df is None:
88
+ return {"error": "System not initialized"}
89
+
90
+ stats = {
91
+ "total_articles": len(recommender.df),
92
+ "articles_by_type": recommender.df['format_type'].value_counts().to_dict(),
93
+ "articles_by_risk": recommender.df['risk_level'].value_counts().to_dict(),
94
+ "model_vocabulary_size": len(recommender.vectorizer.vocabulary_) if recommender.vectorizer else 0,
95
+ "last_updated": datetime.now().isoformat()
96
+ }
97
+ return stats
98
+ except Exception as e:
99
+ logger.error(f"Stats error: {e}")
100
+ raise HTTPException(status_code=500, detail=str(e))
101
+
102
+ @app.post("/api/recommend", response_model=APIResponse)
103
+ def get_recommendations(request: RecommendationRequest):
104
+ """
105
+ Main recommendation endpoint.
106
+ Uses hybrid scoring: Cosine Similarity + Exact Symptom Boost + Source Weighting + Recency Boost.
107
+ """
108
+ try:
109
+ results = recommender.recommend_articles(
110
+ symptoms_text=request.symptoms_text,
111
+ crisis_level=request.risk_level,
112
+ top_n=request.top_n
113
+ )
114
+
115
+ return {
116
+ "status": "success",
117
+ "risk_assessment": request.risk_level,
118
+ "recommendations": results
119
+ }
120
+
121
+ except Exception as e:
122
+ logger.error(f"Recommendation error: {e}")
123
+ raise HTTPException(status_code=500, detail="Internal processing error.")
124
+
125
+ @app.get("/api/article/{article_id}")
126
+ def get_article_content(article_id: int):
127
+ """
128
+ Retrieves full article content.
129
+ Handles both direct contributor text and curated PubMed abstracts.
130
+ """
131
+ article_data = recommender.get_article_by_id(article_id)
132
+
133
+ if not article_data:
134
+ raise HTTPException(status_code=404, detail="Article not found")
135
+
136
+ return {
137
+ "article_id": article_data['article_id'],
138
+ "title": article_data['title'],
139
+ "category": article_data['category'],
140
+ "format_type": article_data.get('format_type', 'text'),
141
+ "external_url": article_data.get('external_url'),
142
+ "content": article_data.get('content_raw') or article_data.get('content_clean')
143
+ }
144
+
145
+ @app.post("/api/admin/rebuild-model")
146
+ def rebuild_model():
147
+ """Admin endpoint to trigger a weighted TF-IDF rebuild."""
148
+ try:
149
+ service = IngestionService()
150
+ service.build_tfidf_model()
151
+ recommender.load_model()
152
+ return {"status": "success", "message": "Weighted TF-IDF model rebuilt and reloaded."}
153
+ except Exception as e:
154
+ logger.error(f"Rebuild error: {e}")
155
+ raise HTTPException(status_code=500, detail=str(e))
156
+
157
+ @app.post("/api/admin/trigger-ingestion")
158
+ def trigger_ingestion():
159
+ """Admin endpoint to manually trigger PubMed ingestion."""
160
+ try:
161
+ service = IngestionService()
162
+ articles = service.fetch_from_pubmed("postpartum depression OR maternal mental health", limit=100)
163
+ if articles:
164
+ count = service.store_articles(articles)
165
+ service.build_tfidf_model()
166
+ recommender.load_model()
167
+ return {
168
+ "status": "success",
169
+ "message": f"Ingested {count} new articles and rebuilt model."
170
+ }
171
+ return {"status": "success", "message": "No new articles found."}
172
+ except Exception as e:
173
+ logger.error(f"Ingestion error: {e}")
174
+ raise HTTPException(status_code=500, detail=str(e))
175
+
176
+ if __name__ == "__main__":
177
+ import uvicorn
178
+ uvicorn.run(app, host="0.0.0.0", port=8000)
recommender_core.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import joblib
3
+ import os
4
+ import time
5
+ from sqlalchemy import create_engine
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ from urllib.parse import quote_plus
8
+ from text_utils import TextProcessor
9
+ from functools import lru_cache
10
+
11
+ # --- CONFIGURATION ---
12
+ # For cloud deployment (HF/Production), use DATABASE_URL.
13
+ # Fallback to local construction if not present.
14
+ DATABASE_URL = os.getenv("DATABASE_URL")
15
+ if not DATABASE_URL:
16
+ DB_USER = os.getenv("DB_USER", "postgres")
17
+ DB_PASSWORD = quote_plus(os.getenv("DB_PASSWORD", "subisu"))
18
+ DB_HOST = os.getenv("DB_HOST", "localhost")
19
+ DB_PORT = os.getenv("DB_PORT", "5432")
20
+ DB_NAME = os.getenv("DB_NAME", "ppd_project_db")
21
+ DB_URI = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
22
+ else:
23
+ # Ensure URL is compatible with SQLAlchemy if it starts with postgres://
24
+ if DATABASE_URL.startswith("postgres://"):
25
+ DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql+psycopg2://", 1)
26
+ elif "postgresql://" in DATABASE_URL and "+psycopg2" not in DATABASE_URL:
27
+ DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+psycopg2://", 1)
28
+ DB_URI = DATABASE_URL
29
+
30
+
31
+ class RecommenderCore:
32
+ def __init__(self):
33
+ self.engine = create_engine(DB_URI)
34
+ self.vectorizer = None
35
+ self.tfidf_matrix = None
36
+ self.df = None
37
+ self.load_model()
38
+
39
+ def load_model(self):
40
+ try:
41
+ if os.path.exists('vectorizer.pkl') and os.path.exists('tfidf_matrix.pkl'):
42
+ self.vectorizer = joblib.load('vectorizer.pkl')
43
+ self.tfidf_matrix = joblib.load('tfidf_matrix.pkl')
44
+ print("💾 Model Loaded into Memory.")
45
+
46
+ self.df = pd.read_sql("SELECT * FROM articles WHERE status = 'Approved' ORDER BY article_id", self.engine)
47
+ self.df = self.df.reset_index(drop=True)
48
+ print(f"📚 Indexed {len(self.df)} articles.")
49
+ except Exception as e:
50
+ print(f"Load Error: {e}")
51
+
52
+ @lru_cache(maxsize=128)
53
+ def recommend_articles(self, symptoms_text, crisis_level, top_n=5):
54
+ """Modular requirement: Main entry point with caching."""
55
+ if self.df is None or self.vectorizer is None:
56
+ return []
57
+
58
+ # 1. Preprocess user query
59
+ query_raw = symptoms_text
60
+ query_norm = TextProcessor.normalize(symptoms_text)
61
+ query_phased = TextProcessor.detect_phrases(query_norm)
62
+
63
+ # 2. Filter by Crisis Level (Safety First)
64
+ risk_map = {
65
+ "High": ["High", "Critical", "Moderate", "All"],
66
+ "Moderate": ["Moderate", "Low", "All"],
67
+ "Low": ["Low", "All"]
68
+ }
69
+ allowed = risk_map.get(crisis_level, ["All"])
70
+
71
+ # Determine the filtered subset
72
+ mask = self.df['risk_level'].apply(
73
+ lambda x: any(level.strip() in allowed for level in str(x).split(','))
74
+ )
75
+ filtered_df = self.df[mask].copy()
76
+
77
+ if filtered_df.empty: return []
78
+
79
+ # 3. Primary ML Scoring (Cosine Similarity)
80
+ user_vec = self.vectorizer.transform([query_phased])
81
+ all_cos_scores = cosine_similarity(user_vec, self.tfidf_matrix).flatten()
82
+
83
+ # 4. Final Ranking
84
+ # Correctly align scores using the original dataframe's index
85
+ filtered_df['cosine_score'] = [all_cos_scores[i] for i in filtered_df.index]
86
+
87
+ # Apply the hybrid ranking engine
88
+ ranked_results = self.apply_ranking(filtered_df, query_raw)
89
+
90
+ # Format for output
91
+ final_list = ranked_results.head(top_n).to_dict('records')
92
+
93
+ # 5. Live Fallback if needed
94
+ # Requirement: If results are too few, fetch fresh content
95
+ K = 3
96
+ if len(final_list) < K:
97
+ try:
98
+ from ingestion_service import IngestionService
99
+ service = IngestionService()
100
+ live_arts = service.fetch_from_pubmed(query_raw, limit=K)
101
+ for art in live_arts:
102
+ if len(final_list) >= top_n: break
103
+ final_list.append({
104
+ "article_id": -1,
105
+ "title": art['title'],
106
+ "category": "Live Fallback",
107
+ "format_type": "pubmed",
108
+ "external_url": art['url'],
109
+ "content": art['content'],
110
+ "risk_level": "All"
111
+ })
112
+ # Background ingestion (optional here, but requested in strategy)
113
+ if live_arts: service.store_articles(live_arts)
114
+ except Exception as e:
115
+ print(f"Fallback error: {e}")
116
+
117
+ for item in final_list:
118
+ item['access_type'] = 'External Link' if item.get('format_type') == 'pubmed' else 'Direct Text'
119
+ if 'created_at' in item and item['created_at']:
120
+ item['created_at'] = str(item['created_at'])
121
+
122
+ return final_list
123
+
124
+ def apply_ranking(self, df, raw_query):
125
+ """Modular requirement: Hybrid ranking engine."""
126
+ # Constants for weighting
127
+ SOURCE_WEIGHT = 1.15 # 15% boost for contributor articles
128
+ EXACT_MATCH_BOOST = 0.2
129
+
130
+ tokens = TextProcessor.normalize(raw_query).split()
131
+
132
+ now = pd.Timestamp.now()
133
+
134
+ def calculate_hybrid_score(row):
135
+ score = row['cosine_score']
136
+
137
+ # A. Source Weighting (Trusted Contributors)
138
+ if row['format_type'] == 'text':
139
+ score *= SOURCE_WEIGHT
140
+
141
+ # B. Exact Symptom Overlap Boost
142
+ # Check how many user tokens appear exactly in the normalized title
143
+ norm_title = TextProcessor.normalize(row['title'])
144
+ matches = sum(1 for t in tokens if t in norm_title)
145
+ score += (matches * EXACT_MATCH_BOOST)
146
+
147
+ # C. Recency Boost (PubMed only, newer is better)
148
+ if row['format_type'] == 'pubmed' and row['created_at']:
149
+ age_days = (now - pd.to_datetime(row['created_at'])).days
150
+ # Decaying boost: max 0.1 for brand new, goes to 0 over 365 days
151
+ recency_boost = max(0, 0.1 * (1 - (min(age_days, 365) / 365)))
152
+ score += recency_boost
153
+
154
+ return score
155
+
156
+ df['final_score'] = df.apply(calculate_hybrid_score, axis=1)
157
+ return df.sort_values(by='final_score', ascending=False)
158
+
159
+ def get_article_by_id(self, article_id):
160
+ """Fetches a single article by its ID."""
161
+ if self.df is None: return None
162
+ article = self.df[self.df['article_id'] == article_id]
163
+ return article.iloc[0].to_dict() if not article.empty else None
164
+
165
+ # Singleton instance to be used by main.py
166
+ recommender = RecommenderCore()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ pandas
4
+ sqlalchemy
5
+ psycopg2-binary
6
+ scikit-learn
7
+ biopython
8
+ beautifulsoup4
9
+ requests
text_utils.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+
4
+ class TextProcessor:
5
+ """Handles normalization, cleaning, and phrase detection."""
6
+
7
+ STOPWORDS = {
8
+ 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
9
+ 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
10
+ 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
11
+ 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
12
+ 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
13
+ 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
14
+ 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
15
+ 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
16
+ 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
17
+ 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should',
18
+ "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't",
19
+ 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
20
+ 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
21
+ 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
22
+ }
23
+
24
+ @staticmethod
25
+ def clean_html(text):
26
+ if not text: return ""
27
+ # Remove HTML tags
28
+ clean = re.sub(r'<[^>]+>', ' ', text)
29
+ # Remove extra whitespace
30
+ clean = re.sub(r'\s+', ' ', clean).strip()
31
+ return clean
32
+
33
+ @classmethod
34
+ def normalize(cls, text):
35
+ if not text: return ""
36
+ # Lowercase
37
+ text = text.lower()
38
+ # Remove punctuation
39
+ text = text.translate(str.maketrans('', '', string.punctuation))
40
+ # Remove stopwords
41
+ tokens = [t for t in text.split() if t not in cls.STOPWORDS]
42
+ return " ".join(tokens)
43
+
44
+ @staticmethod
45
+ def detect_phrases(text):
46
+ """Simple bigram detection for important PPD concepts."""
47
+ phrases = [
48
+ "postpartum depression", "maternal mental health", "sleep disturbance",
49
+ "crying spells", "suicidal ideation", "mood swings", "baby blues"
50
+ ]
51
+ for p in phrases:
52
+ # We don't replace, we just ensure they are treated as one token for TF-IDF if we want,
53
+ # but scikit-learn's ngram_range can also do this.
54
+ # To force it, we could underscore them:
55
+ if p in text.lower():
56
+ underscored = p.replace(" ", "_")
57
+ text = re.sub(p, underscored, text, flags=re.IGNORECASE)
58
+ return text
tfidf_matrix.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13753f66236968d06e263b5ccfbca12d51730c348683e1913e613bba0ac8c6d3
3
+ size 406971
vectorizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:020b250895c3135ec8f61bba8663693461c2dc4b94e4bfef5ff89d72e489a6e7
3
+ size 598132