Spaces:
Sleeping
Sleeping
surbi karki commited on
Upload 8 files
Browse files- Dockerfile_HF +34 -0
- ingestion_service.py +129 -0
- main.py +178 -0
- recommender_core.py +166 -0
- requirements.txt +9 -0
- text_utils.py +58 -0
- tfidf_matrix.pkl +3 -0
- vectorizer.pkl +3 -0
Dockerfile_HF
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python Backend API for Hugging Face Spaces
|
| 2 |
+
FROM python:3.11-slim
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
gcc \
|
| 10 |
+
postgresql-client \
|
| 11 |
+
libpq-dev \
|
| 12 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
+
|
| 14 |
+
# Copy requirements first for better caching
|
| 15 |
+
COPY requirements.txt .
|
| 16 |
+
|
| 17 |
+
# Install Python dependencies
|
| 18 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# Copy application code
|
| 21 |
+
COPY . .
|
| 22 |
+
|
| 23 |
+
# Create directory for model files
|
| 24 |
+
RUN mkdir -p /app/models
|
| 25 |
+
|
| 26 |
+
# Hugging Face Spaces default port is 7860
|
| 27 |
+
EXPOSE 7860
|
| 28 |
+
|
| 29 |
+
# Health check (pointing to the correct port)
|
| 30 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
|
| 31 |
+
CMD python -c "import requests; requests.get('http://localhost:7860/api/health')" || exit 1
|
| 32 |
+
|
| 33 |
+
# Run the application on port 7860 as required by HF Spaces
|
| 34 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
ingestion_service.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import joblib
|
| 4 |
+
from Bio import Entrez
|
| 5 |
+
from sqlalchemy import create_engine, text
|
| 6 |
+
from urllib.parse import quote_plus
|
| 7 |
+
from text_utils import TextProcessor
|
| 8 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 9 |
+
|
| 10 |
+
# --- CONFIGURATION ---
|
| 11 |
+
DB_USER = "postgres"
|
| 12 |
+
DB_PASSWORD = quote_plus("subisu")
|
| 13 |
+
DB_NAME = "ppd_project_db"
|
| 14 |
+
DB_URI = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@localhost:5432/{DB_NAME}'
|
| 15 |
+
|
| 16 |
+
Entrez.email = "surbi.211740@ncit.edu.np"
|
| 17 |
+
Entrez.tool = "PPD_Recommender_App"
|
| 18 |
+
|
| 19 |
+
class IngestionService:
|
| 20 |
+
def __init__(self):
|
| 21 |
+
self.engine = create_engine(DB_URI)
|
| 22 |
+
|
| 23 |
+
def fetch_from_pubmed(self, query, limit=100):
|
| 24 |
+
print(f"🔍 Searching PubMed: '{query}'...")
|
| 25 |
+
try:
|
| 26 |
+
h1 = Entrez.esearch(db="pubmed", term=query, retmax=limit, sort="relevance")
|
| 27 |
+
ids = Entrez.read(h1)["IdList"]
|
| 28 |
+
|
| 29 |
+
if not ids: return []
|
| 30 |
+
|
| 31 |
+
h2 = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
|
| 32 |
+
papers = Entrez.read(h2)
|
| 33 |
+
|
| 34 |
+
results = []
|
| 35 |
+
for paper in papers['PubmedArticle']:
|
| 36 |
+
try:
|
| 37 |
+
article = paper['MedlineCitation']['Article']
|
| 38 |
+
title = article.get('ArticleTitle', '')
|
| 39 |
+
abstract_data = article.get('Abstract', {}).get('AbstractText', [])
|
| 40 |
+
abstract = " ".join([str(x) for x in abstract_data]) if isinstance(abstract_data, list) else str(abstract_data)
|
| 41 |
+
|
| 42 |
+
if not abstract: continue
|
| 43 |
+
|
| 44 |
+
results.append({
|
| 45 |
+
"title": title,
|
| 46 |
+
"content": abstract,
|
| 47 |
+
"url": f"https://pubmed.ncbi.nlm.nih.gov/{paper['MedlineCitation']['PMID']}/"
|
| 48 |
+
})
|
| 49 |
+
except: continue
|
| 50 |
+
return results
|
| 51 |
+
except Exception as e:
|
| 52 |
+
print(f"Pubmed Error: {e}")
|
| 53 |
+
return []
|
| 54 |
+
|
| 55 |
+
def store_articles(self, articles, category="General", risk="All"):
|
| 56 |
+
"""Modular requirement: Stores articles with deduplication."""
|
| 57 |
+
added = 0
|
| 58 |
+
with self.engine.connect() as conn:
|
| 59 |
+
for art in articles:
|
| 60 |
+
# Preprocessing
|
| 61 |
+
clean_title = TextProcessor.clean_html(art['title'])
|
| 62 |
+
clean_content = TextProcessor.clean_html(art['content'])
|
| 63 |
+
|
| 64 |
+
query = text("""
|
| 65 |
+
INSERT INTO articles
|
| 66 |
+
(title, content_clean, content_raw, category, risk_level, status, format_type, external_url)
|
| 67 |
+
VALUES (:t, :cc, :cr, :cat, :risk, 'Approved', 'pubmed', :url)
|
| 68 |
+
ON CONFLICT (external_url) DO NOTHING
|
| 69 |
+
""")
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
res = conn.execute(query, {
|
| 73 |
+
"t": clean_title,
|
| 74 |
+
"cc": clean_content,
|
| 75 |
+
"cr": f"<h3>Source: PubMed</h3><p>{art['content']}</p>",
|
| 76 |
+
"cat": category,
|
| 77 |
+
"risk": risk,
|
| 78 |
+
"url": art['url']
|
| 79 |
+
})
|
| 80 |
+
conn.commit()
|
| 81 |
+
if res.rowcount > 0: added += 1
|
| 82 |
+
except Exception as e:
|
| 83 |
+
print(f"DB Error: {e}")
|
| 84 |
+
print(f"✅ Stored {added} new articles.")
|
| 85 |
+
return added
|
| 86 |
+
|
| 87 |
+
def build_tfidf_model(self, force=False):
|
| 88 |
+
"""Modular requirement: Builds the TF-IDF model with weighted fields."""
|
| 89 |
+
print("🧠 Building Weighted TF-IDF Model...")
|
| 90 |
+
# Use ORDER BY for deterministic indexing
|
| 91 |
+
df = pd.read_sql("SELECT * FROM articles WHERE status = 'Approved' ORDER BY article_id", self.engine)
|
| 92 |
+
df = df.reset_index(drop=True)
|
| 93 |
+
|
| 94 |
+
if df.empty:
|
| 95 |
+
print("⚠️ No articles to build model.")
|
| 96 |
+
return
|
| 97 |
+
|
| 98 |
+
# Multi-Field Weighting
|
| 99 |
+
# Title (3x) + Content (1x) + Tags/Categories (1x)
|
| 100 |
+
# We also apply normalization and phrase detection
|
| 101 |
+
|
| 102 |
+
def prepare_features(row):
|
| 103 |
+
title = TextProcessor.normalize(row['title'])
|
| 104 |
+
content = TextProcessor.normalize(row['content_clean'])
|
| 105 |
+
tags = TextProcessor.normalize(str(row['tags']) + " " + str(row['category']))
|
| 106 |
+
|
| 107 |
+
# Phrase detection on title and content
|
| 108 |
+
title = TextProcessor.detect_phrases(title)
|
| 109 |
+
content = TextProcessor.detect_phrases(content)
|
| 110 |
+
|
| 111 |
+
# Weighted concatenation
|
| 112 |
+
return (title + " ") * 3 + content + " " + tags
|
| 113 |
+
|
| 114 |
+
features = df.apply(prepare_features, axis=1)
|
| 115 |
+
|
| 116 |
+
vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # Support bigrams natively
|
| 117 |
+
tfidf_matrix = vectorizer.fit_transform(features)
|
| 118 |
+
|
| 119 |
+
joblib.dump(vectorizer, 'vectorizer.pkl')
|
| 120 |
+
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')
|
| 121 |
+
print(f"💾 Model optimized and saved. Vocabulary size: {len(vectorizer.vocabulary_)}")
|
| 122 |
+
|
| 123 |
+
if __name__ == "__main__":
|
| 124 |
+
service = IngestionService()
|
| 125 |
+
# 24-hour broad update
|
| 126 |
+
arts = service.fetch_from_pubmed("postpartum depression OR maternal mental health", 100)
|
| 127 |
+
if arts:
|
| 128 |
+
service.store_articles(arts)
|
| 129 |
+
service.build_tfidf_model()
|
main.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from contextlib import asynccontextmanager
|
| 2 |
+
from fastapi import FastAPI, HTTPException
|
| 3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 4 |
+
from pydantic import BaseModel
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
+
import os
|
| 7 |
+
import logging
|
| 8 |
+
from datetime import datetime
|
| 9 |
+
|
| 10 |
+
# Import refactored engines
|
| 11 |
+
from recommender_core import recommender
|
| 12 |
+
from ingestion_service import IngestionService
|
| 13 |
+
|
| 14 |
+
# Set up logging
|
| 15 |
+
logging.basicConfig(level=logging.INFO)
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
# --- LIFESPAN MANAGER ---
|
| 19 |
+
@asynccontextmanager
|
| 20 |
+
async def lifespan(app: FastAPI):
|
| 21 |
+
logger.info("⏳ Starting up... RecommenderCore is ready.")
|
| 22 |
+
yield
|
| 23 |
+
logger.info("🛑 Shutting down...")
|
| 24 |
+
|
| 25 |
+
# --- APP CONFIGURATION ---
|
| 26 |
+
app = FastAPI(
|
| 27 |
+
title="PPD Risk & Recommendation Engine",
|
| 28 |
+
version="1.5",
|
| 29 |
+
description="Advanced system with hybrid scoring, multi-field TF-IDF, and offline-first PubMed integration.",
|
| 30 |
+
lifespan=lifespan,
|
| 31 |
+
docs_url="/docs", # Swagger UI
|
| 32 |
+
redoc_url="/redoc" # ReDoc alternative
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# --- CORS SETUP ---
|
| 36 |
+
app.add_middleware(
|
| 37 |
+
CORSMiddleware,
|
| 38 |
+
allow_origins=["*"],
|
| 39 |
+
allow_credentials=True,
|
| 40 |
+
allow_methods=["*"],
|
| 41 |
+
allow_headers=["*"],
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# --- DATA MODELS ---
|
| 45 |
+
class RecommendationRequest(BaseModel):
|
| 46 |
+
risk_level: str
|
| 47 |
+
symptoms_text: str
|
| 48 |
+
top_n: Optional[int] = 5
|
| 49 |
+
|
| 50 |
+
class APIResponse(BaseModel):
|
| 51 |
+
status: str
|
| 52 |
+
risk_assessment: str
|
| 53 |
+
recommendations: List[Dict[str, Any]]
|
| 54 |
+
|
| 55 |
+
# --- API ENDPOINTS ---
|
| 56 |
+
|
| 57 |
+
@app.get("/")
|
| 58 |
+
def health_check():
|
| 59 |
+
is_ready = recommender is not None and recommender.df is not None and not recommender.df.empty
|
| 60 |
+
return {"status": "online", "engine_ready": is_ready, "version": "1.5"}
|
| 61 |
+
|
| 62 |
+
@app.get("/api/health")
|
| 63 |
+
def api_health():
|
| 64 |
+
"""Detailed health check for container monitoring."""
|
| 65 |
+
try:
|
| 66 |
+
is_ready = recommender is not None and recommender.df is not None and not recommender.df.empty
|
| 67 |
+
db_connected = recommender.engine is not None
|
| 68 |
+
model_loaded = recommender.vectorizer is not None and recommender.tfidf_matrix is not None
|
| 69 |
+
|
| 70 |
+
return {
|
| 71 |
+
"status": "healthy" if is_ready else "degraded",
|
| 72 |
+
"timestamp": datetime.now().isoformat(),
|
| 73 |
+
"checks": {
|
| 74 |
+
"database": "ok" if db_connected else "error",
|
| 75 |
+
"model": "ok" if model_loaded else "error",
|
| 76 |
+
"articles_loaded": len(recommender.df) if is_ready else 0
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.error(f"Health check failed: {e}")
|
| 81 |
+
return {"status": "unhealthy", "error": str(e)}
|
| 82 |
+
|
| 83 |
+
@app.get("/api/stats")
|
| 84 |
+
def get_stats():
|
| 85 |
+
"""System statistics for monitoring."""
|
| 86 |
+
try:
|
| 87 |
+
if recommender.df is None:
|
| 88 |
+
return {"error": "System not initialized"}
|
| 89 |
+
|
| 90 |
+
stats = {
|
| 91 |
+
"total_articles": len(recommender.df),
|
| 92 |
+
"articles_by_type": recommender.df['format_type'].value_counts().to_dict(),
|
| 93 |
+
"articles_by_risk": recommender.df['risk_level'].value_counts().to_dict(),
|
| 94 |
+
"model_vocabulary_size": len(recommender.vectorizer.vocabulary_) if recommender.vectorizer else 0,
|
| 95 |
+
"last_updated": datetime.now().isoformat()
|
| 96 |
+
}
|
| 97 |
+
return stats
|
| 98 |
+
except Exception as e:
|
| 99 |
+
logger.error(f"Stats error: {e}")
|
| 100 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 101 |
+
|
| 102 |
+
@app.post("/api/recommend", response_model=APIResponse)
|
| 103 |
+
def get_recommendations(request: RecommendationRequest):
|
| 104 |
+
"""
|
| 105 |
+
Main recommendation endpoint.
|
| 106 |
+
Uses hybrid scoring: Cosine Similarity + Exact Symptom Boost + Source Weighting + Recency Boost.
|
| 107 |
+
"""
|
| 108 |
+
try:
|
| 109 |
+
results = recommender.recommend_articles(
|
| 110 |
+
symptoms_text=request.symptoms_text,
|
| 111 |
+
crisis_level=request.risk_level,
|
| 112 |
+
top_n=request.top_n
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
return {
|
| 116 |
+
"status": "success",
|
| 117 |
+
"risk_assessment": request.risk_level,
|
| 118 |
+
"recommendations": results
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.error(f"Recommendation error: {e}")
|
| 123 |
+
raise HTTPException(status_code=500, detail="Internal processing error.")
|
| 124 |
+
|
| 125 |
+
@app.get("/api/article/{article_id}")
|
| 126 |
+
def get_article_content(article_id: int):
|
| 127 |
+
"""
|
| 128 |
+
Retrieves full article content.
|
| 129 |
+
Handles both direct contributor text and curated PubMed abstracts.
|
| 130 |
+
"""
|
| 131 |
+
article_data = recommender.get_article_by_id(article_id)
|
| 132 |
+
|
| 133 |
+
if not article_data:
|
| 134 |
+
raise HTTPException(status_code=404, detail="Article not found")
|
| 135 |
+
|
| 136 |
+
return {
|
| 137 |
+
"article_id": article_data['article_id'],
|
| 138 |
+
"title": article_data['title'],
|
| 139 |
+
"category": article_data['category'],
|
| 140 |
+
"format_type": article_data.get('format_type', 'text'),
|
| 141 |
+
"external_url": article_data.get('external_url'),
|
| 142 |
+
"content": article_data.get('content_raw') or article_data.get('content_clean')
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
@app.post("/api/admin/rebuild-model")
|
| 146 |
+
def rebuild_model():
|
| 147 |
+
"""Admin endpoint to trigger a weighted TF-IDF rebuild."""
|
| 148 |
+
try:
|
| 149 |
+
service = IngestionService()
|
| 150 |
+
service.build_tfidf_model()
|
| 151 |
+
recommender.load_model()
|
| 152 |
+
return {"status": "success", "message": "Weighted TF-IDF model rebuilt and reloaded."}
|
| 153 |
+
except Exception as e:
|
| 154 |
+
logger.error(f"Rebuild error: {e}")
|
| 155 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 156 |
+
|
| 157 |
+
@app.post("/api/admin/trigger-ingestion")
|
| 158 |
+
def trigger_ingestion():
|
| 159 |
+
"""Admin endpoint to manually trigger PubMed ingestion."""
|
| 160 |
+
try:
|
| 161 |
+
service = IngestionService()
|
| 162 |
+
articles = service.fetch_from_pubmed("postpartum depression OR maternal mental health", limit=100)
|
| 163 |
+
if articles:
|
| 164 |
+
count = service.store_articles(articles)
|
| 165 |
+
service.build_tfidf_model()
|
| 166 |
+
recommender.load_model()
|
| 167 |
+
return {
|
| 168 |
+
"status": "success",
|
| 169 |
+
"message": f"Ingested {count} new articles and rebuilt model."
|
| 170 |
+
}
|
| 171 |
+
return {"status": "success", "message": "No new articles found."}
|
| 172 |
+
except Exception as e:
|
| 173 |
+
logger.error(f"Ingestion error: {e}")
|
| 174 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 175 |
+
|
| 176 |
+
if __name__ == "__main__":
|
| 177 |
+
import uvicorn
|
| 178 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
recommender_core.py
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import joblib
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
from sqlalchemy import create_engine
|
| 6 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 7 |
+
from urllib.parse import quote_plus
|
| 8 |
+
from text_utils import TextProcessor
|
| 9 |
+
from functools import lru_cache
|
| 10 |
+
|
| 11 |
+
# --- CONFIGURATION ---
|
| 12 |
+
# For cloud deployment (HF/Production), use DATABASE_URL.
|
| 13 |
+
# Fallback to local construction if not present.
|
| 14 |
+
DATABASE_URL = os.getenv("DATABASE_URL")
|
| 15 |
+
if not DATABASE_URL:
|
| 16 |
+
DB_USER = os.getenv("DB_USER", "postgres")
|
| 17 |
+
DB_PASSWORD = quote_plus(os.getenv("DB_PASSWORD", "subisu"))
|
| 18 |
+
DB_HOST = os.getenv("DB_HOST", "localhost")
|
| 19 |
+
DB_PORT = os.getenv("DB_PORT", "5432")
|
| 20 |
+
DB_NAME = os.getenv("DB_NAME", "ppd_project_db")
|
| 21 |
+
DB_URI = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
|
| 22 |
+
else:
|
| 23 |
+
# Ensure URL is compatible with SQLAlchemy if it starts with postgres://
|
| 24 |
+
if DATABASE_URL.startswith("postgres://"):
|
| 25 |
+
DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql+psycopg2://", 1)
|
| 26 |
+
elif "postgresql://" in DATABASE_URL and "+psycopg2" not in DATABASE_URL:
|
| 27 |
+
DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+psycopg2://", 1)
|
| 28 |
+
DB_URI = DATABASE_URL
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class RecommenderCore:
|
| 32 |
+
def __init__(self):
|
| 33 |
+
self.engine = create_engine(DB_URI)
|
| 34 |
+
self.vectorizer = None
|
| 35 |
+
self.tfidf_matrix = None
|
| 36 |
+
self.df = None
|
| 37 |
+
self.load_model()
|
| 38 |
+
|
| 39 |
+
def load_model(self):
|
| 40 |
+
try:
|
| 41 |
+
if os.path.exists('vectorizer.pkl') and os.path.exists('tfidf_matrix.pkl'):
|
| 42 |
+
self.vectorizer = joblib.load('vectorizer.pkl')
|
| 43 |
+
self.tfidf_matrix = joblib.load('tfidf_matrix.pkl')
|
| 44 |
+
print("💾 Model Loaded into Memory.")
|
| 45 |
+
|
| 46 |
+
self.df = pd.read_sql("SELECT * FROM articles WHERE status = 'Approved' ORDER BY article_id", self.engine)
|
| 47 |
+
self.df = self.df.reset_index(drop=True)
|
| 48 |
+
print(f"📚 Indexed {len(self.df)} articles.")
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"Load Error: {e}")
|
| 51 |
+
|
| 52 |
+
@lru_cache(maxsize=128)
|
| 53 |
+
def recommend_articles(self, symptoms_text, crisis_level, top_n=5):
|
| 54 |
+
"""Modular requirement: Main entry point with caching."""
|
| 55 |
+
if self.df is None or self.vectorizer is None:
|
| 56 |
+
return []
|
| 57 |
+
|
| 58 |
+
# 1. Preprocess user query
|
| 59 |
+
query_raw = symptoms_text
|
| 60 |
+
query_norm = TextProcessor.normalize(symptoms_text)
|
| 61 |
+
query_phased = TextProcessor.detect_phrases(query_norm)
|
| 62 |
+
|
| 63 |
+
# 2. Filter by Crisis Level (Safety First)
|
| 64 |
+
risk_map = {
|
| 65 |
+
"High": ["High", "Critical", "Moderate", "All"],
|
| 66 |
+
"Moderate": ["Moderate", "Low", "All"],
|
| 67 |
+
"Low": ["Low", "All"]
|
| 68 |
+
}
|
| 69 |
+
allowed = risk_map.get(crisis_level, ["All"])
|
| 70 |
+
|
| 71 |
+
# Determine the filtered subset
|
| 72 |
+
mask = self.df['risk_level'].apply(
|
| 73 |
+
lambda x: any(level.strip() in allowed for level in str(x).split(','))
|
| 74 |
+
)
|
| 75 |
+
filtered_df = self.df[mask].copy()
|
| 76 |
+
|
| 77 |
+
if filtered_df.empty: return []
|
| 78 |
+
|
| 79 |
+
# 3. Primary ML Scoring (Cosine Similarity)
|
| 80 |
+
user_vec = self.vectorizer.transform([query_phased])
|
| 81 |
+
all_cos_scores = cosine_similarity(user_vec, self.tfidf_matrix).flatten()
|
| 82 |
+
|
| 83 |
+
# 4. Final Ranking
|
| 84 |
+
# Correctly align scores using the original dataframe's index
|
| 85 |
+
filtered_df['cosine_score'] = [all_cos_scores[i] for i in filtered_df.index]
|
| 86 |
+
|
| 87 |
+
# Apply the hybrid ranking engine
|
| 88 |
+
ranked_results = self.apply_ranking(filtered_df, query_raw)
|
| 89 |
+
|
| 90 |
+
# Format for output
|
| 91 |
+
final_list = ranked_results.head(top_n).to_dict('records')
|
| 92 |
+
|
| 93 |
+
# 5. Live Fallback if needed
|
| 94 |
+
# Requirement: If results are too few, fetch fresh content
|
| 95 |
+
K = 3
|
| 96 |
+
if len(final_list) < K:
|
| 97 |
+
try:
|
| 98 |
+
from ingestion_service import IngestionService
|
| 99 |
+
service = IngestionService()
|
| 100 |
+
live_arts = service.fetch_from_pubmed(query_raw, limit=K)
|
| 101 |
+
for art in live_arts:
|
| 102 |
+
if len(final_list) >= top_n: break
|
| 103 |
+
final_list.append({
|
| 104 |
+
"article_id": -1,
|
| 105 |
+
"title": art['title'],
|
| 106 |
+
"category": "Live Fallback",
|
| 107 |
+
"format_type": "pubmed",
|
| 108 |
+
"external_url": art['url'],
|
| 109 |
+
"content": art['content'],
|
| 110 |
+
"risk_level": "All"
|
| 111 |
+
})
|
| 112 |
+
# Background ingestion (optional here, but requested in strategy)
|
| 113 |
+
if live_arts: service.store_articles(live_arts)
|
| 114 |
+
except Exception as e:
|
| 115 |
+
print(f"Fallback error: {e}")
|
| 116 |
+
|
| 117 |
+
for item in final_list:
|
| 118 |
+
item['access_type'] = 'External Link' if item.get('format_type') == 'pubmed' else 'Direct Text'
|
| 119 |
+
if 'created_at' in item and item['created_at']:
|
| 120 |
+
item['created_at'] = str(item['created_at'])
|
| 121 |
+
|
| 122 |
+
return final_list
|
| 123 |
+
|
| 124 |
+
def apply_ranking(self, df, raw_query):
|
| 125 |
+
"""Modular requirement: Hybrid ranking engine."""
|
| 126 |
+
# Constants for weighting
|
| 127 |
+
SOURCE_WEIGHT = 1.15 # 15% boost for contributor articles
|
| 128 |
+
EXACT_MATCH_BOOST = 0.2
|
| 129 |
+
|
| 130 |
+
tokens = TextProcessor.normalize(raw_query).split()
|
| 131 |
+
|
| 132 |
+
now = pd.Timestamp.now()
|
| 133 |
+
|
| 134 |
+
def calculate_hybrid_score(row):
|
| 135 |
+
score = row['cosine_score']
|
| 136 |
+
|
| 137 |
+
# A. Source Weighting (Trusted Contributors)
|
| 138 |
+
if row['format_type'] == 'text':
|
| 139 |
+
score *= SOURCE_WEIGHT
|
| 140 |
+
|
| 141 |
+
# B. Exact Symptom Overlap Boost
|
| 142 |
+
# Check how many user tokens appear exactly in the normalized title
|
| 143 |
+
norm_title = TextProcessor.normalize(row['title'])
|
| 144 |
+
matches = sum(1 for t in tokens if t in norm_title)
|
| 145 |
+
score += (matches * EXACT_MATCH_BOOST)
|
| 146 |
+
|
| 147 |
+
# C. Recency Boost (PubMed only, newer is better)
|
| 148 |
+
if row['format_type'] == 'pubmed' and row['created_at']:
|
| 149 |
+
age_days = (now - pd.to_datetime(row['created_at'])).days
|
| 150 |
+
# Decaying boost: max 0.1 for brand new, goes to 0 over 365 days
|
| 151 |
+
recency_boost = max(0, 0.1 * (1 - (min(age_days, 365) / 365)))
|
| 152 |
+
score += recency_boost
|
| 153 |
+
|
| 154 |
+
return score
|
| 155 |
+
|
| 156 |
+
df['final_score'] = df.apply(calculate_hybrid_score, axis=1)
|
| 157 |
+
return df.sort_values(by='final_score', ascending=False)
|
| 158 |
+
|
| 159 |
+
def get_article_by_id(self, article_id):
|
| 160 |
+
"""Fetches a single article by its ID."""
|
| 161 |
+
if self.df is None: return None
|
| 162 |
+
article = self.df[self.df['article_id'] == article_id]
|
| 163 |
+
return article.iloc[0].to_dict() if not article.empty else None
|
| 164 |
+
|
| 165 |
+
# Singleton instance to be used by main.py
|
| 166 |
+
recommender = RecommenderCore()
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi
|
| 2 |
+
uvicorn
|
| 3 |
+
pandas
|
| 4 |
+
sqlalchemy
|
| 5 |
+
psycopg2-binary
|
| 6 |
+
scikit-learn
|
| 7 |
+
biopython
|
| 8 |
+
beautifulsoup4
|
| 9 |
+
requests
|
text_utils.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import string
|
| 3 |
+
|
| 4 |
+
class TextProcessor:
|
| 5 |
+
"""Handles normalization, cleaning, and phrase detection."""
|
| 6 |
+
|
| 7 |
+
STOPWORDS = {
|
| 8 |
+
'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
|
| 9 |
+
'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
|
| 10 |
+
'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
|
| 11 |
+
'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
|
| 12 |
+
'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
|
| 13 |
+
'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
|
| 14 |
+
'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
|
| 15 |
+
'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
|
| 16 |
+
'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
|
| 17 |
+
'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should',
|
| 18 |
+
"should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't",
|
| 19 |
+
'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
|
| 20 |
+
'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
|
| 21 |
+
'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
@staticmethod
|
| 25 |
+
def clean_html(text):
|
| 26 |
+
if not text: return ""
|
| 27 |
+
# Remove HTML tags
|
| 28 |
+
clean = re.sub(r'<[^>]+>', ' ', text)
|
| 29 |
+
# Remove extra whitespace
|
| 30 |
+
clean = re.sub(r'\s+', ' ', clean).strip()
|
| 31 |
+
return clean
|
| 32 |
+
|
| 33 |
+
@classmethod
|
| 34 |
+
def normalize(cls, text):
|
| 35 |
+
if not text: return ""
|
| 36 |
+
# Lowercase
|
| 37 |
+
text = text.lower()
|
| 38 |
+
# Remove punctuation
|
| 39 |
+
text = text.translate(str.maketrans('', '', string.punctuation))
|
| 40 |
+
# Remove stopwords
|
| 41 |
+
tokens = [t for t in text.split() if t not in cls.STOPWORDS]
|
| 42 |
+
return " ".join(tokens)
|
| 43 |
+
|
| 44 |
+
@staticmethod
|
| 45 |
+
def detect_phrases(text):
|
| 46 |
+
"""Simple bigram detection for important PPD concepts."""
|
| 47 |
+
phrases = [
|
| 48 |
+
"postpartum depression", "maternal mental health", "sleep disturbance",
|
| 49 |
+
"crying spells", "suicidal ideation", "mood swings", "baby blues"
|
| 50 |
+
]
|
| 51 |
+
for p in phrases:
|
| 52 |
+
# We don't replace, we just ensure they are treated as one token for TF-IDF if we want,
|
| 53 |
+
# but scikit-learn's ngram_range can also do this.
|
| 54 |
+
# To force it, we could underscore them:
|
| 55 |
+
if p in text.lower():
|
| 56 |
+
underscored = p.replace(" ", "_")
|
| 57 |
+
text = re.sub(p, underscored, text, flags=re.IGNORECASE)
|
| 58 |
+
return text
|
tfidf_matrix.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13753f66236968d06e263b5ccfbca12d51730c348683e1913e613bba0ac8c6d3
|
| 3 |
+
size 406971
|
vectorizer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:020b250895c3135ec8f61bba8663693461c2dc4b94e4bfef5ff89d72e489a6e7
|
| 3 |
+
size 598132
|