Spaces:

subisu
/

ppd-recommendation-api

Sleeping

App Files Files Community

surbi karki commited on Feb 5

Commit

700d42b

verified ·

1 Parent(s): a0cce43

Upload 8 files

Browse files

Files changed (8) hide show

Dockerfile_HF +34 -0
ingestion_service.py +129 -0
main.py +178 -0
recommender_core.py +166 -0
requirements.txt +9 -0
text_utils.py +58 -0
tfidf_matrix.pkl +3 -0
vectorizer.pkl +3 -0

Dockerfile_HF ADDED Viewed

	@@ -0,0 +1,34 @@

+# Python Backend API for Hugging Face Spaces
+FROM python:3.11-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    postgresql-client \
+    libpq-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create directory for model files
+RUN mkdir -p /app/models
+# Hugging Face Spaces default port is 7860
+EXPOSE 7860
+# Health check (pointing to the correct port)
+HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
+    CMD python -c "import requests; requests.get('http://localhost:7860/api/health')" || exit 1
+# Run the application on port 7860 as required by HF Spaces
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

ingestion_service.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import time
+import pandas as pd
+import joblib
+from Bio import Entrez
+from sqlalchemy import create_engine, text
+from urllib.parse import quote_plus
+from text_utils import TextProcessor
+from sklearn.feature_extraction.text import TfidfVectorizer
+# --- CONFIGURATION ---
+DB_USER = "postgres"
+DB_PASSWORD = quote_plus("subisu")
+DB_NAME = "ppd_project_db"
+DB_URI = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@localhost:5432/{DB_NAME}'
+Entrez.email = "surbi.211740@ncit.edu.np"
+Entrez.tool = "PPD_Recommender_App"
+class IngestionService:
+    def __init__(self):
+        self.engine = create_engine(DB_URI)
+    def fetch_from_pubmed(self, query, limit=100):
+        print(f"🔍 Searching PubMed: '{query}'...")
+        try:
+            h1 = Entrez.esearch(db="pubmed", term=query, retmax=limit, sort="relevance")
+            ids = Entrez.read(h1)["IdList"]
+            if not ids: return []
+            h2 = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
+            papers = Entrez.read(h2)
+            results = []
+            for paper in papers['PubmedArticle']:
+                try:
+                    article = paper['MedlineCitation']['Article']
+                    title = article.get('ArticleTitle', '')
+                    abstract_data = article.get('Abstract', {}).get('AbstractText', [])
+                    abstract = " ".join([str(x) for x in abstract_data]) if isinstance(abstract_data, list) else str(abstract_data)
+                    if not abstract: continue
+                    results.append({
+                        "title": title,
+                        "content": abstract,
+                        "url": f"https://pubmed.ncbi.nlm.nih.gov/{paper['MedlineCitation']['PMID']}/"
+                    })
+                except: continue
+            return results
+        except Exception as e:
+            print(f"Pubmed Error: {e}")
+            return []
+    def store_articles(self, articles, category="General", risk="All"):
+        """Modular requirement: Stores articles with deduplication."""
+        added = 0
+        with self.engine.connect() as conn:
+            for art in articles:
+                # Preprocessing
+                clean_title = TextProcessor.clean_html(art['title'])
+                clean_content = TextProcessor.clean_html(art['content'])
+                query = text("""
+                    INSERT INTO articles
+                    (title, content_clean, content_raw, category, risk_level, status, format_type, external_url)
+                    VALUES (:t, :cc, :cr, :cat, :risk, 'Approved', 'pubmed', :url)
+                    ON CONFLICT (external_url) DO NOTHING
+                """)
+                try:
+                    res = conn.execute(query, {
+                        "t": clean_title,
+                        "cc": clean_content,
+                        "cr": f"<h3>Source: PubMed</h3><p>{art['content']}</p>",
+                        "cat": category,
+                        "risk": risk,
+                        "url": art['url']
+                    })
+                    conn.commit()
+                    if res.rowcount > 0: added += 1
+                except Exception as e:
+                    print(f"DB Error: {e}")
+        print(f"✅ Stored {added} new articles.")
+        return added
+    def build_tfidf_model(self, force=False):
+        """Modular requirement: Builds the TF-IDF model with weighted fields."""
+        print("🧠 Building Weighted TF-IDF Model...")
+        # Use ORDER BY for deterministic indexing
+        df = pd.read_sql("SELECT * FROM articles WHERE status = 'Approved' ORDER BY article_id", self.engine)
+        df = df.reset_index(drop=True)
+        if df.empty:
+            print("⚠️ No articles to build model.")
+            return
+        # Multi-Field Weighting
+        # Title (3x) + Content (1x) + Tags/Categories (1x)
+        # We also apply normalization and phrase detection
+        def prepare_features(row):
+            title = TextProcessor.normalize(row['title'])
+            content = TextProcessor.normalize(row['content_clean'])
+            tags = TextProcessor.normalize(str(row['tags']) + " " + str(row['category']))
+            # Phrase detection on title and content
+            title = TextProcessor.detect_phrases(title)
+            content = TextProcessor.detect_phrases(content)
+            # Weighted concatenation
+            return (title + " ") * 3 + content + " " + tags
+        features = df.apply(prepare_features, axis=1)
+        vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # Support bigrams natively
+        tfidf_matrix = vectorizer.fit_transform(features)
+        joblib.dump(vectorizer, 'vectorizer.pkl')
+        joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')
+        print(f"💾 Model optimized and saved. Vocabulary size: {len(vectorizer.vocabulary_)}")
+if __name__ == "__main__":
+    service = IngestionService()
+    # 24-hour broad update
+    arts = service.fetch_from_pubmed("postpartum depression OR maternal mental health", 100)
+    if arts:
+        service.store_articles(arts)
+        service.build_tfidf_model()

main.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from contextlib import asynccontextmanager
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import List, Dict, Any, Optional
+import os
+import logging
+from datetime import datetime
+# Import refactored engines
+from recommender_core import recommender
+from ingestion_service import IngestionService
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# --- LIFESPAN MANAGER ---
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    logger.info("⏳ Starting up... RecommenderCore is ready.")
+    yield
+    logger.info("🛑 Shutting down...")
+# --- APP CONFIGURATION ---
+app = FastAPI(
+    title="PPD Risk & Recommendation Engine",
+    version="1.5",
+    description="Advanced system with hybrid scoring, multi-field TF-IDF, and offline-first PubMed integration.",
+    lifespan=lifespan,
+    docs_url="/docs",  # Swagger UI
+    redoc_url="/redoc"  # ReDoc alternative
+)
+# --- CORS SETUP ---
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# --- DATA MODELS ---
+class RecommendationRequest(BaseModel):
+    risk_level: str
+    symptoms_text: str
+    top_n: Optional[int] = 5
+class APIResponse(BaseModel):
+    status: str
+    risk_assessment: str
+    recommendations: List[Dict[str, Any]]
+# --- API ENDPOINTS ---
+@app.get("/")
+def health_check():
+    is_ready = recommender is not None and recommender.df is not None and not recommender.df.empty
+    return {"status": "online", "engine_ready": is_ready, "version": "1.5"}
+@app.get("/api/health")
+def api_health():
+    """Detailed health check for container monitoring."""
+    try:
+        is_ready = recommender is not None and recommender.df is not None and not recommender.df.empty
+        db_connected = recommender.engine is not None
+        model_loaded = recommender.vectorizer is not None and recommender.tfidf_matrix is not None
+        return {
+            "status": "healthy" if is_ready else "degraded",
+            "timestamp": datetime.now().isoformat(),
+            "checks": {
+                "database": "ok" if db_connected else "error",
+                "model": "ok" if model_loaded else "error",
+                "articles_loaded": len(recommender.df) if is_ready else 0
+            }
+        }
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        return {"status": "unhealthy", "error": str(e)}
+@app.get("/api/stats")
+def get_stats():
+    """System statistics for monitoring."""
+    try:
+        if recommender.df is None:
+            return {"error": "System not initialized"}
+        stats = {
+            "total_articles": len(recommender.df),
+            "articles_by_type": recommender.df['format_type'].value_counts().to_dict(),
+            "articles_by_risk": recommender.df['risk_level'].value_counts().to_dict(),
+            "model_vocabulary_size": len(recommender.vectorizer.vocabulary_) if recommender.vectorizer else 0,
+            "last_updated": datetime.now().isoformat()
+        }
+        return stats
+    except Exception as e:
+        logger.error(f"Stats error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/recommend", response_model=APIResponse)
+def get_recommendations(request: RecommendationRequest):
+    """
+    Main recommendation endpoint.
+    Uses hybrid scoring: Cosine Similarity + Exact Symptom Boost + Source Weighting + Recency Boost.
+    """
+    try:
+        results = recommender.recommend_articles(
+            symptoms_text=request.symptoms_text,
+            crisis_level=request.risk_level,
+            top_n=request.top_n
+        )
+        return {
+            "status": "success",
+            "risk_assessment": request.risk_level,
+            "recommendations": results
+        }
+    except Exception as e:
+        logger.error(f"Recommendation error: {e}")
+        raise HTTPException(status_code=500, detail="Internal processing error.")
+@app.get("/api/article/{article_id}")
+def get_article_content(article_id: int):
+    """
+    Retrieves full article content.
+    Handles both direct contributor text and curated PubMed abstracts.
+    """
+    article_data = recommender.get_article_by_id(article_id)
+    if not article_data:
+        raise HTTPException(status_code=404, detail="Article not found")
+    return {
+        "article_id": article_data['article_id'],
+        "title": article_data['title'],
+        "category": article_data['category'],
+        "format_type": article_data.get('format_type', 'text'),
+        "external_url": article_data.get('external_url'),
+        "content": article_data.get('content_raw') or article_data.get('content_clean')
+    }
+@app.post("/api/admin/rebuild-model")
+def rebuild_model():
+    """Admin endpoint to trigger a weighted TF-IDF rebuild."""
+    try:
+        service = IngestionService()
+        service.build_tfidf_model()
+        recommender.load_model()
+        return {"status": "success", "message": "Weighted TF-IDF model rebuilt and reloaded."}
+    except Exception as e:
+        logger.error(f"Rebuild error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/admin/trigger-ingestion")
+def trigger_ingestion():
+    """Admin endpoint to manually trigger PubMed ingestion."""
+    try:
+        service = IngestionService()
+        articles = service.fetch_from_pubmed("postpartum depression OR maternal mental health", limit=100)
+        if articles:
+            count = service.store_articles(articles)
+            service.build_tfidf_model()
+            recommender.load_model()
+            return {
+                "status": "success",
+                "message": f"Ingested {count} new articles and rebuilt model."
+            }
+        return {"status": "success", "message": "No new articles found."}
+    except Exception as e:
+        logger.error(f"Ingestion error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

recommender_core.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import pandas as pd
+import joblib
+import os
+import time
+from sqlalchemy import create_engine
+from sklearn.metrics.pairwise import cosine_similarity
+from urllib.parse import quote_plus
+from text_utils import TextProcessor
+from functools import lru_cache
+# --- CONFIGURATION ---
+# For cloud deployment (HF/Production), use DATABASE_URL.
+# Fallback to local construction if not present.
+DATABASE_URL = os.getenv("DATABASE_URL")
+if not DATABASE_URL:
+    DB_USER = os.getenv("DB_USER", "postgres")
+    DB_PASSWORD = quote_plus(os.getenv("DB_PASSWORD", "subisu"))
+    DB_HOST = os.getenv("DB_HOST", "localhost")
+    DB_PORT = os.getenv("DB_PORT", "5432")
+    DB_NAME = os.getenv("DB_NAME", "ppd_project_db")
+    DB_URI = f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}'
+else:
+    # Ensure URL is compatible with SQLAlchemy if it starts with postgres://
+    if DATABASE_URL.startswith("postgres://"):
+        DATABASE_URL = DATABASE_URL.replace("postgres://", "postgresql+psycopg2://", 1)
+    elif "postgresql://" in DATABASE_URL and "+psycopg2" not in DATABASE_URL:
+        DATABASE_URL = DATABASE_URL.replace("postgresql://", "postgresql+psycopg2://", 1)
+    DB_URI = DATABASE_URL
+class RecommenderCore:
+    def __init__(self):
+        self.engine = create_engine(DB_URI)
+        self.vectorizer = None
+        self.tfidf_matrix = None
+        self.df = None
+        self.load_model()
+    def load_model(self):
+        try:
+            if os.path.exists('vectorizer.pkl') and os.path.exists('tfidf_matrix.pkl'):
+                self.vectorizer = joblib.load('vectorizer.pkl')
+                self.tfidf_matrix = joblib.load('tfidf_matrix.pkl')
+                print("💾 Model Loaded into Memory.")
+            self.df = pd.read_sql("SELECT * FROM articles WHERE status = 'Approved' ORDER BY article_id", self.engine)
+            self.df = self.df.reset_index(drop=True)
+            print(f"📚 Indexed {len(self.df)} articles.")
+        except Exception as e:
+            print(f"Load Error: {e}")
+    @lru_cache(maxsize=128)
+    def recommend_articles(self, symptoms_text, crisis_level, top_n=5):
+        """Modular requirement: Main entry point with caching."""
+        if self.df is None or self.vectorizer is None:
+            return []
+        # 1. Preprocess user query
+        query_raw = symptoms_text
+        query_norm = TextProcessor.normalize(symptoms_text)
+        query_phased = TextProcessor.detect_phrases(query_norm)
+        # 2. Filter by Crisis Level (Safety First)
+        risk_map = {
+            "High": ["High", "Critical", "Moderate", "All"],
+            "Moderate": ["Moderate", "Low", "All"],
+            "Low": ["Low", "All"]
+        }
+        allowed = risk_map.get(crisis_level, ["All"])
+        # Determine the filtered subset
+        mask = self.df['risk_level'].apply(
+            lambda x: any(level.strip() in allowed for level in str(x).split(','))
+        )
+        filtered_df = self.df[mask].copy()
+        if filtered_df.empty: return []
+        # 3. Primary ML Scoring (Cosine Similarity)
+        user_vec = self.vectorizer.transform([query_phased])
+        all_cos_scores = cosine_similarity(user_vec, self.tfidf_matrix).flatten()
+        # 4. Final Ranking
+        # Correctly align scores using the original dataframe's index
+        filtered_df['cosine_score'] = [all_cos_scores[i] for i in filtered_df.index]
+        # Apply the hybrid ranking engine
+        ranked_results = self.apply_ranking(filtered_df, query_raw)
+        # Format for output
+        final_list = ranked_results.head(top_n).to_dict('records')
+        # 5. Live Fallback if needed
+        # Requirement: If results are too few, fetch fresh content
+        K = 3
+        if len(final_list) < K:
+            try:
+                from ingestion_service import IngestionService
+                service = IngestionService()
+                live_arts = service.fetch_from_pubmed(query_raw, limit=K)
+                for art in live_arts:
+                    if len(final_list) >= top_n: break
+                    final_list.append({
+                        "article_id": -1,
+                        "title": art['title'],
+                        "category": "Live Fallback",
+                        "format_type": "pubmed",
+                        "external_url": art['url'],
+                        "content": art['content'],
+                        "risk_level": "All"
+                    })
+                # Background ingestion (optional here, but requested in strategy)
+                if live_arts: service.store_articles(live_arts)
+            except Exception as e:
+                print(f"Fallback error: {e}")
+        for item in final_list:
+            item['access_type'] = 'External Link' if item.get('format_type') == 'pubmed' else 'Direct Text'
+            if 'created_at' in item and item['created_at']:
+                item['created_at'] = str(item['created_at'])
+        return final_list
+    def apply_ranking(self, df, raw_query):
+        """Modular requirement: Hybrid ranking engine."""
+        # Constants for weighting
+        SOURCE_WEIGHT = 1.15  # 15% boost for contributor articles
+        EXACT_MATCH_BOOST = 0.2
+        tokens = TextProcessor.normalize(raw_query).split()
+        now = pd.Timestamp.now()
+        def calculate_hybrid_score(row):
+            score = row['cosine_score']
+            # A. Source Weighting (Trusted Contributors)
+            if row['format_type'] == 'text':
+                score *= SOURCE_WEIGHT
+            # B. Exact Symptom Overlap Boost
+            # Check how many user tokens appear exactly in the normalized title
+            norm_title = TextProcessor.normalize(row['title'])
+            matches = sum(1 for t in tokens if t in norm_title)
+            score += (matches * EXACT_MATCH_BOOST)
+            # C. Recency Boost (PubMed only, newer is better)
+            if row['format_type'] == 'pubmed' and row['created_at']:
+                age_days = (now - pd.to_datetime(row['created_at'])).days
+                # Decaying boost: max 0.1 for brand new, goes to 0 over 365 days
+                recency_boost = max(0, 0.1 * (1 - (min(age_days, 365) / 365)))
+                score += recency_boost
+            return score
+        df['final_score'] = df.apply(calculate_hybrid_score, axis=1)
+        return df.sort_values(by='final_score', ascending=False)
+    def get_article_by_id(self, article_id):
+        """Fetches a single article by its ID."""
+        if self.df is None: return None
+        article = self.df[self.df['article_id'] == article_id]
+        return article.iloc[0].to_dict() if not article.empty else None
+# Singleton instance to be used by main.py
+recommender = RecommenderCore()

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+fastapi
+uvicorn
+pandas
+sqlalchemy
+psycopg2-binary
+scikit-learn
+biopython
+beautifulsoup4
+requests

text_utils.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import re
+import string
+class TextProcessor:
+    """Handles normalization, cleaning, and phrase detection."""
+    STOPWORDS = {
+        'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
+        'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
+        'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
+        'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
+        'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
+        'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
+        'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
+        'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
+        'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
+        'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should',
+        "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't",
+        'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
+        'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
+        'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
+    }
+    @staticmethod
+    def clean_html(text):
+        if not text: return ""
+        # Remove HTML tags
+        clean = re.sub(r'<[^>]+>', ' ', text)
+        # Remove extra whitespace
+        clean = re.sub(r'\s+', ' ', clean).strip()
+        return clean
+    @classmethod
+    def normalize(cls, text):
+        if not text: return ""
+        # Lowercase
+        text = text.lower()
+        # Remove punctuation
+        text = text.translate(str.maketrans('', '', string.punctuation))
+        # Remove stopwords
+        tokens = [t for t in text.split() if t not in cls.STOPWORDS]
+        return " ".join(tokens)
+    @staticmethod
+    def detect_phrases(text):
+        """Simple bigram detection for important PPD concepts."""
+        phrases = [
+            "postpartum depression", "maternal mental health", "sleep disturbance",
+            "crying spells", "suicidal ideation", "mood swings", "baby blues"
+        ]
+        for p in phrases:
+            # We don't replace, we just ensure they are treated as one token for TF-IDF if we want,
+            # but scikit-learn's ngram_range can also do this.
+            # To force it, we could underscore them:
+            if p in text.lower():
+                underscored = p.replace(" ", "_")
+                text = re.sub(p, underscored, text, flags=re.IGNORECASE)
+        return text

tfidf_matrix.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:13753f66236968d06e263b5ccfbca12d51730c348683e1913e613bba0ac8c6d3
+size 406971

vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:020b250895c3135ec8f61bba8663693461c2dc4b94e4bfef5ff89d72e489a6e7
+size 598132