Spaces:

barissozudogru
/

research-papers-mcp

Sleeping

File size: 5,702 Bytes

57272d3

"""PubMed source: NCBI E-utilities esearch + efetch."""

import datetime
import logging
import time
from xml.etree import ElementTree

from ._http import _http
from .base import Capability, PaperSource, RateLimitHint, paper_dict

logger = logging.getLogger(__name__)

_ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
_EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"


def _parse_pubmed_date(pub_date_elem) -> datetime.datetime | None:
    if pub_date_elem is None:
        return None
    year = pub_date_elem.findtext("Year")
    month = pub_date_elem.findtext("Month") or "Jan"
    day = pub_date_elem.findtext("Day") or "1"
    if not year:
        return None
    for fmt in ("%Y %b %d", "%Y %m %d"):
        try:
            return datetime.datetime.strptime(f"{year} {month} {day}", fmt)
        except ValueError:
            continue
    try:
        return datetime.datetime(int(year), 1, 1)
    except (ValueError, TypeError):
        return None


def _parse_pubmed_article(article) -> dict | None:
    """Parse a `<PubmedArticle>` element into the standard paper dict."""
    try:
        medline = article.find("MedlineCitation")
        if medline is None:
            return None
        pmid = medline.findtext("PMID") or ""
        article_elem = medline.find("Article")
        if article_elem is None:
            return None

        title = (article_elem.findtext("ArticleTitle") or "").strip()
        if not title or not pmid:
            return None

        abstract_parts = article_elem.findall(".//AbstractText")
        abstract = " ".join(
            (e.text or "") for e in abstract_parts if e.text
        ).strip() or None

        author_list = article_elem.findall(".//Author")
        author_names = []
        for a in author_list:
            last = a.findtext("LastName") or ""
            first = a.findtext("ForeName") or ""
            if last:
                author_names.append(f"{last} {first}".strip())
        authors = ", ".join(author_names) if author_names else None

        pub_date_elem = article_elem.find(".//Journal/JournalIssue/PubDate")

        mesh = medline.findall(".//MeshHeading/DescriptorName")
        topics = [m.text for m in mesh if m.text][:10]

        doi = None
        for id_elem in article.findall(".//ArticleIdList/ArticleId"):
            if id_elem.get("IdType") == "doi":
                doi = id_elem.text
                break

        return paper_dict(
            title=title,
            abstract=abstract,
            authors=authors,
            publication_date=_parse_pubmed_date(pub_date_elem),
            source="pubmed",
            source_id=pmid,
            url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
            doi=doi,
            topics=topics,
        )
    except Exception as exc:
        logger.warning("Skipping malformed PubMed article: %s", exc)
        return None


def fetch_pubmed(query: str = "machine learning", max_results: int = 20) -> list[dict]:
    """Fetch papers from PubMed via NCBI E-utilities."""
    search_params = {
        "db": "pubmed",
        "term": query,
        "retmax": max_results,
        "retmode": "json",
        "sort": "pub_date",
    }
    try:
        resp = _http.get(_ESEARCH_URL, params=search_params, timeout=15)
        resp.raise_for_status()
        id_list = resp.json().get("esearchresult", {}).get("idlist", [])
    except Exception as exc:
        logger.warning("PubMed search failed: %s", exc)
        return []

    if not id_list:
        return []

    time.sleep(0.4)

    fetch_params = {
        "db": "pubmed",
        "id": ",".join(id_list),
        "retmode": "xml",
        "rettype": "abstract",
    }
    try:
        resp = _http.get(_EFETCH_URL, params=fetch_params, timeout=30)
        resp.raise_for_status()
        root = ElementTree.fromstring(resp.content)
    except Exception as exc:
        logger.warning("PubMed fetch failed: %s", exc)
        return []

    papers = []
    for article in root.findall(".//PubmedArticle"):
        parsed = _parse_pubmed_article(article)
        if parsed:
            papers.append(parsed)
    return papers


def fetch_pubmed_by_id(pmid: str) -> dict | None:
    """Fetch a single PubMed article by its PMID."""
    fetch_params = {
        "db": "pubmed",
        "id": pmid,
        "retmode": "xml",
        "rettype": "abstract",
    }
    try:
        resp = _http.get(_EFETCH_URL, params=fetch_params, timeout=15)
        resp.raise_for_status()
        root = ElementTree.fromstring(resp.content)
    except Exception as exc:
        logger.warning("PubMed get_by_id failed for %s: %s", pmid, exc)
        return None

    article = root.find(".//PubmedArticle")
    if article is None:
        return None
    return _parse_pubmed_article(article)


class PubmedSource(PaperSource):
    """NCBI PubMed via E-utilities. Search + PMID lookup only."""

    name = "pubmed"

    def search(
        self,
        query: str,
        max_results: int = 20,
        filters: dict | None = None,
    ) -> list[dict]:
        return fetch_pubmed(query=query, max_results=max_results)

    def get_by_id(self, identifier: str) -> dict | None:
        return fetch_pubmed_by_id(identifier)

    def supports(self, capability: Capability) -> bool:
        # E-utilities can do elink for refs, but we don't expose that today.
        return False

    def rate_limit_hint(self) -> RateLimitHint:
        return RateLimitHint(
            min_interval_seconds=0.34,  # 3 req/s without an API key
            daily_quota=None,
            notes="NCBI E-utilities: 3 req/s without API key, 10 req/s with NCBI_API_KEY",
        )