barissozudogru's picture
bundle research_papers_mcp source, drop git+install
57272d3 verified
Raw
History Blame Contribute Delete
5.7 kB
"""PubMed source: NCBI E-utilities esearch + efetch."""
import datetime
import logging
import time
from xml.etree import ElementTree
from ._http import _http
from .base import Capability, PaperSource, RateLimitHint, paper_dict
logger = logging.getLogger(__name__)
_ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
_EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
def _parse_pubmed_date(pub_date_elem) -> datetime.datetime | None:
if pub_date_elem is None:
return None
year = pub_date_elem.findtext("Year")
month = pub_date_elem.findtext("Month") or "Jan"
day = pub_date_elem.findtext("Day") or "1"
if not year:
return None
for fmt in ("%Y %b %d", "%Y %m %d"):
try:
return datetime.datetime.strptime(f"{year} {month} {day}", fmt)
except ValueError:
continue
try:
return datetime.datetime(int(year), 1, 1)
except (ValueError, TypeError):
return None
def _parse_pubmed_article(article) -> dict | None:
"""Parse a `<PubmedArticle>` element into the standard paper dict."""
try:
medline = article.find("MedlineCitation")
if medline is None:
return None
pmid = medline.findtext("PMID") or ""
article_elem = medline.find("Article")
if article_elem is None:
return None
title = (article_elem.findtext("ArticleTitle") or "").strip()
if not title or not pmid:
return None
abstract_parts = article_elem.findall(".//AbstractText")
abstract = " ".join(
(e.text or "") for e in abstract_parts if e.text
).strip() or None
author_list = article_elem.findall(".//Author")
author_names = []
for a in author_list:
last = a.findtext("LastName") or ""
first = a.findtext("ForeName") or ""
if last:
author_names.append(f"{last} {first}".strip())
authors = ", ".join(author_names) if author_names else None
pub_date_elem = article_elem.find(".//Journal/JournalIssue/PubDate")
mesh = medline.findall(".//MeshHeading/DescriptorName")
topics = [m.text for m in mesh if m.text][:10]
doi = None
for id_elem in article.findall(".//ArticleIdList/ArticleId"):
if id_elem.get("IdType") == "doi":
doi = id_elem.text
break
return paper_dict(
title=title,
abstract=abstract,
authors=authors,
publication_date=_parse_pubmed_date(pub_date_elem),
source="pubmed",
source_id=pmid,
url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
doi=doi,
topics=topics,
)
except Exception as exc:
logger.warning("Skipping malformed PubMed article: %s", exc)
return None
def fetch_pubmed(query: str = "machine learning", max_results: int = 20) -> list[dict]:
"""Fetch papers from PubMed via NCBI E-utilities."""
search_params = {
"db": "pubmed",
"term": query,
"retmax": max_results,
"retmode": "json",
"sort": "pub_date",
}
try:
resp = _http.get(_ESEARCH_URL, params=search_params, timeout=15)
resp.raise_for_status()
id_list = resp.json().get("esearchresult", {}).get("idlist", [])
except Exception as exc:
logger.warning("PubMed search failed: %s", exc)
return []
if not id_list:
return []
time.sleep(0.4)
fetch_params = {
"db": "pubmed",
"id": ",".join(id_list),
"retmode": "xml",
"rettype": "abstract",
}
try:
resp = _http.get(_EFETCH_URL, params=fetch_params, timeout=30)
resp.raise_for_status()
root = ElementTree.fromstring(resp.content)
except Exception as exc:
logger.warning("PubMed fetch failed: %s", exc)
return []
papers = []
for article in root.findall(".//PubmedArticle"):
parsed = _parse_pubmed_article(article)
if parsed:
papers.append(parsed)
return papers
def fetch_pubmed_by_id(pmid: str) -> dict | None:
"""Fetch a single PubMed article by its PMID."""
fetch_params = {
"db": "pubmed",
"id": pmid,
"retmode": "xml",
"rettype": "abstract",
}
try:
resp = _http.get(_EFETCH_URL, params=fetch_params, timeout=15)
resp.raise_for_status()
root = ElementTree.fromstring(resp.content)
except Exception as exc:
logger.warning("PubMed get_by_id failed for %s: %s", pmid, exc)
return None
article = root.find(".//PubmedArticle")
if article is None:
return None
return _parse_pubmed_article(article)
class PubmedSource(PaperSource):
"""NCBI PubMed via E-utilities. Search + PMID lookup only."""
name = "pubmed"
def search(
self,
query: str,
max_results: int = 20,
filters: dict | None = None,
) -> list[dict]:
return fetch_pubmed(query=query, max_results=max_results)
def get_by_id(self, identifier: str) -> dict | None:
return fetch_pubmed_by_id(identifier)
def supports(self, capability: Capability) -> bool:
# E-utilities can do elink for refs, but we don't expose that today.
return False
def rate_limit_hint(self) -> RateLimitHint:
return RateLimitHint(
min_interval_seconds=0.34, # 3 req/s without an API key
daily_quota=None,
notes="NCBI E-utilities: 3 req/s without API key, 10 req/s with NCBI_API_KEY",
)