Spaces:

barissozudogru
/

research-papers-mcp

Sleeping

App Files Files Community

research-papers-mcp / research_papers_mcp /sources /pubmed.py

barissozudogru

bundle research_papers_mcp source, drop git+install

57272d3 verified about 1 month ago

Raw

History Blame Contribute Delete

5.7 kB

	"""PubMed source: NCBI E-utilities esearch + efetch."""

	import datetime
	import logging
	import time
	from xml.etree import ElementTree

	from ._http import _http
	from .base import Capability, PaperSource, RateLimitHint, paper_dict

	logger = logging.getLogger(__name__)

	_ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
	_EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"


	def _parse_pubmed_date(pub_date_elem) -> datetime.datetime \| None:
	if pub_date_elem is None:
	return None
	year = pub_date_elem.findtext("Year")
	month = pub_date_elem.findtext("Month") or "Jan"
	day = pub_date_elem.findtext("Day") or "1"
	if not year:
	return None
	for fmt in ("%Y %b %d", "%Y %m %d"):
	try:
	return datetime.datetime.strptime(f"{year} {month} {day}", fmt)
	except ValueError:
	continue
	try:
	return datetime.datetime(int(year), 1, 1)
	except (ValueError, TypeError):
	return None


	def _parse_pubmed_article(article) -> dict \| None:
	"""Parse a `<PubmedArticle>` element into the standard paper dict."""
	try:
	medline = article.find("MedlineCitation")
	if medline is None:
	return None
	pmid = medline.findtext("PMID") or ""
	article_elem = medline.find("Article")
	if article_elem is None:
	return None

	title = (article_elem.findtext("ArticleTitle") or "").strip()
	if not title or not pmid:
	return None

	abstract_parts = article_elem.findall(".//AbstractText")
	abstract = " ".join(
	(e.text or "") for e in abstract_parts if e.text
	).strip() or None

	author_list = article_elem.findall(".//Author")
	author_names = []
	for a in author_list:
	last = a.findtext("LastName") or ""
	first = a.findtext("ForeName") or ""
	if last:
	author_names.append(f"{last} {first}".strip())
	authors = ", ".join(author_names) if author_names else None

	pub_date_elem = article_elem.find(".//Journal/JournalIssue/PubDate")

	mesh = medline.findall(".//MeshHeading/DescriptorName")
	topics = [m.text for m in mesh if m.text][:10]

	doi = None
	for id_elem in article.findall(".//ArticleIdList/ArticleId"):
	if id_elem.get("IdType") == "doi":
	doi = id_elem.text
	break

	return paper_dict(
	title=title,
	abstract=abstract,
	authors=authors,
	publication_date=_parse_pubmed_date(pub_date_elem),
	source="pubmed",
	source_id=pmid,
	url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
	doi=doi,
	topics=topics,
	)
	except Exception as exc:
	logger.warning("Skipping malformed PubMed article: %s", exc)
	return None


	def fetch_pubmed(query: str = "machine learning", max_results: int = 20) -> list[dict]:
	"""Fetch papers from PubMed via NCBI E-utilities."""
	search_params = {
	"db": "pubmed",
	"term": query,
	"retmax": max_results,
	"retmode": "json",
	"sort": "pub_date",
	}
	try:
	resp = _http.get(_ESEARCH_URL, params=search_params, timeout=15)
	resp.raise_for_status()
	id_list = resp.json().get("esearchresult", {}).get("idlist", [])
	except Exception as exc:
	logger.warning("PubMed search failed: %s", exc)
	return []

	if not id_list:
	return []

	time.sleep(0.4)

	fetch_params = {
	"db": "pubmed",
	"id": ",".join(id_list),
	"retmode": "xml",
	"rettype": "abstract",
	}
	try:
	resp = _http.get(_EFETCH_URL, params=fetch_params, timeout=30)
	resp.raise_for_status()
	root = ElementTree.fromstring(resp.content)
	except Exception as exc:
	logger.warning("PubMed fetch failed: %s", exc)
	return []

	papers = []
	for article in root.findall(".//PubmedArticle"):
	parsed = _parse_pubmed_article(article)
	if parsed:
	papers.append(parsed)
	return papers


	def fetch_pubmed_by_id(pmid: str) -> dict \| None:
	"""Fetch a single PubMed article by its PMID."""
	fetch_params = {
	"db": "pubmed",
	"id": pmid,
	"retmode": "xml",
	"rettype": "abstract",
	}
	try:
	resp = _http.get(_EFETCH_URL, params=fetch_params, timeout=15)
	resp.raise_for_status()
	root = ElementTree.fromstring(resp.content)
	except Exception as exc:
	logger.warning("PubMed get_by_id failed for %s: %s", pmid, exc)
	return None

	article = root.find(".//PubmedArticle")
	if article is None:
	return None
	return _parse_pubmed_article(article)


	class PubmedSource(PaperSource):
	"""NCBI PubMed via E-utilities. Search + PMID lookup only."""

	name = "pubmed"

	def search(
	self,
	query: str,
	max_results: int = 20,
	filters: dict \| None = None,
	) -> list[dict]:
	return fetch_pubmed(query=query, max_results=max_results)

	def get_by_id(self, identifier: str) -> dict \| None:
	return fetch_pubmed_by_id(identifier)

	def supports(self, capability: Capability) -> bool:
	# E-utilities can do elink for refs, but we don't expose that today.
	return False

	def rate_limit_hint(self) -> RateLimitHint:
	return RateLimitHint(
	min_interval_seconds=0.34, # 3 req/s without an API key
	daily_quota=None,
	notes="NCBI E-utilities: 3 req/s without API key, 10 req/s with NCBI_API_KEY",
	)