"""PubMed source: NCBI E-utilities esearch + efetch.""" import datetime import logging import time from xml.etree import ElementTree from ._http import _http from .base import Capability, PaperSource, RateLimitHint, paper_dict logger = logging.getLogger(__name__) _ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" _EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" def _parse_pubmed_date(pub_date_elem) -> datetime.datetime | None: if pub_date_elem is None: return None year = pub_date_elem.findtext("Year") month = pub_date_elem.findtext("Month") or "Jan" day = pub_date_elem.findtext("Day") or "1" if not year: return None for fmt in ("%Y %b %d", "%Y %m %d"): try: return datetime.datetime.strptime(f"{year} {month} {day}", fmt) except ValueError: continue try: return datetime.datetime(int(year), 1, 1) except (ValueError, TypeError): return None def _parse_pubmed_article(article) -> dict | None: """Parse a `` element into the standard paper dict.""" try: medline = article.find("MedlineCitation") if medline is None: return None pmid = medline.findtext("PMID") or "" article_elem = medline.find("Article") if article_elem is None: return None title = (article_elem.findtext("ArticleTitle") or "").strip() if not title or not pmid: return None abstract_parts = article_elem.findall(".//AbstractText") abstract = " ".join( (e.text or "") for e in abstract_parts if e.text ).strip() or None author_list = article_elem.findall(".//Author") author_names = [] for a in author_list: last = a.findtext("LastName") or "" first = a.findtext("ForeName") or "" if last: author_names.append(f"{last} {first}".strip()) authors = ", ".join(author_names) if author_names else None pub_date_elem = article_elem.find(".//Journal/JournalIssue/PubDate") mesh = medline.findall(".//MeshHeading/DescriptorName") topics = [m.text for m in mesh if m.text][:10] doi = None for id_elem in article.findall(".//ArticleIdList/ArticleId"): if id_elem.get("IdType") == "doi": doi = id_elem.text break return paper_dict( title=title, abstract=abstract, authors=authors, publication_date=_parse_pubmed_date(pub_date_elem), source="pubmed", source_id=pmid, url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", doi=doi, topics=topics, ) except Exception as exc: logger.warning("Skipping malformed PubMed article: %s", exc) return None def fetch_pubmed(query: str = "machine learning", max_results: int = 20) -> list[dict]: """Fetch papers from PubMed via NCBI E-utilities.""" search_params = { "db": "pubmed", "term": query, "retmax": max_results, "retmode": "json", "sort": "pub_date", } try: resp = _http.get(_ESEARCH_URL, params=search_params, timeout=15) resp.raise_for_status() id_list = resp.json().get("esearchresult", {}).get("idlist", []) except Exception as exc: logger.warning("PubMed search failed: %s", exc) return [] if not id_list: return [] time.sleep(0.4) fetch_params = { "db": "pubmed", "id": ",".join(id_list), "retmode": "xml", "rettype": "abstract", } try: resp = _http.get(_EFETCH_URL, params=fetch_params, timeout=30) resp.raise_for_status() root = ElementTree.fromstring(resp.content) except Exception as exc: logger.warning("PubMed fetch failed: %s", exc) return [] papers = [] for article in root.findall(".//PubmedArticle"): parsed = _parse_pubmed_article(article) if parsed: papers.append(parsed) return papers def fetch_pubmed_by_id(pmid: str) -> dict | None: """Fetch a single PubMed article by its PMID.""" fetch_params = { "db": "pubmed", "id": pmid, "retmode": "xml", "rettype": "abstract", } try: resp = _http.get(_EFETCH_URL, params=fetch_params, timeout=15) resp.raise_for_status() root = ElementTree.fromstring(resp.content) except Exception as exc: logger.warning("PubMed get_by_id failed for %s: %s", pmid, exc) return None article = root.find(".//PubmedArticle") if article is None: return None return _parse_pubmed_article(article) class PubmedSource(PaperSource): """NCBI PubMed via E-utilities. Search + PMID lookup only.""" name = "pubmed" def search( self, query: str, max_results: int = 20, filters: dict | None = None, ) -> list[dict]: return fetch_pubmed(query=query, max_results=max_results) def get_by_id(self, identifier: str) -> dict | None: return fetch_pubmed_by_id(identifier) def supports(self, capability: Capability) -> bool: # E-utilities can do elink for refs, but we don't expose that today. return False def rate_limit_hint(self) -> RateLimitHint: return RateLimitHint( min_interval_seconds=0.34, # 3 req/s without an API key daily_quota=None, notes="NCBI E-utilities: 3 req/s without API key, 10 req/s with NCBI_API_KEY", )