Spaces:
Sleeping
Sleeping
| """PubMed source: NCBI E-utilities esearch + efetch.""" | |
| import datetime | |
| import logging | |
| import time | |
| from xml.etree import ElementTree | |
| from ._http import _http | |
| from .base import Capability, PaperSource, RateLimitHint, paper_dict | |
| logger = logging.getLogger(__name__) | |
| _ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" | |
| _EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" | |
| def _parse_pubmed_date(pub_date_elem) -> datetime.datetime | None: | |
| if pub_date_elem is None: | |
| return None | |
| year = pub_date_elem.findtext("Year") | |
| month = pub_date_elem.findtext("Month") or "Jan" | |
| day = pub_date_elem.findtext("Day") or "1" | |
| if not year: | |
| return None | |
| for fmt in ("%Y %b %d", "%Y %m %d"): | |
| try: | |
| return datetime.datetime.strptime(f"{year} {month} {day}", fmt) | |
| except ValueError: | |
| continue | |
| try: | |
| return datetime.datetime(int(year), 1, 1) | |
| except (ValueError, TypeError): | |
| return None | |
| def _parse_pubmed_article(article) -> dict | None: | |
| """Parse a `<PubmedArticle>` element into the standard paper dict.""" | |
| try: | |
| medline = article.find("MedlineCitation") | |
| if medline is None: | |
| return None | |
| pmid = medline.findtext("PMID") or "" | |
| article_elem = medline.find("Article") | |
| if article_elem is None: | |
| return None | |
| title = (article_elem.findtext("ArticleTitle") or "").strip() | |
| if not title or not pmid: | |
| return None | |
| abstract_parts = article_elem.findall(".//AbstractText") | |
| abstract = " ".join( | |
| (e.text or "") for e in abstract_parts if e.text | |
| ).strip() or None | |
| author_list = article_elem.findall(".//Author") | |
| author_names = [] | |
| for a in author_list: | |
| last = a.findtext("LastName") or "" | |
| first = a.findtext("ForeName") or "" | |
| if last: | |
| author_names.append(f"{last} {first}".strip()) | |
| authors = ", ".join(author_names) if author_names else None | |
| pub_date_elem = article_elem.find(".//Journal/JournalIssue/PubDate") | |
| mesh = medline.findall(".//MeshHeading/DescriptorName") | |
| topics = [m.text for m in mesh if m.text][:10] | |
| doi = None | |
| for id_elem in article.findall(".//ArticleIdList/ArticleId"): | |
| if id_elem.get("IdType") == "doi": | |
| doi = id_elem.text | |
| break | |
| return paper_dict( | |
| title=title, | |
| abstract=abstract, | |
| authors=authors, | |
| publication_date=_parse_pubmed_date(pub_date_elem), | |
| source="pubmed", | |
| source_id=pmid, | |
| url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", | |
| doi=doi, | |
| topics=topics, | |
| ) | |
| except Exception as exc: | |
| logger.warning("Skipping malformed PubMed article: %s", exc) | |
| return None | |
| def fetch_pubmed(query: str = "machine learning", max_results: int = 20) -> list[dict]: | |
| """Fetch papers from PubMed via NCBI E-utilities.""" | |
| search_params = { | |
| "db": "pubmed", | |
| "term": query, | |
| "retmax": max_results, | |
| "retmode": "json", | |
| "sort": "pub_date", | |
| } | |
| try: | |
| resp = _http.get(_ESEARCH_URL, params=search_params, timeout=15) | |
| resp.raise_for_status() | |
| id_list = resp.json().get("esearchresult", {}).get("idlist", []) | |
| except Exception as exc: | |
| logger.warning("PubMed search failed: %s", exc) | |
| return [] | |
| if not id_list: | |
| return [] | |
| time.sleep(0.4) | |
| fetch_params = { | |
| "db": "pubmed", | |
| "id": ",".join(id_list), | |
| "retmode": "xml", | |
| "rettype": "abstract", | |
| } | |
| try: | |
| resp = _http.get(_EFETCH_URL, params=fetch_params, timeout=30) | |
| resp.raise_for_status() | |
| root = ElementTree.fromstring(resp.content) | |
| except Exception as exc: | |
| logger.warning("PubMed fetch failed: %s", exc) | |
| return [] | |
| papers = [] | |
| for article in root.findall(".//PubmedArticle"): | |
| parsed = _parse_pubmed_article(article) | |
| if parsed: | |
| papers.append(parsed) | |
| return papers | |
| def fetch_pubmed_by_id(pmid: str) -> dict | None: | |
| """Fetch a single PubMed article by its PMID.""" | |
| fetch_params = { | |
| "db": "pubmed", | |
| "id": pmid, | |
| "retmode": "xml", | |
| "rettype": "abstract", | |
| } | |
| try: | |
| resp = _http.get(_EFETCH_URL, params=fetch_params, timeout=15) | |
| resp.raise_for_status() | |
| root = ElementTree.fromstring(resp.content) | |
| except Exception as exc: | |
| logger.warning("PubMed get_by_id failed for %s: %s", pmid, exc) | |
| return None | |
| article = root.find(".//PubmedArticle") | |
| if article is None: | |
| return None | |
| return _parse_pubmed_article(article) | |
| class PubmedSource(PaperSource): | |
| """NCBI PubMed via E-utilities. Search + PMID lookup only.""" | |
| name = "pubmed" | |
| def search( | |
| self, | |
| query: str, | |
| max_results: int = 20, | |
| filters: dict | None = None, | |
| ) -> list[dict]: | |
| return fetch_pubmed(query=query, max_results=max_results) | |
| def get_by_id(self, identifier: str) -> dict | None: | |
| return fetch_pubmed_by_id(identifier) | |
| def supports(self, capability: Capability) -> bool: | |
| # E-utilities can do elink for refs, but we don't expose that today. | |
| return False | |
| def rate_limit_hint(self) -> RateLimitHint: | |
| return RateLimitHint( | |
| min_interval_seconds=0.34, # 3 req/s without an API key | |
| daily_quota=None, | |
| notes="NCBI E-utilities: 3 req/s without API key, 10 req/s with NCBI_API_KEY", | |
| ) | |