File size: 5,702 Bytes
57272d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""PubMed source: NCBI E-utilities esearch + efetch."""

import datetime
import logging
import time
from xml.etree import ElementTree

from ._http import _http
from .base import Capability, PaperSource, RateLimitHint, paper_dict

logger = logging.getLogger(__name__)

_ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
_EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"


def _parse_pubmed_date(pub_date_elem) -> datetime.datetime | None:
    if pub_date_elem is None:
        return None
    year = pub_date_elem.findtext("Year")
    month = pub_date_elem.findtext("Month") or "Jan"
    day = pub_date_elem.findtext("Day") or "1"
    if not year:
        return None
    for fmt in ("%Y %b %d", "%Y %m %d"):
        try:
            return datetime.datetime.strptime(f"{year} {month} {day}", fmt)
        except ValueError:
            continue
    try:
        return datetime.datetime(int(year), 1, 1)
    except (ValueError, TypeError):
        return None


def _parse_pubmed_article(article) -> dict | None:
    """Parse a `<PubmedArticle>` element into the standard paper dict."""
    try:
        medline = article.find("MedlineCitation")
        if medline is None:
            return None
        pmid = medline.findtext("PMID") or ""
        article_elem = medline.find("Article")
        if article_elem is None:
            return None

        title = (article_elem.findtext("ArticleTitle") or "").strip()
        if not title or not pmid:
            return None

        abstract_parts = article_elem.findall(".//AbstractText")
        abstract = " ".join(
            (e.text or "") for e in abstract_parts if e.text
        ).strip() or None

        author_list = article_elem.findall(".//Author")
        author_names = []
        for a in author_list:
            last = a.findtext("LastName") or ""
            first = a.findtext("ForeName") or ""
            if last:
                author_names.append(f"{last} {first}".strip())
        authors = ", ".join(author_names) if author_names else None

        pub_date_elem = article_elem.find(".//Journal/JournalIssue/PubDate")

        mesh = medline.findall(".//MeshHeading/DescriptorName")
        topics = [m.text for m in mesh if m.text][:10]

        doi = None
        for id_elem in article.findall(".//ArticleIdList/ArticleId"):
            if id_elem.get("IdType") == "doi":
                doi = id_elem.text
                break

        return paper_dict(
            title=title,
            abstract=abstract,
            authors=authors,
            publication_date=_parse_pubmed_date(pub_date_elem),
            source="pubmed",
            source_id=pmid,
            url=f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
            doi=doi,
            topics=topics,
        )
    except Exception as exc:
        logger.warning("Skipping malformed PubMed article: %s", exc)
        return None


def fetch_pubmed(query: str = "machine learning", max_results: int = 20) -> list[dict]:
    """Fetch papers from PubMed via NCBI E-utilities."""
    search_params = {
        "db": "pubmed",
        "term": query,
        "retmax": max_results,
        "retmode": "json",
        "sort": "pub_date",
    }
    try:
        resp = _http.get(_ESEARCH_URL, params=search_params, timeout=15)
        resp.raise_for_status()
        id_list = resp.json().get("esearchresult", {}).get("idlist", [])
    except Exception as exc:
        logger.warning("PubMed search failed: %s", exc)
        return []

    if not id_list:
        return []

    time.sleep(0.4)

    fetch_params = {
        "db": "pubmed",
        "id": ",".join(id_list),
        "retmode": "xml",
        "rettype": "abstract",
    }
    try:
        resp = _http.get(_EFETCH_URL, params=fetch_params, timeout=30)
        resp.raise_for_status()
        root = ElementTree.fromstring(resp.content)
    except Exception as exc:
        logger.warning("PubMed fetch failed: %s", exc)
        return []

    papers = []
    for article in root.findall(".//PubmedArticle"):
        parsed = _parse_pubmed_article(article)
        if parsed:
            papers.append(parsed)
    return papers


def fetch_pubmed_by_id(pmid: str) -> dict | None:
    """Fetch a single PubMed article by its PMID."""
    fetch_params = {
        "db": "pubmed",
        "id": pmid,
        "retmode": "xml",
        "rettype": "abstract",
    }
    try:
        resp = _http.get(_EFETCH_URL, params=fetch_params, timeout=15)
        resp.raise_for_status()
        root = ElementTree.fromstring(resp.content)
    except Exception as exc:
        logger.warning("PubMed get_by_id failed for %s: %s", pmid, exc)
        return None

    article = root.find(".//PubmedArticle")
    if article is None:
        return None
    return _parse_pubmed_article(article)


class PubmedSource(PaperSource):
    """NCBI PubMed via E-utilities. Search + PMID lookup only."""

    name = "pubmed"

    def search(
        self,
        query: str,
        max_results: int = 20,
        filters: dict | None = None,
    ) -> list[dict]:
        return fetch_pubmed(query=query, max_results=max_results)

    def get_by_id(self, identifier: str) -> dict | None:
        return fetch_pubmed_by_id(identifier)

    def supports(self, capability: Capability) -> bool:
        # E-utilities can do elink for refs, but we don't expose that today.
        return False

    def rate_limit_hint(self) -> RateLimitHint:
        return RateLimitHint(
            min_interval_seconds=0.34,  # 3 req/s without an API key
            daily_quota=None,
            notes="NCBI E-utilities: 3 req/s without API key, 10 req/s with NCBI_API_KEY",
        )