mpkato commited on
Commit
85cb39c
·
verified ·
1 Parent(s): 72324cd

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +264 -66
app.py CHANGED
@@ -14,41 +14,15 @@ from transformers import pipeline
14
 
15
  MODEL_ID = os.environ.get("CHEMU_MODEL_ID", "mpkato/chemu-biobert-ner")
16
 
17
- TITLE = "ChEMU NER (BioBERT)"
18
- DESCRIPTION = """\
19
- Fine-tuned **BioBERT** for extracting reaction-step entities from
20
- chemical patents, trained on the [ChEMU 2020 Task 1]\
21
- (https://chemu-patent-ie.github.io/) corpus.
22
-
23
- Paste any chemical patent snippet below and the model will highlight
24
- the 10 entity types (reactants, catalysts, solvents, products,
25
- conditions, yields, labels).
26
-
27
- **Held-out dev F1 (exact match, micro): \u2248 0.95**
28
- """
29
-
30
- ENTITY_GUIDE = """\
31
- | Label | Meaning | Examples |
32
- |---|---|---|
33
- | **STARTING_MATERIAL** | reactant that provides the core skeleton | `aniline`, `benzyl bromide` |
34
- | **REAGENT_CATALYST** | reagent / catalyst / base / oxidant / reductant | `sodium hydride`, `DIPEA` |
35
- | **REACTION_PRODUCT** | target product of the reaction | `tert-butyl 2-(4-pyridyl)pyrrolidine-1-carboxylate` |
36
- | **SOLVENT** | reaction or extraction medium | `THF`, `dioxane`, `acetonitrile` |
37
- | **OTHER_COMPOUND** | auxiliary: brines, drying agents, washes, by-products | `brine`, `celite`, `ethyl acetate` |
38
- | **TEMPERATURE** | reaction temperature or range | `50 \u00b0C`, `room temperature` |
39
- | **TIME** | elapsed reaction time | `2 h`, `overnight`, `30 min` |
40
- | **YIELD_PERCENT** | yield expressed as percentage | `56%`, `quantitative` |
41
- | **YIELD_OTHER** | yield expressed as mass or moles | `1.30 g`, `2.5 mmol` |
42
- | **EXAMPLE_LABEL** | numeric/identifier labels for compounds or examples | `Example 5`, `(1)`, `14` |
43
- """
44
 
45
  EXAMPLES = [
46
- [
47
- "Under blue LED light, N-Boc-pyrrolidine was coupled with "
48
- "4-cyanopyridine in acetonitrile using [Ru(bpy)\u2083]Cl\u2082 "
49
- "as the photocatalyst and DIPEA as the reductant to afford "
50
- "tert-butyl 2-(4-pyridyl)pyrrolidine-1-carboxylate."
51
- ],
52
  [
53
  "Step 1. 4-chloro-2-fluorobenzoic acid (5.0 g, 12.3 mmol) was "
54
  "dissolved in dioxane (40 mL) at room temperature for 2 h."
@@ -60,12 +34,204 @@ EXAMPLES = [
60
  ],
61
  ]
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  def _load_pipeline():
65
  return pipeline(
66
  "token-classification",
67
  model=MODEL_ID,
68
  aggregation_strategy="simple",
 
69
  )
70
 
71
 
@@ -73,17 +239,10 @@ NER = _load_pipeline()
73
 
74
 
75
  def extract(text: str):
76
- """Run the NER model and return a list of (text, label) segments.
77
-
78
- Gradio's `HighlightedText` accepts a list of tuples where `label=None`
79
- means un-highlighted plain text.
80
- """
81
  if not text:
82
  return []
83
  result = NER(text)
84
- # `aggregation_strategy="simple"` merges adjacent subwords into entity
85
- # chunks with `start`, `end`, `entity_group` fields. We walk the text
86
- # and emit plain / highlighted segments in order.
87
  spans: list[tuple[str, str | None]] = []
88
  cursor = 0
89
  for ent in result:
@@ -97,36 +256,75 @@ def extract(text: str):
97
  return spans
98
 
99
 
100
- with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as demo:
101
- gr.Markdown(f"# {TITLE}")
102
- gr.Markdown(DESCRIPTION)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  with gr.Row():
104
- with gr.Column(scale=2):
105
- text_in = gr.Textbox(
106
- label="Chemical patent text",
107
- lines=8,
108
- placeholder="Paste a reaction description here...",
109
- )
110
- extract_btn = gr.Button("Extract entities", variant="primary")
111
- highlighted = gr.HighlightedText(
112
- label="Detected entities",
113
- combine_adjacent=True,
114
- show_legend=True,
115
- )
116
- gr.Examples(examples=EXAMPLES, inputs=[text_in])
117
- with gr.Column(scale=1):
118
- gr.Markdown("### Entity legend")
119
- gr.Markdown(ENTITY_GUIDE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  extract_btn.click(extract, inputs=[text_in], outputs=[highlighted])
 
122
  text_in.submit(extract, inputs=[text_in], outputs=[highlighted])
123
 
124
- gr.Markdown(
125
- "---\n"
126
- "Model: [`mpkato/chemu-biobert-ner`](https://huggingface.co/mpkato/chemu-biobert-ner) \n"
127
- "Training data: ChEMU 2020 NER corpus (CC BY-NC 3.0), for **non-commercial research use only**. \n"
128
- "Base encoder: [`dmis-lab/biobert-base-cased-v1.2`](https://huggingface.co/dmis-lab/biobert-base-cased-v1.2)"
129
- )
130
 
131
 
132
  if __name__ == "__main__":
 
14
 
15
  MODEL_ID = os.environ.get("CHEMU_MODEL_ID", "mpkato/chemu-biobert-ner")
16
 
17
+ DEFAULT_TEXT = (
18
+ "Under blue LED light, N-Boc-pyrrolidine was coupled with "
19
+ "4-cyanopyridine in acetonitrile using [Ru(bpy)\u2083]Cl\u2082 "
20
+ "as the photocatalyst and DIPEA as the reductant to afford "
21
+ "tert-butyl 2-(4-pyridyl)pyrrolidine-1-carboxylate."
22
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  EXAMPLES = [
25
+ [DEFAULT_TEXT],
 
 
 
 
 
26
  [
27
  "Step 1. 4-chloro-2-fluorobenzoic acid (5.0 g, 12.3 mmol) was "
28
  "dissolved in dioxane (40 mL) at room temperature for 2 h."
 
34
  ],
35
  ]
36
 
37
+ # Color palette for the 10 entity types. Colors are chosen to be
38
+ # visually distinct and mutually readable on a light background.
39
+ COLOR_MAP = {
40
+ "STARTING_MATERIAL": "#BBDEFB", # blue
41
+ "REAGENT_CATALYST": "#FFE0B2", # orange
42
+ "REACTION_PRODUCT": "#C8E6C9", # green
43
+ "SOLVENT": "#E1BEE7", # purple
44
+ "OTHER_COMPOUND": "#E0E0E0", # gray
45
+ "TEMPERATURE": "#FFCDD2", # red
46
+ "TIME": "#FFF59D", # yellow
47
+ "YIELD_PERCENT": "#B2DFDB", # teal
48
+ "YIELD_OTHER": "#B3E5FC", # cyan
49
+ "EXAMPLE_LABEL": "#F8BBD0", # pink
50
+ }
51
+
52
+ # Held-out dev F1 per type (from the training run)
53
+ PER_TYPE_METRICS = [
54
+ ["STARTING_MATERIAL", 0.8881, 413],
55
+ ["REAGENT_CATALYST", 0.9005, 289],
56
+ ["REACTION_PRODUCT", 0.9553, 506],
57
+ ["SOLVENT", 0.9545, 250],
58
+ ["OTHER_COMPOUND", 0.9689, 1080],
59
+ ["TEMPERATURE", 0.9813, 346],
60
+ ["TIME", 0.9862, 252],
61
+ ["YIELD_PERCENT", 1.0000, 228],
62
+ ["YIELD_OTHER", 0.9867, 261],
63
+ ["EXAMPLE_LABEL", 0.9862, 218],
64
+ ]
65
+
66
+ ENTITY_DESCRIPTIONS = {
67
+ "STARTING_MATERIAL": ("Reactant providing the core skeleton",
68
+ "aniline, benzyl bromide, N-Boc-pyrrolidine"),
69
+ "REAGENT_CATALYST": ("Reagent, catalyst, base, oxidant, reductant",
70
+ "sodium hydride, DIPEA, [Ru(bpy)₃]Cl₂"),
71
+ "REACTION_PRODUCT": ("Target product of the reaction",
72
+ "tert-butyl 2-(4-pyridyl)pyrrolidine-1-carboxylate"),
73
+ "SOLVENT": ("Reaction or extraction medium",
74
+ "THF, dioxane, acetonitrile"),
75
+ "OTHER_COMPOUND": ("Auxiliary: brine, drying agent, wash, by-product",
76
+ "brine, celite, ethyl acetate"),
77
+ "TEMPERATURE": ("Reaction temperature (or range)",
78
+ "50 °C, room temperature, −78 °C"),
79
+ "TIME": ("Elapsed reaction time",
80
+ "2 h, overnight, 30 min"),
81
+ "YIELD_PERCENT": ("Yield as a percentage",
82
+ "56%, 85%, quantitative"),
83
+ "YIELD_OTHER": ("Yield expressed as mass or moles",
84
+ "1.30 g, 2.5 mmol"),
85
+ "EXAMPLE_LABEL": ("Numeric identifier for a compound or example",
86
+ "Example 5, (1), 14"),
87
+ }
88
+
89
+
90
+ def _legend_html() -> str:
91
+ rows = []
92
+ for label, (desc, examples) in ENTITY_DESCRIPTIONS.items():
93
+ color = COLOR_MAP[label]
94
+ rows.append(
95
+ f"""
96
+ <div class="legend-row">
97
+ <span class="legend-chip" style="background:{color}">{label}</span>
98
+ <div class="legend-body">
99
+ <div class="legend-desc">{desc}</div>
100
+ <div class="legend-examples"><em>e.g.</em> {examples}</div>
101
+ </div>
102
+ </div>
103
+ """
104
+ )
105
+ return '<div class="legend-grid">' + "".join(rows) + "</div>"
106
+
107
+
108
+ CUSTOM_CSS = """
109
+ .gradio-container {
110
+ max-width: 1100px !important;
111
+ margin: 0 auto !important;
112
+ }
113
+
114
+ #header-block {
115
+ background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
116
+ color: #ffffff;
117
+ padding: 32px 28px;
118
+ border-radius: 18px;
119
+ text-align: center;
120
+ box-shadow: 0 8px 24px rgba(30, 60, 114, 0.20);
121
+ margin-bottom: 24px;
122
+ }
123
+ #header-block h1 {
124
+ color: #ffffff;
125
+ margin: 0 0 8px 0;
126
+ font-size: 2.2rem;
127
+ font-weight: 700;
128
+ }
129
+ #header-block p {
130
+ color: rgba(255, 255, 255, 0.92);
131
+ margin: 4px 0;
132
+ font-size: 1.0rem;
133
+ }
134
+ #header-block .chip-row {
135
+ margin-top: 14px;
136
+ }
137
+ #header-block .chip {
138
+ display: inline-block;
139
+ padding: 6px 14px;
140
+ margin: 4px 4px 0 4px;
141
+ background: rgba(255, 255, 255, 0.18);
142
+ border: 1px solid rgba(255, 255, 255, 0.35);
143
+ border-radius: 999px;
144
+ font-size: 0.9rem;
145
+ font-weight: 500;
146
+ }
147
+
148
+ .section-title {
149
+ color: #1e3c72;
150
+ font-size: 1.25rem;
151
+ font-weight: 700;
152
+ margin: 24px 0 8px 0;
153
+ }
154
+
155
+ .legend-grid {
156
+ display: grid;
157
+ grid-template-columns: repeat(2, 1fr);
158
+ gap: 12px 24px;
159
+ padding: 16px 4px;
160
+ }
161
+ @media (max-width: 700px) {
162
+ .legend-grid { grid-template-columns: 1fr; }
163
+ }
164
+ .legend-row {
165
+ display: flex;
166
+ align-items: flex-start;
167
+ gap: 12px;
168
+ }
169
+ .legend-chip {
170
+ display: inline-block;
171
+ min-width: 160px;
172
+ padding: 6px 12px;
173
+ border-radius: 8px;
174
+ font-family: ui-monospace, Menlo, Consolas, monospace;
175
+ font-size: 0.82rem;
176
+ font-weight: 700;
177
+ text-align: center;
178
+ color: #1a1a1a;
179
+ flex-shrink: 0;
180
+ }
181
+ .legend-body {
182
+ font-size: 0.9rem;
183
+ line-height: 1.4;
184
+ }
185
+ .legend-desc { color: #1a1a1a; }
186
+ .legend-examples { color: #5f6368; font-size: 0.82rem; margin-top: 2px; }
187
+
188
+ #footer-block {
189
+ margin-top: 32px;
190
+ padding: 18px;
191
+ background: #f5f7fb;
192
+ border-radius: 12px;
193
+ color: #455a64;
194
+ text-align: center;
195
+ font-size: 0.88rem;
196
+ }
197
+ #footer-block a { color: #1e3c72; text-decoration: none; font-weight: 600; }
198
+ #footer-block a:hover { text-decoration: underline; }
199
+ """
200
+
201
+ HEADER_HTML = """
202
+ <div id="header-block">
203
+ <h1>⚗️ ChEMU NER Demo</h1>
204
+ <p>Named-entity extraction for chemical reaction descriptions in patents</p>
205
+ <div class="chip-row">
206
+ <span class="chip">BioBERT fine-tune</span>
207
+ <span class="chip">held-out dev F1 = 0.9585</span>
208
+ <span class="chip">10 entity types</span>
209
+ <span class="chip">CC BY-NC 3.0</span>
210
+ </div>
211
+ </div>
212
+ """
213
+
214
+ FOOTER_HTML = """
215
+ <div id="footer-block">
216
+ Model: <a href="https://huggingface.co/mpkato/chemu-biobert-ner"
217
+ target="_blank">mpkato/chemu-biobert-ner</a> &nbsp;|&nbsp;
218
+ Base: <a href="https://huggingface.co/dmis-lab/biobert-base-cased-v1.2"
219
+ target="_blank">dmis-lab/biobert-base-cased-v1.2</a> &nbsp;|&nbsp;
220
+ Data: <a href="https://chemu-patent-ie.github.io/"
221
+ target="_blank">ChEMU 2020 Task 1</a><br>
222
+ Training data is licensed under
223
+ <strong>CC BY-NC 3.0</strong> &mdash; this model and demo are
224
+ released for <strong>non-commercial research use only</strong>.
225
+ </div>
226
+ """
227
+
228
 
229
  def _load_pipeline():
230
  return pipeline(
231
  "token-classification",
232
  model=MODEL_ID,
233
  aggregation_strategy="simple",
234
+ stride=64,
235
  )
236
 
237
 
 
239
 
240
 
241
  def extract(text: str):
242
+ """Run the NER model and return a list of (text, label) segments."""
 
 
 
 
243
  if not text:
244
  return []
245
  result = NER(text)
 
 
 
246
  spans: list[tuple[str, str | None]] = []
247
  cursor = 0
248
  for ent in result:
 
256
  return spans
257
 
258
 
259
+ with gr.Blocks(
260
+ title="ChEMU NER Demo",
261
+ theme=gr.themes.Soft(
262
+ primary_hue="indigo",
263
+ secondary_hue="blue",
264
+ ),
265
+ css=CUSTOM_CSS,
266
+ ) as demo:
267
+ gr.HTML(HEADER_HTML)
268
+
269
+ gr.HTML('<div class="section-title">🧪 Reaction description</div>')
270
+ text_in = gr.Textbox(
271
+ label="",
272
+ lines=6,
273
+ value=DEFAULT_TEXT,
274
+ placeholder="Paste a chemical reaction description here...",
275
+ show_label=False,
276
+ )
277
  with gr.Row():
278
+ extract_btn = gr.Button(
279
+ "⚡ Extract entities",
280
+ variant="primary",
281
+ size="lg",
282
+ scale=3,
283
+ )
284
+ clear_btn = gr.Button("Clear", variant="secondary", scale=1)
285
+
286
+ gr.HTML('<div class="section-title">🔍 Extracted entities</div>')
287
+ highlighted = gr.HighlightedText(
288
+ label="",
289
+ combine_adjacent=True,
290
+ show_legend=False,
291
+ color_map=COLOR_MAP,
292
+ show_label=False,
293
+ )
294
+
295
+ gr.HTML('<div class="section-title">📋 Quick examples</div>')
296
+ gr.Examples(
297
+ examples=EXAMPLES,
298
+ inputs=[text_in],
299
+ label="",
300
+ examples_per_page=3,
301
+ )
302
+
303
+ with gr.Accordion("📊 Held-out dev performance (ChEMU 2020)", open=False):
304
+ gr.Dataframe(
305
+ headers=["Entity type", "F1", "support"],
306
+ value=[[t, f"{f1:.4f}", n] for t, f1, n in PER_TYPE_METRICS],
307
+ interactive=False,
308
+ wrap=True,
309
+ )
310
+ gr.Markdown(
311
+ "**Overall micro-F1 = 0.9585** on 225 held-out dev documents "
312
+ "(3,843 entities). For reference, the official BANNER baseline "
313
+ "scores 0.8893."
314
+ )
315
+
316
+ gr.HTML('<div class="section-title">📖 Entity type legend</div>')
317
+ gr.HTML(_legend_html())
318
+
319
+ gr.HTML(FOOTER_HTML)
320
 
321
  extract_btn.click(extract, inputs=[text_in], outputs=[highlighted])
322
+ clear_btn.click(lambda: ("", []), outputs=[text_in, highlighted])
323
  text_in.submit(extract, inputs=[text_in], outputs=[highlighted])
324
 
325
+ # Run inference once at load so the user sees a highlighted result
326
+ # as soon as the Space boots.
327
+ demo.load(extract, inputs=[text_in], outputs=[highlighted])
 
 
 
328
 
329
 
330
  if __name__ == "__main__":