Saracasm commited on
Commit
f98ebda
·
1 Parent(s): 86bce6d

Phase 6: UI polish complete - all 4 stages, SVG architecture diagrams, audio recorder fix, confidence explainer, honest overfitting analysis, light mode overrides

Browse files
Files changed (1) hide show
  1. app/app.py +1273 -65
app/app.py CHANGED
@@ -201,6 +201,15 @@ def make_wavefake_plot():
201
  # Prediction handler
202
  # ============================================================
203
 
 
 
 
 
 
 
 
 
 
204
  def predict_audio(audio_path):
205
  if audio_path is None:
206
  empty_badge = """
@@ -229,6 +238,16 @@ def predict_audio(audio_path):
229
  spoof_pct = result["spoof_probability"] * 100
230
  bona_pct = result["bonafide_probability"] * 100
231
 
 
 
 
 
 
 
 
 
 
 
232
  if pred == "spoof":
233
  badge_class = "result-card-spoof"
234
  icon = "⚠"
@@ -257,6 +276,32 @@ def predict_audio(audio_path):
257
  <div class='confidence-bar-track'>
258
  <div class='confidence-bar-fill' style='width: {confidence:.1f}%;'></div>
259
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  </div>
261
  <div class='result-card-probs'>
262
  <div class='prob-row'>
@@ -280,6 +325,8 @@ def predict_audio(audio_path):
280
  <span>{result['n_windows']} windows</span>
281
  <span class='meta-dot'>·</span>
282
  <span>{elapsed_ms:.0f}ms on CPU</span>
 
 
283
  </div>
284
  </div>
285
  """
@@ -1047,6 +1094,699 @@ body::before {
1047
  margin-top: 0.25rem !important;
1048
  }
1049
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1050
  """
1051
 
1052
 
@@ -1177,20 +1917,61 @@ with gr.Blocks(
1177
  with gr.Row(equal_height=False):
1178
  with gr.Column(scale=1):
1179
  gr.HTML("<div class='step-label'><span class='step-number'>1</span> Provide audio</div>")
1180
- audio_input = gr.Audio(
1181
- sources=["upload", "microphone"],
1182
- type="filepath",
1183
- label="",
1184
- elem_classes=["audio-input-styled"],
1185
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1186
 
1187
  gr.HTML("<div class='step-label' style='margin-top: 1.25rem;'><span class='step-number'>2</span> Run the detector</div>")
1188
  analyze_btn = gr.Button("Analyze audio →", variant="primary", size="lg", elem_classes=["analyze-button"])
1189
 
1190
  gr.HTML("<div class='step-label' style='margin-top: 1.5rem;'>Or try an example</div>")
 
 
 
 
 
 
 
 
 
 
 
 
1191
  gr.Examples(
1192
  examples=EXAMPLE_FILES,
1193
- inputs=audio_input,
1194
  label="",
1195
  )
1196
 
@@ -1213,8 +1994,8 @@ with gr.Blocks(
1213
  raw_output = gr.JSON(label=None)
1214
 
1215
  analyze_btn.click(
1216
- fn=predict_audio,
1217
- inputs=audio_input,
1218
  outputs=[badge_output, details_output, plot_output, raw_output],
1219
  )
1220
 
@@ -1223,66 +2004,335 @@ with gr.Blocks(
1223
  # TAB 3: PERFORMANCE
1224
  # ============================================================
1225
  with gr.Tab("Performance", id=2):
1226
- gr.Markdown("### Headline results")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1227
 
1228
  with gr.Row():
1229
  gr.HTML("""
1230
- <div class='metric-card'>
1231
- <div class='metric-value' style='color:#16a34a;'>5.55%</div>
1232
- <div class='metric-label'><b>ASVspoof 2019 LA</b><br/>(unseen attacks A07-A19)</div>
 
 
1233
  </div>
1234
  """)
1235
  gr.HTML("""
1236
- <div class='metric-card'>
1237
- <div class='metric-value' style='color:#7c3aed;'>9.09%</div>
1238
- <div class='metric-label'><b>ASVspoof 2021 LA</b><br/>(codec-degraded audio)</div>
 
 
1239
  </div>
1240
  """)
1241
  gr.HTML("""
1242
- <div class='metric-card'>
1243
- <div class='metric-value' style='color:#dc2626;'>26.33%</div>
1244
- <div class='metric-label'><b>WaveFake</b><br/>(novel vocoder pipelines)</div>
 
 
1245
  </div>
1246
  """)
1247
 
1248
- gr.Markdown("""
1249
- #### Comparison to published baselines
 
 
 
 
 
1250
 
1251
- | System | 2019 LA EER | 2021 LA EER |
1252
- |---|---|---|
1253
- | Official LFCC-GMM baseline | 8.09% | 25.56% |
1254
- | Official CQCC-GMM baseline | 9.57% | 19.30% |
1255
- | Official LFCC-LCNN baseline | – | 9.26% |
1256
- | Official RawNet2 baseline | – | 9.50% |
1257
- | **This work (Wav2Vec 2.0)** | **5.55%** | **9.09%** |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1258
 
1259
- Our model outperforms LFCC-GMM on 2019 LA by 2.54 pp and matches the strongest neural
1260
- baselines (LFCC-LCNN, RawNet2) on 2021 LA — without any codec-specific training augmentation.
 
 
 
 
 
 
 
 
1261
  """)
 
 
 
1262
 
1263
- gr.Markdown("---")
1264
- gr.Markdown("### Performance by audio codec (ASVspoof 2021 LA)")
1265
- gr.Markdown("Real-world speech goes through codecs (compression for transmission). The model handles modern codecs well but struggles with aggressive cellular compression.")
1266
- gr.Plot(value=make_per_codec_plot(), label=None)
 
 
 
 
 
 
 
 
 
 
1267
 
1268
- gr.Markdown("---")
1269
- gr.Markdown("### Performance by attack type (ASVspoof 2019 LA eval)")
1270
- gr.Markdown("13 different synthesis methods (A07-A19), all unseen during training. A10 is the model's persistent weakness across both datasets.")
1271
- gr.Plot(value=make_per_attack_plot(), label=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1272
 
1273
- gr.Markdown("---")
1274
- gr.Markdown("### The WaveFake story (honest negative result)")
1275
- gr.Markdown("""
1276
- On WaveFake the model performs significantly worse — particularly on LJSpeech-based vocoders
1277
- (22-34% EER). This is because WaveFake tests pure neural vocoder synthesis, while the model
1278
- was trained on ASVspoof's mix of TTS + voice conversion attacks. **The model has learned
1279
- ASVspoof-specific synthesis artifacts but not universal vocoder detection.**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1280
 
1281
- JSUT (Japanese) numbers look artificially good because the bonafide examples are English LJSpeech —
1282
- the model is detecting language/domain, not actual spoofing artifacts. The LJSpeech-based numbers
1283
- are the methodologically meaningful results.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1284
  """)
1285
- gr.Plot(value=make_wavefake_plot(), label=None)
1286
 
1287
 
1288
  # ============================================================
@@ -1293,27 +2343,185 @@ with gr.Blocks(
1293
  gr.Markdown("## Architecture")
1294
 
1295
  gr.HTML("""
1296
- <div style="background:#1f2937;color:#e5e7eb;padding:1.5rem;border-radius:0.5rem;font-family:monospace;font-size:0.95rem;line-height:1.7;">
1297
- <div style="text-align:center;color:#a78bfa;font-weight:600;margin-bottom:0.5rem;">Pipeline</div>
1298
- raw waveform (16 kHz, 4 sec, 64,000 samples)<br>
1299
- &nbsp;&nbsp;&nbsp;&nbsp;|<br>
1300
- &nbsp;&nbsp;&nbsp;&nbsp;v<br>
1301
- <span style="color:#fbbf24;">Wav2Vec 2.0 Base backbone (95M params, 12 transformer layers)</span><br>
1302
- &nbsp;&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;Stage 1: fully frozen<br>
1303
- &nbsp;&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;Stage 2: top 2 layers + final LayerNorm unfrozen (~14M trainable)<br>
1304
- &nbsp;&nbsp;&nbsp;&nbsp;v<br>
1305
- mean pooling over time<br>
1306
- &nbsp;&nbsp;&nbsp;&nbsp;|<br>
1307
- &nbsp;&nbsp;&nbsp;&nbsp;v<br>
1308
- <span style="color:#34d399;">linear classification head (768 -> 2)</span><br>
1309
- &nbsp;&nbsp;&nbsp;&nbsp;|<br>
1310
- &nbsp;&nbsp;&nbsp;&nbsp;v<br>
1311
- softmax -> P(spoof), P(bonafide)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1312
  </div>
1313
  """)
1314
 
1315
  gr.Markdown("## Two-stage training rationale")
1316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1317
  with gr.Row():
1318
  gr.HTML("""
1319
  <div class='stage-card'>
 
201
  # Prediction handler
202
  # ============================================================
203
 
204
+ def predict_audio_router(upload_path, record_path):
205
+ """
206
+ Routes between the two audio inputs (upload tab vs record tab).
207
+ Whichever one has a value gets used. Upload takes precedence if both somehow set.
208
+ """
209
+ audio_path = upload_path if upload_path is not None else record_path
210
+ return predict_audio(audio_path)
211
+
212
+
213
  def predict_audio(audio_path):
214
  if audio_path is None:
215
  empty_badge = """
 
238
  spoof_pct = result["spoof_probability"] * 100
239
  bona_pct = result["bonafide_probability"] * 100
240
 
241
+ # Plain-language hint about difficulty based on confidence
242
+ if confidence >= 97:
243
+ difficulty_hint = "clear case"
244
+ elif confidence >= 80:
245
+ difficulty_hint = "moderately confident"
246
+ elif confidence >= 65:
247
+ difficulty_hint = "borderline"
248
+ else:
249
+ difficulty_hint = "uncertain — interpret with caution"
250
+
251
  if pred == "spoof":
252
  badge_class = "result-card-spoof"
253
  icon = "⚠"
 
276
  <div class='confidence-bar-track'>
277
  <div class='confidence-bar-fill' style='width: {confidence:.1f}%;'></div>
278
  </div>
279
+ <details class='confidence-explainer'>
280
+ <summary>What does this number mean?</summary>
281
+ <div class='confidence-explainer-body'>
282
+ <p>
283
+ <strong>Confidence is how much probability the model puts behind its
284
+ prediction.</strong> If it says "Likely synthetic" at 66%, it means the model
285
+ sees a 66% chance this audio is synthetic and a 34% chance it's authentic.
286
+ That IS the answer — the prediction label is just the side with more probability.
287
+ </p>
288
+ <p>
289
+ <strong>High confidence does not always mean the model is right.</strong>
290
+ On the example clips below, the model is 100% confident on the easy ones and
291
+ less confident on the harder ones — that's expected. But it can also be 100%
292
+ confident and <em>wrong</em>, especially on attack types it struggles with
293
+ (like A10, the hardest example). When a deepfake is made by a method the
294
+ model hasn't learned to detect, it may see no spoofing signal at all and
295
+ confidently call it authentic.
296
+ </p>
297
+ <p>
298
+ <strong>Bottom line:</strong> treat any single prediction as one piece of
299
+ evidence, not a definitive answer. High confidence means the model sees
300
+ strong signal — but it can't detect what it hasn't been trained to detect.
301
+ Try the examples in order (easy → hardest) to see how confidence varies.
302
+ </p>
303
+ </div>
304
+ </details>
305
  </div>
306
  <div class='result-card-probs'>
307
  <div class='prob-row'>
 
325
  <span>{result['n_windows']} windows</span>
326
  <span class='meta-dot'>·</span>
327
  <span>{elapsed_ms:.0f}ms on CPU</span>
328
+ <span class='meta-dot'>·</span>
329
+ <span class='meta-difficulty'>{difficulty_hint}</span>
330
  </div>
331
  </div>
332
  """
 
1094
  margin-top: 0.25rem !important;
1095
  }
1096
 
1097
+
1098
+ /* Input tabs (Upload file / Record mic) — smaller, segmented control feel */
1099
+ .input-tabs > .tab-nav {
1100
+ border-bottom: none !important;
1101
+ margin-bottom: 0.5rem !important;
1102
+ background: var(--background-fill-secondary, rgba(124, 58, 237, 0.04));
1103
+ border-radius: 0.5rem;
1104
+ padding: 0.25rem;
1105
+ width: fit-content;
1106
+ }
1107
+ .input-tabs > .tab-nav button {
1108
+ font-size: 0.85rem !important;
1109
+ font-weight: 500 !important;
1110
+ padding: 0.45rem 0.9rem !important;
1111
+ border-radius: 0.4rem !important;
1112
+ border: none !important;
1113
+ background: transparent !important;
1114
+ color: var(--body-text-color-subdued, #6b7280) !important;
1115
+ transition: all 0.15s ease !important;
1116
+ }
1117
+ .input-tabs > .tab-nav button:hover {
1118
+ background: var(--background-fill-primary, rgba(124, 58, 237, 0.08)) !important;
1119
+ color: var(--body-text-color, #111827) !important;
1120
+ }
1121
+ .input-tabs > .tab-nav button.selected {
1122
+ background: var(--background-fill-primary, white) !important;
1123
+ color: var(--brand-purple-400) !important;
1124
+ border-bottom: none !important;
1125
+ box-shadow: 0 1px 3px rgba(0,0,0,0.08);
1126
+ }
1127
+
1128
+
1129
+ /* Recording instructions banner */
1130
+ .record-instructions {
1131
+ display: flex;
1132
+ align-items: flex-start;
1133
+ gap: 0.75rem;
1134
+ background: linear-gradient(135deg, rgba(124, 58, 237, 0.08) 0%, rgba(236, 72, 153, 0.08) 100%);
1135
+ border: 1px solid rgba(124, 58, 237, 0.2);
1136
+ border-radius: 0.625rem;
1137
+ padding: 0.75rem 1rem;
1138
+ margin: 0.25rem 0 0.75rem 0;
1139
+ font-size: 0.85rem;
1140
+ line-height: 1.5;
1141
+ color: var(--body-text-color, #4b5563);
1142
+ }
1143
+ .record-instructions-icon {
1144
+ font-size: 1.25rem;
1145
+ flex-shrink: 0;
1146
+ line-height: 1.4;
1147
+ }
1148
+ .record-instructions-text {
1149
+ flex: 1;
1150
+ opacity: 0.9;
1151
+ }
1152
+ .record-instructions-text strong {
1153
+ color: var(--body-text-color, #111827);
1154
+ font-weight: 600;
1155
+ }
1156
+
1157
+ /* Force record waveform area to have visible height */
1158
+ .audio-record-styled .waveform-container,
1159
+ .audio-record-styled audio {
1160
+ min-height: 80px !important;
1161
+ }
1162
+
1163
+
1164
+ /* ============================================================
1165
+ STAGE 4: PERFORMANCE TAB POLISH
1166
+ ============================================================ */
1167
+
1168
+ /* Subsection header (smaller than section header) */
1169
+ .subsection-header {
1170
+ text-align: left;
1171
+ margin: 2rem 0 1rem 0;
1172
+ }
1173
+ .subsection-eyebrow {
1174
+ display: block;
1175
+ font-size: 0.75rem;
1176
+ font-weight: 600;
1177
+ color: var(--brand-purple-400);
1178
+ text-transform: uppercase;
1179
+ letter-spacing: 0.1em;
1180
+ margin-bottom: 0.4rem;
1181
+ }
1182
+ .subsection-title {
1183
+ font-size: 1.4rem !important;
1184
+ font-weight: 700 !important;
1185
+ letter-spacing: -0.015em !important;
1186
+ color: var(--body-text-color, #111827);
1187
+ margin: 0 0 0.5rem 0 !important;
1188
+ }
1189
+ .subsection-caption {
1190
+ font-size: 0.95rem !important;
1191
+ color: var(--body-text-color, #4b5563) !important;
1192
+ opacity: 0.85;
1193
+ line-height: 1.6 !important;
1194
+ margin: 0 !important;
1195
+ max-width: 780px;
1196
+ }
1197
+
1198
+ /* Performance metric cards (replaces older .metric-card on this tab) */
1199
+ .perf-metric-card {
1200
+ background: var(--background-fill-secondary, rgba(124, 58, 237, 0.04));
1201
+ border: 1px solid var(--border-color-primary, rgba(124, 58, 237, 0.15));
1202
+ border-radius: 1rem;
1203
+ padding: 1.5rem 1.25rem;
1204
+ text-align: center;
1205
+ transition: transform 0.2s ease, box-shadow 0.2s ease, border-color 0.2s ease;
1206
+ position: relative;
1207
+ overflow: hidden;
1208
+ }
1209
+ .perf-metric-card::before {
1210
+ content: '';
1211
+ position: absolute;
1212
+ top: 0; left: 0; right: 0;
1213
+ height: 3px;
1214
+ }
1215
+ .perf-metric-card:hover {
1216
+ transform: translateY(-3px);
1217
+ box-shadow: 0 16px 32px -12px rgba(124, 58, 237, 0.2);
1218
+ }
1219
+ .perf-metric-good::before { background: linear-gradient(90deg, transparent, #10b981, transparent); }
1220
+ .perf-metric-mid::before { background: linear-gradient(90deg, transparent, #a78bfa, transparent); }
1221
+ .perf-metric-warn::before { background: linear-gradient(90deg, transparent, #f59e0b, transparent); }
1222
+
1223
+ .perf-metric-good:hover { border-color: rgba(16, 185, 129, 0.4) !important; }
1224
+ .perf-metric-mid:hover { border-color: rgba(167, 139, 250, 0.5) !important; }
1225
+ .perf-metric-warn:hover { border-color: rgba(245, 158, 11, 0.4) !important; }
1226
+
1227
+ .perf-metric-tag {
1228
+ display: inline-block;
1229
+ font-size: 0.7rem;
1230
+ font-weight: 600;
1231
+ text-transform: uppercase;
1232
+ letter-spacing: 0.08em;
1233
+ padding: 0.25rem 0.6rem;
1234
+ border-radius: 999px;
1235
+ margin-bottom: 0.85rem;
1236
+ }
1237
+ .perf-metric-good .perf-metric-tag {
1238
+ background: rgba(16, 185, 129, 0.12);
1239
+ color: #10b981;
1240
+ border: 1px solid rgba(16, 185, 129, 0.25);
1241
+ }
1242
+ .perf-metric-mid .perf-metric-tag {
1243
+ background: rgba(167, 139, 250, 0.12);
1244
+ color: var(--brand-purple-400);
1245
+ border: 1px solid rgba(167, 139, 250, 0.3);
1246
+ }
1247
+ .perf-metric-warn .perf-metric-tag {
1248
+ background: rgba(245, 158, 11, 0.12);
1249
+ color: #f59e0b;
1250
+ border: 1px solid rgba(245, 158, 11, 0.3);
1251
+ }
1252
+
1253
+ .perf-metric-value {
1254
+ font-size: 2.75rem;
1255
+ font-weight: 800;
1256
+ letter-spacing: -0.03em;
1257
+ line-height: 1.05;
1258
+ margin-bottom: 0.5rem;
1259
+ font-variant-numeric: tabular-nums;
1260
+ }
1261
+ .perf-metric-good .perf-metric-value { color: #10b981; }
1262
+ .perf-metric-mid .perf-metric-value { color: var(--brand-purple-400); }
1263
+ .perf-metric-warn .perf-metric-value { color: #f59e0b; }
1264
+
1265
+ .perf-metric-name {
1266
+ font-size: 0.95rem;
1267
+ font-weight: 600;
1268
+ color: var(--body-text-color, #111827);
1269
+ letter-spacing: -0.01em;
1270
+ margin-bottom: 0.25rem;
1271
+ }
1272
+ .perf-metric-detail {
1273
+ font-size: 0.8rem;
1274
+ color: var(--body-text-color-subdued, #6b7280);
1275
+ opacity: 0.8;
1276
+ }
1277
+
1278
+ /* Comparison table */
1279
+ .comparison-table-wrap {
1280
+ background: var(--background-fill-secondary, rgba(124, 58, 237, 0.04));
1281
+ border: 1px solid var(--border-color-primary, rgba(124, 58, 237, 0.15));
1282
+ border-radius: 1rem;
1283
+ padding: 1.5rem;
1284
+ margin: 1rem 0;
1285
+ overflow: hidden;
1286
+ }
1287
+ .comparison-table {
1288
+ width: 100%;
1289
+ border-collapse: collapse;
1290
+ font-variant-numeric: tabular-nums;
1291
+ }
1292
+ .comparison-table thead {
1293
+ border-bottom: 2px solid var(--border-color-primary, rgba(124, 58, 237, 0.2));
1294
+ }
1295
+ .comparison-table th {
1296
+ padding: 0.75rem 0.5rem;
1297
+ text-align: left;
1298
+ font-size: 0.78rem;
1299
+ font-weight: 600;
1300
+ text-transform: uppercase;
1301
+ letter-spacing: 0.06em;
1302
+ color: var(--body-text-color-subdued, #6b7280);
1303
+ }
1304
+ .comparison-table th:nth-child(2),
1305
+ .comparison-table th:nth-child(3) {
1306
+ text-align: right;
1307
+ }
1308
+ .comparison-table td {
1309
+ padding: 0.75rem 0.5rem;
1310
+ font-size: 0.95rem;
1311
+ color: var(--body-text-color, #111827);
1312
+ border-bottom: 1px solid var(--border-color-primary, rgba(0,0,0,0.05));
1313
+ }
1314
+ .comparison-table td:nth-child(2),
1315
+ .comparison-table td:nth-child(3) {
1316
+ text-align: right;
1317
+ font-weight: 500;
1318
+ }
1319
+ .comparison-table tbody tr:last-child td {
1320
+ border-bottom: none;
1321
+ }
1322
+ .comparison-row-highlight td {
1323
+ background: linear-gradient(90deg, rgba(124, 58, 237, 0.08) 0%, rgba(236, 72, 153, 0.08) 100%);
1324
+ border-bottom: none !important;
1325
+ color: var(--body-text-color, #111827) !important;
1326
+ }
1327
+ .comparison-row-highlight td:first-child {
1328
+ border-radius: 0.5rem 0 0 0.5rem;
1329
+ }
1330
+ .comparison-row-highlight td:last-child {
1331
+ border-radius: 0 0.5rem 0.5rem 0;
1332
+ }
1333
+ .comparison-caption {
1334
+ margin: 1.25rem 0 0 0 !important;
1335
+ font-size: 0.9rem !important;
1336
+ color: var(--body-text-color, #4b5563) !important;
1337
+ opacity: 0.85;
1338
+ line-height: 1.6 !important;
1339
+ }
1340
+
1341
+ /* Chart wrapper — subtle frame around each plot */
1342
+ .chart-wrap {
1343
+ background: var(--background-fill-secondary, rgba(124, 58, 237, 0.03));
1344
+ border: 1px solid var(--border-color-primary, rgba(124, 58, 237, 0.12));
1345
+ border-radius: 1rem !important;
1346
+ padding: 1rem !important;
1347
+ margin-top: 0.5rem;
1348
+ }
1349
+
1350
+
1351
+ /* ============================================================
1352
+ LIGHT MODE OVERRIDES
1353
+ Card backgrounds and borders are tuned for dark mode (low alpha
1354
+ over dark bg). On light backgrounds, those tints become invisible.
1355
+ This block bumps alpha + uses solid neutrals only when NOT in dark mode.
1356
+ ============================================================ */
1357
+ body:not(.dark) .perf-metric-card {
1358
+ background: #ffffff !important;
1359
+ border: 1px solid rgba(124, 58, 237, 0.18) !important;
1360
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.04);
1361
+ }
1362
+ body:not(.dark) .perf-metric-card:hover {
1363
+ box-shadow: 0 16px 32px -12px rgba(124, 58, 237, 0.18);
1364
+ }
1365
+
1366
+ body:not(.dark) .comparison-table-wrap {
1367
+ background: #ffffff !important;
1368
+ border: 1px solid rgba(124, 58, 237, 0.18) !important;
1369
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.04);
1370
+ }
1371
+ body:not(.dark) .comparison-table td {
1372
+ border-bottom: 1px solid rgba(0, 0, 0, 0.06) !important;
1373
+ }
1374
+
1375
+ body:not(.dark) .chart-wrap {
1376
+ background: #ffffff !important;
1377
+ border: 1px solid rgba(124, 58, 237, 0.18) !important;
1378
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.04);
1379
+ }
1380
+
1381
+ body:not(.dark) .context-card-v2 {
1382
+ background: #ffffff !important;
1383
+ border: 1px solid rgba(124, 58, 237, 0.18) !important;
1384
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.04);
1385
+ }
1386
+
1387
+ body:not(.dark) .stage-card {
1388
+ background: #ffffff !important;
1389
+ border: 1px solid rgba(124, 58, 237, 0.18) !important;
1390
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.04);
1391
+ }
1392
+
1393
+ body:not(.dark) .result-placeholder {
1394
+ background: rgba(124, 58, 237, 0.025) !important;
1395
+ border-color: rgba(124, 58, 237, 0.25) !important;
1396
+ }
1397
+
1398
+ body:not(.dark) .result-card-bonafide {
1399
+ background: linear-gradient(135deg, rgba(16, 185, 129, 0.06) 0%, #ffffff 100%) !important;
1400
+ border-color: rgba(16, 185, 129, 0.35) !important;
1401
+ }
1402
+ body:not(.dark) .result-card-spoof {
1403
+ background: linear-gradient(135deg, rgba(239, 68, 68, 0.06) 0%, #ffffff 100%) !important;
1404
+ border-color: rgba(239, 68, 68, 0.35) !important;
1405
+ }
1406
+
1407
+ body:not(.dark) .result-card-probs {
1408
+ background: rgba(124, 58, 237, 0.03) !important;
1409
+ }
1410
+
1411
+ body:not(.dark) .input-tabs > .tab-nav {
1412
+ background: rgba(124, 58, 237, 0.05) !important;
1413
+ }
1414
+ body:not(.dark) .input-tabs > .tab-nav button.selected {
1415
+ background: #ffffff !important;
1416
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.08);
1417
+ }
1418
+
1419
+ body:not(.dark) .record-instructions {
1420
+ background: linear-gradient(135deg, rgba(124, 58, 237, 0.05) 0%, rgba(236, 72, 153, 0.05) 100%) !important;
1421
+ border-color: rgba(124, 58, 237, 0.25) !important;
1422
+ }
1423
+
1424
+ body:not(.dark) .cta-section-v2 {
1425
+ background: linear-gradient(135deg, rgba(124, 58, 237, 0.06) 0%, rgba(236, 72, 153, 0.06) 100%) !important;
1426
+ border-color: rgba(124, 58, 237, 0.25) !important;
1427
+ }
1428
+
1429
+ body:not(.dark) .hero-eyebrow {
1430
+ background: rgba(124, 58, 237, 0.08) !important;
1431
+ border-color: rgba(124, 58, 237, 0.3) !important;
1432
+ }
1433
+
1434
+
1435
+ /* ============================================================
1436
+ ARCHITECTURE DIAGRAMS — entrance animation, no pulsing
1437
+ ============================================================ */
1438
+ .arch-diagram-wrap {
1439
+ margin: 1.5rem 0;
1440
+ padding: 1.5rem;
1441
+ background: var(--background-fill-secondary, rgba(124, 58, 237, 0.03));
1442
+ border: 1px solid var(--border-color-primary, rgba(124, 58, 237, 0.15));
1443
+ border-radius: 1rem;
1444
+ color: var(--body-text-color, #111827);
1445
+ overflow: hidden;
1446
+ }
1447
+ body:not(.dark) .arch-diagram-wrap {
1448
+ background: #ffffff !important;
1449
+ border: 1px solid rgba(124, 58, 237, 0.18) !important;
1450
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.04);
1451
+ }
1452
+ .arch-svg {
1453
+ display: block;
1454
+ max-width: 100%;
1455
+ height: auto;
1456
+ }
1457
+
1458
+ /* Entrance animation — each element fades in + slides down slightly */
1459
+ @keyframes archDrawIn {
1460
+ from {
1461
+ opacity: 0;
1462
+ transform: translateY(8px);
1463
+ }
1464
+ to {
1465
+ opacity: 1;
1466
+ transform: translateY(0);
1467
+ }
1468
+ }
1469
+ .arch-anim {
1470
+ animation: archDrawIn 0.5s ease-out both;
1471
+ transform-origin: center;
1472
+ }
1473
+
1474
+ /* Hover effect — boxes brighten slightly */
1475
+ .arch-svg rect {
1476
+ transition: opacity 0.2s ease, fill-opacity 0.2s ease;
1477
+ }
1478
+ .arch-svg g:hover rect {
1479
+ fill-opacity: 1;
1480
+ }
1481
+
1482
+ /* Reduce motion — respect user preference */
1483
+ @media (prefers-reduced-motion: reduce) {
1484
+ .arch-anim {
1485
+ animation: none;
1486
+ opacity: 1;
1487
+ transform: none;
1488
+ }
1489
+ }
1490
+
1491
+
1492
+ /* ============================================================
1493
+ PLAIN-LANGUAGE OVERFITTING SECTION
1494
+ ============================================================ */
1495
+ .plain-card {
1496
+ background: var(--background-fill-secondary, rgba(124, 58, 237, 0.03));
1497
+ border: 1px solid var(--border-color-primary, rgba(124, 58, 237, 0.15));
1498
+ border-radius: 1rem;
1499
+ padding: 2rem 1.75rem;
1500
+ margin: 1rem 0;
1501
+ }
1502
+ body:not(.dark) .plain-card {
1503
+ background: #ffffff !important;
1504
+ border: 1px solid rgba(124, 58, 237, 0.18) !important;
1505
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.04);
1506
+ }
1507
+
1508
+ .plain-card-eyebrow {
1509
+ display: inline-block;
1510
+ font-size: 0.7rem;
1511
+ font-weight: 700;
1512
+ color: var(--brand-purple-400);
1513
+ text-transform: uppercase;
1514
+ letter-spacing: 0.12em;
1515
+ background: rgba(124, 58, 237, 0.1);
1516
+ padding: 0.25rem 0.6rem;
1517
+ border-radius: 999px;
1518
+ margin-bottom: 0.75rem;
1519
+ }
1520
+ .plain-card-title {
1521
+ font-size: 1.35rem !important;
1522
+ font-weight: 700 !important;
1523
+ letter-spacing: -0.015em !important;
1524
+ color: var(--body-text-color, #111827);
1525
+ margin-bottom: 1.25rem !important;
1526
+ line-height: 1.3;
1527
+ }
1528
+ .plain-card-body {
1529
+ font-size: 1rem !important;
1530
+ line-height: 1.7 !important;
1531
+ color: var(--body-text-color, #374151) !important;
1532
+ margin: 0 0 1rem 0 !important;
1533
+ }
1534
+ .plain-card-body:last-child {
1535
+ margin-bottom: 0 !important;
1536
+ }
1537
+ .plain-card-body strong {
1538
+ color: var(--body-text-color, #111827);
1539
+ font-weight: 600;
1540
+ }
1541
+
1542
+ .analogy-diagram-wrap {
1543
+ margin: 1.5rem 0;
1544
+ padding: 1rem;
1545
+ background: rgba(124, 58, 237, 0.025);
1546
+ border-radius: 0.75rem;
1547
+ border: 1px solid rgba(124, 58, 237, 0.08);
1548
+ }
1549
+ body:not(.dark) .analogy-diagram-wrap {
1550
+ background: rgba(124, 58, 237, 0.02) !important;
1551
+ border-color: rgba(124, 58, 237, 0.1) !important;
1552
+ }
1553
+
1554
+ /* Takeaway grid in Part 3 */
1555
+ .takeaway-grid {
1556
+ display: grid;
1557
+ grid-template-columns: 1fr 1fr;
1558
+ gap: 1rem;
1559
+ margin: 1rem 0 1.25rem 0;
1560
+ }
1561
+ @media (max-width: 768px) {
1562
+ .takeaway-grid { grid-template-columns: 1fr; }
1563
+ }
1564
+ .takeaway-item {
1565
+ display: flex;
1566
+ align-items: flex-start;
1567
+ gap: 0.85rem;
1568
+ padding: 1rem 1.1rem;
1569
+ border-radius: 0.75rem;
1570
+ border: 1px solid;
1571
+ }
1572
+ .takeaway-good {
1573
+ background: rgba(16, 185, 129, 0.06);
1574
+ border-color: rgba(16, 185, 129, 0.25);
1575
+ }
1576
+ .takeaway-warn {
1577
+ background: rgba(245, 158, 11, 0.06);
1578
+ border-color: rgba(245, 158, 11, 0.25);
1579
+ }
1580
+ body:not(.dark) .takeaway-good {
1581
+ background: rgba(16, 185, 129, 0.04) !important;
1582
+ border-color: rgba(16, 185, 129, 0.3) !important;
1583
+ }
1584
+ body:not(.dark) .takeaway-warn {
1585
+ background: rgba(245, 158, 11, 0.04) !important;
1586
+ border-color: rgba(245, 158, 11, 0.3) !important;
1587
+ }
1588
+ .takeaway-icon {
1589
+ width: 28px;
1590
+ height: 28px;
1591
+ border-radius: 50%;
1592
+ display: flex;
1593
+ align-items: center;
1594
+ justify-content: center;
1595
+ font-size: 0.95rem;
1596
+ font-weight: 700;
1597
+ flex-shrink: 0;
1598
+ }
1599
+ .takeaway-good .takeaway-icon {
1600
+ background: rgba(16, 185, 129, 0.18);
1601
+ color: #10b981;
1602
+ border: 1px solid rgba(16, 185, 129, 0.35);
1603
+ }
1604
+ .takeaway-warn .takeaway-icon {
1605
+ background: rgba(245, 158, 11, 0.18);
1606
+ color: #f59e0b;
1607
+ border: 1px solid rgba(245, 158, 11, 0.35);
1608
+ }
1609
+ .takeaway-body {
1610
+ font-size: 0.94rem;
1611
+ line-height: 1.55;
1612
+ color: var(--body-text-color, #374151);
1613
+ }
1614
+ .takeaway-body strong {
1615
+ color: var(--body-text-color, #111827);
1616
+ font-weight: 600;
1617
+ }
1618
+
1619
+ /* Bottom quote */
1620
+ .plain-card-bottom-quote {
1621
+ margin-top: 1.5rem;
1622
+ padding: 1.25rem 1.5rem;
1623
+ background: linear-gradient(135deg, rgba(124, 58, 237, 0.08) 0%, rgba(236, 72, 153, 0.08) 100%);
1624
+ border-left: 3px solid var(--brand-purple-400);
1625
+ border-radius: 0 0.75rem 0.75rem 0;
1626
+ font-style: italic;
1627
+ font-size: 0.98rem;
1628
+ line-height: 1.6;
1629
+ color: var(--body-text-color, #374151);
1630
+ }
1631
+ body:not(.dark) .plain-card-bottom-quote {
1632
+ background: linear-gradient(135deg, rgba(124, 58, 237, 0.06) 0%, rgba(236, 72, 153, 0.06) 100%) !important;
1633
+ }
1634
+
1635
+
1636
+ /* ============================================================
1637
+ CONFIDENCE EXPLAINER (inside result card)
1638
+ ============================================================ */
1639
+ .confidence-explainer {
1640
+ margin-top: 0.75rem;
1641
+ border-radius: 0.5rem;
1642
+ background: rgba(124, 58, 237, 0.04);
1643
+ border: 1px solid rgba(124, 58, 237, 0.12);
1644
+ overflow: hidden;
1645
+ }
1646
+ body:not(.dark) .confidence-explainer {
1647
+ background: rgba(124, 58, 237, 0.025) !important;
1648
+ border-color: rgba(124, 58, 237, 0.18) !important;
1649
+ }
1650
+
1651
+ .confidence-explainer summary {
1652
+ cursor: pointer;
1653
+ padding: 0.65rem 0.9rem;
1654
+ font-size: 0.82rem;
1655
+ font-weight: 600;
1656
+ color: var(--brand-purple-400);
1657
+ list-style: none;
1658
+ user-select: none;
1659
+ display: flex;
1660
+ align-items: center;
1661
+ gap: 0.4rem;
1662
+ transition: background 0.15s ease;
1663
+ }
1664
+ .confidence-explainer summary::-webkit-details-marker {
1665
+ display: none;
1666
+ }
1667
+ .confidence-explainer summary::before {
1668
+ content: '▸';
1669
+ font-size: 0.7rem;
1670
+ transition: transform 0.2s ease;
1671
+ opacity: 0.7;
1672
+ }
1673
+ .confidence-explainer[open] summary::before {
1674
+ transform: rotate(90deg);
1675
+ }
1676
+ .confidence-explainer summary:hover {
1677
+ background: rgba(124, 58, 237, 0.06);
1678
+ }
1679
+
1680
+ .confidence-explainer-body {
1681
+ padding: 0 1rem 0.9rem 1rem;
1682
+ border-top: 1px solid rgba(124, 58, 237, 0.1);
1683
+ margin-top: 0.1rem;
1684
+ }
1685
+ .confidence-explainer-body p {
1686
+ margin: 0.85rem 0 0 0;
1687
+ font-size: 0.84rem;
1688
+ line-height: 1.6;
1689
+ color: var(--body-text-color, #374151);
1690
+ opacity: 0.92;
1691
+ }
1692
+ .confidence-explainer-body p:first-child {
1693
+ margin-top: 0.85rem;
1694
+ }
1695
+ .confidence-explainer-body strong {
1696
+ color: var(--body-text-color, #111827);
1697
+ font-weight: 600;
1698
+ opacity: 1;
1699
+ }
1700
+
1701
+
1702
+ /* Difficulty hint in result card meta line */
1703
+ .meta-difficulty {
1704
+ font-style: italic;
1705
+ opacity: 0.85;
1706
+ }
1707
+
1708
+
1709
+ /* Verdict callout — direct answer */
1710
+ .verdict-callout {
1711
+ background: linear-gradient(135deg, rgba(245, 158, 11, 0.12) 0%, rgba(245, 158, 11, 0.05) 100%);
1712
+ border-left: 4px solid #f59e0b;
1713
+ border-radius: 0 0.75rem 0.75rem 0;
1714
+ padding: 1.25rem 1.5rem;
1715
+ margin: 1rem 0 1.25rem 0;
1716
+ }
1717
+ body:not(.dark) .verdict-callout {
1718
+ background: linear-gradient(135deg, rgba(245, 158, 11, 0.08) 0%, rgba(245, 158, 11, 0.03) 100%) !important;
1719
+ }
1720
+ .verdict-line {
1721
+ margin: 0 !important;
1722
+ font-size: 1.05rem;
1723
+ line-height: 1.55;
1724
+ color: var(--body-text-color, #111827);
1725
+ }
1726
+ .verdict-line strong {
1727
+ color: var(--body-text-color, #111827);
1728
+ font-weight: 600;
1729
+ }
1730
+
1731
+ /* Aim callout — what the project is for */
1732
+ .aim-callout {
1733
+ margin-top: 1.5rem;
1734
+ padding: 1.5rem 1.6rem;
1735
+ background: linear-gradient(135deg, rgba(124, 58, 237, 0.08) 0%, rgba(236, 72, 153, 0.08) 100%);
1736
+ border: 1px solid rgba(124, 58, 237, 0.25);
1737
+ border-radius: 0.875rem;
1738
+ }
1739
+ body:not(.dark) .aim-callout {
1740
+ background: linear-gradient(135deg, rgba(124, 58, 237, 0.05) 0%, rgba(236, 72, 153, 0.05) 100%) !important;
1741
+ border-color: rgba(124, 58, 237, 0.3) !important;
1742
+ }
1743
+ .aim-eyebrow {
1744
+ font-size: 0.72rem;
1745
+ font-weight: 700;
1746
+ color: var(--brand-purple-400);
1747
+ text-transform: uppercase;
1748
+ letter-spacing: 0.1em;
1749
+ margin-bottom: 0.85rem;
1750
+ }
1751
+ .aim-body {
1752
+ margin: 0 0 0.85rem 0 !important;
1753
+ font-size: 0.96rem;
1754
+ line-height: 1.65;
1755
+ color: var(--body-text-color, #374151);
1756
+ }
1757
+ .aim-body:last-child {
1758
+ margin-bottom: 0 !important;
1759
+ }
1760
+ .aim-body strong {
1761
+ color: var(--body-text-color, #111827);
1762
+ font-weight: 600;
1763
+ }
1764
+
1765
+ /* Note above example clips in Detector tab */
1766
+ .examples-note {
1767
+ background: linear-gradient(135deg, rgba(124, 58, 237, 0.05) 0%, rgba(236, 72, 153, 0.05) 100%);
1768
+ border: 1px solid rgba(124, 58, 237, 0.18);
1769
+ border-radius: 0.625rem;
1770
+ padding: 0.85rem 1rem;
1771
+ font-size: 0.83rem;
1772
+ line-height: 1.55;
1773
+ color: var(--body-text-color, #374151);
1774
+ margin: 0.25rem 0 0.85rem 0;
1775
+ }
1776
+ body:not(.dark) .examples-note {
1777
+ background: linear-gradient(135deg, rgba(124, 58, 237, 0.04) 0%, rgba(236, 72, 153, 0.04) 100%) !important;
1778
+ border-color: rgba(124, 58, 237, 0.22) !important;
1779
+ }
1780
+ .examples-note strong {
1781
+ color: var(--body-text-color, #111827);
1782
+ font-weight: 600;
1783
+ }
1784
+
1785
+
1786
+ body:not(.dark) .verdict-callout {
1787
+ background: linear-gradient(135deg, rgba(16, 185, 129, 0.08) 0%, rgba(16, 185, 129, 0.03) 100%) !important;
1788
+ }
1789
+
1790
  """
1791
 
1792
 
 
1917
  with gr.Row(equal_height=False):
1918
  with gr.Column(scale=1):
1919
  gr.HTML("<div class='step-label'><span class='step-number'>1</span> Provide audio</div>")
1920
+
1921
+ with gr.Tabs(elem_classes=["input-tabs"]):
1922
+ with gr.Tab("Upload file"):
1923
+ audio_upload = gr.Audio(
1924
+ sources=["upload"],
1925
+ type="filepath",
1926
+ label="",
1927
+ elem_classes=["audio-input-styled"],
1928
+ )
1929
+ with gr.Tab("Record mic"):
1930
+ gr.HTML("""
1931
+ <div class='record-instructions'>
1932
+ <div class='record-instructions-icon'>🎤</div>
1933
+ <div class='record-instructions-text'>
1934
+ <strong>Click the record button below</strong>, speak for 3 to 10 seconds, then click stop.
1935
+ A live waveform will show your audio being captured.
1936
+ </div>
1937
+ </div>
1938
+ """)
1939
+ audio_record = gr.Audio(
1940
+ sources=["microphone"],
1941
+ type="filepath",
1942
+ label="",
1943
+ format="wav",
1944
+ show_download_button=True,
1945
+ waveform_options=gr.WaveformOptions(
1946
+ waveform_color="#a78bfa",
1947
+ waveform_progress_color="#ec4899",
1948
+ show_recording_waveform=True,
1949
+ show_controls=True,
1950
+ skip_length=2,
1951
+ sample_rate=16000,
1952
+ ),
1953
+ elem_classes=["audio-input-styled", "audio-record-styled"],
1954
+ )
1955
 
1956
  gr.HTML("<div class='step-label' style='margin-top: 1.25rem;'><span class='step-number'>2</span> Run the detector</div>")
1957
  analyze_btn = gr.Button("Analyze audio →", variant="primary", size="lg", elem_classes=["analyze-button"])
1958
 
1959
  gr.HTML("<div class='step-label' style='margin-top: 1.5rem;'>Or try an example</div>")
1960
+ gr.HTML("""
1961
+ <div class='examples-note'>
1962
+ <strong>Try all 5 examples in order</strong> — they go from easy to hardest.
1963
+ You'll see the model handle easy cases confidently, become uncertain on medium
1964
+ ones, and <strong>get the hardest one (A10) completely wrong</strong>. Why?
1965
+ A10 uses Tacotron 2 + WaveRNN — a system so advanced that even human listeners
1966
+ can't tell its output from real speech. The acoustic features literally overlap
1967
+ with authentic speech, leaving our model (and any acoustic-feature-based
1968
+ detector) with no signal to detect. We included this example so you can see
1969
+ where the limits are, not just where it succeeds.
1970
+ </div>
1971
+ """)
1972
  gr.Examples(
1973
  examples=EXAMPLE_FILES,
1974
+ inputs=audio_upload,
1975
  label="",
1976
  )
1977
 
 
1994
  raw_output = gr.JSON(label=None)
1995
 
1996
  analyze_btn.click(
1997
+ fn=predict_audio_router,
1998
+ inputs=[audio_upload, audio_record],
1999
  outputs=[badge_output, details_output, plot_output, raw_output],
2000
  )
2001
 
 
2004
  # TAB 3: PERFORMANCE
2005
  # ============================================================
2006
  with gr.Tab("Performance", id=2):
2007
+ # Section header
2008
+ gr.HTML("""
2009
+ <div class='section-header' style='margin-top: 1rem;'>
2010
+ <div class='section-eyebrow'>Evaluation</div>
2011
+ <div class='section-title'>How well does the model actually perform?</div>
2012
+ <p class='detector-intro'>
2013
+ Three datasets, two regimes (in-domain and out-of-domain), and full transparency about
2014
+ where the model wins and where it struggles. Results are reported as Equal Error Rate (EER) —
2015
+ lower is better.
2016
+ </p>
2017
+ </div>
2018
+ """)
2019
+
2020
+ # Headline metric cards
2021
+ gr.HTML("""
2022
+ <div class='subsection-header'>
2023
+ <span class='subsection-eyebrow'>Headline results</span>
2024
+ <div class='subsection-title'>Three benchmarks at a glance</div>
2025
+ </div>
2026
+ """)
2027
 
2028
  with gr.Row():
2029
  gr.HTML("""
2030
+ <div class='perf-metric-card perf-metric-good'>
2031
+ <div class='perf-metric-tag'>In-domain</div>
2032
+ <div class='perf-metric-value'>5.55%</div>
2033
+ <div class='perf-metric-name'>ASVspoof 2019 LA</div>
2034
+ <div class='perf-metric-detail'>Unseen attacks A07–A19</div>
2035
  </div>
2036
  """)
2037
  gr.HTML("""
2038
+ <div class='perf-metric-card perf-metric-mid'>
2039
+ <div class='perf-metric-tag'>Cross-dataset</div>
2040
+ <div class='perf-metric-value'>9.09%</div>
2041
+ <div class='perf-metric-name'>ASVspoof 2021 LA</div>
2042
+ <div class='perf-metric-detail'>Codec-degraded audio</div>
2043
  </div>
2044
  """)
2045
  gr.HTML("""
2046
+ <div class='perf-metric-card perf-metric-warn'>
2047
+ <div class='perf-metric-tag'>Out-of-domain</div>
2048
+ <div class='perf-metric-value'>26.33%</div>
2049
+ <div class='perf-metric-name'>WaveFake</div>
2050
+ <div class='perf-metric-detail'>Novel vocoder pipelines</div>
2051
  </div>
2052
  """)
2053
 
2054
+ # Baseline comparison
2055
+ gr.HTML("""
2056
+ <div class='subsection-header' style='margin-top: 2.5rem;'>
2057
+ <span class='subsection-eyebrow'>Benchmark comparison</span>
2058
+ <div class='subsection-title'>How we compare to published baselines</div>
2059
+ </div>
2060
+ """)
2061
 
2062
+ gr.HTML("""
2063
+ <div class='comparison-table-wrap'>
2064
+ <table class='comparison-table'>
2065
+ <thead>
2066
+ <tr>
2067
+ <th>System</th>
2068
+ <th>2019 LA EER</th>
2069
+ <th>2021 LA EER</th>
2070
+ </tr>
2071
+ </thead>
2072
+ <tbody>
2073
+ <tr><td>Official LFCC-GMM baseline</td><td>8.09%</td><td>25.56%</td></tr>
2074
+ <tr><td>Official CQCC-GMM baseline</td><td>9.57%</td><td>19.30%</td></tr>
2075
+ <tr><td>Official LFCC-LCNN baseline</td><td>—</td><td>9.26%</td></tr>
2076
+ <tr><td>Official RawNet2 baseline</td><td>—</td><td>9.50%</td></tr>
2077
+ <tr class='comparison-row-highlight'>
2078
+ <td><strong>This work (Wav2Vec 2.0)</strong></td>
2079
+ <td><strong>5.55%</strong></td>
2080
+ <td><strong>9.09%</strong></td>
2081
+ </tr>
2082
+ </tbody>
2083
+ </table>
2084
+ <p class='comparison-caption'>
2085
+ Outperforms LFCC-GMM on 2019 LA by 2.54 pp and matches the strongest neural baselines
2086
+ (LFCC-LCNN, RawNet2) on 2021 LA — without any codec-specific training augmentation.
2087
+ </p>
2088
+ </div>
2089
+ """)
2090
 
2091
+ # Per-codec analysis
2092
+ gr.HTML("""
2093
+ <div class='subsection-header' style='margin-top: 3rem;'>
2094
+ <span class='subsection-eyebrow'>Codec robustness</span>
2095
+ <div class='subsection-title'>Performance by audio codec (ASVspoof 2021 LA)</div>
2096
+ <p class='subsection-caption'>
2097
+ Real-world speech goes through codecs for transmission. The model handles modern codecs
2098
+ well but struggles with aggressive cellular compression.
2099
+ </p>
2100
+ </div>
2101
  """)
2102
+ with gr.Row():
2103
+ with gr.Column(elem_classes=["chart-wrap"]):
2104
+ gr.Plot(value=make_per_codec_plot(), label=None)
2105
 
2106
+ # Per-attack analysis
2107
+ gr.HTML("""
2108
+ <div class='subsection-header' style='margin-top: 3rem;'>
2109
+ <span class='subsection-eyebrow'>Attack-family robustness</span>
2110
+ <div class='subsection-title'>Performance by attack type (ASVspoof 2019 LA eval)</div>
2111
+ <p class='subsection-caption'>
2112
+ 13 different synthesis methods (A07–A19), all unseen during training. A10 is the
2113
+ model's persistent weakness across both 2019 and 2021 evaluations.
2114
+ </p>
2115
+ </div>
2116
+ """)
2117
+ with gr.Row():
2118
+ with gr.Column(elem_classes=["chart-wrap"]):
2119
+ gr.Plot(value=make_per_attack_plot(), label=None)
2120
 
2121
+ # WaveFake story
2122
+ gr.HTML("""
2123
+ <div class='subsection-header' style='margin-top: 3rem;'>
2124
+ <span class='subsection-eyebrow'>Out-of-domain limits</span>
2125
+ <div class='subsection-title'>The WaveFake story — an honest negative result</div>
2126
+ <p class='subsection-caption'>
2127
+ On WaveFake the model performs significantly worse, particularly on LJSpeech-based
2128
+ vocoders (22–34% EER). WaveFake tests pure neural vocoder synthesis, while the model
2129
+ was trained on ASVspoof's mix of TTS and voice-conversion attacks.
2130
+ <br><br>
2131
+ <strong>Interpretation:</strong> the model has learned ASVspoof-specific synthesis
2132
+ artifacts, not universal vocoder detection. JSUT (Japanese) numbers look artificially
2133
+ good because the bonafide examples are English LJSpeech — the model is partly detecting
2134
+ language and domain, not spoofing artifacts. The LJSpeech-based numbers are the
2135
+ methodologically meaningful results.
2136
+ </p>
2137
+ </div>
2138
+ """)
2139
+ with gr.Row():
2140
+ with gr.Column(elem_classes=["chart-wrap"]):
2141
+ gr.Plot(value=make_wavefake_plot(), label=None)
2142
 
2143
+ # ============================================================
2144
+ # Is the model overfit? (honest analysis)
2145
+ gr.HTML("""
2146
+ <div class='subsection-header' style='margin-top: 4rem;'>
2147
+ <span class='subsection-eyebrow'>Plain-language analysis</span>
2148
+ <div class='subsection-title'>So is our model overfit?</div>
2149
+ <p class='subsection-caption'>
2150
+ A fair question to ask of any deep learning model. We'll explain what overfitting is,
2151
+ walk through what our numbers show, and give you a straight answer.
2152
+ </p>
2153
+ </div>
2154
+
2155
+ <!-- PART 1: What is overfitting? -->
2156
+ <div class='plain-card'>
2157
+ <div class='plain-card-eyebrow'>Part 1</div>
2158
+ <div class='plain-card-title'>What is overfitting?</div>
2159
+ <p class='plain-card-body'>
2160
+ Overfitting is when a model <strong>memorises specific examples</strong> instead of
2161
+ <strong>learning general patterns</strong>. Sometimes called "rote learning" — the model gets very good at
2162
+ recognising things that look like its training data, but anything that looks even slightly
2163
+ different feels wrong to it and it gets confused.
2164
+ </p>
2165
+ <p class='plain-card-body'>
2166
+ A good model learns the underlying signal. A deepfake detector should learn what makes a synthetic
2167
+ voice sound synthetic — patterns that show up across many different fake-voice methods, not
2168
+ just the specific ones it studied. If it only recognises fake voices that look exactly like the
2169
+ ones it trained on, it has overfit.
2170
+ </p>
2171
+ <p class='plain-card-body'>
2172
+ The way you spot overfitting is to test the model on examples it has never seen — and ideally on
2173
+ examples that are <em>quite different</em> from what it trained on. If performance drops gracefully,
2174
+ the model is generalising. If it falls off a cliff, the model has overfit.
2175
+ </p>
2176
+ </div>
2177
 
2178
+ <!-- PART 2: Where does our model land? -->
2179
+ <div class='plain-card' style='margin-top: 1.5rem;'>
2180
+ <div class='plain-card-eyebrow'>Part 2</div>
2181
+ <div class='plain-card-title'>Where does our model actually land?</div>
2182
+ <p class='plain-card-body'>
2183
+ We tested the detector on four progressively harder challenges. Each step further from what it
2184
+ trained on tells us how well it generalises.
2185
+ </p>
2186
+
2187
+ <div class='analogy-diagram-wrap'>
2188
+ <svg width="100%" viewBox="0 0 680 360" role="img" xmlns="http://www.w3.org/2000/svg" class='arch-svg'>
2189
+ <title>Model performance across four difficulty levels</title>
2190
+ <desc>The detector's error rate increases as the test data moves further from what the model trained on.</desc>
2191
+ <text x="40" y="32" font-size="12" fill="currentColor" opacity="0.65" font-weight="500">EASIER →→→→→→→→→→→→→→→→→→→→→→→→→→→ HARDER</text>
2192
+ <line x1="40" y1="200" x2="640" y2="200" stroke="currentColor" stroke-width="0.8" opacity="0.4"/>
2193
+
2194
+ <g class='arch-anim' style='animation-delay: 0.15s;'>
2195
+ <circle cx="115" cy="200" r="9" fill="#10b981" stroke="#10b981" stroke-width="0.6"/>
2196
+ <text x="115" y="186" text-anchor="middle" font-size="11" fill="#10b981" font-weight="700">0.69%</text>
2197
+ <text x="115" y="172" text-anchor="middle" font-size="10" fill="currentColor" opacity="0.6">error rate</text>
2198
+ <text x="115" y="225" text-anchor="middle" font-size="12" fill="currentColor" font-weight="600">Familiar voices</text>
2199
+ <text x="115" y="244" text-anchor="middle" font-size="11" fill="currentColor" opacity="0.7">Same examples it</text>
2200
+ <text x="115" y="258" text-anchor="middle" font-size="11" fill="currentColor" opacity="0.7">trained on</text>
2201
+ <text x="115" y="285" text-anchor="middle" font-size="10" fill="currentColor" opacity="0.55">~1 wrong in 145</text>
2202
+ </g>
2203
+ <g class='arch-anim' style='animation-delay: 0.35s;'>
2204
+ <circle cx="290" cy="200" r="11" fill="#10b981" stroke="#10b981" stroke-width="0.6" opacity="0.85"/>
2205
+ <text x="290" y="186" text-anchor="middle" font-size="11" fill="#10b981" font-weight="700">5.55%</text>
2206
+ <text x="290" y="172" text-anchor="middle" font-size="10" fill="currentColor" opacity="0.6">error rate</text>
2207
+ <text x="290" y="225" text-anchor="middle" font-size="12" fill="currentColor" font-weight="600">New fakes,</text>
2208
+ <text x="290" y="240" text-anchor="middle" font-size="12" fill="currentColor" font-weight="600">same style</text>
2209
+ <text x="290" y="258" text-anchor="middle" font-size="11" fill="currentColor" opacity="0.7">13 fake-voice methods</text>
2210
+ <text x="290" y="272" text-anchor="middle" font-size="11" fill="currentColor" opacity="0.7">it had never heard</text>
2211
+ <text x="290" y="299" text-anchor="middle" font-size="10" fill="currentColor" opacity="0.55">~1 wrong in 18</text>
2212
+ </g>
2213
+ <g class='arch-anim' style='animation-delay: 0.55s;'>
2214
+ <circle cx="465" cy="200" r="13" fill="#f59e0b" stroke="#f59e0b" stroke-width="0.6" opacity="0.9"/>
2215
+ <text x="465" y="186" text-anchor="middle" font-size="11" fill="#f59e0b" font-weight="700">9.09%</text>
2216
+ <text x="465" y="172" text-anchor="middle" font-size="10" fill="currentColor" opacity="0.6">error rate</text>
2217
+ <text x="465" y="225" text-anchor="middle" font-size="12" fill="currentColor" font-weight="600">Phone-quality</text>
2218
+ <text x="465" y="240" text-anchor="middle" font-size="12" fill="currentColor" font-weight="600">audio</text>
2219
+ <text x="465" y="258" text-anchor="middle" font-size="11" fill="currentColor" opacity="0.7">Compressed audio</text>
2220
+ <text x="465" y="272" text-anchor="middle" font-size="11" fill="currentColor" opacity="0.7">like real phone calls</text>
2221
+ <text x="465" y="299" text-anchor="middle" font-size="10" fill="currentColor" opacity="0.55">~1 wrong in 11</text>
2222
+ </g>
2223
+ <g class='arch-anim' style='animation-delay: 0.75s;'>
2224
+ <circle cx="615" cy="200" r="17" fill="#ef4444" stroke="#ef4444" stroke-width="0.6" opacity="0.9"/>
2225
+ <text x="615" y="186" text-anchor="middle" font-size="11" fill="#ef4444" font-weight="700">26.33%</text>
2226
+ <text x="615" y="172" text-anchor="middle" font-size="10" fill="currentColor" opacity="0.6">error rate</text>
2227
+ <text x="615" y="232" text-anchor="middle" font-size="12" fill="currentColor" font-weight="600">Brand new</text>
2228
+ <text x="615" y="247" text-anchor="middle" font-size="12" fill="currentColor" font-weight="600">fake-voice tech</text>
2229
+ <text x="615" y="265" text-anchor="middle" font-size="11" fill="currentColor" opacity="0.7">Made by a method</text>
2230
+ <text x="615" y="279" text-anchor="middle" font-size="11" fill="currentColor" opacity="0.7">it never studied</text>
2231
+ <text x="615" y="306" text-anchor="middle" font-size="10" fill="currentColor" opacity="0.55">~1 wrong in 4</text>
2232
+ </g>
2233
+ <line x1="40" y1="335" x2="640" y2="335" stroke="currentColor" stroke-width="0.4" opacity="0.2"/>
2234
+ </svg>
2235
+ </div>
2236
+
2237
+ <p class='plain-card-body'>
2238
+ Two things to notice. First — the model degrades gradually, not catastrophically. It doesn't go
2239
+ from 0.69% to 50% (which would mean random guessing on anything new). That tells us it
2240
+ <strong>did learn real patterns</strong>, not just memorise specific clips.
2241
+ </p>
2242
+ <p class='plain-card-body'>
2243
+ Second — there's still a <strong>big gap</strong>. Going from 0.69% on familiar territory to 26.33%
2244
+ on brand new fake-voice technology is a 38× jump. That's not catastrophic, but it's also not
2245
+ great. The model clearly learned features that matter for the kinds of fake voices it studied —
2246
+ and those features don't fully transfer to fake voices made by methods it has never seen.
2247
+ </p>
2248
+ </div>
2249
+
2250
+ <!-- PART 3: The honest verdict -->
2251
+ <div class='plain-card' style='margin-top: 1.5rem;'>
2252
+ <div class='plain-card-eyebrow'>Part 3</div>
2253
+ <div class='plain-card-title'>The honest verdict</div>
2254
+
2255
+ <div class='verdict-callout'>
2256
+ <p class='verdict-line'>
2257
+ <strong>The honest answer: it's a mix.</strong> The model learned real patterns
2258
+ and generalises to most unseen attacks — but it has a genuine blind spot, and
2259
+ its confidence can be dangerously high even when it's wrong.
2260
+ </p>
2261
+ </div>
2262
+
2263
+ <p class='plain-card-body'>
2264
+ <strong>What works well:</strong> When tested on 13 fake-voice methods it had never
2265
+ seen during training, it achieved a 5.55% error rate — roughly 94 out of 100 predictions
2266
+ correct on completely new fakes. It becomes appropriately uncertain on medium-difficulty
2267
+ attacks (66% confidence on A07). And it handles noisy, real-world audio without
2268
+ false-alarming (93.7% confidence on a noisy real voice). These are signs of a model
2269
+ that learned real anti-spoofing patterns, not just memorised its training data.
2270
+ </p>
2271
+ <p class='plain-card-body'>
2272
+ <strong>What doesn't work:</strong> Two real problems. First, the model has a
2273
+ <strong>complete blind spot for A10 attacks</strong> — it classifies the hardest
2274
+ spoof example as "100% authentic," completely wrong. But there's a specific reason:
2275
+ A10 is a Tacotron 2 + WaveRNN system whose output is so natural that even <strong>human
2276
+ listeners cannot distinguish it from real speech</strong>. The ASVspoof 2019 paper
2277
+ itself confirms that A10's acoustic features literally overlap with authentic speech
2278
+ in feature space. Since our model relies on acoustic representations (Wav2Vec 2.0
2279
+ features), it faces the same fundamental limit human ears do — there's no acoustic
2280
+ signal to detect.
2281
+ </p>
2282
+ <p class='plain-card-body'>
2283
+ Second, on the WaveFake dataset (modern neural vocoders like MelGAN
2284
+ and HiFi-GAN — the same technology used in real-world voice cloning today), the error
2285
+ rate jumps to 26.33%. These vocoders produce different artifacts from what the model
2286
+ trained on. Since our project's goal is detecting AI voice cloning broadly, this is
2287
+ a real coverage gap.
2288
+ </p>
2289
+ <p class='plain-card-body'>
2290
+ <strong>What this means:</strong> The model is not classically "overfit" in the sense of
2291
+ having memorised its training data — the 5.55% result on unseen attacks proves that. But
2292
+ it does have <strong>limited coverage</strong>: it learned to detect certain types of
2293
+ synthesis artifacts (the ones present in ASVspoof) and is blind to others (A10, neural
2294
+ vocoders). For the project's stated goal of detecting AI voice cloning broadly, this is
2295
+ a meaningful gap.
2296
+ </p>
2297
+
2298
+ <div class='aim-callout'>
2299
+ <div class='aim-eyebrow'>What our project actually demonstrates</div>
2300
+ <p class='aim-body'>
2301
+ <strong>1. Wav2Vec 2.0 features work for deepfake detection.</strong> Pretrained speech
2302
+ representations carry strong anti-spoofing signal. With minimal fine-tuning (15% of the
2303
+ model), we match or beat published neural baselines on the standard ASVspoof benchmarks.
2304
+ This validates the transfer-learning approach.
2305
+ </p>
2306
+ <p class='aim-body'>
2307
+ <strong>2. Single-corpus training has real limits — and we measured exactly where.</strong>
2308
+ The A10 blind spot reveals a fundamental challenge: when a synthesis system produces
2309
+ speech that is acoustically indistinguishable from real speech (even to humans),
2310
+ acoustic-feature-based detection reaches its theoretical limit. The WaveFake results
2311
+ show that cross-family generalization requires cross-family training data. Both findings
2312
+ are concrete, measured, and reproducible.
2313
+ </p>
2314
+ <p class='aim-body'>
2315
+ <strong>3. The path forward is clear.</strong> Universal AI voice cloning detection
2316
+ requires multi-corpus, multi-family training — combining ASVspoof, WaveFake, and newer
2317
+ datasets covering the latest synthesis methods. This project establishes the baseline
2318
+ that such future work would build on, with measured evidence showing exactly where the
2319
+ current approach succeeds and where it falls short.
2320
+ </p>
2321
+ <p class='aim-body'>
2322
+ We chose to include the failures (A10, WaveFake) rather than hide them because honest
2323
+ evaluation is more valuable than inflated numbers. A detector that reports 5.55% EER
2324
+ with known blind spots is more useful than one that reports 5.55% EER and pretends it
2325
+ works on everything.
2326
+ </p>
2327
+ </div>
2328
+
2329
+ <div class='plain-card-bottom-quote'>
2330
+ "Treat this as a research demonstration of how Wav2Vec features behave for deepfake detection,
2331
+ not a security tool. If you need to verify whether a real-world recording is a deepfake, no
2332
+ single model — including this one — should be trusted as the final answer."
2333
+ </div>
2334
+ </div>
2335
  """)
 
2336
 
2337
 
2338
  # ============================================================
 
2343
  gr.Markdown("## Architecture")
2344
 
2345
  gr.HTML("""
2346
+ <div class='arch-diagram-wrap'>
2347
+ <svg width="100%" viewBox="0 0 680 380" role="img" xmlns="http://www.w3.org/2000/svg" class='arch-svg'>
2348
+ <title>Wav2Vec 2.0 architecture for deepfake detection</title>
2349
+ <desc>Raw waveform feeds into a CNN feature encoder, then a 12-layer transformer stack, mean-pooled into a 768-dim embedding, then a linear classifier produces spoof and bonafide probabilities.</desc>
2350
+ <defs>
2351
+ <marker id="arch-arrow" viewBox="0 0 10 10" refX="8" refY="5" markerWidth="6" markerHeight="6" orient="auto-start-reverse">
2352
+ <path d="M2 1L8 5L2 9" fill="none" stroke="context-stroke" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"/>
2353
+ </marker>
2354
+ <linearGradient id="arch-brand-grad" x1="0" y1="0" x2="1" y2="1">
2355
+ <stop offset="0%" stop-color="#a78bfa" stop-opacity="0.9"/>
2356
+ <stop offset="100%" stop-color="#ec4899" stop-opacity="0.9"/>
2357
+ </linearGradient>
2358
+ </defs>
2359
+
2360
+ <g class='arch-anim' style='animation-delay: 0.05s;'>
2361
+ <path d="M30 165 L30 195 L34 188 L38 192 L42 175 L46 200 L50 168 L54 195 L58 178 L62 192 L66 170 L70 198 L74 165 L78 200 L82 172 L86 195 L86 165 Z" fill="#a78bfa" fill-opacity="0.4" stroke="#a78bfa" stroke-width="0.6"/>
2362
+ <text x="58" y="225" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.85">Waveform</text>
2363
+ <text x="58" y="240" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.6">16 kHz · 4 s</text>
2364
+ </g>
2365
+
2366
+ <line x1="92" y1="180" x2="118" y2="180" stroke="currentColor" stroke-width="1.5" opacity="0.45" marker-end="url(#arch-arrow)" class='arch-anim' style='animation-delay: 0.2s;'/>
2367
+
2368
+ <g class='arch-anim' style='animation-delay: 0.3s;'>
2369
+ <rect x="118" y="142" width="22" height="76" rx="3" fill="#a78bfa" fill-opacity="0.18" stroke="#a78bfa" stroke-width="0.6"/>
2370
+ <rect x="142" y="148" width="22" height="64" rx="3" fill="#a78bfa" fill-opacity="0.28" stroke="#a78bfa" stroke-width="0.6"/>
2371
+ <rect x="166" y="154" width="22" height="52" rx="3" fill="#a78bfa" fill-opacity="0.4" stroke="#a78bfa" stroke-width="0.6"/>
2372
+ <rect x="190" y="160" width="22" height="40" rx="3" fill="#a78bfa" fill-opacity="0.55" stroke="#a78bfa" stroke-width="0.6"/>
2373
+ <text x="165" y="232" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.85">CNN encoder</text>
2374
+ <text x="165" y="247" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.6">7 conv layers</text>
2375
+ </g>
2376
+
2377
+ <line x1="218" y1="180" x2="244" y2="180" stroke="currentColor" stroke-width="1.5" opacity="0.45" marker-end="url(#arch-arrow)" class='arch-anim' style='animation-delay: 0.45s;'/>
2378
+
2379
+ <g class='arch-anim' style='animation-delay: 0.55s;'>
2380
+ <rect x="248" y="62" width="180" height="20" rx="3" fill="#7c3aed" fill-opacity="0.85" stroke="#7c3aed" stroke-width="0.5"/>
2381
+ <text x="338" y="76" text-anchor="middle" font-size="12" fill="#ffffff" font-weight="500">LayerNorm</text>
2382
+
2383
+ <rect x="248" y="86" width="180" height="22" rx="3" fill="#a78bfa" fill-opacity="0.95" stroke="#a78bfa" stroke-width="0.5"/>
2384
+ <text x="338" y="101" text-anchor="middle" font-size="12" fill="#ffffff" font-weight="500">Layer 12</text>
2385
+
2386
+ <rect x="248" y="112" width="180" height="22" rx="3" fill="#a78bfa" fill-opacity="0.9" stroke="#a78bfa" stroke-width="0.5"/>
2387
+ <text x="338" y="127" text-anchor="middle" font-size="12" fill="#ffffff" font-weight="500">Layer 11</text>
2388
+
2389
+ <rect x="248" y="138" width="180" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2390
+ <rect x="248" y="156" width="180" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2391
+ <rect x="248" y="174" width="180" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2392
+ <rect x="248" y="192" width="180" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2393
+ <rect x="248" y="210" width="180" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2394
+ <text x="338" y="220" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.6">Layers 10 — 6</text>
2395
+
2396
+ <rect x="248" y="228" width="180" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2397
+ <rect x="248" y="246" width="180" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2398
+ <rect x="248" y="264" width="180" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2399
+ <rect x="248" y="282" width="180" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2400
+ <rect x="248" y="300" width="180" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2401
+ <text x="338" y="310" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.6">Layers 5 — 1</text>
2402
+
2403
+ <line x1="248" y1="62" x2="248" y2="314" stroke="#7c3aed" stroke-width="0.5" opacity="0.4"/>
2404
+ <line x1="428" y1="62" x2="428" y2="314" stroke="#7c3aed" stroke-width="0.5" opacity="0.4"/>
2405
+
2406
+ <text x="338" y="335" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.75" font-weight="500">Transformer stack · 12 layers · 95M params</text>
2407
+ </g>
2408
+
2409
+ <line x1="436" y1="180" x2="462" y2="180" stroke="currentColor" stroke-width="1.5" opacity="0.45" marker-end="url(#arch-arrow)" class='arch-anim' style='animation-delay: 0.75s;'/>
2410
+
2411
+ <g class='arch-anim' style='animation-delay: 0.85s;'>
2412
+ <ellipse cx="486" cy="180" rx="22" ry="36" fill="#10b981" fill-opacity="0.22" stroke="#10b981" stroke-width="0.6"/>
2413
+ <text x="486" y="178" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.85">Mean</text>
2414
+ <text x="486" y="192" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.85">pool</text>
2415
+ <text x="486" y="232" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.6">768-dim</text>
2416
+ </g>
2417
+
2418
+ <line x1="510" y1="180" x2="536" y2="180" stroke="currentColor" stroke-width="1.5" opacity="0.45" marker-end="url(#arch-arrow)" class='arch-anim' style='animation-delay: 1.0s;'/>
2419
+
2420
+ <g class='arch-anim' style='animation-delay: 1.1s;'>
2421
+ <rect x="540" y="156" width="100" height="48" rx="6" fill="url(#arch-brand-grad)" stroke="#7c3aed" stroke-width="0.6"/>
2422
+ <text x="590" y="174" text-anchor="middle" font-size="12" fill="#ffffff" font-weight="500">Linear</text>
2423
+ <text x="590" y="190" text-anchor="middle" font-size="12" fill="#ffffff">768 → 2</text>
2424
+ <text x="590" y="225" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.6">P(spoof)</text>
2425
+ <text x="590" y="240" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.6">P(bonafide)</text>
2426
+ </g>
2427
+
2428
+ <g transform="translate(40, 290)" class='arch-anim' style='animation-delay: 1.3s;'>
2429
+ <rect x="0" y="0" width="14" height="14" rx="3" fill="#a78bfa" fill-opacity="0.9"/>
2430
+ <text x="20" y="11" font-size="12" fill="currentColor" opacity="0.7">Trainable in Stage 2 (14M params)</text>
2431
+ <rect x="0" y="22" width="14" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32"/>
2432
+ <text x="20" y="33" font-size="12" fill="currentColor" opacity="0.7">Frozen (81M params)</text>
2433
+ </g>
2434
+ </svg>
2435
  </div>
2436
  """)
2437
 
2438
  gr.Markdown("## Two-stage training rationale")
2439
 
2440
+ gr.HTML("""
2441
+ <div class='arch-diagram-wrap'>
2442
+ <svg width="100%" viewBox="0 0 680 460" role="img" xmlns="http://www.w3.org/2000/svg" class='arch-svg'>
2443
+ <title>Two-stage fine-tuning strategy</title>
2444
+ <desc>Stage 1 trains only the linear classification head with all transformer layers frozen, achieving 10.09% EER. Stage 2 unfreezes the top 2 transformer layers plus the final LayerNorm, achieving 0.69% EER.</desc>
2445
+ <defs>
2446
+ <linearGradient id="ft-head-grad" x1="0" y1="0" x2="1" y2="1">
2447
+ <stop offset="0%" stop-color="#a78bfa" stop-opacity="0.95"/>
2448
+ <stop offset="100%" stop-color="#ec4899" stop-opacity="0.95"/>
2449
+ </linearGradient>
2450
+ </defs>
2451
+
2452
+ <text x="170" y="32" text-anchor="middle" font-size="14" fill="currentColor" font-weight="500" class='arch-anim' style='animation-delay: 0.05s;'>Stage 1: head only</text>
2453
+ <text x="170" y="50" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.65" class='arch-anim' style='animation-delay: 0.1s;'>1,538 trainable params</text>
2454
+
2455
+ <g class='arch-anim' style='animation-delay: 0.2s;'>
2456
+ <rect x="100" y="70" width="140" height="20" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2457
+ <text x="170" y="84" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.6">LayerNorm</text>
2458
+ <rect x="100" y="94" width="140" height="20" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2459
+ <text x="170" y="108" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.6">Layer 12</text>
2460
+ <rect x="100" y="118" width="140" height="20" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2461
+ <text x="170" y="132" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.6">Layer 11</text>
2462
+ <rect x="100" y="142" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2463
+ <rect x="100" y="160" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2464
+ <rect x="100" y="178" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2465
+ <rect x="100" y="196" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2466
+ <rect x="100" y="214" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2467
+ <text x="170" y="224" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.55">Layers 10 — 6</text>
2468
+ <rect x="100" y="232" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2469
+ <rect x="100" y="250" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2470
+ <rect x="100" y="268" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2471
+ <rect x="100" y="286" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2472
+ <rect x="100" y="304" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2473
+ <text x="170" y="314" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.55">Layers 5 — 1</text>
2474
+ <rect x="100" y="322" width="140" height="20" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2475
+ <text x="170" y="336" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.6">CNN encoder</text>
2476
+ </g>
2477
+
2478
+ <g class='arch-anim' style='animation-delay: 0.4s;'>
2479
+ <rect x="100" y="354" width="140" height="28" rx="4" fill="url(#ft-head-grad)" stroke="#7c3aed" stroke-width="0.6"/>
2480
+ <text x="170" y="372" text-anchor="middle" font-size="12" fill="#ffffff" font-weight="500">Linear head</text>
2481
+ </g>
2482
+
2483
+ <text x="170" y="408" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.7" class='arch-anim' style='animation-delay: 0.5s;'>Dev EER</text>
2484
+ <text x="170" y="434" text-anchor="middle" font-size="22" fill="#a78bfa" font-weight="700" class='arch-anim' style='animation-delay: 0.55s;'>10.09%</text>
2485
+
2486
+ <line x1="320" y1="60" x2="320" y2="430" stroke="#9ca3af" stroke-width="0.4" stroke-dasharray="4 4" opacity="0.5"/>
2487
+
2488
+ <text x="510" y="32" text-anchor="middle" font-size="14" fill="currentColor" font-weight="500" class='arch-anim' style='animation-delay: 0.6s;'>Stage 2: top 2 layers + head</text>
2489
+ <text x="510" y="50" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.65" class='arch-anim' style='animation-delay: 0.65s;'>14.18M trainable params</text>
2490
+
2491
+ <g class='arch-anim' style='animation-delay: 0.75s;'>
2492
+ <rect x="440" y="70" width="140" height="20" rx="3" fill="#a78bfa" fill-opacity="0.95" stroke="#a78bfa" stroke-width="0.6"/>
2493
+ <text x="510" y="84" text-anchor="middle" font-size="12" fill="#ffffff" font-weight="500">LayerNorm</text>
2494
+ <rect x="440" y="94" width="140" height="20" rx="3" fill="#a78bfa" fill-opacity="0.95" stroke="#a78bfa" stroke-width="0.6"/>
2495
+ <text x="510" y="108" text-anchor="middle" font-size="12" fill="#ffffff" font-weight="500">Layer 12</text>
2496
+ <rect x="440" y="118" width="140" height="20" rx="3" fill="#a78bfa" fill-opacity="0.95" stroke="#a78bfa" stroke-width="0.6"/>
2497
+ <text x="510" y="132" text-anchor="middle" font-size="12" fill="#ffffff" font-weight="500">Layer 11</text>
2498
+ <rect x="440" y="142" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2499
+ <rect x="440" y="160" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2500
+ <rect x="440" y="178" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2501
+ <rect x="440" y="196" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2502
+ <rect x="440" y="214" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2503
+ <text x="510" y="224" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.55">Layers 10 — 6</text>
2504
+ <rect x="440" y="232" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2505
+ <rect x="440" y="250" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2506
+ <rect x="440" y="268" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2507
+ <rect x="440" y="286" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2508
+ <rect x="440" y="304" width="140" height="14" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2509
+ <text x="510" y="314" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.55">Layers 5 — 1</text>
2510
+ <rect x="440" y="322" width="140" height="20" rx="3" fill="#9ca3af" fill-opacity="0.32" stroke="#9ca3af" stroke-width="0.4"/>
2511
+ <text x="510" y="336" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.6">CNN encoder</text>
2512
+ </g>
2513
+
2514
+ <g class='arch-anim' style='animation-delay: 0.95s;'>
2515
+ <rect x="440" y="354" width="140" height="28" rx="4" fill="url(#ft-head-grad)" stroke="#7c3aed" stroke-width="0.6"/>
2516
+ <text x="510" y="372" text-anchor="middle" font-size="12" fill="#ffffff" font-weight="500">Linear head</text>
2517
+ </g>
2518
+
2519
+ <text x="510" y="408" text-anchor="middle" font-size="12" fill="currentColor" opacity="0.7" class='arch-anim' style='animation-delay: 1.05s;'>Dev EER</text>
2520
+ <text x="510" y="434" text-anchor="middle" font-size="22" fill="#10b981" font-weight="700" class='arch-anim' style='animation-delay: 1.1s;'>0.69%</text>
2521
+ </svg>
2522
+ </div>
2523
+ """)
2524
+
2525
  with gr.Row():
2526
  gr.HTML("""
2527
  <div class='stage-card'>