Storytelling pass via taste-skill: plain-language story beats per section, panel analogy in the thesis, and zero em-dashes (kill all 48 visible dash tells)
Browse files- index.html +31 -30
index.html
CHANGED
|
@@ -4,7 +4,7 @@
|
|
| 4 |
<meta charset="utf-8">
|
| 5 |
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 6 |
<meta name="color-scheme" content="dark">
|
| 7 |
-
<title>Combining LLMs Rarely Beats the Single Best Model
|
| 8 |
<meta name="description" content="Interactive companion to the paper “Combining LLMs Rarely Beats the Single Best Model: A Provable Co-Failure Ceiling Across 67 Frontier Models” (Josef Chen, KAIKAKU). The deployment-relevant object is β = P(all models wrong); a one-sample certificate bounds every router, vote, and cascade.">
|
| 9 |
<meta property="og:title" content="Combining LLMs Rarely Beats the Single Best Model">
|
| 10 |
<meta property="og:description" content="β = P(all models wrong) is the ceiling. An interactive companion to the paper.">
|
|
@@ -348,19 +348,20 @@ footer a:hover{border-bottom-color:var(--text)}
|
|
| 348 |
<span class="hd-beta num" id="heroBeta" data-ticker="0.052">0.052</span>
|
| 349 |
<span class="hd-foot-l"><span class="gb">β</span> = P(ALL MODELS WRONG) · MATH-500<br>17 OF 330 QUERIES · 67 MODELS, 21 PROVIDERS</span>
|
| 350 |
</div>
|
| 351 |
-
<div class="hd-hint">move the cursor
|
| 352 |
</div>
|
| 353 |
</header>
|
| 354 |
|
| 355 |
<section id="thesis" class="wrap">
|
| 356 |
<div class="sechead">
|
| 357 |
-
<div class="h-main"><span class="secno num">01</span><h2>
|
| 358 |
-
<p>
|
| 359 |
</div>
|
| 360 |
<div class="track">
|
| 361 |
<div class="prose">
|
| 362 |
-
<p>
|
| 363 |
-
<p>
|
|
|
|
| 364 |
</div>
|
| 365 |
<aside class="rail" aria-label="Section 1 notes: definitions and the pairwise blind spot">
|
| 366 |
<span class="rk">DEFINITION</span><br>β = P(all m wrong)<br>ceiling = 1 − β
|
|
@@ -373,8 +374,8 @@ footer a:hover{border-bottom-color:var(--text)}
|
|
| 373 |
<!-- 1. CERTIFICATE -->
|
| 374 |
<section id="certificate" class="wrap">
|
| 375 |
<div class="sechead">
|
| 376 |
-
<div class="h-main"><span class="secno num">02</span><h2>
|
| 377 |
-
<p>
|
| 378 |
</div>
|
| 379 |
<div class="track">
|
| 380 |
<div class="fig">
|
|
@@ -388,12 +389,12 @@ footer a:hover{border-bottom-color:var(--text)}
|
|
| 388 |
<svg id="certChart" viewBox="0 0 720 96" role="img" aria-label="Accuracy axis showing single-best upper bound, certified ceiling, and the gain between them"></svg>
|
| 389 |
<div id="certMobile" role="img" aria-label="Certificate, stacked for narrow screens: single-best upper bound, certified ceiling, and the gain between them as a bar"></div>
|
| 390 |
<div class="readout" aria-live="polite">
|
| 391 |
-
<div class="r"><span class="rv beta" id="oBeta">
|
| 392 |
-
<div class="r"><span class="rv" id="oCeil">
|
| 393 |
-
<div class="r"><span class="rv" id="oGain">
|
| 394 |
</div>
|
| 395 |
<div class="verdict" id="oVerdict" role="status" aria-live="polite">Verdict pending</div>
|
| 396 |
-
<div class="cap"><b>Defaults are the paper's MATH-500 run</b> (K=17, n=330, 67 models
|
| 397 |
</div>
|
| 398 |
<aside class="rail" aria-label="Figure 1 notes: inputs, output formula, and scope">
|
| 399 |
<span class="rk">FIG 1 · INPUTS</span><br>K / n · A_single-best · overhead
|
|
@@ -408,8 +409,8 @@ footer a:hover{border-bottom-color:var(--text)}
|
|
| 408 |
<!-- 2. POOL-SIZE DIVERGENCE -->
|
| 409 |
<section id="divergence" class="wrap">
|
| 410 |
<div class="sechead">
|
| 411 |
-
<div class="h-main"><span class="secno num">03</span><h2>
|
| 412 |
-
<p>
|
| 413 |
</div>
|
| 414 |
<div class="track">
|
| 415 |
<div class="fig">
|
|
@@ -419,10 +420,10 @@ footer a:hover{border-bottom-color:var(--text)}
|
|
| 419 |
</div>
|
| 420 |
<div class="panx"><svg id="divChart" viewBox="0 0 720 300" role="img" aria-label="Tetrachoric underpricing ratio rising with pool size"></svg></div>
|
| 421 |
<p class="panhint">scroll to pan ↔</p>
|
| 422 |
-
<div class="cap">Empirical β over the tetrachoric single-factor prediction
|
| 423 |
</div>
|
| 424 |
<aside class="rail" aria-label="Figure 2 notes: sampling, baseline, and robustness checks">
|
| 425 |
-
<span class="rk">FIG 2 · SAMPLING</span><br>random k-subsets<br>resampled w/o replacement<br>band = 5
|
| 426 |
<span class="rsep"></span>
|
| 427 |
<span class="rk">REFERENCE</span><br>1× = ρ-exact (neutral)
|
| 428 |
<span class="rsep"></span>
|
|
@@ -434,22 +435,22 @@ footer a:hover{border-bottom-color:var(--text)}
|
|
| 434 |
<!-- 3. FOUR-DOMAIN REGIME -->
|
| 435 |
<section id="regime" class="wrap">
|
| 436 |
<div class="sechead">
|
| 437 |
-
<div class="h-main"><span class="secno num">04</span><h2>Two regimes
|
| 438 |
-
<p>
|
| 439 |
</div>
|
| 440 |
<div class="domains" id="domainGrid"></div>
|
| 441 |
-
<div class="cap" style="max-width:60ch;margin-top:22px;color:var(--dim);font-size:13px">Co-failure (β > 0, with the same Pearson-trap and full-Σ residual) holds on two mathematics benchmarks <span class="num">and</span> execution-graded competitive code
|
| 442 |
</section>
|
| 443 |
|
| 444 |
<!-- 5. THE FLIP -->
|
| 445 |
<section id="flip" class="wrap">
|
| 446 |
<div class="sechead">
|
| 447 |
-
<div class="h-main"><span class="secno num">05</span><h2>
|
| 448 |
-
<p>
|
| 449 |
</div>
|
| 450 |
<div class="track">
|
| 451 |
<div class="fig">
|
| 452 |
-
<div class="figlbl"><span class="eyebrow">FIG 3 · Content-controlled format flip</span><span class="figsrc" id="flip-judge">5-judge panel · κ 0.73
|
| 453 |
<div class="controls">
|
| 454 |
<div class="seg" role="group" aria-label="Question format">
|
| 455 |
<button id="segMC" aria-pressed="true">Multiple-choice</button>
|
|
@@ -463,12 +464,12 @@ footer a:hover{border-bottom-color:var(--text)}
|
|
| 463 |
</div>
|
| 464 |
<div class="panx"><svg id="flipChart" viewBox="0 0 720 132" role="img" aria-label="Each cell is one GPQA-Diamond item; under open-ended format a block of all-models-wrong items lights up at the left where multiple-choice had none"></svg></div>
|
| 465 |
<p class="panhint">scroll to pan ↔</p>
|
| 466 |
-
<div class="cap">Each cell is one of the <b>79 GPQA-Diamond items</b>, content held fixed. Toggle only the <b>format</b> and a co-failure block opens at the left edge
|
| 467 |
</div>
|
| 468 |
<aside class="rail" aria-label="Figure 3 notes: sample, grading, and the Pearson trap">
|
| 469 |
<span class="rk">FIG 3 · SAMPLE</span><br>GPQA-Diamond items<br>open-ended n = 79<br>k = 10 all-wrong
|
| 470 |
<span class="rsep"></span>
|
| 471 |
-
<span class="rk">GRADING</span><br>5 LLM judges<br>inter-rater κ 0.73
|
| 472 |
<span class="rsep"></span>
|
| 473 |
<span class="rk">PEARSON TRAP</span><br>same Σ residual<br>holds under full copula
|
| 474 |
</aside>
|
|
@@ -478,8 +479,8 @@ footer a:hover{border-bottom-color:var(--text)}
|
|
| 478 |
<!-- THE POOL -->
|
| 479 |
<section id="pool" class="wrap">
|
| 480 |
<div class="sechead">
|
| 481 |
-
<div class="h-main"><span class="secno num">06</span><h2>The
|
| 482 |
-
<p>Every number
|
| 483 |
</div>
|
| 484 |
<div class="poolstat"><span class="beta num">67</span> models · <span class="beta num">21</span> providers · priced live · temperature 0 · one 67×67 co-failure matrix</div>
|
| 485 |
<div id="poolGrid" class="poolgrid"></div>
|
|
@@ -534,15 +535,15 @@ function cert(){
|
|
| 534 |
if(certGain<ov){
|
| 535 |
v.className='verdict stop';
|
| 536 |
v.textContent = ceil<asb
|
| 537 |
-
? 'Even the certified ceiling '+fmt(ceil,3)+' sits below single-best '+fmt(asb,3)+'
|
| 538 |
-
: 'No policy can pay for itself
|
| 539 |
} else {
|
| 540 |
-
v.className='verdict go'; v.textContent='Certified max gain '+fmt(certGain,3)+' over single-best exceeds overhead
|
| 541 |
}
|
| 542 |
// source eyebrow tracks whether the instrument still shows the paper's default run
|
| 543 |
const isDefault=(K===CERT_DEF.K&&n===CERT_DEF.n&&Math.abs(asb-CERT_DEF.asb)<1e-9&&Math.abs(ov-CERT_DEF.ov)<1e-9);
|
| 544 |
certSrc.textContent=isDefault?'MATH-500 default':'custom inputs';
|
| 545 |
-
certChart.setAttribute('aria-label',`Certificate: single-best ${pct(asb)}, certified ceiling ${pct(ceil)}, max gain ${(certGain>=0?'+':'')+pct(certGain)}
|
| 546 |
drawCert(asb,ceil,certGain,ov); // render only; reuses the math above, alters no value
|
| 547 |
}
|
| 548 |
/* certificate strip: a DATA-DRIVEN accuracy window (not a wasteful [0,1] axis). The single-best
|
|
|
|
| 4 |
<meta charset="utf-8">
|
| 5 |
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 6 |
<meta name="color-scheme" content="dark">
|
| 7 |
+
<title>Combining LLMs Rarely Beats the Single Best Model: the co-failure ceiling</title>
|
| 8 |
<meta name="description" content="Interactive companion to the paper “Combining LLMs Rarely Beats the Single Best Model: A Provable Co-Failure Ceiling Across 67 Frontier Models” (Josef Chen, KAIKAKU). The deployment-relevant object is β = P(all models wrong); a one-sample certificate bounds every router, vote, and cascade.">
|
| 9 |
<meta property="og:title" content="Combining LLMs Rarely Beats the Single Best Model">
|
| 10 |
<meta property="og:description" content="β = P(all models wrong) is the ceiling. An interactive companion to the paper.">
|
|
|
|
| 348 |
<span class="hd-beta num" id="heroBeta" data-ticker="0.052">0.052</span>
|
| 349 |
<span class="hd-foot-l"><span class="gb">β</span> = P(ALL MODELS WRONG) · MATH-500<br>17 OF 330 QUERIES · 67 MODELS, 21 PROVIDERS</span>
|
| 350 |
</div>
|
| 351 |
+
<div class="hd-hint">move the cursor, the field scatters</div>
|
| 352 |
</div>
|
| 353 |
</header>
|
| 354 |
|
| 355 |
<section id="thesis" class="wrap">
|
| 356 |
<div class="sechead">
|
| 357 |
+
<div class="h-main"><span class="secno num">01</span><h2>If they are all wrong, no vote can win.</h2></div>
|
| 358 |
+
<p>Routing, voting, and cascading all hand back one model's answer. So your ceiling is set by how often every model is wrong at once. Call that β. It is not how often models agree (ρ).</p>
|
| 359 |
</div>
|
| 360 |
<div class="track">
|
| 361 |
<div class="prose">
|
| 362 |
+
<p>Picture a panel of experts where you can only return one expert's answer. Choosing the best one helps, right up until a question lands on a blind spot they all share. Then no rule wins, because the right answer was never in the room.</p>
|
| 363 |
+
<p>That is the ceiling, and it is exact. Give a query to a pool of <span class="num">m</span> models. If <em>every</em> one is wrong, no selection policy (router, weighted vote, cascade, debate) can be right, since each returns one member's answer. Accuracy is capped at <span class="num">1−<span class="gb">β</span></span>, where <span class="beta num"><span class="gb">β</span> = P(all m wrong)</span>.</p>
|
| 364 |
+
<p>The field reports pairwise correlation ρ instead, and ρ is provably blind to β. You can hold the entire pairwise law fixed and still move β, a Fréchet-class fact we make exact in the paper. A single-factor copula calibrated on ρ <em>underprices</em> the co-failure tail, a bias that grows with pool size, driven by a common-mode atom that no pairwise number represents.</p>
|
| 365 |
</div>
|
| 366 |
<aside class="rail" aria-label="Section 1 notes: definitions and the pairwise blind spot">
|
| 367 |
<span class="rk">DEFINITION</span><br>β = P(all m wrong)<br>ceiling = 1 − β
|
|
|
|
| 374 |
<!-- 1. CERTIFICATE -->
|
| 375 |
<section id="certificate" class="wrap">
|
| 376 |
<div class="sechead">
|
| 377 |
+
<div class="h-main"><span class="secno num">02</span><h2>Know your ceiling before you build it.</h2></div>
|
| 378 |
+
<p>Grade the models once on a held-out set and count the questions all of them missed. That count alone caps what any router could add. No training, no cost. Move the inputs and watch the ceiling.</p>
|
| 379 |
</div>
|
| 380 |
<div class="track">
|
| 381 |
<div class="fig">
|
|
|
|
| 389 |
<svg id="certChart" viewBox="0 0 720 96" role="img" aria-label="Accuracy axis showing single-best upper bound, certified ceiling, and the gain between them"></svg>
|
| 390 |
<div id="certMobile" role="img" aria-label="Certificate, stacked for narrow screens: single-best upper bound, certified ceiling, and the gain between them as a bar"></div>
|
| 391 |
<div class="readout" aria-live="polite">
|
| 392 |
+
<div class="r"><span class="rv beta" id="oBeta">·</span><span class="rl"><span class="gb">β</span>̂ = K/n</span></div>
|
| 393 |
+
<div class="r"><span class="rv" id="oCeil">·</span><span class="rl">certified ceiling 1−<span class="gb">β</span>_lo (95% CP)</span></div>
|
| 394 |
+
<div class="r"><span class="rv" id="oGain">·</span><span class="rl">certified max gain</span></div>
|
| 395 |
</div>
|
| 396 |
<div class="verdict" id="oVerdict" role="status" aria-live="polite">Verdict pending</div>
|
| 397 |
+
<div class="cap"><b>Defaults are the paper's MATH-500 run</b> (K=17, n=330, 67 models, β̂ = <span class="num" id="capBeta">·</span>). The Clopper-Pearson lower bound on β turns the count K/n into a certified ceiling <span class="num">1−β_lo</span> on achievable accuracy, the most any router, vote, or cascade could reach. Subtracting your measured single-best upper-bounds the gain, with ≥95% coverage on β from one labelled sample, no router trained. The point-estimate ceiling is <span class="num">1−β̂ = 0.948</span>. The <b>certified</b> <span class="num">1−β_lo</span> above is deliberately wider.</div>
|
| 398 |
</div>
|
| 399 |
<aside class="rail" aria-label="Figure 1 notes: inputs, output formula, and scope">
|
| 400 |
<span class="rk">FIG 1 · INPUTS</span><br>K / n · A_single-best · overhead
|
|
|
|
| 409 |
<!-- 2. POOL-SIZE DIVERGENCE -->
|
| 410 |
<section id="divergence" class="wrap">
|
| 411 |
<div class="sechead">
|
| 412 |
+
<div class="h-main"><span class="secno num">03</span><h2>Add more models and the gap widens.</h2></div>
|
| 413 |
+
<p>The usual estimate reads joint failure off pairwise agreement. It runs low, and runs lower as the pool grows, because the models share blind spots that no pair reveals. Drag the slider to see it open up.</p>
|
| 414 |
</div>
|
| 415 |
<div class="track">
|
| 416 |
<div class="fig">
|
|
|
|
| 420 |
</div>
|
| 421 |
<div class="panx"><svg id="divChart" viewBox="0 0 720 300" role="img" aria-label="Tetrachoric underpricing ratio rising with pool size"></svg></div>
|
| 422 |
<p class="panhint">scroll to pan ↔</p>
|
| 423 |
+
<div class="cap">Empirical β over the tetrachoric single-factor prediction. Median across random k-model subsets, a 5-95% band. At the full pool <span class="num" id="kReadout">k=67</span> the residual is <span class="num beta" id="ratioReadout">·</span>: a common-mode atom, not a calibration artifact. Robustness in the rail.</div>
|
| 424 |
</div>
|
| 425 |
<aside class="rail" aria-label="Figure 2 notes: sampling, baseline, and robustness checks">
|
| 426 |
+
<span class="rk">FIG 2 · SAMPLING</span><br>random k-subsets<br>resampled w/o replacement<br>band = 5-95% pctile
|
| 427 |
<span class="rsep"></span>
|
| 428 |
<span class="rk">REFERENCE</span><br>1× = ρ-exact (neutral)
|
| 429 |
<span class="rsep"></span>
|
|
|
|
| 435 |
<!-- 3. FOUR-DOMAIN REGIME -->
|
| 436 |
<section id="regime" class="wrap">
|
| 437 |
<div class="sechead">
|
| 438 |
+
<div class="h-main"><span class="secno num">04</span><h2>Two regimes, and the task picks one.</h2></div>
|
| 439 |
+
<p>On open-ended math and code, every model trips on some of the same questions, so the ceiling bites. On multiple-choice, someone always lands the answer, so β is near zero and combining only breaks ties.</p>
|
| 440 |
</div>
|
| 441 |
<div class="domains" id="domainGrid"></div>
|
| 442 |
+
<div class="cap" style="max-width:60ch;margin-top:22px;color:var(--dim);font-size:13px">Co-failure (β > 0, with the same Pearson-trap and full-Σ residual) holds on two mathematics benchmarks <span class="num">and</span> execution-graded competitive code, and inverts on multiple-choice science. The lever is open-endedness, not subject matter.</div>
|
| 443 |
</section>
|
| 444 |
|
| 445 |
<!-- 5. THE FLIP -->
|
| 446 |
<section id="flip" class="wrap">
|
| 447 |
<div class="sechead">
|
| 448 |
+
<div class="h-main"><span class="secno num">05</span><h2>Same questions, new format, the ceiling appears.</h2></div>
|
| 449 |
+
<p>Take hard science questions. As multiple-choice, models can guess or eliminate, so someone is always right. Remove the options, make them answer cold, and 10 of 79 now stump every model at once.</p>
|
| 450 |
</div>
|
| 451 |
<div class="track">
|
| 452 |
<div class="fig">
|
| 453 |
+
<div class="figlbl"><span class="eyebrow">FIG 3 · Content-controlled format flip</span><span class="figsrc" id="flip-judge">5-judge panel · κ 0.73-0.92</span></div>
|
| 454 |
<div class="controls">
|
| 455 |
<div class="seg" role="group" aria-label="Question format">
|
| 456 |
<button id="segMC" aria-pressed="true">Multiple-choice</button>
|
|
|
|
| 464 |
</div>
|
| 465 |
<div class="panx"><svg id="flipChart" viewBox="0 0 720 132" role="img" aria-label="Each cell is one GPQA-Diamond item; under open-ended format a block of all-models-wrong items lights up at the left where multiple-choice had none"></svg></div>
|
| 466 |
<p class="panhint">scroll to pan ↔</p>
|
| 467 |
+
<div class="cap">Each cell is one of the <b>79 GPQA-Diamond items</b>, content held fixed. Toggle only the <b>format</b> and a co-failure block opens at the left edge. β goes from ~0 (multiple-choice) to <span class="num beta">0.127</span>, 10 of 79 items where <b>every</b> model is wrong. The tail stays positive under every judge rule (majority 0.127, unanimous 0.241, lenient 0.038), so it is not a grading knob.</div>
|
| 468 |
</div>
|
| 469 |
<aside class="rail" aria-label="Figure 3 notes: sample, grading, and the Pearson trap">
|
| 470 |
<span class="rk">FIG 3 · SAMPLE</span><br>GPQA-Diamond items<br>open-ended n = 79<br>k = 10 all-wrong
|
| 471 |
<span class="rsep"></span>
|
| 472 |
+
<span class="rk">GRADING</span><br>5 LLM judges<br>inter-rater κ 0.73-0.92<br>not human-adjudicated
|
| 473 |
<span class="rsep"></span>
|
| 474 |
<span class="rk">PEARSON TRAP</span><br>same Σ residual<br>holds under full copula
|
| 475 |
</aside>
|
|
|
|
| 479 |
<!-- THE POOL -->
|
| 480 |
<section id="pool" class="wrap">
|
| 481 |
<div class="sechead">
|
| 482 |
+
<div class="h-main"><span class="secno num">06</span><h2>The cast: 67 frontier models, 21 providers.</h2></div>
|
| 483 |
+
<p>Every number here recomputes live over one 2026 OpenRouter pool, from $30/Mtok flagships down to $0.03/Mtok open weights. The roster, the matrices, the grading, and the code are all released to rerun.</p>
|
| 484 |
</div>
|
| 485 |
<div class="poolstat"><span class="beta num">67</span> models · <span class="beta num">21</span> providers · priced live · temperature 0 · one 67×67 co-failure matrix</div>
|
| 486 |
<div id="poolGrid" class="poolgrid"></div>
|
|
|
|
| 535 |
if(certGain<ov){
|
| 536 |
v.className='verdict stop';
|
| 537 |
v.textContent = ceil<asb
|
| 538 |
+
? 'Even the certified ceiling '+fmt(ceil,3)+' sits below single-best '+fmt(asb,3)+'. No selection policy over this pool can beat the best single model.'
|
| 539 |
+
: 'No policy can pay for itself. Certified max gain '+fmt(certGain,3)+' is below overhead '+fmt(ov,3)+'. Skip orchestration.';
|
| 540 |
} else {
|
| 541 |
+
v.className='verdict go'; v.textContent='Certified max gain '+fmt(certGain,3)+' over single-best exceeds overhead. A router is worth evaluating.';
|
| 542 |
}
|
| 543 |
// source eyebrow tracks whether the instrument still shows the paper's default run
|
| 544 |
const isDefault=(K===CERT_DEF.K&&n===CERT_DEF.n&&Math.abs(asb-CERT_DEF.asb)<1e-9&&Math.abs(ov-CERT_DEF.ov)<1e-9);
|
| 545 |
certSrc.textContent=isDefault?'MATH-500 default':'custom inputs';
|
| 546 |
+
certChart.setAttribute('aria-label',`Certificate: single-best ${pct(asb)}, certified ceiling ${pct(ceil)}, max gain ${(certGain>=0?'+':'')+pct(certGain)}, verdict ${certGain<ov?'stop':'go'}`);
|
| 547 |
drawCert(asb,ceil,certGain,ov); // render only; reuses the math above, alters no value
|
| 548 |
}
|
| 549 |
/* certificate strip: a DATA-DRIVEN accuracy window (not a wasteful [0,1] axis). The single-best
|