JerrettDavis commited on
Commit
d1eb7ff
·
1 Parent(s): 189bff3

Harden anthropic cache-mode replay stability

Browse files
benchmarks/claude_session_branch_compare.py ADDED
@@ -0,0 +1,595 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Compare Claude session mode simulations across two git refs."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import os
9
+ import shutil
10
+ import subprocess
11
+ import sys
12
+ import tempfile
13
+ from dataclasses import asdict, dataclass
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ if __package__ in {None, ""}:
18
+ sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
19
+
20
+ from benchmarks.claude_session_mode_benchmark import (
21
+ IMPACT_DIRECTION,
22
+ OUTPUT_JSON,
23
+ PROXY_MODE_CACHE,
24
+ PROXY_MODE_TOKEN,
25
+ format_currency,
26
+ )
27
+
28
+ DEFAULT_OUTPUT_DIR = Path("benchmark_results") / "branch_compare"
29
+
30
+
31
+ @dataclass
32
+ class BranchResult:
33
+ ref: str
34
+ label: str
35
+ commit: str
36
+ summary: str
37
+ dataset: dict[str, Any]
38
+ observed: dict[str, Any]
39
+ summaries: dict[str, dict[str, Any]]
40
+ winners: dict[str, str]
41
+ output_dir: str
42
+
43
+
44
+ def parse_args() -> argparse.Namespace:
45
+ parser = argparse.ArgumentParser(description=__doc__)
46
+ parser.add_argument("--left-ref", default="upstream/main")
47
+ parser.add_argument("--right-ref", default="HEAD")
48
+ parser.add_argument("--left-label", default="main")
49
+ parser.add_argument("--right-label", default="pr")
50
+ parser.add_argument("--root", type=Path, default=Path.home() / ".claude" / "projects")
51
+ parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR)
52
+ parser.add_argument("--max-sessions", type=int, default=None)
53
+ parser.add_argument("--recent-turns-per-session", type=int, default=None)
54
+ parser.add_argument("--cache-ttl-minutes", type=int, default=5)
55
+ parser.add_argument("--cache-write-multiplier", type=float, default=1.25)
56
+ parser.add_argument("--workers", type=int, default=1)
57
+ parser.add_argument(
58
+ "--python",
59
+ default=sys.executable,
60
+ help="Python executable to use inside each worktree.",
61
+ )
62
+ parser.add_argument(
63
+ "--keep-worktrees",
64
+ action="store_true",
65
+ help="Do not remove temporary worktrees after the comparison run.",
66
+ )
67
+ return parser.parse_args()
68
+
69
+
70
+ def _run_git(args: list[str], cwd: Path) -> str:
71
+ completed = subprocess.run(
72
+ ["git", *args],
73
+ cwd=cwd,
74
+ check=True,
75
+ capture_output=True,
76
+ text=True,
77
+ )
78
+ return completed.stdout.strip()
79
+
80
+
81
+ def _ref_slug(ref: str) -> str:
82
+ return "".join(ch if ch.isalnum() else "-" for ch in ref).strip("-").lower() or "ref"
83
+
84
+
85
+ def _branch_output_dir(base: Path, label: str) -> Path:
86
+ return base / _ref_slug(label)
87
+
88
+
89
+ def _comparison_paths(base: Path) -> tuple[Path, Path, Path]:
90
+ return (
91
+ base / "claude_session_branch_compare.md",
92
+ base / "claude_session_branch_compare.json",
93
+ base / "claude_session_branch_compare.html",
94
+ )
95
+
96
+
97
+ def _mode_metric(branch: BranchResult, mode: str, field: str) -> float:
98
+ summary = branch.summaries[mode]
99
+ if field == "no_cache_total_cost_usd":
100
+ if "no_cache_total_cost_usd" in summary:
101
+ value = summary["no_cache_total_cost_usd"]
102
+ else:
103
+ value = (
104
+ float(summary["paid_input_cost_usd"])
105
+ + (float(summary["cache_read_cost_usd"]) * 10.0)
106
+ + float(summary["paid_output_cost_usd"])
107
+ )
108
+ elif field == "prompt_window_with_cache":
109
+ value = float(summary["forwarded_input_tokens"])
110
+ elif field == "prompt_window_without_cache_reads":
111
+ value = float(summary["forwarded_input_tokens"]) - float(summary["cache_read_tokens"])
112
+ else:
113
+ value = summary[field]
114
+ if isinstance(value, bool):
115
+ return float(value)
116
+ return float(value)
117
+
118
+
119
+ def _delta(left: float, right: float) -> float:
120
+ return right - left
121
+
122
+
123
+ def _classify_delta(field: str, delta: float) -> str:
124
+ direction = IMPACT_DIRECTION.get(field, "same")
125
+ tolerance = 1e-9
126
+ if abs(delta) <= tolerance:
127
+ return "no_change"
128
+ if direction == "lower":
129
+ return "assist" if delta < 0 else "harm"
130
+ if direction == "higher":
131
+ return "assist" if delta > 0 else "harm"
132
+ return "harm"
133
+
134
+
135
+ def _build_benchmark_command(
136
+ python_executable: str,
137
+ script_path: Path,
138
+ root: Path,
139
+ output_dir: Path,
140
+ max_sessions: int | None,
141
+ recent_turns_per_session: int | None,
142
+ cache_ttl_minutes: int,
143
+ cache_write_multiplier: float,
144
+ workers: int,
145
+ ) -> list[str]:
146
+ command = [
147
+ python_executable,
148
+ str(script_path),
149
+ "--root",
150
+ str(root),
151
+ "--output-dir",
152
+ str(output_dir),
153
+ "--cache-ttl-minutes",
154
+ str(cache_ttl_minutes),
155
+ "--cache-write-multiplier",
156
+ str(cache_write_multiplier),
157
+ "--workers",
158
+ str(workers),
159
+ ]
160
+ if max_sessions is not None:
161
+ command.extend(["--max-sessions", str(max_sessions)])
162
+ if recent_turns_per_session is not None:
163
+ command.extend(["--recent-turns-per-session", str(recent_turns_per_session)])
164
+ return command
165
+
166
+
167
+ def _load_branch_result(
168
+ repo_root: Path,
169
+ ref: str,
170
+ label: str,
171
+ branch_output_dir: Path,
172
+ ) -> BranchResult:
173
+ payload = json.loads((branch_output_dir / OUTPUT_JSON).read_text(encoding="utf-8"))
174
+ commit = _run_git(["rev-parse", ref], repo_root)
175
+ summary = _run_git(["show", "-s", "--format=%s", ref], repo_root)
176
+ return BranchResult(
177
+ ref=ref,
178
+ label=label,
179
+ commit=commit,
180
+ summary=summary,
181
+ dataset=payload["dataset"],
182
+ observed=payload["observed"],
183
+ summaries=payload["summaries"],
184
+ winners=payload["winners"],
185
+ output_dir=str(branch_output_dir),
186
+ )
187
+
188
+
189
+ def _run_branch_benchmark(
190
+ repo_root: Path,
191
+ ref: str,
192
+ label: str,
193
+ args: argparse.Namespace,
194
+ worktree_root: Path,
195
+ ) -> BranchResult:
196
+ worktree_dir = worktree_root / _ref_slug(label)
197
+ branch_output_dir = _branch_output_dir(args.output_dir, label)
198
+ branch_output_dir.mkdir(parents=True, exist_ok=True)
199
+ if worktree_dir.exists():
200
+ shutil.rmtree(worktree_dir)
201
+ _run_git(["worktree", "add", "--detach", str(worktree_dir), ref], repo_root)
202
+ try:
203
+ command = _build_benchmark_command(
204
+ python_executable=args.python,
205
+ script_path=repo_root / "benchmarks" / "claude_session_mode_benchmark.py",
206
+ root=args.root,
207
+ output_dir=branch_output_dir,
208
+ max_sessions=args.max_sessions,
209
+ recent_turns_per_session=args.recent_turns_per_session,
210
+ cache_ttl_minutes=args.cache_ttl_minutes,
211
+ cache_write_multiplier=args.cache_write_multiplier,
212
+ workers=args.workers,
213
+ )
214
+ env = os.environ.copy()
215
+ env["PYTHONPATH"] = os.pathsep.join(
216
+ [str(worktree_dir), str(repo_root), env.get("PYTHONPATH", "")]
217
+ ).rstrip(os.pathsep)
218
+ subprocess.run(command, cwd=worktree_dir, check=True, env=env)
219
+ return _load_branch_result(repo_root, ref, label, branch_output_dir)
220
+ finally:
221
+ if not args.keep_worktrees:
222
+ subprocess.run(
223
+ ["git", "worktree", "remove", "--force", str(worktree_dir)],
224
+ cwd=repo_root,
225
+ check=True,
226
+ )
227
+
228
+
229
+ def _winner_line(metric: str, left: BranchResult, right: BranchResult) -> str:
230
+ left_winner = left.winners[metric]
231
+ right_winner = right.winners[metric]
232
+ if left_winner == right_winner:
233
+ return f"- {metric}: both pick `{left_winner}`"
234
+ return (
235
+ f"- {metric}: `{left.label}` picks `{left_winner}`, `{right.label}` picks `{right_winner}`"
236
+ )
237
+
238
+
239
+ def _build_six_way_rows(
240
+ left: BranchResult, right: BranchResult
241
+ ) -> list[dict[str, str | float | int]]:
242
+ rows: list[dict[str, str | float | int]] = []
243
+ for branch in (left, right):
244
+ for mode in ("baseline", PROXY_MODE_TOKEN, PROXY_MODE_CACHE):
245
+ summary = branch.summaries[mode]
246
+ cost_delta = _mode_metric(branch, mode, "total_cost_usd") - _mode_metric(
247
+ branch, "baseline", "total_cost_usd"
248
+ )
249
+ window_delta = int(
250
+ _mode_metric(branch, mode, "prompt_window_with_cache")
251
+ - _mode_metric(branch, "baseline", "prompt_window_with_cache")
252
+ )
253
+ read_delta = int(
254
+ _mode_metric(branch, mode, "cache_read_tokens")
255
+ - _mode_metric(branch, "baseline", "cache_read_tokens")
256
+ )
257
+ write_delta = int(
258
+ _mode_metric(branch, mode, "cache_write_tokens")
259
+ - _mode_metric(branch, "baseline", "cache_write_tokens")
260
+ )
261
+ paid_input_delta = int(
262
+ _mode_metric(branch, mode, "regular_input_tokens")
263
+ - _mode_metric(branch, "baseline", "regular_input_tokens")
264
+ )
265
+ rows.append(
266
+ {
267
+ "branch": branch.label,
268
+ "mode": mode,
269
+ "forwarded_input_tokens": int(summary["forwarded_input_tokens"]),
270
+ "cache_read_tokens": int(summary["cache_read_tokens"]),
271
+ "cache_write_tokens": int(summary["cache_write_tokens"]),
272
+ "regular_input_tokens": int(summary["regular_input_tokens"]),
273
+ "output_tokens": int(summary["output_tokens"]),
274
+ "total_cost_usd": float(summary["total_cost_usd"]),
275
+ "cost_delta_vs_branch_baseline": cost_delta,
276
+ "window_delta_vs_branch_baseline": window_delta,
277
+ "cache_read_delta_vs_branch_baseline": read_delta,
278
+ "cache_write_delta_vs_branch_baseline": write_delta,
279
+ "paid_input_delta_vs_branch_baseline": paid_input_delta,
280
+ "is_branch_winner": "yes" if branch.winners["total_cost"] == mode else "no",
281
+ }
282
+ )
283
+ return rows
284
+
285
+
286
+ def build_compare_markdown(left: BranchResult, right: BranchResult) -> str:
287
+ six_way_rows = _build_six_way_rows(left, right)
288
+ lines = [
289
+ "# Claude Session Branch Comparison",
290
+ "",
291
+ "## Branches",
292
+ "",
293
+ f"- {left.label}: `{left.ref}` @ `{left.commit[:12]}` - {left.summary}",
294
+ f"- {right.label}: `{right.ref}` @ `{right.commit[:12]}` - {right.summary}",
295
+ "",
296
+ "## Dataset",
297
+ "",
298
+ f"- Projects: {right.dataset['projects']}",
299
+ f"- Sessions: {right.dataset['sessions']}",
300
+ f"- Requests: {right.dataset['requests']}",
301
+ f"- Sampled requests: {right.dataset.get('sampled_requests', 0)}",
302
+ f"- Sampling: {right.dataset.get('sampling_note', 'Full sessions')}",
303
+ "",
304
+ "## Winner Comparison",
305
+ "",
306
+ _winner_line("total_cost", left, right),
307
+ _winner_line("no_cache_total_cost", left, right),
308
+ _winner_line("window_with_cache", left, right),
309
+ _winner_line("window_without_cache_reads", left, right),
310
+ "",
311
+ "## Six-Way Mode Matrix",
312
+ "",
313
+ "| Branch | Mode | Forwarded Input | Cache Read | Cache Write | Paid Input | Paid Output | Total Cost | Cost Δ vs Branch Baseline | Window Δ vs Branch Baseline | Winner |",
314
+ "| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |",
315
+ *[
316
+ "| "
317
+ + " | ".join(
318
+ [
319
+ str(row["branch"]),
320
+ str(row["mode"]),
321
+ f"{int(row['forwarded_input_tokens']):,}",
322
+ f"{int(row['cache_read_tokens']):,}",
323
+ f"{int(row['cache_write_tokens']):,}",
324
+ f"{int(row['regular_input_tokens']):,}",
325
+ f"{int(row['output_tokens']):,}",
326
+ format_currency(float(row["total_cost_usd"])),
327
+ format_currency(float(row["cost_delta_vs_branch_baseline"])),
328
+ f"{int(row['window_delta_vs_branch_baseline']):,}",
329
+ str(row["is_branch_winner"]),
330
+ ]
331
+ )
332
+ + " |"
333
+ for row in six_way_rows
334
+ ],
335
+ "",
336
+ "## Mode Deltas",
337
+ "",
338
+ f"| Mode | Metric | {left.label} | {right.label} | Delta ({right.label} - {left.label}) | Classification |",
339
+ "| --- | --- | ---: | ---: | ---: | --- |",
340
+ ]
341
+ metrics = [
342
+ ("total_cost_usd", "Total Cost", format_currency),
343
+ ("no_cache_total_cost_usd", "No-Cache Total Cost", format_currency),
344
+ ("forwarded_input_tokens", "Forwarded Input Tokens", lambda v: f"{int(v):,}"),
345
+ ("cache_read_tokens", "Cache Read Tokens", lambda v: f"{int(v):,}"),
346
+ ("cache_write_tokens", "Cache Write Tokens", lambda v: f"{int(v):,}"),
347
+ ("cache_bust_turns", "Cache Bust Turns", lambda v: f"{int(v):,}"),
348
+ ("ttl_expiry_turns", "TTL Expiry Turns", lambda v: f"{int(v):,}"),
349
+ ("prompt_window_with_cache", "Window With Cache", lambda v: f"{int(v):,}"),
350
+ (
351
+ "prompt_window_without_cache_reads",
352
+ "Window Without Cache Reads",
353
+ lambda v: f"{int(v):,}",
354
+ ),
355
+ ]
356
+ for mode in ("baseline", PROXY_MODE_TOKEN, PROXY_MODE_CACHE):
357
+ for field, label, formatter in metrics:
358
+ left_value = _mode_metric(left, mode, field)
359
+ right_value = _mode_metric(right, mode, field)
360
+ delta = _delta(left_value, right_value)
361
+ delta_text = format_currency(delta) if "cost" in field else f"{int(delta):,}"
362
+ classification = _classify_delta(field, delta)
363
+ lines.append(
364
+ f"| {mode} | {label} | {formatter(left_value)} | {formatter(right_value)} | {delta_text} | {classification} |"
365
+ )
366
+ return "\n".join(lines)
367
+
368
+
369
+ def build_compare_html(left: BranchResult, right: BranchResult) -> str:
370
+ six_way_rows = []
371
+ for row in _build_six_way_rows(left, right):
372
+ six_way_rows.append(
373
+ "<tr>"
374
+ f"<td>{row['branch']}</td>"
375
+ f"<td><span class='pill'>{row['mode']}</span></td>"
376
+ f"<td>{int(row['forwarded_input_tokens']):,}</td>"
377
+ f"<td>{int(row['cache_read_tokens']):,}</td>"
378
+ f"<td>{int(row['cache_write_tokens']):,}</td>"
379
+ f"<td>{int(row['regular_input_tokens']):,}</td>"
380
+ f"<td>{int(row['output_tokens']):,}</td>"
381
+ f"<td>{format_currency(float(row['total_cost_usd']))}</td>"
382
+ f"<td>{format_currency(float(row['cost_delta_vs_branch_baseline']))}</td>"
383
+ f"<td>{int(row['window_delta_vs_branch_baseline']):,}</td>"
384
+ f"<td>{row['is_branch_winner']}</td>"
385
+ "</tr>"
386
+ )
387
+ cards = []
388
+ for branch in (left, right):
389
+ cards.append(
390
+ "<div class='card'>"
391
+ f"<div class='eyebrow'>{branch.label}</div>"
392
+ f"<h2>{branch.ref}</h2>"
393
+ f"<p><code>{branch.commit[:12]}</code></p>"
394
+ f"<p>{branch.summary}</p>"
395
+ "<div class='winner-grid'>"
396
+ f"<div><span>Total Cost</span><strong>{branch.winners['total_cost']}</strong></div>"
397
+ f"<div><span>No Cache</span><strong>{branch.winners['no_cache_total_cost']}</strong></div>"
398
+ f"<div><span>Window + Cache</span><strong>{branch.winners['window_with_cache']}</strong></div>"
399
+ "<div><span>Window - Reads</span>"
400
+ f"<strong>{branch.winners['window_without_cache_reads']}</strong></div>"
401
+ "</div>"
402
+ "</div>"
403
+ )
404
+ rows = []
405
+ for mode in ("baseline", PROXY_MODE_TOKEN, PROXY_MODE_CACHE):
406
+ for field, label in (
407
+ ("total_cost_usd", "Total Cost"),
408
+ ("no_cache_total_cost_usd", "No-Cache Total Cost"),
409
+ ("forwarded_input_tokens", "Forwarded Input Tokens"),
410
+ ("cache_read_tokens", "Cache Read Tokens"),
411
+ ("cache_write_tokens", "Cache Write Tokens"),
412
+ ("cache_bust_turns", "Cache Bust Turns"),
413
+ ("prompt_window_with_cache", "Window With Cache"),
414
+ ("prompt_window_without_cache_reads", "Window Without Cache Reads"),
415
+ ):
416
+ left_value = _mode_metric(left, mode, field)
417
+ right_value = _mode_metric(right, mode, field)
418
+ delta = _delta(left_value, right_value)
419
+ is_cost = "cost" in field
420
+ formatter = format_currency if is_cost else (lambda v: f"{int(v):,}")
421
+ delta_text = format_currency(delta) if is_cost else f"{int(delta):,}"
422
+ delta_class = "pos" if delta > 0 else "neg" if delta < 0 else "neutral"
423
+ classification = _classify_delta(field, delta)
424
+ rows.append(
425
+ "<tr>"
426
+ f"<td><span class='pill'>{mode}</span></td>"
427
+ f"<td>{label}</td>"
428
+ f"<td>{formatter(left_value)}</td>"
429
+ f"<td>{formatter(right_value)}</td>"
430
+ f"<td class='{delta_class}'>{delta_text}</td>"
431
+ f"<td>{classification}</td>"
432
+ "</tr>"
433
+ )
434
+ return f"""<!doctype html>
435
+ <html lang="en">
436
+ <head>
437
+ <meta charset="utf-8">
438
+ <meta name="viewport" content="width=device-width, initial-scale=1">
439
+ <title>Claude Session Branch Comparison</title>
440
+ <style>
441
+ :root {{
442
+ --bg: #f8fafc;
443
+ --fg: #020617;
444
+ --muted: #475569;
445
+ --card: #ffffff;
446
+ --line: #e2e8f0;
447
+ --soft: #f1f5f9;
448
+ --accent: #0f172a;
449
+ --accent-soft: #e2e8f0;
450
+ --good: #166534;
451
+ --bad: #991b1b;
452
+ --shadow: 0 10px 30px rgba(15, 23, 42, 0.08);
453
+ --radius: 16px;
454
+ --font: "Geist", "Segoe UI", system-ui, sans-serif;
455
+ }}
456
+ * {{ box-sizing: border-box; }}
457
+ body {{ margin: 0; background: var(--bg); color: var(--fg); font-family: var(--font); }}
458
+ .shell {{ max-width: 1280px; margin: 0 auto; padding: 32px 16px 56px; }}
459
+ .hero, .card {{
460
+ background: var(--card);
461
+ border: 1px solid var(--line);
462
+ border-radius: var(--radius);
463
+ box-shadow: var(--shadow);
464
+ }}
465
+ .hero {{ padding: 24px; }}
466
+ .eyebrow {{ color: var(--muted); font-size: 12px; font-weight: 600; text-transform: uppercase; letter-spacing: .08em; }}
467
+ h1, h2 {{ margin: 0; letter-spacing: -0.03em; }}
468
+ p {{ color: var(--muted); line-height: 1.5; }}
469
+ .grid {{ display: grid; gap: 16px; margin-top: 16px; }}
470
+ .two {{ grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); }}
471
+ .card {{ padding: 20px; }}
472
+ .winner-grid {{ display: grid; grid-template-columns: repeat(2, minmax(0, 1fr)); gap: 12px; margin-top: 16px; }}
473
+ .winner-grid span {{ display: block; color: var(--muted); font-size: 12px; }}
474
+ .winner-grid strong {{ display: block; margin-top: 4px; font-size: 16px; }}
475
+ .table-card {{ margin-top: 16px; padding: 0; overflow: hidden; }}
476
+ .table-wrap {{ overflow-x: auto; }}
477
+ table {{ width: 100%; border-collapse: collapse; }}
478
+ th, td {{ padding: 12px 14px; border-bottom: 1px solid var(--line); text-align: left; white-space: nowrap; }}
479
+ th {{ background: var(--soft); color: var(--muted); font-size: 12px; text-transform: uppercase; letter-spacing: .04em; }}
480
+ .pill {{
481
+ display: inline-flex; align-items: center; border-radius: 999px; padding: 4px 10px;
482
+ background: var(--accent-soft); color: var(--accent); font-size: 12px; font-weight: 600;
483
+ }}
484
+ .pos {{ color: var(--bad); font-weight: 600; }}
485
+ .neg {{ color: var(--good); font-weight: 600; }}
486
+ .neutral {{ color: var(--muted); }}
487
+ code {{ font-family: ui-monospace, SFMono-Regular, Consolas, monospace; }}
488
+ </style>
489
+ </head>
490
+ <body>
491
+ <div class="shell">
492
+ <section class="hero">
493
+ <div class="eyebrow">Branch Comparison</div>
494
+ <h1>Claude Session Mode Simulation</h1>
495
+ <p>Same local Claude transcript corpus. Same simulation knobs. Two git refs. This report isolates code-level behavior changes between the branches.</p>
496
+ <div class="grid two">
497
+ {"".join(cards)}
498
+ </div>
499
+ </section>
500
+ <section class="card table-card">
501
+ <div class="table-wrap">
502
+ <table>
503
+ <thead>
504
+ <tr>
505
+ <th>Branch</th>
506
+ <th>Mode</th>
507
+ <th>Forwarded Input</th>
508
+ <th>Cache Read</th>
509
+ <th>Cache Write</th>
510
+ <th>Paid Input</th>
511
+ <th>Paid Output</th>
512
+ <th>Total Cost</th>
513
+ <th>Cost Δ vs Branch Baseline</th>
514
+ <th>Window Δ vs Branch Baseline</th>
515
+ <th>Winner</th>
516
+ </tr>
517
+ </thead>
518
+ <tbody>
519
+ {"".join(six_way_rows)}
520
+ </tbody>
521
+ </table>
522
+ </div>
523
+ </section>
524
+ <section class="card table-card">
525
+ <div class="table-wrap">
526
+ <table>
527
+ <thead>
528
+ <tr>
529
+ <th>Mode</th>
530
+ <th>Metric</th>
531
+ <th>{left.label}</th>
532
+ <th>{right.label}</th>
533
+ <th>Delta</th>
534
+ <th>Classification</th>
535
+ </tr>
536
+ </thead>
537
+ <tbody>
538
+ {"".join(rows)}
539
+ </tbody>
540
+ </table>
541
+ </div>
542
+ </section>
543
+ </div>
544
+ </body>
545
+ </html>"""
546
+
547
+
548
+ def write_compare_report(
549
+ output_dir: Path,
550
+ left: BranchResult,
551
+ right: BranchResult,
552
+ ) -> tuple[Path, Path, Path]:
553
+ output_dir.mkdir(parents=True, exist_ok=True)
554
+ md_path, json_path, html_path = _comparison_paths(output_dir)
555
+ md_path.write_text(build_compare_markdown(left, right), encoding="utf-8")
556
+ html_path.write_text(build_compare_html(left, right), encoding="utf-8")
557
+ payload = {
558
+ "left": asdict(left),
559
+ "right": asdict(right),
560
+ "left_winners": left.winners,
561
+ "right_winners": right.winners,
562
+ }
563
+ json_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
564
+ return md_path, json_path, html_path
565
+
566
+
567
+ def main() -> int:
568
+ args = parse_args()
569
+ repo_root = Path(__file__).resolve().parents[1]
570
+ if not args.output_dir.is_absolute():
571
+ args.output_dir = (repo_root / args.output_dir).resolve()
572
+ if not args.root.is_absolute():
573
+ args.root = args.root.resolve()
574
+ args.output_dir.mkdir(parents=True, exist_ok=True)
575
+ worktree_root = Path(tempfile.mkdtemp(prefix="headroom-branch-compare-"))
576
+ try:
577
+ left = _run_branch_benchmark(repo_root, args.left_ref, args.left_label, args, worktree_root)
578
+ right = _run_branch_benchmark(
579
+ repo_root, args.right_ref, args.right_label, args, worktree_root
580
+ )
581
+ md_path, json_path, html_path = write_compare_report(args.output_dir, left, right)
582
+ print(f"Compared {left.label} ({left.ref}) vs {right.label} ({right.ref})")
583
+ print(f"Markdown report: {md_path}")
584
+ print(f"JSON report: {json_path}")
585
+ print(f"HTML report: {html_path}")
586
+ return 0
587
+ finally:
588
+ if args.keep_worktrees:
589
+ print(f"Retained worktrees under {worktree_root}")
590
+ else:
591
+ shutil.rmtree(worktree_root, ignore_errors=True)
592
+
593
+
594
+ if __name__ == "__main__":
595
+ raise SystemExit(main())
benchmarks/claude_session_mode_benchmark.py CHANGED
@@ -20,11 +20,16 @@ from headroom.cache.prefix_tracker import PrefixCacheTracker
20
  from headroom.pricing.litellm_pricing import get_model_pricing
21
  from headroom.proxy.handlers.anthropic import AnthropicHandlerMixin
22
  from headroom.proxy.models import ProxyConfig
23
- from headroom.proxy.modes import PROXY_MODE_CACHE, PROXY_MODE_TOKEN
24
  from headroom.proxy.server import HeadroomProxy
25
  from headroom.tokenizers import get_tokenizer
26
  from headroom.utils import extract_user_query
27
 
 
 
 
 
 
 
28
  DEFAULT_ROOT = Path.home() / ".claude" / "projects"
29
  DEFAULT_OUTPUT_DIR = Path("benchmark_results")
30
  DEFAULT_CACHE_TTL_MINUTES = 5
@@ -96,6 +101,9 @@ class ModeSummary:
96
  cache_eligible_turns: int = 0
97
  cache_bust_turns: int = 0
98
  ttl_expiry_turns: int = 0
 
 
 
99
  turns: list[TurnMetrics] = field(default_factory=list)
100
 
101
  @property
@@ -136,6 +144,24 @@ class DatasetSummary:
136
  sampling_note: str = ""
137
 
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  @dataclass
140
  class ObservedSummary:
141
  sessions: int = 0
@@ -221,6 +247,9 @@ def _mode_summary_from_dict(data: dict[str, Any]) -> ModeSummary:
221
  cache_eligible_turns=data.get("cache_eligible_turns", 0),
222
  cache_bust_turns=data.get("cache_bust_turns", 0),
223
  ttl_expiry_turns=data.get("ttl_expiry_turns", 0),
 
 
 
224
  turns=turns,
225
  )
226
  return summary
@@ -618,6 +647,133 @@ def _common_prefix_tokens(
618
  return common
619
 
620
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
621
  def _make_proxy(mode: str) -> HeadroomProxy:
622
  cfg = ProxyConfig(
623
  mode=mode,
@@ -655,25 +811,47 @@ def _apply_mode_to_messages(
655
  assert proxy is not None
656
  assert prefix_tracker is not None
657
  if mode == PROXY_MODE_CACHE:
658
- delta = AnthropicHandlerMixin._extract_cache_stable_delta(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
659
  messages,
660
  previous_original_messages,
661
  previous_forwarded_messages,
662
  )
663
- if delta is None:
664
- return copy.deepcopy(messages)
665
- stable_forwarded_prefix, delta_messages = delta
666
- if not delta_messages:
667
- return stable_forwarded_prefix
668
- context_limit = proxy.anthropic_provider.get_context_limit(model)
669
- result = proxy.anthropic_pipeline.apply(
670
- messages=delta_messages,
671
- model=model,
672
- model_limit=context_limit,
673
- context=extract_user_query(delta_messages),
674
- frozen_message_count=0,
675
- )
676
- return stable_forwarded_prefix + result.messages
 
677
 
678
  frozen_message_count = prefix_tracker.get_frozen_message_count()
679
 
@@ -842,6 +1020,9 @@ def _merge_mode_summary(target: ModeSummary, source: ModeSummary) -> None:
842
  target.cache_eligible_turns += source.cache_eligible_turns
843
  target.cache_bust_turns += source.cache_bust_turns
844
  target.ttl_expiry_turns += source.ttl_expiry_turns
 
 
 
845
 
846
 
847
  def _disable_headroom_benchmark_logging() -> None:
@@ -914,6 +1095,32 @@ def _write_checkpoint_by_session_id(
914
  path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
915
 
916
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
917
  def _simulate_single_replay_mode(
918
  replay: SessionReplay,
919
  mode: str,
@@ -938,6 +1145,7 @@ def _simulate_single_replay_mode(
938
  for turn in replay.turns:
939
  tokenizer = get_tokenizer(turn.model)
940
  turn_input_token_total = sum(tokenizer.count_message(msg) for msg in turn.input_messages)
 
941
  conversation.extend(turn.input_messages)
942
  raw_input_tokens = conversation_token_total + turn_input_token_total
943
  forwarded = _apply_mode_to_messages(
@@ -950,6 +1158,17 @@ def _simulate_single_replay_mode(
950
  previous_original_messages=previous_original_context,
951
  previous_forwarded_messages=previous_forwarded_context,
952
  )
 
 
 
 
 
 
 
 
 
 
 
953
  if pending is not None:
954
  _apply_turn_metrics(
955
  pending.summary,
@@ -968,7 +1187,8 @@ def _simulate_single_replay_mode(
968
  previous_timestamp = pending.turn.timestamp
969
 
970
  if prefix_tracker is not None:
971
- prefix_tracker.update_from_response(
 
972
  cache_read_tokens=0,
973
  cache_write_tokens=0,
974
  messages=forwarded,
@@ -1231,12 +1451,60 @@ def determine_winners(summaries: dict[str, ModeSummary]) -> dict[str, str]:
1231
  }
1232
 
1233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1234
  def format_currency(value: float) -> str:
1235
  return f"${value:,.2f}"
1236
 
1237
 
1238
  def print_console_report(dataset: DatasetSummary, summaries: dict[str, ModeSummary]) -> None:
1239
  winners = determine_winners(summaries)
 
1240
  print("Claude session mode simulation")
1241
  print(
1242
  f"Dataset: {dataset.projects} projects, {dataset.sessions} sessions, "
@@ -1245,7 +1513,7 @@ def print_console_report(dataset: DatasetSummary, summaries: dict[str, ModeSumma
1245
  print(f"Sampling: {dataset.sampling_note}")
1246
  print()
1247
  print(
1248
- "mode raw_tok cache_tok cache_read cache_write paid_in paid_out busts ttl_exp total_cost no_cache"
1249
  )
1250
  for mode in ("baseline", PROXY_MODE_TOKEN, PROXY_MODE_CACHE):
1251
  summary = summaries[mode]
@@ -1254,6 +1522,7 @@ def print_console_report(dataset: DatasetSummary, summaries: dict[str, ModeSumma
1254
  f"{summary.cache_read_tokens:>11,} {summary.cache_write_tokens:>12,} "
1255
  f"{summary.regular_input_tokens:>10,} {summary.output_tokens:>12,} "
1256
  f"{summary.cache_bust_turns:>7,} {summary.ttl_expiry_turns:>9,} "
 
1257
  f"{format_currency(summary.total_cost_usd):>11} "
1258
  f"{format_currency(summary.no_cache_total_cost_usd):>11}"
1259
  )
@@ -1265,6 +1534,26 @@ def print_console_report(dataset: DatasetSummary, summaries: dict[str, ModeSumma
1265
  "Winner if cache read tokens do not count against window: "
1266
  f"{winners['window_without_cache_reads']}"
1267
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1268
 
1269
 
1270
  def print_observed_console_report(observed: ObservedSummary) -> None:
@@ -1288,6 +1577,7 @@ def build_report_markdown(
1288
  summaries: dict[str, ModeSummary],
1289
  ) -> str:
1290
  winners = determine_winners(summaries)
 
1291
  model_lines = "\n".join(f"- `{model}`: {count}" for model, count in dataset.models.items())
1292
  rows = []
1293
  for mode in ("baseline", PROXY_MODE_TOKEN, PROXY_MODE_CACHE):
@@ -1311,12 +1601,36 @@ def build_report_markdown(
1311
  format_currency(summary.no_cache_total_cost_usd),
1312
  f"{summary.cache_bust_turns:,}",
1313
  f"{summary.ttl_expiry_turns:,}",
 
 
 
1314
  f"{summary.prompt_window_with_cache:,}",
1315
  f"{summary.prompt_window_without_cache_reads:,}",
1316
  ]
1317
  )
1318
  + " |"
1319
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1320
  return "\n".join(
1321
  [
1322
  "# Claude Session Mode Simulation",
@@ -1351,10 +1665,16 @@ def build_report_markdown(
1351
  "",
1352
  "## Summary",
1353
  "",
1354
- "| Mode | Raw Tokens | Cache Tokens | Cache Read | Cache Write | Paid Input Tokens | Paid Output Tokens | Paid Input Cost | Cache Read Cost | Cache Write Cost | Paid Output Cost | Total Cost | No-Cache Total Cost | Cache Bust Turns | TTL Expiry Turns | Window Tokens (Cache Counted) | Window Tokens (Cache Reads Excluded) |",
1355
- "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
1356
  *rows,
1357
  "",
 
 
 
 
 
 
1358
  "## Winners",
1359
  "",
1360
  f"- Total cost winner: `{winners['total_cost']}`",
@@ -1372,6 +1692,7 @@ def build_report_html(
1372
  summaries: dict[str, ModeSummary],
1373
  ) -> str:
1374
  winners = determine_winners(summaries)
 
1375
  model_items = "".join(
1376
  f"<li><code>{model}</code><span>{count:,}</span></li>"
1377
  for model, count in dataset.models.items()
@@ -1390,12 +1711,42 @@ def build_report_html(
1390
  f"<td>{summary.output_tokens:,}</td>"
1391
  f"<td>{summary.cache_bust_turns:,}</td>"
1392
  f"<td>{summary.ttl_expiry_turns:,}</td>"
 
 
 
1393
  f"<td>{format_currency(summary.total_cost_usd)}</td>"
1394
  f"<td>{format_currency(summary.no_cache_total_cost_usd)}</td>"
1395
  f"<td>{summary.prompt_window_with_cache:,}</td>"
1396
  f"<td>{summary.prompt_window_without_cache_reads:,}</td>"
1397
  "</tr>"
1398
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1399
  return f"""<!doctype html>
1400
  <html lang="en">
1401
  <head>
@@ -1517,7 +1868,7 @@ def build_report_html(
1517
  <table>
1518
  <thead>
1519
  <tr>
1520
- <th>Mode</th><th>Raw Tokens</th><th>Cache Tokens</th><th>Cache Read</th><th>Cache Write</th><th>Paid Input</th><th>Paid Output</th><th>Cache Busts</th><th>TTL Expiry</th><th>Total Cost</th><th>No-Cache Cost</th><th>Window With Cache</th><th>Window Without Cache Reads</th>
1521
  </tr>
1522
  </thead>
1523
  <tbody>
@@ -1526,6 +1877,21 @@ def build_report_html(
1526
  </table>
1527
  </div>
1528
  </section>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1529
  </div>
1530
  </body>
1531
  </html>"""
@@ -1548,6 +1914,7 @@ def write_report(
1548
  "observed": asdict(observed),
1549
  "summaries": {mode: asdict(summary) for mode, summary in summaries.items()},
1550
  "winners": determine_winners(summaries),
 
1551
  }
1552
  json_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
1553
  return md_path, json_path, html_path
 
20
  from headroom.pricing.litellm_pricing import get_model_pricing
21
  from headroom.proxy.handlers.anthropic import AnthropicHandlerMixin
22
  from headroom.proxy.models import ProxyConfig
 
23
  from headroom.proxy.server import HeadroomProxy
24
  from headroom.tokenizers import get_tokenizer
25
  from headroom.utils import extract_user_query
26
 
27
+ try:
28
+ from headroom.proxy.modes import PROXY_MODE_CACHE, PROXY_MODE_TOKEN
29
+ except ImportError:
30
+ PROXY_MODE_CACHE = "cache"
31
+ PROXY_MODE_TOKEN = "token"
32
+
33
  DEFAULT_ROOT = Path.home() / ".claude" / "projects"
34
  DEFAULT_OUTPUT_DIR = Path("benchmark_results")
35
  DEFAULT_CACHE_TTL_MINUTES = 5
 
101
  cache_eligible_turns: int = 0
102
  cache_bust_turns: int = 0
103
  ttl_expiry_turns: int = 0
104
+ rewrite_turns: int = 0
105
+ retroactive_rewrite_turns: int = 0
106
+ latest_turn_only_rewrite_turns: int = 0
107
  turns: list[TurnMetrics] = field(default_factory=list)
108
 
109
  @property
 
144
  sampling_note: str = ""
145
 
146
 
147
+ IMPACT_DIRECTION = {
148
+ "forwarded_input_tokens": "lower",
149
+ "cache_read_tokens": "higher",
150
+ "cache_write_tokens": "lower",
151
+ "regular_input_tokens": "lower",
152
+ "output_tokens": "same",
153
+ "total_cost_usd": "lower",
154
+ "no_cache_total_cost_usd": "lower",
155
+ "prompt_window_with_cache": "lower",
156
+ "prompt_window_without_cache_reads": "lower",
157
+ "cache_bust_turns": "lower",
158
+ "ttl_expiry_turns": "lower",
159
+ "rewrite_turns": "lower",
160
+ "retroactive_rewrite_turns": "lower",
161
+ "latest_turn_only_rewrite_turns": "lower",
162
+ }
163
+
164
+
165
  @dataclass
166
  class ObservedSummary:
167
  sessions: int = 0
 
247
  cache_eligible_turns=data.get("cache_eligible_turns", 0),
248
  cache_bust_turns=data.get("cache_bust_turns", 0),
249
  ttl_expiry_turns=data.get("ttl_expiry_turns", 0),
250
+ rewrite_turns=data.get("rewrite_turns", 0),
251
+ retroactive_rewrite_turns=data.get("retroactive_rewrite_turns", 0),
252
+ latest_turn_only_rewrite_turns=data.get("latest_turn_only_rewrite_turns", 0),
253
  turns=turns,
254
  )
255
  return summary
 
647
  return common
648
 
649
 
650
+ def _rewrite_scope(
651
+ original_messages: list[dict[str, Any]],
652
+ forwarded_messages: list[dict[str, Any]],
653
+ *,
654
+ stable_prefix_message_count: int,
655
+ ) -> tuple[bool, bool]:
656
+ if original_messages == forwarded_messages:
657
+ return False, False
658
+ stable_count = min(
659
+ stable_prefix_message_count,
660
+ len(original_messages),
661
+ len(forwarded_messages),
662
+ )
663
+ retroactive = False
664
+ if len(forwarded_messages) < stable_prefix_message_count:
665
+ retroactive = True
666
+ elif stable_count > 0 and forwarded_messages[:stable_count] != original_messages[:stable_count]:
667
+ retroactive = True
668
+ return True, retroactive
669
+
670
+
671
+ def _extract_cache_stable_delta(
672
+ current_messages: list[dict[str, Any]],
673
+ previous_original_messages: list[dict[str, Any]] | None,
674
+ previous_forwarded_messages: list[dict[str, Any]] | None,
675
+ ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]] | None:
676
+ if previous_original_messages is None or previous_forwarded_messages is None:
677
+ return None
678
+ if len(current_messages) < len(previous_original_messages):
679
+ return None
680
+ stable_count = len(previous_original_messages)
681
+ if current_messages[:stable_count] != previous_original_messages:
682
+ return None
683
+ return (
684
+ copy.deepcopy(previous_forwarded_messages),
685
+ copy.deepcopy(current_messages[stable_count:]),
686
+ )
687
+
688
+
689
+ def _extract_cache_stable_last_message_suffix(
690
+ current_messages: list[dict[str, Any]],
691
+ previous_original_messages: list[dict[str, Any]] | None,
692
+ previous_forwarded_messages: list[dict[str, Any]] | None,
693
+ ) -> tuple[list[dict[str, Any]], dict[str, Any], list[dict[str, Any]]] | None:
694
+ if not previous_original_messages or previous_forwarded_messages is None:
695
+ return None
696
+ if (
697
+ len(current_messages) != len(previous_original_messages)
698
+ or len(previous_forwarded_messages) != len(previous_original_messages)
699
+ or not current_messages
700
+ ):
701
+ return None
702
+ prefix_len = len(current_messages) - 1
703
+ if prefix_len > 0 and current_messages[:prefix_len] != previous_original_messages[:prefix_len]:
704
+ return None
705
+
706
+ current_last = current_messages[-1]
707
+ previous_original_last = previous_original_messages[-1]
708
+ previous_forwarded_last = previous_forwarded_messages[-1]
709
+ if current_last.get("role") != previous_original_last.get("role") or current_last.get(
710
+ "role"
711
+ ) != previous_forwarded_last.get("role"):
712
+ return None
713
+
714
+ current_content = current_last.get("content")
715
+ previous_original_content = previous_original_last.get("content")
716
+ previous_forwarded_content = previous_forwarded_last.get("content")
717
+
718
+ if (
719
+ isinstance(current_content, str)
720
+ and isinstance(previous_original_content, str)
721
+ and isinstance(previous_forwarded_content, str)
722
+ and current_content.startswith(previous_original_content)
723
+ ):
724
+ suffix = current_content[len(previous_original_content) :]
725
+ delta_messages = []
726
+ if suffix:
727
+ delta_messages = [{**copy.deepcopy(current_last), "content": suffix}]
728
+ return (
729
+ copy.deepcopy(previous_forwarded_messages[:-1]),
730
+ copy.deepcopy(previous_forwarded_last),
731
+ delta_messages,
732
+ )
733
+
734
+ if (
735
+ isinstance(current_content, list)
736
+ and isinstance(previous_original_content, list)
737
+ and isinstance(previous_forwarded_content, list)
738
+ and len(current_content) >= len(previous_original_content)
739
+ and current_content[: len(previous_original_content)] == previous_original_content
740
+ ):
741
+ delta_blocks = copy.deepcopy(current_content[len(previous_original_content) :])
742
+ delta_messages = []
743
+ if delta_blocks:
744
+ delta_messages = [{**copy.deepcopy(current_last), "content": delta_blocks}]
745
+ return (
746
+ copy.deepcopy(previous_forwarded_messages[:-1]),
747
+ copy.deepcopy(previous_forwarded_last),
748
+ delta_messages,
749
+ )
750
+ return None
751
+
752
+
753
+ def _merge_appended_message_delta(
754
+ previous_forwarded_message: dict[str, Any],
755
+ delta_forwarded_message: dict[str, Any] | None,
756
+ ) -> dict[str, Any] | None:
757
+ if delta_forwarded_message is None:
758
+ return copy.deepcopy(previous_forwarded_message)
759
+ if previous_forwarded_message.get("role") != delta_forwarded_message.get("role"):
760
+ return None
761
+
762
+ previous_content = previous_forwarded_message.get("content")
763
+ delta_content = delta_forwarded_message.get("content")
764
+ if isinstance(previous_content, str) and isinstance(delta_content, str):
765
+ return {
766
+ **copy.deepcopy(previous_forwarded_message),
767
+ "content": previous_content + delta_content,
768
+ }
769
+ if isinstance(previous_content, list) and isinstance(delta_content, list):
770
+ return {
771
+ **copy.deepcopy(previous_forwarded_message),
772
+ "content": copy.deepcopy(previous_content) + copy.deepcopy(delta_content),
773
+ }
774
+ return None
775
+
776
+
777
  def _make_proxy(mode: str) -> HeadroomProxy:
778
  cfg = ProxyConfig(
779
  mode=mode,
 
811
  assert proxy is not None
812
  assert prefix_tracker is not None
813
  if mode == PROXY_MODE_CACHE:
814
+ supports_delta_replay = hasattr(
815
+ AnthropicHandlerMixin, "_extract_cache_stable_last_message_suffix"
816
+ )
817
+ if not supports_delta_replay:
818
+ frozen_message_count = prefix_tracker.get_frozen_message_count()
819
+ context_limit = proxy.anthropic_provider.get_context_limit(model)
820
+ result = proxy.anthropic_pipeline.apply(
821
+ messages=copy.deepcopy(messages),
822
+ model=model,
823
+ model_limit=context_limit,
824
+ context=extract_user_query(messages),
825
+ frozen_message_count=frozen_message_count,
826
+ )
827
+ if hasattr(AnthropicHandlerMixin, "_restore_frozen_prefix"):
828
+ result.messages, _ = AnthropicHandlerMixin._restore_frozen_prefix(
829
+ messages,
830
+ result.messages,
831
+ frozen_message_count=frozen_message_count,
832
+ )
833
+ return result.messages
834
+
835
+ delta = _extract_cache_stable_delta(
836
  messages,
837
  previous_original_messages,
838
  previous_forwarded_messages,
839
  )
840
+ if delta is not None:
841
+ stable_forwarded_prefix, delta_messages = delta
842
+ if not delta_messages:
843
+ return stable_forwarded_prefix
844
+ context_limit = proxy.anthropic_provider.get_context_limit(model)
845
+ result = proxy.anthropic_pipeline.apply(
846
+ messages=delta_messages,
847
+ model=model,
848
+ model_limit=context_limit,
849
+ context=extract_user_query(delta_messages),
850
+ frozen_message_count=0,
851
+ )
852
+ return stable_forwarded_prefix + result.messages
853
+
854
+ return copy.deepcopy(messages)
855
 
856
  frozen_message_count = prefix_tracker.get_frozen_message_count()
857
 
 
1020
  target.cache_eligible_turns += source.cache_eligible_turns
1021
  target.cache_bust_turns += source.cache_bust_turns
1022
  target.ttl_expiry_turns += source.ttl_expiry_turns
1023
+ target.rewrite_turns += source.rewrite_turns
1024
+ target.retroactive_rewrite_turns += source.retroactive_rewrite_turns
1025
+ target.latest_turn_only_rewrite_turns += source.latest_turn_only_rewrite_turns
1026
 
1027
 
1028
  def _disable_headroom_benchmark_logging() -> None:
 
1095
  path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
1096
 
1097
 
1098
+ def _update_prefix_tracker(
1099
+ prefix_tracker: PrefixCacheTracker,
1100
+ *,
1101
+ cache_read_tokens: int,
1102
+ cache_write_tokens: int,
1103
+ messages: list[dict[str, Any]],
1104
+ message_token_counts: list[int],
1105
+ original_messages: list[dict[str, Any]] | None = None,
1106
+ ) -> None:
1107
+ try:
1108
+ prefix_tracker.update_from_response(
1109
+ cache_read_tokens=cache_read_tokens,
1110
+ cache_write_tokens=cache_write_tokens,
1111
+ messages=messages,
1112
+ message_token_counts=message_token_counts,
1113
+ original_messages=original_messages,
1114
+ )
1115
+ except TypeError:
1116
+ prefix_tracker.update_from_response(
1117
+ cache_read_tokens=cache_read_tokens,
1118
+ cache_write_tokens=cache_write_tokens,
1119
+ messages=messages,
1120
+ message_token_counts=message_token_counts,
1121
+ )
1122
+
1123
+
1124
  def _simulate_single_replay_mode(
1125
  replay: SessionReplay,
1126
  mode: str,
 
1145
  for turn in replay.turns:
1146
  tokenizer = get_tokenizer(turn.model)
1147
  turn_input_token_total = sum(tokenizer.count_message(msg) for msg in turn.input_messages)
1148
+ prior_context_message_count = len(conversation)
1149
  conversation.extend(turn.input_messages)
1150
  raw_input_tokens = conversation_token_total + turn_input_token_total
1151
  forwarded = _apply_mode_to_messages(
 
1158
  previous_original_messages=previous_original_context,
1159
  previous_forwarded_messages=previous_forwarded_context,
1160
  )
1161
+ rewrite, retroactive_rewrite = _rewrite_scope(
1162
+ conversation,
1163
+ forwarded,
1164
+ stable_prefix_message_count=prior_context_message_count,
1165
+ )
1166
+ if rewrite:
1167
+ summary.rewrite_turns += 1
1168
+ if retroactive_rewrite:
1169
+ summary.retroactive_rewrite_turns += 1
1170
+ else:
1171
+ summary.latest_turn_only_rewrite_turns += 1
1172
  if pending is not None:
1173
  _apply_turn_metrics(
1174
  pending.summary,
 
1187
  previous_timestamp = pending.turn.timestamp
1188
 
1189
  if prefix_tracker is not None:
1190
+ _update_prefix_tracker(
1191
+ prefix_tracker,
1192
  cache_read_tokens=0,
1193
  cache_write_tokens=0,
1194
  messages=forwarded,
 
1451
  }
1452
 
1453
 
1454
+ def _metric_value(summary: ModeSummary, field: str) -> float:
1455
+ value = getattr(summary, field)
1456
+ return float(value)
1457
+
1458
+
1459
+ def classify_metric_impact(
1460
+ baseline: ModeSummary,
1461
+ candidate: ModeSummary,
1462
+ field: str,
1463
+ ) -> dict[str, float | str]:
1464
+ baseline_value = _metric_value(baseline, field)
1465
+ candidate_value = _metric_value(candidate, field)
1466
+ delta = candidate_value - baseline_value
1467
+ direction = IMPACT_DIRECTION[field]
1468
+ tolerance = 1e-9
1469
+
1470
+ if abs(delta) <= tolerance:
1471
+ impact = "no_change"
1472
+ elif direction == "lower":
1473
+ impact = "assist" if delta < 0 else "harm"
1474
+ elif direction == "higher":
1475
+ impact = "assist" if delta > 0 else "harm"
1476
+ else:
1477
+ impact = "harm" if abs(delta) > tolerance else "no_change"
1478
+
1479
+ return {
1480
+ "baseline": baseline_value,
1481
+ "candidate": candidate_value,
1482
+ "delta": delta,
1483
+ "impact": impact,
1484
+ "direction": direction,
1485
+ }
1486
+
1487
+
1488
+ def summarize_mode_impact_vs_baseline(
1489
+ summaries: dict[str, ModeSummary],
1490
+ ) -> dict[str, dict[str, dict[str, float | str]]]:
1491
+ baseline = summaries["baseline"]
1492
+ result: dict[str, dict[str, dict[str, float | str]]] = {}
1493
+ for mode in (PROXY_MODE_TOKEN, PROXY_MODE_CACHE):
1494
+ candidate = summaries[mode]
1495
+ result[mode] = {
1496
+ field: classify_metric_impact(baseline, candidate, field) for field in IMPACT_DIRECTION
1497
+ }
1498
+ return result
1499
+
1500
+
1501
  def format_currency(value: float) -> str:
1502
  return f"${value:,.2f}"
1503
 
1504
 
1505
  def print_console_report(dataset: DatasetSummary, summaries: dict[str, ModeSummary]) -> None:
1506
  winners = determine_winners(summaries)
1507
+ impacts = summarize_mode_impact_vs_baseline(summaries)
1508
  print("Claude session mode simulation")
1509
  print(
1510
  f"Dataset: {dataset.projects} projects, {dataset.sessions} sessions, "
 
1513
  print(f"Sampling: {dataset.sampling_note}")
1514
  print()
1515
  print(
1516
+ "mode raw_tok cache_tok cache_read cache_write paid_in paid_out busts ttl_exp rewrite retro_rw total_cost no_cache"
1517
  )
1518
  for mode in ("baseline", PROXY_MODE_TOKEN, PROXY_MODE_CACHE):
1519
  summary = summaries[mode]
 
1522
  f"{summary.cache_read_tokens:>11,} {summary.cache_write_tokens:>12,} "
1523
  f"{summary.regular_input_tokens:>10,} {summary.output_tokens:>12,} "
1524
  f"{summary.cache_bust_turns:>7,} {summary.ttl_expiry_turns:>9,} "
1525
+ f"{summary.rewrite_turns:>9,} {summary.retroactive_rewrite_turns:>10,} "
1526
  f"{format_currency(summary.total_cost_usd):>11} "
1527
  f"{format_currency(summary.no_cache_total_cost_usd):>11}"
1528
  )
 
1534
  "Winner if cache read tokens do not count against window: "
1535
  f"{winners['window_without_cache_reads']}"
1536
  )
1537
+ print()
1538
+ print("Impact vs baseline")
1539
+ for mode in (PROXY_MODE_TOKEN, PROXY_MODE_CACHE):
1540
+ impact = impacts[mode]
1541
+ print(
1542
+ f"{mode}: total_cost={impact['total_cost_usd']['impact']} "
1543
+ f"({format_currency(impact['total_cost_usd']['delta'])}), "
1544
+ f"cache_read={impact['cache_read_tokens']['impact']} "
1545
+ f"({int(impact['cache_read_tokens']['delta']):,}), "
1546
+ f"cache_write={impact['cache_write_tokens']['impact']} "
1547
+ f"({int(impact['cache_write_tokens']['delta']):,}), "
1548
+ f"paid_input={impact['regular_input_tokens']['impact']} "
1549
+ f"({int(impact['regular_input_tokens']['delta']):,}), "
1550
+ f"rewrite={impact['rewrite_turns']['impact']} "
1551
+ f"({int(impact['rewrite_turns']['delta']):,}), "
1552
+ f"retro_rw={impact['retroactive_rewrite_turns']['impact']} "
1553
+ f"({int(impact['retroactive_rewrite_turns']['delta']):,}), "
1554
+ f"window={impact['prompt_window_with_cache']['impact']} "
1555
+ f"({int(impact['prompt_window_with_cache']['delta']):,})"
1556
+ )
1557
 
1558
 
1559
  def print_observed_console_report(observed: ObservedSummary) -> None:
 
1577
  summaries: dict[str, ModeSummary],
1578
  ) -> str:
1579
  winners = determine_winners(summaries)
1580
+ impacts = summarize_mode_impact_vs_baseline(summaries)
1581
  model_lines = "\n".join(f"- `{model}`: {count}" for model, count in dataset.models.items())
1582
  rows = []
1583
  for mode in ("baseline", PROXY_MODE_TOKEN, PROXY_MODE_CACHE):
 
1601
  format_currency(summary.no_cache_total_cost_usd),
1602
  f"{summary.cache_bust_turns:,}",
1603
  f"{summary.ttl_expiry_turns:,}",
1604
+ f"{summary.rewrite_turns:,}",
1605
+ f"{summary.retroactive_rewrite_turns:,}",
1606
+ f"{summary.latest_turn_only_rewrite_turns:,}",
1607
  f"{summary.prompt_window_with_cache:,}",
1608
  f"{summary.prompt_window_without_cache_reads:,}",
1609
  ]
1610
  )
1611
  + " |"
1612
  )
1613
+ impact_rows = []
1614
+ for mode in (PROXY_MODE_TOKEN, PROXY_MODE_CACHE):
1615
+ for metric_key, label in (
1616
+ ("total_cost_usd", "Total Cost"),
1617
+ ("cache_read_tokens", "Cache Read Tokens"),
1618
+ ("cache_write_tokens", "Cache Write Tokens"),
1619
+ ("regular_input_tokens", "Paid Input Tokens"),
1620
+ ("output_tokens", "Paid Output Tokens"),
1621
+ ("prompt_window_with_cache", "Window With Cache"),
1622
+ ("prompt_window_without_cache_reads", "Window Without Cache Reads"),
1623
+ ("cache_bust_turns", "Cache Bust Turns"),
1624
+ ("rewrite_turns", "Rewrite Turns"),
1625
+ ("retroactive_rewrite_turns", "Retroactive Rewrite Turns"),
1626
+ ("latest_turn_only_rewrite_turns", "Latest-Turn-Only Rewrite Turns"),
1627
+ ):
1628
+ impact = impacts[mode][metric_key]
1629
+ delta = impact["delta"]
1630
+ delta_text = format_currency(delta) if "cost" in metric_key else f"{int(delta):,}"
1631
+ impact_rows.append(
1632
+ f"| {mode} | {label} | {impact['impact']} | {delta_text} | {impact['direction']} |"
1633
+ )
1634
  return "\n".join(
1635
  [
1636
  "# Claude Session Mode Simulation",
 
1665
  "",
1666
  "## Summary",
1667
  "",
1668
+ "| Mode | Raw Tokens | Cache Tokens | Cache Read | Cache Write | Paid Input Tokens | Paid Output Tokens | Paid Input Cost | Cache Read Cost | Cache Write Cost | Paid Output Cost | Total Cost | No-Cache Total Cost | Cache Bust Turns | TTL Expiry Turns | Rewrite Turns | Retroactive Rewrite Turns | Latest-Turn-Only Rewrite Turns | Window Tokens (Cache Counted) | Window Tokens (Cache Reads Excluded) |",
1669
+ "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
1670
  *rows,
1671
  "",
1672
+ "## Impact vs Baseline",
1673
+ "",
1674
+ "| Mode | Metric | Classification | Delta | Better Direction |",
1675
+ "| --- | --- | --- | ---: | --- |",
1676
+ *impact_rows,
1677
+ "",
1678
  "## Winners",
1679
  "",
1680
  f"- Total cost winner: `{winners['total_cost']}`",
 
1692
  summaries: dict[str, ModeSummary],
1693
  ) -> str:
1694
  winners = determine_winners(summaries)
1695
+ impacts = summarize_mode_impact_vs_baseline(summaries)
1696
  model_items = "".join(
1697
  f"<li><code>{model}</code><span>{count:,}</span></li>"
1698
  for model, count in dataset.models.items()
 
1711
  f"<td>{summary.output_tokens:,}</td>"
1712
  f"<td>{summary.cache_bust_turns:,}</td>"
1713
  f"<td>{summary.ttl_expiry_turns:,}</td>"
1714
+ f"<td>{summary.rewrite_turns:,}</td>"
1715
+ f"<td>{summary.retroactive_rewrite_turns:,}</td>"
1716
+ f"<td>{summary.latest_turn_only_rewrite_turns:,}</td>"
1717
  f"<td>{format_currency(summary.total_cost_usd)}</td>"
1718
  f"<td>{format_currency(summary.no_cache_total_cost_usd)}</td>"
1719
  f"<td>{summary.prompt_window_with_cache:,}</td>"
1720
  f"<td>{summary.prompt_window_without_cache_reads:,}</td>"
1721
  "</tr>"
1722
  )
1723
+ impact_rows = []
1724
+ for mode in (PROXY_MODE_TOKEN, PROXY_MODE_CACHE):
1725
+ for metric_key, label in (
1726
+ ("total_cost_usd", "Total Cost"),
1727
+ ("cache_read_tokens", "Cache Read Tokens"),
1728
+ ("cache_write_tokens", "Cache Write Tokens"),
1729
+ ("regular_input_tokens", "Paid Input Tokens"),
1730
+ ("output_tokens", "Paid Output Tokens"),
1731
+ ("prompt_window_with_cache", "Window With Cache"),
1732
+ ("prompt_window_without_cache_reads", "Window Without Cache Reads"),
1733
+ ("cache_bust_turns", "Cache Bust Turns"),
1734
+ ("rewrite_turns", "Rewrite Turns"),
1735
+ ("retroactive_rewrite_turns", "Retroactive Rewrite Turns"),
1736
+ ("latest_turn_only_rewrite_turns", "Latest-Turn-Only Rewrite Turns"),
1737
+ ):
1738
+ impact = impacts[mode][metric_key]
1739
+ delta = impact["delta"]
1740
+ delta_text = format_currency(delta) if "cost" in metric_key else f"{int(delta):,}"
1741
+ impact_rows.append(
1742
+ "<tr>"
1743
+ f"<td><span class='badge'>{mode}</span></td>"
1744
+ f"<td>{label}</td>"
1745
+ f"<td>{impact['impact']}</td>"
1746
+ f"<td>{delta_text}</td>"
1747
+ f"<td>{impact['direction']}</td>"
1748
+ "</tr>"
1749
+ )
1750
  return f"""<!doctype html>
1751
  <html lang="en">
1752
  <head>
 
1868
  <table>
1869
  <thead>
1870
  <tr>
1871
+ <th>Mode</th><th>Raw Tokens</th><th>Cache Tokens</th><th>Cache Read</th><th>Cache Write</th><th>Paid Input</th><th>Paid Output</th><th>Cache Busts</th><th>TTL Expiry</th><th>Rewrite Turns</th><th>Retroactive Rewrites</th><th>Latest-Turn-Only Rewrites</th><th>Total Cost</th><th>No-Cache Cost</th><th>Window With Cache</th><th>Window Without Cache Reads</th>
1872
  </tr>
1873
  </thead>
1874
  <tbody>
 
1877
  </table>
1878
  </div>
1879
  </section>
1880
+ <section class="section card">
1881
+ <h2>Impact vs Baseline</h2>
1882
+ <div class="table-wrap">
1883
+ <table>
1884
+ <thead>
1885
+ <tr>
1886
+ <th>Mode</th><th>Metric</th><th>Classification</th><th>Delta</th><th>Better Direction</th>
1887
+ </tr>
1888
+ </thead>
1889
+ <tbody>
1890
+ {"".join(impact_rows)}
1891
+ </tbody>
1892
+ </table>
1893
+ </div>
1894
+ </section>
1895
  </div>
1896
  </body>
1897
  </html>"""
 
1914
  "observed": asdict(observed),
1915
  "summaries": {mode: asdict(summary) for mode, summary in summaries.items()},
1916
  "winners": determine_winners(summaries),
1917
+ "impact_vs_baseline": summarize_mode_impact_vs_baseline(summaries),
1918
  }
1919
  json_path.write_text(json.dumps(payload, indent=2), encoding="utf-8")
1920
  return md_path, json_path, html_path
docs/benchmarks.md CHANGED
@@ -209,6 +209,9 @@ python benchmarks/proxy_mode_benchmark.py --turns 12 --show-real-harness
209
 
210
  # Replay local Claude Code transcripts (no API calls)
211
  python benchmarks/claude_session_mode_benchmark.py --workers 1
 
 
 
212
  ```
213
 
214
  This benchmark compares `token` vs `cache` proxy modes on the same synthetic conversation:
@@ -218,6 +221,13 @@ This benchmark compares `token` vs `cache` proxy modes on the same synthetic con
218
 
219
  `--show-real-harness` prints optional steps for running the same comparison with Claude Code, but does not call APIs by default.
220
 
 
 
 
 
 
 
 
221
  The Claude session benchmark replays local transcript data from `~/.claude/projects`
222
  through `baseline`, `token`, and `cache` modes. It estimates raw tokens, cache
223
  read/write tokens, paid input/output costs, and prompt-window winners under two
 
209
 
210
  # Replay local Claude Code transcripts (no API calls)
211
  python benchmarks/claude_session_mode_benchmark.py --workers 1
212
+
213
+ # Compare two refs on the same local Claude transcript corpus
214
+ python benchmarks/claude_session_branch_compare.py --left-ref upstream/main --right-ref HEAD --recent-turns-per-session 200 --workers 1
215
  ```
216
 
217
  This benchmark compares `token` vs `cache` proxy modes on the same synthetic conversation:
 
221
 
222
  `--show-real-harness` prints optional steps for running the same comparison with Claude Code, but does not call APIs by default.
223
 
224
+ `claude_session_branch_compare.py` runs the real local session replay benchmark twice, once per git ref, in isolated worktrees. It writes:
225
+
226
+ - per-ref replay outputs under `benchmark_results/branch_compare/<label>/`
227
+ - a combined comparison report under `benchmark_results/branch_compare/`
228
+
229
+ Use it when you want a clean PR-vs-`main` comparison on the same transcript slice.
230
+
231
  The Claude session benchmark replays local transcript data from `~/.claude/projects`
232
  through `baseline`, `token`, and `cache` modes. It estimates raw tokens, cache
233
  read/write tokens, paid input/output costs, and prompt-window winners under two
headroom/proxy/handlers/anthropic.py CHANGED
@@ -179,6 +179,100 @@ class AnthropicHandlerMixin:
179
  copy.deepcopy(current_messages[prefix_len:]),
180
  )
181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  @staticmethod
183
  def _assistant_message_from_response_json(
184
  resp_json: dict[str, Any] | None,
@@ -482,10 +576,7 @@ class AnthropicHandlerMixin:
482
  previous_original_messages,
483
  previous_forwarded_messages,
484
  )
485
- if delta is None:
486
- optimized_messages = messages
487
- optimized_tokens = original_tokens
488
- else:
489
  stable_forwarded_prefix, delta_messages = delta
490
  if delta_messages:
491
  result = await asyncio.wait_for(
@@ -508,6 +599,13 @@ class AnthropicHandlerMixin:
508
  else:
509
  optimized_messages = stable_forwarded_prefix
510
  optimized_tokens = tokenizer.count_messages(optimized_messages)
 
 
 
 
 
 
 
511
 
512
  if result and result.waste_signals:
513
  waste_signals_dict = result.waste_signals.to_dict()
 
179
  copy.deepcopy(current_messages[prefix_len:]),
180
  )
181
 
182
+ @staticmethod
183
+ def _extract_cache_stable_last_message_suffix(
184
+ current_messages: list[dict[str, Any]],
185
+ previous_original_messages: list[dict[str, Any]] | None,
186
+ previous_forwarded_messages: list[dict[str, Any]] | None,
187
+ ) -> tuple[list[dict[str, Any]], dict[str, Any], list[dict[str, Any]]] | None:
188
+ """Return append-only delta when only the latest message grew in place."""
189
+ if not previous_original_messages or previous_forwarded_messages is None:
190
+ return None
191
+ if (
192
+ len(current_messages) != len(previous_original_messages)
193
+ or len(previous_forwarded_messages) != len(previous_original_messages)
194
+ or not current_messages
195
+ ):
196
+ return None
197
+
198
+ prefix_len = len(current_messages) - 1
199
+ if (
200
+ prefix_len > 0
201
+ and current_messages[:prefix_len] != previous_original_messages[:prefix_len]
202
+ ):
203
+ return None
204
+
205
+ current_last = current_messages[-1]
206
+ previous_original_last = previous_original_messages[-1]
207
+ previous_forwarded_last = previous_forwarded_messages[-1]
208
+ if current_last.get("role") != previous_original_last.get("role") or current_last.get(
209
+ "role"
210
+ ) != previous_forwarded_last.get("role"):
211
+ return None
212
+
213
+ current_content = current_last.get("content")
214
+ previous_original_content = previous_original_last.get("content")
215
+ previous_forwarded_content = previous_forwarded_last.get("content")
216
+
217
+ if (
218
+ isinstance(current_content, str)
219
+ and isinstance(previous_original_content, str)
220
+ and isinstance(previous_forwarded_content, str)
221
+ and current_content.startswith(previous_original_content)
222
+ ):
223
+ suffix = current_content[len(previous_original_content) :]
224
+ delta_messages = []
225
+ if suffix:
226
+ delta_messages = [{**copy.deepcopy(current_last), "content": suffix}]
227
+ return (
228
+ copy.deepcopy(previous_forwarded_messages[:-1]),
229
+ copy.deepcopy(previous_forwarded_last),
230
+ delta_messages,
231
+ )
232
+
233
+ if (
234
+ isinstance(current_content, list)
235
+ and isinstance(previous_original_content, list)
236
+ and isinstance(previous_forwarded_content, list)
237
+ and len(current_content) >= len(previous_original_content)
238
+ and current_content[: len(previous_original_content)] == previous_original_content
239
+ ):
240
+ delta_blocks = copy.deepcopy(current_content[len(previous_original_content) :])
241
+ delta_messages = []
242
+ if delta_blocks:
243
+ delta_messages = [{**copy.deepcopy(current_last), "content": delta_blocks}]
244
+ return (
245
+ copy.deepcopy(previous_forwarded_messages[:-1]),
246
+ copy.deepcopy(previous_forwarded_last),
247
+ delta_messages,
248
+ )
249
+ return None
250
+
251
+ @staticmethod
252
+ def _merge_appended_message_delta(
253
+ previous_forwarded_message: dict[str, Any],
254
+ delta_forwarded_message: dict[str, Any] | None,
255
+ ) -> dict[str, Any] | None:
256
+ """Merge a compressed suffix back into the prior forwarded message."""
257
+ if delta_forwarded_message is None:
258
+ return copy.deepcopy(previous_forwarded_message)
259
+ if previous_forwarded_message.get("role") != delta_forwarded_message.get("role"):
260
+ return None
261
+
262
+ previous_content = previous_forwarded_message.get("content")
263
+ delta_content = delta_forwarded_message.get("content")
264
+ if isinstance(previous_content, str) and isinstance(delta_content, str):
265
+ return {
266
+ **copy.deepcopy(previous_forwarded_message),
267
+ "content": previous_content + delta_content,
268
+ }
269
+ if isinstance(previous_content, list) and isinstance(delta_content, list):
270
+ return {
271
+ **copy.deepcopy(previous_forwarded_message),
272
+ "content": copy.deepcopy(previous_content) + copy.deepcopy(delta_content),
273
+ }
274
+ return None
275
+
276
  @staticmethod
277
  def _assistant_message_from_response_json(
278
  resp_json: dict[str, Any] | None,
 
576
  previous_original_messages,
577
  previous_forwarded_messages,
578
  )
579
+ if delta is not None:
 
 
 
580
  stable_forwarded_prefix, delta_messages = delta
581
  if delta_messages:
582
  result = await asyncio.wait_for(
 
599
  else:
600
  optimized_messages = stable_forwarded_prefix
601
  optimized_tokens = tokenizer.count_messages(optimized_messages)
602
+ else:
603
+ # Conservative rule for cache mode:
604
+ # only replay exact stable message-prefix extensions.
605
+ # In-message append rewriting is deferred until we can
606
+ # prove it is perfectly replayable across future turns.
607
+ optimized_messages = messages
608
+ optimized_tokens = original_tokens
609
 
610
  if result and result.waste_signals:
611
  waste_signals_dict = result.waste_signals.to_dict()
tests/test_claude_session_branch_compare.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import sys
5
+ from pathlib import Path
6
+
7
+ from benchmarks.claude_session_branch_compare import (
8
+ BranchResult,
9
+ _build_benchmark_command,
10
+ _build_six_way_rows,
11
+ _classify_delta,
12
+ _ref_slug,
13
+ build_compare_markdown,
14
+ write_compare_report,
15
+ )
16
+
17
+
18
+ def _branch(label: str, ref: str, commit: str, total_cost: float) -> BranchResult:
19
+ summaries = {
20
+ "baseline": {
21
+ "mode": "baseline",
22
+ "total_cost_usd": total_cost + 1.0,
23
+ "no_cache_total_cost_usd": total_cost + 5.0,
24
+ "forwarded_input_tokens": 1_200,
25
+ "cache_read_tokens": 800,
26
+ "cache_write_tokens": 200,
27
+ "regular_input_tokens": 400,
28
+ "output_tokens": 120,
29
+ "cache_bust_turns": 1,
30
+ "ttl_expiry_turns": 2,
31
+ "prompt_window_with_cache": 1_200,
32
+ "prompt_window_without_cache_reads": 400,
33
+ },
34
+ "token": {
35
+ "mode": "token",
36
+ "total_cost_usd": total_cost,
37
+ "no_cache_total_cost_usd": total_cost + 3.0,
38
+ "forwarded_input_tokens": 900,
39
+ "cache_read_tokens": 700,
40
+ "cache_write_tokens": 150,
41
+ "regular_input_tokens": 200,
42
+ "output_tokens": 120,
43
+ "cache_bust_turns": 4,
44
+ "ttl_expiry_turns": 2,
45
+ "prompt_window_with_cache": 900,
46
+ "prompt_window_without_cache_reads": 200,
47
+ },
48
+ "cache": {
49
+ "mode": "cache",
50
+ "total_cost_usd": total_cost + 0.5,
51
+ "no_cache_total_cost_usd": total_cost + 4.0,
52
+ "forwarded_input_tokens": 950,
53
+ "cache_read_tokens": 760,
54
+ "cache_write_tokens": 180,
55
+ "regular_input_tokens": 190,
56
+ "output_tokens": 120,
57
+ "cache_bust_turns": 1,
58
+ "ttl_expiry_turns": 2,
59
+ "prompt_window_with_cache": 950,
60
+ "prompt_window_without_cache_reads": 190,
61
+ },
62
+ }
63
+ return BranchResult(
64
+ ref=ref,
65
+ label=label,
66
+ commit=commit,
67
+ summary=f"{label} summary",
68
+ dataset={
69
+ "projects": 3,
70
+ "sessions": 7,
71
+ "requests": 80,
72
+ "sampled_requests": 80,
73
+ "sampling_note": "Most recent 10 turns per session",
74
+ },
75
+ observed={"cache_ratio_pct": 97.0},
76
+ summaries=summaries,
77
+ winners={
78
+ "total_cost": "token",
79
+ "no_cache_total_cost": "token",
80
+ "window_with_cache": "token",
81
+ "window_without_cache_reads": "cache",
82
+ },
83
+ output_dir=f"benchmark_results/{label}",
84
+ )
85
+
86
+
87
+ def test_ref_slug_normalizes_refs() -> None:
88
+ assert _ref_slug("upstream/main") == "upstream-main"
89
+ assert _ref_slug("feature/cache.fix") == "feature-cache-fix"
90
+
91
+
92
+ def test_build_benchmark_command_includes_knobs() -> None:
93
+ command = _build_benchmark_command(
94
+ python_executable=sys.executable,
95
+ script_path=Path("benchmarks") / "claude_session_mode_benchmark.py",
96
+ root=Path.home() / ".claude" / "projects",
97
+ output_dir=Path("benchmark_results") / "pr",
98
+ max_sessions=5,
99
+ recent_turns_per_session=200,
100
+ cache_ttl_minutes=5,
101
+ cache_write_multiplier=1.25,
102
+ workers=1,
103
+ )
104
+
105
+ assert command[0] == sys.executable
106
+ assert "--max-sessions" in command
107
+ assert "--recent-turns-per-session" in command
108
+ assert "--workers" in command
109
+
110
+
111
+ def test_build_compare_markdown_surfaces_branch_deltas() -> None:
112
+ left = _branch("main", "upstream/main", "abc123456789", 12.0)
113
+ right = _branch("pr", "HEAD", "def987654321", 11.0)
114
+
115
+ markdown = build_compare_markdown(left, right)
116
+
117
+ assert "Claude Session Branch Comparison" in markdown
118
+ assert "`main`" not in markdown
119
+ assert "main picks" not in markdown
120
+ assert "Delta (pr - main)" in markdown
121
+ assert "| token | Total Cost | $12.00 | $11.00 | $-1.00 |" in markdown
122
+
123
+
124
+ def test_write_compare_report_persists_payload(tmp_path: Path) -> None:
125
+ left = _branch("main", "upstream/main", "abc123456789", 12.0)
126
+ right = _branch("pr", "HEAD", "def987654321", 11.0)
127
+
128
+ md_path, json_path, html_path = write_compare_report(tmp_path, left, right)
129
+
130
+ assert md_path.exists()
131
+ assert html_path.exists()
132
+ payload = json.loads(json_path.read_text(encoding="utf-8"))
133
+ assert payload["left"]["ref"] == "upstream/main"
134
+ assert payload["right"]["label"] == "pr"
135
+ assert payload["right_winners"]["total_cost"] == "token"
136
+
137
+
138
+ def test_branch_delta_classification_uses_metric_direction() -> None:
139
+ assert _classify_delta("total_cost_usd", -1.0) == "assist"
140
+ assert _classify_delta("cache_read_tokens", 10.0) == "assist"
141
+ assert _classify_delta("cache_write_tokens", 5.0) == "harm"
142
+ assert _classify_delta("output_tokens", 0.0) == "no_change"
143
+
144
+
145
+ def test_six_way_rows_cover_both_branches_and_modes() -> None:
146
+ left = _branch("main", "upstream/main", "abc123456789", 12.0)
147
+ right = _branch("pr", "HEAD", "def987654321", 11.0)
148
+
149
+ rows = _build_six_way_rows(left, right)
150
+
151
+ assert len(rows) == 6
152
+ assert rows[0]["branch"] == "main"
153
+ assert rows[0]["mode"] == "baseline"
154
+ assert any(row["branch"] == "pr" and row["mode"] == "token" for row in rows)
155
+ assert any(
156
+ row["branch"] == "main"
157
+ and row["mode"] == "token"
158
+ and row["paid_input_delta_vs_branch_baseline"] == -200
159
+ for row in rows
160
+ )
tests/test_claude_session_mode_benchmark.py CHANGED
@@ -5,6 +5,7 @@ from __future__ import annotations
5
  import json
6
  from datetime import datetime
7
  from pathlib import Path
 
8
 
9
  from benchmarks.claude_session_mode_benchmark import (
10
  PROXY_MODE_CACHE,
@@ -12,13 +13,18 @@ from benchmarks.claude_session_mode_benchmark import (
12
  ModeSummary,
13
  ReplayTurn,
14
  SessionReplay,
 
 
 
15
  _write_checkpoint_by_session_id,
16
  build_dataset_and_observed_from_files,
 
17
  decode_project_key,
18
  determine_winners,
19
  load_session_replay,
20
  resolve_checkpoint_dir,
21
  simulate_replays,
 
22
  summarize_observed_usage,
23
  trim_replay_to_recent_turns,
24
  )
@@ -147,6 +153,8 @@ def test_simulation_and_winner_logic() -> None:
147
  assert summaries["baseline"].cache_bust_turns == 0
148
  assert summaries[PROXY_MODE_CACHE].cache_bust_turns == 0
149
  assert summaries[PROXY_MODE_TOKEN].cache_bust_turns >= 0
 
 
150
 
151
  winners = determine_winners(summaries)
152
  assert winners["total_cost"] in {"baseline", PROXY_MODE_TOKEN, PROXY_MODE_CACHE}
@@ -334,3 +342,183 @@ def test_resolve_checkpoint_dir_namespaces_sampling_mode() -> None:
334
  assert (
335
  resolve_checkpoint_dir(base, recent_turns_per_session=200).name == "v4__ttl_5m__recent_200"
336
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import json
6
  from datetime import datetime
7
  from pathlib import Path
8
+ from types import SimpleNamespace
9
 
10
  from benchmarks.claude_session_mode_benchmark import (
11
  PROXY_MODE_CACHE,
 
13
  ModeSummary,
14
  ReplayTurn,
15
  SessionReplay,
16
+ _extract_cache_stable_last_message_suffix,
17
+ _merge_appended_message_delta,
18
+ _rewrite_scope,
19
  _write_checkpoint_by_session_id,
20
  build_dataset_and_observed_from_files,
21
+ classify_metric_impact,
22
  decode_project_key,
23
  determine_winners,
24
  load_session_replay,
25
  resolve_checkpoint_dir,
26
  simulate_replays,
27
+ summarize_mode_impact_vs_baseline,
28
  summarize_observed_usage,
29
  trim_replay_to_recent_turns,
30
  )
 
153
  assert summaries["baseline"].cache_bust_turns == 0
154
  assert summaries[PROXY_MODE_CACHE].cache_bust_turns == 0
155
  assert summaries[PROXY_MODE_TOKEN].cache_bust_turns >= 0
156
+ assert summaries[PROXY_MODE_TOKEN].rewrite_turns >= 0
157
+ assert summaries[PROXY_MODE_CACHE].rewrite_turns >= 0
158
 
159
  winners = determine_winners(summaries)
160
  assert winners["total_cost"] in {"baseline", PROXY_MODE_TOKEN, PROXY_MODE_CACHE}
 
342
  assert (
343
  resolve_checkpoint_dir(base, recent_turns_per_session=200).name == "v4__ttl_5m__recent_200"
344
  )
345
+
346
+
347
+ def test_cache_suffix_helpers_support_append_only_text_growth() -> None:
348
+ suffix_delta = _extract_cache_stable_last_message_suffix(
349
+ [{"role": "user", "content": "prefix + raw suffix"}],
350
+ [{"role": "user", "content": "prefix"}],
351
+ [{"role": "user", "content": "COMPRESSED_PREFIX"}],
352
+ )
353
+
354
+ assert suffix_delta is not None
355
+ stable_prefix, stable_last_message, delta_messages = suffix_delta
356
+ assert stable_prefix == []
357
+ assert stable_last_message == {"role": "user", "content": "COMPRESSED_PREFIX"}
358
+ assert delta_messages == [{"role": "user", "content": " + raw suffix"}]
359
+
360
+ merged = _merge_appended_message_delta(
361
+ stable_last_message,
362
+ {"role": "user", "content": " + COMPRESSED_SUFFIX"},
363
+ )
364
+ assert merged == {"role": "user", "content": "COMPRESSED_PREFIX + COMPRESSED_SUFFIX"}
365
+
366
+
367
+ def test_mode_impact_classification_marks_assist_harm_and_no_change() -> None:
368
+ baseline = ModeSummary(
369
+ mode="baseline",
370
+ forwarded_input_tokens=100,
371
+ cache_read_tokens=50,
372
+ cache_write_tokens=10,
373
+ regular_input_tokens=40,
374
+ output_tokens=5,
375
+ total_cost_usd=1.0,
376
+ )
377
+ token = ModeSummary(
378
+ mode=PROXY_MODE_TOKEN,
379
+ forwarded_input_tokens=80,
380
+ cache_read_tokens=70,
381
+ cache_write_tokens=8,
382
+ regular_input_tokens=30,
383
+ output_tokens=5,
384
+ total_cost_usd=0.8,
385
+ )
386
+ cache = ModeSummary(
387
+ mode=PROXY_MODE_CACHE,
388
+ forwarded_input_tokens=120,
389
+ cache_read_tokens=45,
390
+ cache_write_tokens=15,
391
+ regular_input_tokens=60,
392
+ output_tokens=5,
393
+ total_cost_usd=1.2,
394
+ )
395
+
396
+ assert classify_metric_impact(baseline, token, "forwarded_input_tokens")["impact"] == "assist"
397
+ assert classify_metric_impact(baseline, token, "cache_read_tokens")["impact"] == "assist"
398
+ assert classify_metric_impact(baseline, cache, "total_cost_usd")["impact"] == "harm"
399
+ assert classify_metric_impact(baseline, token, "output_tokens")["impact"] == "no_change"
400
+
401
+ impacts = summarize_mode_impact_vs_baseline(
402
+ {"baseline": baseline, PROXY_MODE_TOKEN: token, PROXY_MODE_CACHE: cache}
403
+ )
404
+ assert impacts[PROXY_MODE_TOKEN]["total_cost_usd"]["impact"] == "assist"
405
+ assert impacts[PROXY_MODE_CACHE]["cache_write_tokens"]["impact"] == "harm"
406
+
407
+
408
+ def test_rewrite_scope_distinguishes_retroactive_from_latest_turn_only() -> None:
409
+ rewrite, retroactive = _rewrite_scope(
410
+ [{"role": "user", "content": "prefix"}, {"role": "user", "content": "new raw"}],
411
+ [{"role": "user", "content": "prefix"}, {"role": "user", "content": "new compressed"}],
412
+ stable_prefix_message_count=1,
413
+ )
414
+ assert rewrite is True
415
+ assert retroactive is False
416
+
417
+ rewrite, retroactive = _rewrite_scope(
418
+ [{"role": "user", "content": "prefix"}, {"role": "user", "content": "new raw"}],
419
+ [
420
+ {"role": "user", "content": "compressed prefix"},
421
+ {"role": "user", "content": "new compressed"},
422
+ ],
423
+ stable_prefix_message_count=1,
424
+ )
425
+ assert rewrite is True
426
+ assert retroactive is True
427
+
428
+
429
+ def test_synthetic_token_mode_busts_cache_while_cache_mode_stays_stable(monkeypatch) -> None:
430
+ class _FakeProvider:
431
+ @staticmethod
432
+ def get_context_limit(model: str) -> int:
433
+ return 200_000
434
+
435
+ class _FakePipeline:
436
+ @staticmethod
437
+ def apply(messages, **kwargs): # noqa: ANN001
438
+ rewritten = []
439
+ should_rewrite_history = len(messages) > 2
440
+ for message in messages:
441
+ content = message.get("content")
442
+ if (
443
+ should_rewrite_history
444
+ and isinstance(content, list)
445
+ and any(
446
+ isinstance(block, dict) and block.get("type") == "tool_result"
447
+ for block in content
448
+ )
449
+ ):
450
+ new_blocks = []
451
+ for block in content:
452
+ if isinstance(block, dict) and block.get("type") == "tool_result":
453
+ new_blocks.append({**block, "content": "[compressed-tool-result]"})
454
+ else:
455
+ new_blocks.append(block)
456
+ rewritten.append({**message, "content": new_blocks})
457
+ else:
458
+ rewritten.append(message)
459
+ return SimpleNamespace(messages=rewritten)
460
+
461
+ class _FakeProxy:
462
+ def __init__(self) -> None:
463
+ self.config = SimpleNamespace(image_optimize=False)
464
+ self.anthropic_provider = _FakeProvider()
465
+ self.anthropic_pipeline = _FakePipeline()
466
+
467
+ monkeypatch.setattr(
468
+ "benchmarks.claude_session_mode_benchmark._make_proxy",
469
+ lambda mode: _FakeProxy(),
470
+ )
471
+
472
+ tool_blob = "X" * 800
473
+ replay = SessionReplay(
474
+ session_id="synth-bust",
475
+ project_key="C--git-synth",
476
+ decoded_project_path=r"C:\git\synth",
477
+ turns=[
478
+ ReplayTurn(
479
+ session_id="synth-bust",
480
+ project_key="C--git-synth",
481
+ decoded_project_path=r"C:\git\synth",
482
+ request_id="r1",
483
+ model="claude-sonnet-4-6",
484
+ timestamp=datetime.fromisoformat("2026-03-13T01:00:00+00:00"),
485
+ input_messages=[
486
+ {"role": "user", "content": "Summarize this tool output"},
487
+ {
488
+ "role": "user",
489
+ "content": [
490
+ {
491
+ "type": "tool_result",
492
+ "tool_use_id": "tool-1",
493
+ "content": tool_blob,
494
+ }
495
+ ],
496
+ },
497
+ ],
498
+ assistant_message={"role": "assistant", "content": "ok"},
499
+ output_tokens=10,
500
+ ),
501
+ ReplayTurn(
502
+ session_id="synth-bust",
503
+ project_key="C--git-synth",
504
+ decoded_project_path=r"C:\git\synth",
505
+ request_id="r2",
506
+ model="claude-sonnet-4-6",
507
+ timestamp=datetime.fromisoformat("2026-03-13T01:02:00+00:00"),
508
+ input_messages=[{"role": "user", "content": "What changed?"}],
509
+ assistant_message={"role": "assistant", "content": "done"},
510
+ output_tokens=12,
511
+ ),
512
+ ],
513
+ )
514
+
515
+ _, summaries = simulate_replays([replay], cache_ttl_minutes=5)
516
+
517
+ token = summaries[PROXY_MODE_TOKEN]
518
+ cache = summaries[PROXY_MODE_CACHE]
519
+
520
+ assert token.cache_bust_turns == 1
521
+ assert token.rewrite_turns >= 1
522
+ assert token.retroactive_rewrite_turns >= 1
523
+ assert cache.cache_bust_turns == 0
524
+ assert cache.retroactive_rewrite_turns == 0
tests/test_proxy_anthropic_cache_stability.py CHANGED
@@ -897,3 +897,78 @@ def test_cache_mode_reuses_prior_forwarded_prefix_and_compresses_only_new_suffix
897
  {"role": "assistant", "content": "turn2-assistant"},
898
  {"role": "user", "content": "COMPRESSED_TURN3"},
899
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
897
  {"role": "assistant", "content": "turn2-assistant"},
898
  {"role": "user", "content": "COMPRESSED_TURN3"},
899
  ]
900
+
901
+
902
+ def test_cache_mode_skips_same_message_append_rewrite_to_preserve_stability() -> None:
903
+ captured = {"calls": []}
904
+ with _make_proxy_client() as client:
905
+ proxy = client.app.state.proxy
906
+ proxy.config.optimize = True
907
+ proxy.config.mode = "cache"
908
+ proxy.config.image_optimize = False
909
+
910
+ tracker = _FakePrefixTracker(frozen_count=0)
911
+ tracker._last_original_messages = [
912
+ {"role": "user", "content": "shared-prefix"},
913
+ ]
914
+ tracker._last_forwarded_messages = [
915
+ {"role": "user", "content": "COMPRESSED_PREFIX"},
916
+ ]
917
+ tracker.get_last_original_messages = lambda: tracker._last_original_messages.copy()
918
+ tracker.get_last_forwarded_messages = lambda: tracker._last_forwarded_messages.copy()
919
+
920
+ proxy.session_tracker_store.compute_session_id = lambda request, model, messages: (
921
+ "stable-session"
922
+ )
923
+ proxy.session_tracker_store.get_or_create = lambda session_id, provider: tracker
924
+
925
+ def _fake_apply(**kwargs):
926
+ captured["calls"].append(kwargs["messages"])
927
+ return SimpleNamespace(
928
+ messages=[{"role": "user", "content": " + COMPRESSED_SUFFIX"}],
929
+ transforms_applied=["fake:suffix"],
930
+ timing={},
931
+ tokens_before=20,
932
+ tokens_after=10,
933
+ waste_signals=None,
934
+ )
935
+
936
+ proxy.anthropic_pipeline.apply = _fake_apply
937
+
938
+ async def _fake_retry(method, url, headers, body, stream=False): # noqa: ANN001
939
+ captured["body"] = body
940
+ return httpx.Response(
941
+ 200,
942
+ json={
943
+ "id": "msg_cache_suffix",
944
+ "type": "message",
945
+ "role": "assistant",
946
+ "content": [{"type": "text", "text": "ok"}],
947
+ "usage": {
948
+ "input_tokens": 80,
949
+ "output_tokens": 3,
950
+ "cache_read_input_tokens": 0,
951
+ "cache_creation_input_tokens": 0,
952
+ },
953
+ },
954
+ )
955
+
956
+ proxy._retry_request = _fake_retry
957
+
958
+ response = client.post(
959
+ "/v1/messages",
960
+ headers={"x-api-key": "test-key", "anthropic-version": "2023-06-01"},
961
+ json={
962
+ "model": "claude-sonnet-4-6",
963
+ "max_tokens": 64,
964
+ "messages": [
965
+ {"role": "user", "content": "shared-prefix + raw suffix"},
966
+ ],
967
+ },
968
+ )
969
+
970
+ assert response.status_code == 200
971
+ assert captured["calls"] == []
972
+ assert captured["body"]["messages"] == [
973
+ {"role": "user", "content": "shared-prefix + raw suffix"},
974
+ ]