chopratejas commited on
Commit
0e85550
·
1 Parent(s): 7e6087f

fix: dashboard metrics, TTFB tracking, eager LLMLingua loading, and multi-provider consistency

Browse files

Dashboard was showing wildly incorrect metrics (99.5% savings, 3ms overhead)
due to using Anthropic API's non-cached input_tokens instead of optimized_tokens,
and dividing overhead by total request count instead of optimized-only count.

Key fixes:
- Use optimized_tokens (what we sent) for dashboard aggregation, not API's
input_tokens which excludes cached portion
- Track overhead_count separately from latency_count for correct averages
- Add TTFB (time to first byte) measurement, replace full stream latency in UI
- Eager-load LLMLingua model at proxy startup (eliminates 5.9s first-request delay)
- Simplify CostTracker to token-based accounting with counterfactual cost display
- Add two-tier compression cache to ContentRouter (skip set + result cache)
- Fix compression pinning to detect both CCR and ReadLifecycle markers
- Clamp tokens_saved to max(0, ...) across all provider paths
- Add per-transform timing instrumentation to pipeline
- Guard against over-aggressive code compression (<5% ratio)
- Fix ReadLifecycle partial read supersede logic (_read_covers range check)
- Disable CacheAligner and compress_superseded by default
- Fix all pre-existing mypy errors (CompressionCache return types)
- Fix test mocks to accept **kwargs for cache token parameters

headroom/config.py CHANGED
@@ -78,7 +78,7 @@ class CacheAlignerConfig:
78
  SAFE: Only applied to SYSTEM messages, not user/assistant/tool content.
79
  """
80
 
81
- enabled: bool = True
82
 
83
  # === Phase 1: DynamicContentDetector Integration ===
84
  # When True, uses the full DynamicContentDetector with 15+ patterns
@@ -397,7 +397,7 @@ class ReadLifecycleConfig:
397
 
398
  enabled: bool = True # On by default: stale/superseded Reads are provably safe to compress
399
  compress_stale: bool = True # Replace Reads of files that were later edited
400
- compress_superseded: bool = True # Replace Reads of files that were later re-Read
401
  min_size_bytes: int = 512 # Skip tiny Read outputs (not worth the overhead)
402
 
403
 
@@ -702,11 +702,13 @@ class TransformResult:
702
  warnings: list[str] = field(default_factory=list)
703
  diff_artifact: DiffArtifact | None = None # Populated if generate_diff_artifact=True
704
  cache_metrics: CachePrefixMetrics | None = None # Populated by CacheAligner
 
 
705
 
706
 
707
  @dataclass
708
  class TransformDiff:
709
- """Diff info for a single transform (for debugging)."""
710
 
711
  transform_name: str
712
  tokens_before: int
@@ -715,6 +717,7 @@ class TransformDiff:
715
  items_removed: int = 0
716
  items_kept: int = 0
717
  details: str = "" # Human-readable description of what changed
 
718
 
719
 
720
  @dataclass
 
78
  SAFE: Only applied to SYSTEM messages, not user/assistant/tool content.
79
  """
80
 
81
+ enabled: bool = False # Disabled by default — prefix stability gains are marginal in practice
82
 
83
  # === Phase 1: DynamicContentDetector Integration ===
84
  # When True, uses the full DynamicContentDetector with 15+ patterns
 
397
 
398
  enabled: bool = True # On by default: stale/superseded Reads are provably safe to compress
399
  compress_stale: bool = True # Replace Reads of files that were later edited
400
+ compress_superseded: bool = False # Disabled: busts Anthropic prompt cache prefix
401
  min_size_bytes: int = 512 # Skip tiny Read outputs (not worth the overhead)
402
 
403
 
 
702
  warnings: list[str] = field(default_factory=list)
703
  diff_artifact: DiffArtifact | None = None # Populated if generate_diff_artifact=True
704
  cache_metrics: CachePrefixMetrics | None = None # Populated by CacheAligner
705
+ timing: dict[str, float] = field(default_factory=dict) # transform_name → ms
706
+ waste_signals: WasteSignals | None = None # Detected waste in original messages
707
 
708
 
709
  @dataclass
710
  class TransformDiff:
711
+ """Diff info for a single transform (for debugging/perf)."""
712
 
713
  transform_name: str
714
  tokens_before: int
 
717
  items_removed: int = 0
718
  items_kept: int = 0
719
  details: str = "" # Human-readable description of what changed
720
+ duration_ms: float = 0.0 # Wall-clock time for this transform
721
 
722
 
723
  @dataclass
headroom/dashboard/templates/dashboard.html CHANGED
@@ -25,6 +25,8 @@
25
  body { background: #0f0f0f; }
26
  .sparkline { stroke: #22d3ee; stroke-width: 1.5; fill: none; }
27
  .sparkline-area { fill: url(#sparkline-gradient); }
 
 
28
  @keyframes pulse-subtle { 0%, 100% { opacity: 1; } 50% { opacity: 0.7; } }
29
  .pulse-live { animation: pulse-subtle 2s ease-in-out infinite; }
30
  </style>
@@ -52,27 +54,20 @@
52
  </header>
53
 
54
  <main class="p-6 max-w-7xl mx-auto">
55
- <!-- Hero Metrics -->
56
  <div class="grid grid-cols-2 md:grid-cols-4 gap-4 mb-6">
57
- <!-- Requests -->
58
  <div class="bg-surface rounded-lg p-4 border border-border">
59
- <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Requests</div>
60
- <div class="text-3xl font-light tabular-nums" x-text="formatNumber(stats.requests?.total || 0)"></div>
61
- <div class="mt-2 h-8">
62
- <svg class="w-full h-full" viewBox="0 0 100 32" preserveAspectRatio="none">
63
- <defs>
64
- <linearGradient id="sparkline-gradient" x1="0%" y1="0%" x2="0%" y2="100%">
65
- <stop offset="0%" style="stop-color:#22d3ee;stop-opacity:0.3"/>
66
- <stop offset="100%" style="stop-color:#22d3ee;stop-opacity:0"/>
67
- </linearGradient>
68
- </defs>
69
- <path class="sparkline-area" :d="getSparklineArea(requestHistory)"></path>
70
- <path class="sparkline" :d="getSparkline(requestHistory)"></path>
71
- </svg>
72
  </div>
73
  </div>
74
 
75
- <!-- Tokens Saved -->
76
  <div class="bg-surface rounded-lg p-4 border border-border">
77
  <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Tokens Saved</div>
78
  <div class="flex items-baseline gap-2">
@@ -81,32 +76,120 @@
81
  </div>
82
  <div class="mt-2 h-8">
83
  <svg class="w-full h-full" viewBox="0 0 100 32" preserveAspectRatio="none">
 
 
 
 
 
 
84
  <path class="sparkline-area" :d="getSparklineArea(savingsHistory)"></path>
85
  <path class="sparkline" :d="getSparkline(savingsHistory)"></path>
86
  </svg>
87
  </div>
88
  </div>
89
 
90
- <!-- Cost Saved -->
91
  <div class="bg-surface rounded-lg p-4 border border-border">
92
- <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Cost Saved</div>
93
  <div class="flex items-baseline gap-2">
94
- <span class="text-3xl font-light tabular-nums text-accent" x-text="'$' + formatCost(stats.cost?.total_savings_usd || 0)"></span>
95
- </div>
96
- <div class="mt-3 text-xs text-gray-500">
97
- vs $<span x-text="formatCost(stats.cost?.total_cost_usd || 0)"></span> spent
98
  </div>
 
99
  </div>
100
 
101
  <!-- Headroom Overhead -->
102
  <div class="bg-surface rounded-lg p-4 border border-border">
103
- <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Headroom Overhead</div>
104
  <div class="flex items-baseline gap-2">
105
  <span class="text-3xl font-light tabular-nums" x-text="(stats.overhead?.average_ms || 0).toFixed(0) + 'ms'"></span>
106
  </div>
107
- <div class="mt-3 text-xs text-gray-500">
108
- Avg <span x-text="((stats.latency?.average_ms || 0) / 1000).toFixed(1)"></span>s total response time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  </div>
111
  </div>
112
 
@@ -117,21 +200,17 @@
117
  <div class="text-sm font-medium mb-4 text-gray-300">Token Usage</div>
118
  <div class="space-y-3">
119
  <div class="flex justify-between items-center">
120
- <span class="text-sm text-gray-400">Input Tokens</span>
121
- <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.input || 0)"></span>
122
  </div>
123
  <div class="flex justify-between items-center">
124
- <span class="text-sm text-gray-400">Output Tokens</span>
125
- <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.output || 0)"></span>
126
  </div>
127
  <div class="border-t border-border my-2"></div>
128
  <div class="flex justify-between items-center">
129
- <span class="text-sm text-gray-400">Original Size</span>
130
- <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.total_before_compression || 0)"></span>
131
- </div>
132
- <div class="flex justify-between items-center">
133
- <span class="text-sm text-gray-400">After Compression</span>
134
- <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.input || 0)"></span>
135
  </div>
136
  </div>
137
  </div>
@@ -171,55 +250,171 @@
171
  <span class="font-mono text-sm" x-text="(stats.overhead?.min_ms || 0).toFixed(0) + ' - ' + (stats.overhead?.max_ms || 0).toFixed(0) + 'ms'"></span>
172
  </div>
173
  <div class="flex justify-between items-center">
174
- <span class="text-sm text-gray-400">Total Response Time</span>
175
- <span class="font-mono text-sm" x-text="((stats.latency?.average_ms || 0) / 1000).toFixed(1) + 's avg'"></span>
 
 
 
 
176
  </div>
177
  <div class="flex justify-between items-center">
178
  <span class="text-sm text-gray-400">Failed Requests</span>
179
  <span class="font-mono text-sm" x-text="stats.requests?.failed || 0"></span>
180
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  </div>
182
  </div>
183
  </div>
184
 
185
- <!-- Recent Requests Table -->
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  <div class="bg-surface rounded-lg border border-border overflow-hidden">
187
  <div class="px-4 py-3 border-b border-border flex justify-between items-center">
188
  <span class="text-sm font-medium text-gray-300">Recent Requests</span>
189
- <span class="text-xs text-gray-500">Last 10</span>
190
  </div>
191
  <div class="overflow-x-auto">
192
  <table class="w-full text-sm">
193
  <thead>
194
  <tr class="text-left text-xs text-gray-500 uppercase tracking-wide">
 
195
  <th class="px-4 py-3 font-medium">Time</th>
196
  <th class="px-4 py-3 font-medium">Model</th>
197
  <th class="px-4 py-3 font-medium text-right">Input</th>
198
  <th class="px-4 py-3 font-medium text-right">Output</th>
199
  <th class="px-4 py-3 font-medium text-right">Saved</th>
200
- <th class="px-4 py-3 font-medium text-right">Cost</th>
201
  <th class="px-4 py-3 font-medium text-right">Latency</th>
202
  </tr>
203
  </thead>
204
  <tbody class="divide-y divide-border">
205
  <template x-for="req in (stats.recent_requests || [])" :key="req.request_id">
206
- <tr class="hover:bg-border/30 transition-colors">
207
- <td class="px-4 py-3 font-mono text-gray-400" x-text="formatTime(req.timestamp)"></td>
208
- <td class="px-4 py-3">
209
- <span class="px-2 py-0.5 bg-border rounded text-xs" x-text="truncateModel(req.model)"></span>
210
- </td>
211
- <td class="px-4 py-3 text-right font-mono" x-text="formatNumber(req.input_tokens_optimized)"></td>
212
- <td class="px-4 py-3 text-right font-mono" x-text="formatNumber(req.output_tokens || 0)"></td>
213
- <td class="px-4 py-3 text-right">
214
- <span class="text-accent font-mono" x-text="req.savings_percent.toFixed(0) + '%'"></span>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  </td>
216
- <td class="px-4 py-3 text-right font-mono text-gray-400" x-text="'$' + (req.estimated_cost_usd || 0).toFixed(4)"></td>
217
- <td class="px-4 py-3 text-right font-mono text-gray-400" x-text="(req.total_latency_ms || 0).toFixed(0) + 'ms'"></td>
218
  </tr>
219
  </template>
220
  <template x-if="(stats.recent_requests || []).length === 0">
221
  <tr>
222
- <td colspan="7" class="px-4 py-8 text-center text-gray-500 italic">
223
  No requests yet. Start using the proxy to see activity here.
224
  </td>
225
  </tr>
@@ -229,23 +424,6 @@
229
  </div>
230
  </div>
231
 
232
- <!-- Budget Bar (if configured) -->
233
- <template x-if="stats.cost?.budget_limit_usd">
234
- <div class="mt-6 bg-surface rounded-lg p-4 border border-border">
235
- <div class="flex justify-between items-center mb-2">
236
- <span class="text-sm text-gray-400">Budget (<span x-text="stats.cost?.budget_period || 'daily'"></span>)</span>
237
- <span class="font-mono text-sm">
238
- $<span x-text="formatCost(stats.cost?.period_cost_usd || 0)"></span>
239
- / $<span x-text="formatCost(stats.cost?.budget_limit_usd || 0)"></span>
240
- </span>
241
- </div>
242
- <div class="w-full h-2 bg-border rounded-full overflow-hidden">
243
- <div class="h-full rounded-full transition-all duration-500"
244
- :class="getBudgetPercent() > 90 ? 'bg-red-400' : getBudgetPercent() > 70 ? 'bg-amber-400' : 'bg-accent'"
245
- :style="'width: ' + Math.min(getBudgetPercent(), 100) + '%'"></div>
246
- </div>
247
- </div>
248
- </template>
249
  </main>
250
 
251
  <!-- Footer -->
@@ -269,6 +447,7 @@
269
  lastUpdate: 'never',
270
  requestHistory: [],
271
  savingsHistory: [],
 
272
  pollInterval: null,
273
 
274
  async init() {
@@ -310,14 +489,21 @@
310
  }
311
  },
312
 
 
 
313
  formatNumber(n) {
314
  if (n >= 1000000) return (n / 1000000).toFixed(1) + 'M';
315
  if (n >= 1000) return (n / 1000).toFixed(1) + 'k';
316
  return n.toString();
317
  },
318
 
319
- formatCost(n) {
320
- return n.toFixed(2);
 
 
 
 
 
321
  },
322
 
323
  formatTime(ts) {
@@ -332,23 +518,123 @@
332
 
333
  truncateModel(model) {
334
  if (!model) return '-';
335
- // Remove provider prefix and version suffix for display
336
  return model.replace(/^(anthropic\.|openai\.|bedrock\/)/, '')
337
  .replace(/-\d{8}$/, '')
338
  .substring(0, 20);
339
  },
340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
  getProviderPercent(count) {
342
  const total = this.stats.requests?.total || 1;
343
  return Math.min((count / total) * 100, 100);
344
  },
345
 
346
- getBudgetPercent() {
347
- const limit = this.stats.cost?.budget_limit_usd || 1;
348
- const used = this.stats.cost?.period_cost_usd || 0;
349
- return (used / limit) * 100;
350
- },
351
-
352
  getSparkline(data) {
353
  if (!data || data.length < 2) return '';
354
  const min = Math.min(...data);
@@ -369,7 +655,30 @@
369
  const line = this.getSparkline(data);
370
  if (!line) return '';
371
  return line + ` L100,32 L0,32 Z`;
372
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  };
374
  }
375
  </script>
 
25
  body { background: #0f0f0f; }
26
  .sparkline { stroke: #22d3ee; stroke-width: 1.5; fill: none; }
27
  .sparkline-area { fill: url(#sparkline-gradient); }
28
+ .trend-line { stroke: #22d3ee; stroke-width: 2; fill: none; }
29
+ .trend-area { fill: url(#trend-gradient); }
30
  @keyframes pulse-subtle { 0%, 100% { opacity: 1; } 50% { opacity: 0.7; } }
31
  .pulse-live { animation: pulse-subtle 2s ease-in-out infinite; }
32
  </style>
 
54
  </header>
55
 
56
  <main class="p-6 max-w-7xl mx-auto">
57
+ <!-- Hero Metrics (reordered: Savings $ -> Tokens Saved % -> Quality Confidence -> Overhead) -->
58
  <div class="grid grid-cols-2 md:grid-cols-4 gap-4 mb-6">
59
+ <!-- Savings ($) - Lead with dollars -->
60
  <div class="bg-surface rounded-lg p-4 border border-border">
61
+ <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Savings</div>
62
+ <div class="flex items-baseline gap-2">
63
+ <span class="text-3xl font-light tabular-nums text-emerald-400" x-text="'$' + formatCurrency(stats.cost?.savings_usd || 0)"></span>
64
+ </div>
65
+ <div class="mt-2 text-xs text-gray-500">
66
+ <span x-text="formatNumber(stats.requests?.total || 0)"></span> requests processed
 
 
 
 
 
 
 
67
  </div>
68
  </div>
69
 
70
+ <!-- Tokens Saved (%) -->
71
  <div class="bg-surface rounded-lg p-4 border border-border">
72
  <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Tokens Saved</div>
73
  <div class="flex items-baseline gap-2">
 
76
  </div>
77
  <div class="mt-2 h-8">
78
  <svg class="w-full h-full" viewBox="0 0 100 32" preserveAspectRatio="none">
79
+ <defs>
80
+ <linearGradient id="sparkline-gradient" x1="0%" y1="0%" x2="0%" y2="100%">
81
+ <stop offset="0%" style="stop-color:#22d3ee;stop-opacity:0.3"/>
82
+ <stop offset="100%" style="stop-color:#22d3ee;stop-opacity:0"/>
83
+ </linearGradient>
84
+ </defs>
85
  <path class="sparkline-area" :d="getSparklineArea(savingsHistory)"></path>
86
  <path class="sparkline" :d="getSparkline(savingsHistory)"></path>
87
  </svg>
88
  </div>
89
  </div>
90
 
91
+ <!-- Quality Confidence -->
92
  <div class="bg-surface rounded-lg p-4 border border-border">
93
+ <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Compression Quality</div>
94
  <div class="flex items-baseline gap-2">
95
+ <span class="w-3 h-3 rounded-full mt-1" :class="confidenceColor"></span>
96
+ <span class="text-3xl font-light tabular-nums" :class="confidenceTextColor" x-text="confidenceLabel"></span>
 
 
97
  </div>
98
+ <div class="mt-2 text-xs text-gray-500" x-text="confidenceDetail"></div>
99
  </div>
100
 
101
  <!-- Headroom Overhead -->
102
  <div class="bg-surface rounded-lg p-4 border border-border">
103
+ <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Overhead</div>
104
  <div class="flex items-baseline gap-2">
105
  <span class="text-3xl font-light tabular-nums" x-text="(stats.overhead?.average_ms || 0).toFixed(0) + 'ms'"></span>
106
  </div>
107
+ <div class="mt-2 text-xs text-gray-500">
108
+ TTFB <span x-text="((stats.ttfb?.average_ms || 0) / 1000).toFixed(2)"></span>s avg
109
+ </div>
110
+ </div>
111
+ </div>
112
+
113
+ <!-- "Without Headroom" Counterfactual -->
114
+ <template x-if="(stats.cost?.cost_without_headroom_usd || 0) > 0">
115
+ <div class="bg-surface rounded-lg p-4 border border-border mb-6">
116
+ <div class="flex items-center justify-between">
117
+ <div>
118
+ <div class="text-sm font-medium text-gray-300 mb-2">Without Headroom</div>
119
+ <div class="flex items-center gap-8">
120
+ <div>
121
+ <div class="text-xs text-gray-500 mb-1">Input cost</div>
122
+ <div class="text-2xl font-light tabular-nums text-emerald-400" x-text="'$' + formatCurrency(stats.cost?.cost_with_headroom_usd || 0)"></div>
123
+ </div>
124
+ <div class="text-gray-500 text-2xl font-light">vs</div>
125
+ <div>
126
+ <div class="text-xs text-gray-500 mb-1">Input cost without Headroom</div>
127
+ <div class="text-2xl font-light tabular-nums text-red-400 line-through decoration-red-400/50" x-text="'$' + formatCurrency(stats.cost?.cost_without_headroom_usd || 0)"></div>
128
+ </div>
129
+ <div class="ml-auto text-right">
130
+ <div class="text-xs text-gray-500 mb-1">Total saved</div>
131
+ <div class="text-2xl font-light tabular-nums text-emerald-400" x-text="'$' + formatCurrency(stats.cost?.savings_usd || 0)"></div>
132
+ </div>
133
+ </div>
134
+ </div>
135
+ </div>
136
+ </div>
137
+ </template>
138
+
139
+ <!-- New: Waste Signal Breakdown + Cumulative Savings Trend -->
140
+ <div class="grid grid-cols-1 lg:grid-cols-2 gap-4 mb-6">
141
+ <!-- What Headroom Removed -->
142
+ <div class="bg-surface rounded-lg p-4 border border-border">
143
+ <div class="text-sm font-medium mb-4 text-gray-300">What Headroom Removed</div>
144
+ <template x-if="Object.keys(stats.waste_signals || {}).length > 0">
145
+ <div class="space-y-3">
146
+ <template x-for="[signal, tokens] in sortedWasteSignals" :key="signal">
147
+ <div>
148
+ <div class="flex justify-between items-center mb-1">
149
+ <span class="text-sm text-gray-400" x-text="wasteSignalLabel(signal)"></span>
150
+ <span class="font-mono text-sm text-accent" x-text="formatNumber(tokens) + ' tokens'"></span>
151
+ </div>
152
+ <div class="w-full h-2 bg-border rounded-full overflow-hidden">
153
+ <div class="h-full rounded-full transition-all duration-500"
154
+ :class="wasteSignalColor(signal)"
155
+ :style="'width: ' + getWastePercent(tokens) + '%'"></div>
156
+ </div>
157
+ </div>
158
+ </template>
159
+ </div>
160
+ </template>
161
+ <template x-if="Object.keys(stats.waste_signals || {}).length === 0">
162
+ <div class="text-sm text-gray-500 italic py-8 text-center">
163
+ No waste signals detected yet. Data appears after requests are processed.
164
+ </div>
165
+ </template>
166
+ </div>
167
+
168
+ <!-- Cumulative Savings Trend -->
169
+ <div class="bg-surface rounded-lg p-4 border border-border">
170
+ <div class="flex justify-between items-center mb-4">
171
+ <span class="text-sm font-medium text-gray-300">Savings Over Time</span>
172
+ <span class="text-xs text-gray-500 font-mono" x-text="formatNumber(stats.tokens?.saved || 0) + ' tokens total'"></span>
173
  </div>
174
+ <template x-if="(stats.savings_history || []).length >= 2">
175
+ <div class="h-32">
176
+ <svg class="w-full h-full" viewBox="0 0 200 64" preserveAspectRatio="none">
177
+ <defs>
178
+ <linearGradient id="trend-gradient" x1="0%" y1="0%" x2="0%" y2="100%">
179
+ <stop offset="0%" style="stop-color:#22d3ee;stop-opacity:0.2"/>
180
+ <stop offset="100%" style="stop-color:#22d3ee;stop-opacity:0"/>
181
+ </linearGradient>
182
+ </defs>
183
+ <path class="trend-area" :d="getTrendArea(stats.savings_history)"></path>
184
+ <path class="trend-line" :d="getTrendLine(stats.savings_history)"></path>
185
+ </svg>
186
+ </div>
187
+ </template>
188
+ <template x-if="(stats.savings_history || []).length < 2">
189
+ <div class="h-32 flex items-center justify-center text-sm text-gray-500 italic">
190
+ Trend data will appear after multiple requests.
191
+ </div>
192
+ </template>
193
  </div>
194
  </div>
195
 
 
200
  <div class="text-sm font-medium mb-4 text-gray-300">Token Usage</div>
201
  <div class="space-y-3">
202
  <div class="flex justify-between items-center">
203
+ <span class="text-sm text-gray-400">Before Compression</span>
204
+ <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.total_before_compression || 0)"></span>
205
  </div>
206
  <div class="flex justify-between items-center">
207
+ <span class="text-sm text-gray-400">After Compression (sent)</span>
208
+ <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.input || 0)"></span>
209
  </div>
210
  <div class="border-t border-border my-2"></div>
211
  <div class="flex justify-between items-center">
212
+ <span class="text-sm text-gray-400">Output Tokens</span>
213
+ <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.output || 0)"></span>
 
 
 
 
214
  </div>
215
  </div>
216
  </div>
 
250
  <span class="font-mono text-sm" x-text="(stats.overhead?.min_ms || 0).toFixed(0) + ' - ' + (stats.overhead?.max_ms || 0).toFixed(0) + 'ms'"></span>
251
  </div>
252
  <div class="flex justify-between items-center">
253
+ <span class="text-sm text-gray-400">TTFB (upstream)</span>
254
+ <span class="font-mono text-sm" x-text="((stats.ttfb?.average_ms || 0) / 1000).toFixed(2) + 's avg'"></span>
255
+ </div>
256
+ <div class="flex justify-between items-center">
257
+ <span class="text-sm text-gray-400">TTFB Range</span>
258
+ <span class="font-mono text-sm" x-text="((stats.ttfb?.min_ms || 0) / 1000).toFixed(2) + ' - ' + ((stats.ttfb?.max_ms || 0) / 1000).toFixed(2) + 's'"></span>
259
  </div>
260
  <div class="flex justify-between items-center">
261
  <span class="text-sm text-gray-400">Failed Requests</span>
262
  <span class="font-mono text-sm" x-text="stats.requests?.failed || 0"></span>
263
  </div>
264
+ <!-- Per-transform timing breakdown -->
265
+ <template x-if="Object.keys(stats.pipeline_timing || {}).length > 0">
266
+ <div>
267
+ <div class="border-t border-border my-2"></div>
268
+ <div class="text-xs text-gray-500 uppercase tracking-wide mb-2">Pipeline Breakdown</div>
269
+ <template x-for="[name, t] in Object.entries(stats.pipeline_timing || {})" :key="name">
270
+ <div class="flex justify-between items-center mb-1">
271
+ <span class="text-xs text-gray-400 font-mono truncate mr-2" x-text="name"></span>
272
+ <span class="text-xs font-mono whitespace-nowrap"
273
+ :class="t.average_ms > 100 ? 'text-amber-400' : t.average_ms > 50 ? 'text-yellow-400' : 'text-gray-400'"
274
+ x-text="t.average_ms.toFixed(0) + 'ms avg / ' + t.max_ms.toFixed(0) + 'ms max'"></span>
275
+ </div>
276
+ </template>
277
+ </div>
278
+ </template>
279
  </div>
280
  </div>
281
  </div>
282
 
283
+ <!-- Per-Model Savings Breakdown -->
284
+ <template x-if="Object.keys(stats.cost?.per_model || {}).length > 0">
285
+ <div class="bg-surface rounded-lg border border-border overflow-hidden mb-6">
286
+ <div class="px-4 py-3 border-b border-border flex justify-between items-center">
287
+ <span class="text-sm font-medium text-gray-300">Per-Model Token Savings</span>
288
+ <span class="text-xs text-gray-500">Exact tokens saved per model</span>
289
+ </div>
290
+ <div class="overflow-x-auto">
291
+ <table class="w-full text-sm">
292
+ <thead>
293
+ <tr class="text-left text-xs text-gray-500 uppercase tracking-wide">
294
+ <th class="px-4 py-3 font-medium">Model</th>
295
+ <th class="px-4 py-3 font-medium text-right">Requests</th>
296
+ <th class="px-4 py-3 font-medium text-right">Tokens Saved</th>
297
+ <th class="px-4 py-3 font-medium text-right">Tokens Sent</th>
298
+ <th class="px-4 py-3 font-medium text-right">Reduction</th>
299
+ </tr>
300
+ </thead>
301
+ <tbody class="divide-y divide-border">
302
+ <template x-for="[model, info] in Object.entries(stats.cost?.per_model || {})" :key="model">
303
+ <tr class="hover:bg-border/30 transition-colors">
304
+ <td class="px-4 py-3">
305
+ <span class="px-2 py-0.5 bg-border rounded text-xs" x-text="truncateModel(model)"></span>
306
+ </td>
307
+ <td class="px-4 py-3 text-right font-mono" x-text="info.requests"></td>
308
+ <td class="px-4 py-3 text-right font-mono text-accent" x-text="formatNumber(info.tokens_saved)"></td>
309
+ <td class="px-4 py-3 text-right font-mono" x-text="formatNumber(info.tokens_sent)"></td>
310
+ <td class="px-4 py-3 text-right">
311
+ <span class="text-accent font-mono" x-text="info.reduction_pct.toFixed(1) + '%'"></span>
312
+ </td>
313
+ </tr>
314
+ </template>
315
+ </tbody>
316
+ </table>
317
+ </div>
318
+ </div>
319
+ </template>
320
+
321
+ <!-- Recent Requests Table (with expandable rows) -->
322
  <div class="bg-surface rounded-lg border border-border overflow-hidden">
323
  <div class="px-4 py-3 border-b border-border flex justify-between items-center">
324
  <span class="text-sm font-medium text-gray-300">Recent Requests</span>
325
+ <span class="text-xs text-gray-500">Last 10 &mdash; click row to expand</span>
326
  </div>
327
  <div class="overflow-x-auto">
328
  <table class="w-full text-sm">
329
  <thead>
330
  <tr class="text-left text-xs text-gray-500 uppercase tracking-wide">
331
+ <th class="px-4 py-3 font-medium w-6"></th>
332
  <th class="px-4 py-3 font-medium">Time</th>
333
  <th class="px-4 py-3 font-medium">Model</th>
334
  <th class="px-4 py-3 font-medium text-right">Input</th>
335
  <th class="px-4 py-3 font-medium text-right">Output</th>
336
  <th class="px-4 py-3 font-medium text-right">Saved</th>
337
+ <th class="px-4 py-3 font-medium text-right">Quality</th>
338
  <th class="px-4 py-3 font-medium text-right">Latency</th>
339
  </tr>
340
  </thead>
341
  <tbody class="divide-y divide-border">
342
  <template x-for="req in (stats.recent_requests || [])" :key="req.request_id">
343
+ <tr>
344
+ <td colspan="8" class="p-0">
345
+ <div class="cursor-pointer" @click="toggleExpanded(req.request_id)">
346
+ <div class="flex hover:bg-border/30 transition-colors">
347
+ <div class="px-4 py-3 w-6 text-gray-500">
348
+ <span x-text="expandedRows[req.request_id] ? '-' : '+'"></span>
349
+ </div>
350
+ <div class="px-4 py-3 font-mono text-gray-400 flex-1" x-text="formatTime(req.timestamp)"></div>
351
+ <div class="px-4 py-3 flex-1">
352
+ <span class="px-2 py-0.5 bg-border rounded text-xs" x-text="truncateModel(req.model)"></span>
353
+ </div>
354
+ <div class="px-4 py-3 text-right font-mono flex-1" x-text="formatNumber(req.input_tokens_optimized)"></div>
355
+ <div class="px-4 py-3 text-right font-mono flex-1" x-text="formatNumber(req.output_tokens || 0)"></div>
356
+ <div class="px-4 py-3 text-right flex-1">
357
+ <span class="text-accent font-mono" x-text="req.savings_percent.toFixed(0) + '%'"></span>
358
+ </div>
359
+ <div class="px-4 py-3 text-right flex-1">
360
+ <span class="w-2 h-2 rounded-full inline-block" :class="getRequestConfidenceColor(req)"></span>
361
+ </div>
362
+ <div class="px-4 py-3 text-right font-mono text-gray-400 flex-1" x-text="(req.total_latency_ms || 0).toFixed(0) + 'ms'"></div>
363
+ </div>
364
+ </div>
365
+ <!-- Expanded detail row -->
366
+ <template x-if="expandedRows[req.request_id]">
367
+ <div class="px-8 py-4 bg-[#151515] border-t border-border">
368
+ <div class="grid grid-cols-2 lg:grid-cols-4 gap-4 text-xs">
369
+ <div>
370
+ <div class="text-gray-500 uppercase tracking-wide mb-1">Original Tokens</div>
371
+ <div class="font-mono" x-text="formatNumber(req.input_tokens_original)"></div>
372
+ </div>
373
+ <div>
374
+ <div class="text-gray-500 uppercase tracking-wide mb-1">Compressed Tokens</div>
375
+ <div class="font-mono" x-text="formatNumber(req.input_tokens_optimized)"></div>
376
+ </div>
377
+ <div>
378
+ <div class="text-gray-500 uppercase tracking-wide mb-1">Tokens Removed</div>
379
+ <div class="font-mono text-accent" x-text="formatNumber(req.tokens_saved)"></div>
380
+ </div>
381
+ <div>
382
+ <div class="text-gray-500 uppercase tracking-wide mb-1">Optimization Time</div>
383
+ <div class="font-mono" x-text="(req.optimization_latency_ms || 0).toFixed(0) + 'ms'"></div>
384
+ </div>
385
+ </div>
386
+ <!-- Transforms Applied -->
387
+ <template x-if="(req.transforms_applied || []).length > 0">
388
+ <div class="mt-3">
389
+ <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Transforms Applied</div>
390
+ <div class="flex flex-wrap gap-1">
391
+ <template x-for="t in req.transforms_applied" :key="t">
392
+ <span class="px-2 py-0.5 bg-border rounded text-xs font-mono" x-text="t"></span>
393
+ </template>
394
+ </div>
395
+ </div>
396
+ </template>
397
+ <!-- Waste Signals for this request -->
398
+ <template x-if="req.waste_signals && Object.keys(req.waste_signals).length > 0">
399
+ <div class="mt-3">
400
+ <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Waste Detected</div>
401
+ <div class="flex flex-wrap gap-2">
402
+ <template x-for="[signal, tokens] in Object.entries(req.waste_signals).filter(([,v]) => v > 0)" :key="signal">
403
+ <span class="px-2 py-0.5 rounded text-xs font-mono"
404
+ :class="wasteSignalBadgeColor(signal)"
405
+ x-text="wasteSignalLabel(signal) + ': ' + formatNumber(tokens)"></span>
406
+ </template>
407
+ </div>
408
+ </div>
409
+ </template>
410
+ </div>
411
+ </template>
412
  </td>
 
 
413
  </tr>
414
  </template>
415
  <template x-if="(stats.recent_requests || []).length === 0">
416
  <tr>
417
+ <td colspan="8" class="px-4 py-8 text-center text-gray-500 italic">
418
  No requests yet. Start using the proxy to see activity here.
419
  </td>
420
  </tr>
 
424
  </div>
425
  </div>
426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  </main>
428
 
429
  <!-- Footer -->
 
447
  lastUpdate: 'never',
448
  requestHistory: [],
449
  savingsHistory: [],
450
+ expandedRows: {},
451
  pollInterval: null,
452
 
453
  async init() {
 
489
  }
490
  },
491
 
492
+ // --- Formatting ---
493
+
494
  formatNumber(n) {
495
  if (n >= 1000000) return (n / 1000000).toFixed(1) + 'M';
496
  if (n >= 1000) return (n / 1000).toFixed(1) + 'k';
497
  return n.toString();
498
  },
499
 
500
+ formatCurrency(n) {
501
+ if (n < 0) return '-' + this.formatCurrency(-n);
502
+ if (n >= 1000) return (n / 1000).toFixed(1) + 'k';
503
+ if (n >= 1) return n.toFixed(2);
504
+ if (n >= 0.01) return n.toFixed(3);
505
+ if (n > 0) return n.toFixed(4);
506
+ return '0.00';
507
  },
508
 
509
  formatTime(ts) {
 
518
 
519
  truncateModel(model) {
520
  if (!model) return '-';
 
521
  return model.replace(/^(anthropic\.|openai\.|bedrock\/)/, '')
522
  .replace(/-\d{8}$/, '')
523
  .substring(0, 20);
524
  },
525
 
526
+ // --- Waste Signals ---
527
+
528
+ wasteSignalLabel(signal) {
529
+ const labels = {
530
+ json_bloat: 'JSON Bloat',
531
+ html_noise: 'HTML Noise',
532
+ base64: 'Base64 Blobs',
533
+ whitespace: 'Whitespace',
534
+ dynamic_date: 'Dynamic Dates',
535
+ repetition: 'Repetition',
536
+ };
537
+ return labels[signal] || signal;
538
+ },
539
+
540
+ wasteSignalColor(signal) {
541
+ const colors = {
542
+ json_bloat: 'bg-amber-500',
543
+ html_noise: 'bg-orange-500',
544
+ base64: 'bg-red-500',
545
+ whitespace: 'bg-blue-500',
546
+ dynamic_date: 'bg-purple-500',
547
+ repetition: 'bg-pink-500',
548
+ };
549
+ return colors[signal] || 'bg-gray-500';
550
+ },
551
+
552
+ wasteSignalBadgeColor(signal) {
553
+ const colors = {
554
+ json_bloat: 'bg-amber-500/20 text-amber-400',
555
+ html_noise: 'bg-orange-500/20 text-orange-400',
556
+ base64: 'bg-red-500/20 text-red-400',
557
+ whitespace: 'bg-blue-500/20 text-blue-400',
558
+ dynamic_date: 'bg-purple-500/20 text-purple-400',
559
+ repetition: 'bg-pink-500/20 text-pink-400',
560
+ };
561
+ return colors[signal] || 'bg-gray-500/20 text-gray-400';
562
+ },
563
+
564
+ get sortedWasteSignals() {
565
+ const signals = this.stats.waste_signals || {};
566
+ return Object.entries(signals)
567
+ .filter(([, v]) => v > 0)
568
+ .sort((a, b) => b[1] - a[1]);
569
+ },
570
+
571
+ getWastePercent(tokens) {
572
+ const signals = this.stats.waste_signals || {};
573
+ const max = Math.max(...Object.values(signals), 1);
574
+ return Math.min((tokens / max) * 100, 100);
575
+ },
576
+
577
+ // --- Compression Confidence ---
578
+
579
+ get confidenceLevel() {
580
+ const saved = this.stats.tokens?.saved || 0;
581
+ if (saved === 0) return 'none';
582
+ const signals = this.stats.waste_signals || {};
583
+ const totalWaste = Object.values(signals).reduce((a, b) => a + b, 0);
584
+ if (totalWaste === 0) return 'unknown';
585
+ const wasteRatio = totalWaste / saved;
586
+ if (wasteRatio >= 0.7) return 'high';
587
+ if (wasteRatio >= 0.3) return 'medium';
588
+ return 'low';
589
+ },
590
+
591
+ get confidenceColor() {
592
+ const c = { high: 'bg-emerald-400', medium: 'bg-yellow-400', low: 'bg-red-400', none: 'bg-gray-500', unknown: 'bg-gray-500' };
593
+ return c[this.confidenceLevel];
594
+ },
595
+
596
+ get confidenceTextColor() {
597
+ const c = { high: 'text-emerald-400', medium: 'text-yellow-400', low: 'text-red-400', none: 'text-gray-500', unknown: 'text-gray-500' };
598
+ return c[this.confidenceLevel];
599
+ },
600
+
601
+ get confidenceLabel() {
602
+ const l = { high: 'High', medium: 'Medium', low: 'Low', none: '-', unknown: '-' };
603
+ return l[this.confidenceLevel];
604
+ },
605
+
606
+ get confidenceDetail() {
607
+ const saved = this.stats.tokens?.saved || 0;
608
+ if (saved === 0) return 'No compression yet';
609
+ const signals = this.stats.waste_signals || {};
610
+ const totalWaste = Object.values(signals).reduce((a, b) => a + b, 0);
611
+ if (totalWaste === 0) return 'No waste signals detected';
612
+ const pct = Math.round((totalWaste / saved) * 100);
613
+ return pct + '% of removed tokens were identified waste';
614
+ },
615
+
616
+ getRequestConfidenceColor(req) {
617
+ if (!req.waste_signals || req.tokens_saved === 0) return 'bg-gray-500';
618
+ const totalWaste = Object.values(req.waste_signals).reduce((a, b) => a + b, 0);
619
+ const ratio = totalWaste / req.tokens_saved;
620
+ if (ratio >= 0.7) return 'bg-emerald-400';
621
+ if (ratio >= 0.3) return 'bg-yellow-400';
622
+ return 'bg-red-400';
623
+ },
624
+
625
+ // --- Expandable Rows ---
626
+
627
+ toggleExpanded(id) {
628
+ this.expandedRows[id] = !this.expandedRows[id];
629
+ },
630
+
631
+ // --- Charts ---
632
+
633
  getProviderPercent(count) {
634
  const total = this.stats.requests?.total || 1;
635
  return Math.min((count / total) * 100, 100);
636
  },
637
 
 
 
 
 
 
 
638
  getSparkline(data) {
639
  if (!data || data.length < 2) return '';
640
  const min = Math.min(...data);
 
655
  const line = this.getSparkline(data);
656
  if (!line) return '';
657
  return line + ` L100,32 L0,32 Z`;
658
+ },
659
+
660
+ getTrendLine(history) {
661
+ if (!history || history.length < 2) return '';
662
+ const values = history.map(h => h[1]);
663
+ const min = Math.min(...values);
664
+ const max = Math.max(...values);
665
+ const range = max - min || 1;
666
+
667
+ const points = values.map((v, i) => {
668
+ const x = (i / (values.length - 1)) * 200;
669
+ const y = 60 - ((v - min) / range) * 56;
670
+ return `${x},${y}`;
671
+ });
672
+
673
+ return 'M' + points.join(' L');
674
+ },
675
+
676
+ getTrendArea(history) {
677
+ if (!history || history.length < 2) return '';
678
+ const line = this.getTrendLine(history);
679
+ if (!line) return '';
680
+ return line + ` L200,64 L0,64 Z`;
681
+ },
682
  };
683
  }
684
  </script>
headroom/perf/analyzer.py CHANGED
@@ -2,14 +2,21 @@
2
 
3
  Parses PERF log lines from ~/.headroom/logs/proxy.log* and produces
4
  actionable reports on token savings, cache efficiency, and transform impact.
 
 
 
 
5
  """
6
 
7
  from __future__ import annotations
8
 
 
9
  import re
10
  from dataclasses import dataclass, field
11
  from pathlib import Path
12
 
 
 
13
  LOG_DIR = Path.home() / ".headroom" / "logs"
14
 
15
  # Matches: 2026-03-07 13:38:31,009 - headroom.proxy - INFO - [hr_...] PERF model=... ...
@@ -38,6 +45,90 @@ _TOIN_RE = re.compile(
38
  )
39
 
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def _parse_kv(kv_str: str) -> dict[str, str]:
42
  """Parse key=value pairs from a PERF log line.
43
 
@@ -276,16 +367,35 @@ def format_report(report: PerfReport) -> str:
276
  total_saved = sum(r.tokens_saved for r in records)
277
  pct = (total_saved / total_before * 100) if total_before > 0 else 0
278
 
279
- models = {r.model for r in records}
280
  lines.append(f"Requests: {len(records)}")
281
- lines.append(f"Models: {', '.join(sorted(models))}")
282
- lines.append(
283
- f"Tokens: {total_before:,} input -> {total_after:,} after transforms "
284
- f"({pct:.1f}% reduction)"
285
- )
286
  lines.append(f"Total saved: {total_saved:,} tokens")
287
  lines.append("")
288
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  # Cache analysis
290
  cache_records = [r for r in records if (r.cache_read + r.cache_write) > 0]
291
  if cache_records:
 
2
 
3
  Parses PERF log lines from ~/.headroom/logs/proxy.log* and produces
4
  actionable reports on token savings, cache efficiency, and transform impact.
5
+
6
+ Cost accounting is **cache-aware**: saved tokens that would have been served
7
+ from the provider's prompt cache are valued at cache_read price (~10% for
8
+ Anthropic), not the full input price. This prevents overstating dollar savings.
9
  """
10
 
11
  from __future__ import annotations
12
 
13
+ import logging
14
  import re
15
  from dataclasses import dataclass, field
16
  from pathlib import Path
17
 
18
+ log = logging.getLogger(__name__)
19
+
20
  LOG_DIR = Path.home() / ".headroom" / "logs"
21
 
22
  # Matches: 2026-03-07 13:38:31,009 - headroom.proxy - INFO - [hr_...] PERF model=... ...
 
45
  )
46
 
47
 
48
+ # ---------------------------------------------------------------------------
49
+ # Cache-aware pricing via LiteLLM
50
+ # ---------------------------------------------------------------------------
51
+
52
+ # LiteLLM already knows per-token costs for 100+ models including
53
+ # cache_read and cache_creation pricing. We call it directly instead
54
+ # of maintaining our own pricing tables.
55
+
56
+ try:
57
+ import litellm as _litellm
58
+
59
+ _LITELLM_AVAILABLE = True
60
+ except ImportError:
61
+ _LITELLM_AVAILABLE = False
62
+
63
+ # Cache resolved model names (e.g. "claude-opus-4-6" → "anthropic/claude-opus-4-6")
64
+ _resolved_model_cache: dict[str, str] = {}
65
+
66
+
67
+ def _resolve_model(model: str) -> str:
68
+ """Resolve to a model name LiteLLM recognises, adding provider prefix if needed.
69
+
70
+ TODO: Duplicated with CostTracker._resolve_litellm_model in proxy/server.py.
71
+ Extract to shared utility.
72
+ """
73
+ if model in _resolved_model_cache:
74
+ return _resolved_model_cache[model]
75
+
76
+ if not _LITELLM_AVAILABLE:
77
+ _resolved_model_cache[model] = model
78
+ return model
79
+
80
+ # Try as-is
81
+ if model in _litellm.model_cost:
82
+ _resolved_model_cache[model] = model
83
+ return model
84
+
85
+ # Try provider prefixes
86
+ for prefix in ("anthropic/", "openai/", "google/", "mistral/", "deepseek/"):
87
+ prefixed = f"{prefix}{model}"
88
+ if prefixed in _litellm.model_cost:
89
+ _resolved_model_cache[model] = prefixed
90
+ return prefixed
91
+
92
+ _resolved_model_cache[model] = model
93
+ return model
94
+
95
+
96
+ def _litellm_cost(
97
+ model: str,
98
+ prompt_tokens: int,
99
+ cache_read_tokens: int = 0,
100
+ cache_write_tokens: int = 0,
101
+ ) -> float | None:
102
+ """Compute input cost via litellm.cost_per_token (cache-aware).
103
+
104
+ Returns total input cost in USD, or None if model not found.
105
+ """
106
+ if not _LITELLM_AVAILABLE:
107
+ return None
108
+ resolved = _resolve_model(model)
109
+ try:
110
+ input_cost, _ = _litellm.cost_per_token(
111
+ model=resolved,
112
+ prompt_tokens=prompt_tokens,
113
+ completion_tokens=0,
114
+ cache_read_input_tokens=cache_read_tokens,
115
+ cache_creation_input_tokens=cache_write_tokens,
116
+ )
117
+ return float(input_cost)
118
+ except Exception:
119
+ return None
120
+
121
+
122
+ def _get_list_price(model: str) -> float | None:
123
+ """Get list input price per 1M tokens."""
124
+ if not _LITELLM_AVAILABLE:
125
+ return None
126
+ resolved = _resolve_model(model)
127
+ info = _litellm.model_cost.get(resolved, {})
128
+ cost_per_token = info.get("input_cost_per_token")
129
+ return cost_per_token * 1_000_000 if cost_per_token else None
130
+
131
+
132
  def _parse_kv(kv_str: str) -> dict[str, str]:
133
  """Parse key=value pairs from a PERF log line.
134
 
 
367
  total_saved = sum(r.tokens_saved for r in records)
368
  pct = (total_saved / total_before * 100) if total_before > 0 else 0
369
 
 
370
  lines.append(f"Requests: {len(records)}")
371
+ lines.append(f"Tokens: {total_before:,} -> {total_after:,} ({pct:.1f}% reduction)")
 
 
 
 
372
  lines.append(f"Total saved: {total_saved:,} tokens")
373
  lines.append("")
374
 
375
+ # Per-model breakdown with list prices
376
+ by_model: dict[str, list[PerfRecord]] = {}
377
+ for r in records:
378
+ by_model.setdefault(r.model, []).append(r)
379
+
380
+ lines.append("Per-Model Breakdown")
381
+ lines.append("-" * 40)
382
+ for model, model_recs in sorted(by_model.items()):
383
+ m_saved = sum(r.tokens_saved for r in model_recs)
384
+ m_before = sum(r.tokens_before for r in model_recs)
385
+ m_pct = (m_saved / m_before * 100) if m_before > 0 else 0
386
+ list_price = _get_list_price(model)
387
+ price_str = f"${list_price:.2f}/MTok" if list_price else "unknown"
388
+ est_str = (
389
+ f" ~${m_saved * list_price / 1_000_000:.2f} at list price" if list_price else ""
390
+ )
391
+ lines.append(
392
+ f" {model}: {len(model_recs)} reqs, "
393
+ f"{m_saved:,} tokens saved ({m_pct:.0f}%), "
394
+ f"list price {price_str}{est_str}"
395
+ )
396
+ lines.append(" * Actual bill savings depend on provider caching behavior")
397
+ lines.append("")
398
+
399
  # Cache analysis
400
  cache_records = [r for r in records if (r.cache_read + r.cache_write) > 0]
401
  if cache_records:
headroom/proxy/server.py CHANGED
@@ -216,10 +216,6 @@ class RequestLog:
216
  tokens_saved: int
217
  savings_percent: float
218
 
219
- # Cost
220
- estimated_cost_usd: float | None
221
- estimated_savings_usd: float | None
222
-
223
  # Performance
224
  optimization_latency_ms: float
225
  total_latency_ms: float | None
@@ -229,6 +225,9 @@ class RequestLog:
229
  cache_hit: bool
230
  transforms_applied: list[str]
231
 
 
 
 
232
  # Request/Response (optional, for debugging)
233
  request_messages: list[dict] | None = None
234
  response_content: str | None = None
@@ -601,10 +600,13 @@ class CostTracker:
601
 
602
  # Cost tracking - using deque for efficient left-side removal
603
  self._costs: deque[tuple[datetime, float]] = deque(maxlen=self.MAX_COST_ENTRIES)
604
- self._total_cost_usd: float = 0
605
- self._total_savings_usd: float = 0
606
  self._last_prune_time: datetime = datetime.now()
607
 
 
 
 
 
 
608
  # Cache resolved model names to avoid repeated litellm lookups.
609
  # This is critical: litellm.cost_per_token() is synchronous and can block
610
  # the async event loop if it triggers I/O (lazy model info download).
@@ -667,83 +669,34 @@ class CostTracker:
667
  ) -> float | None:
668
  """Estimate cost in USD using LiteLLM's pricing database.
669
 
 
 
 
670
  Args:
671
  model: Model name for pricing lookup
672
- input_tokens: Input tokens sent to API (does NOT include cache_read, which is served from cache)
673
  output_tokens: Output tokens
674
- cache_read_tokens: Tokens read from cache (charged at ~10% of input rate)
675
- cache_write_tokens: Tokens written to cache - this is a SUBSET of input_tokens (charged at ~125% of input rate)
676
  """
677
  if not LITELLM_AVAILABLE:
678
  logger.warning("LiteLLM not available - cannot calculate costs")
679
  return None
680
 
681
  try:
682
- # Resolve model name (adds provider prefix if needed, e.g. claude-opus-4-6 → anthropic/claude-opus-4-6)
683
  resolved_model = self._resolve_litellm_model(model)
684
 
685
- # cost_per_token returns (total_input_cost, total_output_cost) for the given token counts
686
- # Despite the name, it returns total cost not per-token cost
687
-
688
- # Anthropic's token semantics (all three are SEPARATE, not overlapping):
689
- # - input_tokens: tokens sent that are NOT cached (neither read nor written)
690
- # - cache_read_input_tokens: tokens served from existing cache
691
- # - cache_creation_input_tokens: tokens being written to cache
692
- # Total billable = input_tokens + cache_read + cache_write (each at different rates)
693
- regular_input = input_tokens # Don't subtract cache_write, they're separate
694
-
695
- # Get cost for regular (non-cached) input tokens
696
- input_cost, _ = litellm.cost_per_token(
697
- model=resolved_model,
698
- prompt_tokens=regular_input,
699
- completion_tokens=0,
700
- )
701
-
702
- # Get cost for output tokens
703
- _, output_cost = litellm.cost_per_token(
704
  model=resolved_model,
705
- prompt_tokens=0,
706
  completion_tokens=output_tokens,
 
 
707
  )
708
 
709
- # Get model info for cache pricing
710
- model_info: dict[str, Any] = {}
711
- try:
712
- model_info = dict(litellm.get_model_info(resolved_model))
713
- except Exception:
714
- pass
715
-
716
- # Calculate cache read cost (typically 10% of input price)
717
- cache_read_cost = 0.0
718
- if cache_read_tokens > 0:
719
- cache_read_cost_per_token = model_info.get("cache_read_input_token_cost")
720
- if cache_read_cost_per_token:
721
- cache_read_cost = cache_read_tokens * cache_read_cost_per_token
722
- else:
723
- # Fallback: most providers charge ~10% of input price for cache reads
724
- cache_read_full_cost, _ = litellm.cost_per_token(
725
- model=resolved_model,
726
- prompt_tokens=cache_read_tokens,
727
- completion_tokens=0,
728
- )
729
- cache_read_cost = cache_read_full_cost * 0.1
730
-
731
- # Calculate cache write cost (typically 125% of input price)
732
- cache_write_cost = 0.0
733
- if cache_write_tokens > 0:
734
- cache_write_cost_per_token = model_info.get("cache_creation_input_token_cost")
735
- if cache_write_cost_per_token:
736
- cache_write_cost = cache_write_tokens * cache_write_cost_per_token
737
- else:
738
- # Fallback: most providers charge ~125% of input price for cache writes
739
- cache_write_full_cost, _ = litellm.cost_per_token(
740
- model=resolved_model,
741
- prompt_tokens=cache_write_tokens,
742
- completion_tokens=0,
743
- )
744
- cache_write_cost = cache_write_full_cost * 1.25
745
-
746
- total_cost = input_cost + cache_read_cost + cache_write_cost + output_cost
747
  return float(total_cost) if total_cost > 0 else None
748
 
749
  except Exception as e:
@@ -769,16 +722,13 @@ class CostTracker:
769
  while self._costs and self._costs[0][0] < cutoff:
770
  self._costs.popleft()
771
 
772
- def record_cost(self, cost_usd: float):
773
- """Record a cost. Periodically prunes old entries."""
774
- self._costs.append((datetime.now(), cost_usd))
775
- self._total_cost_usd += cost_usd
776
- # Periodically prune old costs to prevent memory growth
777
- self._prune_old_costs()
778
-
779
- def record_savings(self, savings_usd: float):
780
- """Record savings from optimization."""
781
- self._total_savings_usd += savings_usd
782
 
783
  def get_period_cost(self) -> float:
784
  """Get cost for current budget period."""
@@ -802,17 +752,55 @@ class CostTracker:
802
  remaining = self.budget_limit_usd - period_cost
803
  return remaining > 0, max(0, remaining)
804
 
 
 
 
 
 
 
 
 
 
 
 
 
805
  def stats(self) -> dict:
806
- """Get cost statistics."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
807
  return {
808
- "total_cost_usd": round(self._total_cost_usd, 4),
809
- "total_savings_usd": round(self._total_savings_usd, 4),
810
- "period_cost_usd": round(self.get_period_cost(), 4),
811
- "budget_limit_usd": self.budget_limit_usd,
812
- "budget_period": self.budget_period,
813
- "budget_remaining_usd": round(self.check_budget()[1], 4)
814
- if self.budget_limit_usd
815
- else None,
816
  }
817
 
818
 
@@ -845,9 +833,24 @@ class PrometheusMetrics:
845
  self.overhead_sum_ms = 0.0
846
  self.overhead_min_ms = float("inf")
847
  self.overhead_max_ms = 0.0
 
 
 
 
 
 
 
848
 
849
- self.cost_total_usd = 0.0
850
- self.savings_total_usd = 0.0
 
 
 
 
 
 
 
 
851
 
852
  self._lock = asyncio.Lock()
853
 
@@ -860,9 +863,10 @@ class PrometheusMetrics:
860
  tokens_saved: int,
861
  latency_ms: float,
862
  cached: bool = False,
863
- cost_usd: float = 0,
864
- savings_usd: float = 0,
865
  overhead_ms: float = 0,
 
 
 
866
  ):
867
  """Record metrics for a request."""
868
  async with self._lock:
@@ -887,9 +891,34 @@ class PrometheusMetrics:
887
  self.overhead_sum_ms += overhead_ms
888
  self.overhead_min_ms = min(self.overhead_min_ms, overhead_ms)
889
  self.overhead_max_ms = max(self.overhead_max_ms, overhead_ms)
890
-
891
- self.cost_total_usd += cost_usd
892
- self.savings_total_usd += savings_usd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
893
 
894
  async def record_rate_limited(self):
895
  async with self._lock:
@@ -934,14 +963,6 @@ class PrometheusMetrics:
934
  "# HELP headroom_latency_ms_sum Sum of request latencies",
935
  "# TYPE headroom_latency_ms_sum counter",
936
  f"headroom_latency_ms_sum {self.latency_sum_ms:.2f}",
937
- "",
938
- "# HELP headroom_cost_usd_total Total cost in USD",
939
- "# TYPE headroom_cost_usd_total counter",
940
- f"headroom_cost_usd_total {self.cost_total_usd:.6f}",
941
- "",
942
- "# HELP headroom_savings_usd_total Total savings in USD",
943
- "# TYPE headroom_savings_usd_total counter",
944
- f"headroom_savings_usd_total {self.savings_total_usd:.6f}",
945
  ]
946
 
947
  # Per-provider metrics
@@ -1406,6 +1427,14 @@ class HeadroomProxy:
1406
  else:
1407
  logger.info("Smart Routing: DISABLED (legacy sequential mode)")
1408
 
 
 
 
 
 
 
 
 
1409
  # LLMLingua status with helpful hint
1410
  if self._llmlingua_status == "enabled":
1411
  logger.info(
@@ -1474,8 +1503,6 @@ class HeadroomProxy:
1474
  m.tokens_saved_total / (m.tokens_input_total + m.tokens_saved_total)
1475
  ) * 100
1476
  logger.info(f"Token savings: {savings_pct:.1f}%")
1477
- logger.info(f"Total cost: ${m.cost_total_usd:.4f}")
1478
- logger.info(f"Total savings: ${m.savings_total_usd:.4f}")
1479
  if m.latency_count > 0:
1480
  avg_latency = m.latency_sum_ms / m.latency_count
1481
  logger.info(f"Avg latency: {avg_latency:.0f}ms")
@@ -1727,6 +1754,8 @@ class HeadroomProxy:
1727
 
1728
  # Apply optimization
1729
  transforms_applied = []
 
 
1730
  optimized_messages = messages
1731
  optimized_tokens = original_tokens
1732
 
@@ -1745,13 +1774,16 @@ class HeadroomProxy:
1745
  if result.messages != messages:
1746
  optimized_messages = result.messages
1747
  transforms_applied = result.transforms_applied
 
1748
  # Use pipeline's token counts for consistency with pipeline logs
1749
  original_tokens = result.tokens_before
1750
  optimized_tokens = result.tokens_after
 
 
1751
  except Exception as e:
1752
  logger.warning(f"Optimization failed: {e}")
1753
 
1754
- tokens_saved = original_tokens - optimized_tokens
1755
  optimization_latency = (time.time() - start_time) * 1000
1756
 
1757
  # Hook: post_compress — let hooks observe compression results
@@ -1933,6 +1965,7 @@ class HeadroomProxy:
1933
  transforms_applied,
1934
  tags,
1935
  optimization_latency,
 
1936
  )
1937
  else:
1938
  backend_response = await self.anthropic_backend.send_message(body, headers)
@@ -1957,22 +1990,11 @@ class HeadroomProxy:
1957
  latency_ms=total_latency,
1958
  cached=False,
1959
  overhead_ms=optimization_latency,
 
1960
  )
1961
 
1962
- cost_usd = None
1963
- savings_usd = None
1964
  if self.cost_tracker:
1965
- cost_usd = self.cost_tracker.estimate_cost(
1966
- model, optimized_tokens, output_tokens
1967
- )
1968
- original_cost = self.cost_tracker.estimate_cost(
1969
- model, original_tokens, output_tokens
1970
- )
1971
- if cost_usd:
1972
- self.cost_tracker.record_cost(cost_usd)
1973
- if cost_usd and original_cost:
1974
- savings_usd = original_cost - cost_usd
1975
- self.cost_tracker.record_savings(savings_usd)
1976
 
1977
  # Log request
1978
  if self.logger:
@@ -1989,8 +2011,6 @@ class HeadroomProxy:
1989
  savings_percent=(tokens_saved / original_tokens * 100)
1990
  if original_tokens > 0
1991
  else 0,
1992
- estimated_cost_usd=cost_usd,
1993
- estimated_savings_usd=savings_usd,
1994
  optimization_latency_ms=optimization_latency,
1995
  total_latency_ms=total_latency,
1996
  tags=tags,
@@ -2035,6 +2055,7 @@ class HeadroomProxy:
2035
  tags,
2036
  optimization_latency,
2037
  memory_user_id=memory_user_id,
 
2038
  )
2039
  else:
2040
  response = await self._retry_request("POST", url, headers, body)
@@ -2200,45 +2221,14 @@ class HeadroomProxy:
2200
 
2201
  total_latency = (time.time() - start_time) * 1000
2202
 
2203
- # Parse response for actual token counts from API
2204
- actual_input_tokens = optimized_tokens # fallback
2205
  output_tokens = 0
2206
- cache_read_tokens = 0
2207
- cache_write_tokens = 0
2208
  if resp_json:
2209
  usage = resp_json.get("usage", {})
2210
- actual_input_tokens = usage.get("input_tokens", optimized_tokens)
2211
  output_tokens = usage.get("output_tokens", 0)
2212
- # Anthropic returns cache_read_input_tokens for cached prompt tokens
2213
- # These are charged at 10% of the input price
2214
- cache_read_tokens = usage.get("cache_read_input_tokens", 0)
2215
- # Anthropic returns cache_creation_input_tokens for tokens written to cache
2216
- # These are charged at 125% of the input price
2217
- cache_write_tokens = usage.get("cache_creation_input_tokens", 0)
2218
-
2219
- # Calculate cost using actual API tokens with proper cache pricing
2220
- cost_usd = None
2221
- savings_usd = None
2222
  if self.cost_tracker:
2223
- cost_usd = self.cost_tracker.estimate_cost(
2224
- model,
2225
- actual_input_tokens,
2226
- output_tokens,
2227
- cache_read_tokens=cache_read_tokens,
2228
- cache_write_tokens=cache_write_tokens,
2229
- )
2230
- # original_cost: what it would have cost without compression
2231
- # Use only original_tokens at regular input rate — no cache params,
2232
- # since caching is orthogonal to compression savings
2233
- original_cost = self.cost_tracker.estimate_cost(
2234
- model,
2235
- original_tokens,
2236
- output_tokens,
2237
- )
2238
- if cost_usd and original_cost:
2239
- savings_usd = original_cost - cost_usd
2240
- self.cost_tracker.record_cost(cost_usd)
2241
- self.cost_tracker.record_savings(savings_usd)
2242
 
2243
  # Cache response
2244
  if self.cache and response.status_code == 200:
@@ -2250,17 +2240,18 @@ class HeadroomProxy:
2250
  tokens_saved=tokens_saved,
2251
  )
2252
 
2253
- # Record metrics with actual API tokens
 
2254
  await self.metrics.record_request(
2255
  provider="anthropic",
2256
  model=model,
2257
- input_tokens=actual_input_tokens,
2258
  output_tokens=output_tokens,
2259
  tokens_saved=tokens_saved,
2260
  latency_ms=total_latency,
2261
- cost_usd=cost_usd or 0,
2262
- savings_usd=savings_usd or 0,
2263
  overhead_ms=optimization_latency,
 
 
2264
  )
2265
 
2266
  # Log request
@@ -2278,13 +2269,12 @@ class HeadroomProxy:
2278
  savings_percent=(tokens_saved / original_tokens * 100)
2279
  if original_tokens > 0
2280
  else 0,
2281
- estimated_cost_usd=cost_usd,
2282
- estimated_savings_usd=savings_usd,
2283
  optimization_latency_ms=optimization_latency,
2284
  total_latency_ms=total_latency,
2285
  tags=tags,
2286
  cache_hit=cache_hit,
2287
  transforms_applied=transforms_applied,
 
2288
  request_messages=messages if self.config.log_full_messages else None,
2289
  )
2290
  )
@@ -2295,6 +2285,11 @@ class HeadroomProxy:
2295
  cr = resp_usage.get("cache_read_input_tokens", 0)
2296
  cw = resp_usage.get("cache_creation_input_tokens", 0)
2297
  chp = round(cr / (cr + cw) * 100) if (cr + cw) > 0 else 0
 
 
 
 
 
2298
  logger.info(
2299
  f"[{request_id}] PERF "
2300
  f"model={model} msgs={num_msgs} "
@@ -2303,6 +2298,7 @@ class HeadroomProxy:
2303
  f"cache_read={cr} cache_write={cw} cache_hit_pct={chp} "
2304
  f"opt_ms={optimization_latency:.0f} "
2305
  f"transforms={_summarize_transforms(transforms_applied)}"
 
2306
  )
2307
 
2308
  # Remove compression headers since httpx already decompressed the response
@@ -2427,6 +2423,7 @@ class HeadroomProxy:
2427
  total_optimized_tokens = 0
2428
  total_tokens_saved = 0
2429
  compressed_requests = []
 
2430
 
2431
  # Apply compression to each request in the batch
2432
  for batch_req in requests_list:
@@ -2450,12 +2447,13 @@ class HeadroomProxy:
2450
  )
2451
 
2452
  optimized_messages = result.messages
 
2453
  # Use pipeline's token counts for consistency with pipeline logs
2454
  original_tokens = result.tokens_before
2455
  optimized_tokens = result.tokens_after
2456
  total_original_tokens += original_tokens
2457
  total_optimized_tokens += optimized_tokens
2458
- tokens_saved = original_tokens - optimized_tokens
2459
  total_tokens_saved += tokens_saved
2460
 
2461
  # CCR Tool Injection: Inject retrieval tool if compression occurred
@@ -2519,6 +2517,8 @@ class HeadroomProxy:
2519
  output_tokens=0,
2520
  tokens_saved=total_tokens_saved,
2521
  latency_ms=optimization_latency,
 
 
2522
  )
2523
 
2524
  # Log compression stats
@@ -2857,6 +2857,7 @@ class HeadroomProxy:
2857
  total_optimized_tokens = 0
2858
  total_tokens_saved = 0
2859
  compressed_requests = []
 
2860
 
2861
  # Apply compression to each request in the batch
2862
  for idx, batch_req in enumerate(requests_list):
@@ -2897,12 +2898,13 @@ class HeadroomProxy:
2897
  )
2898
 
2899
  optimized_messages = result.messages
 
2900
  # Use pipeline's token counts for consistency with pipeline logs
2901
  original_tokens = result.tokens_before
2902
  optimized_tokens = result.tokens_after
2903
  total_original_tokens += original_tokens
2904
  total_optimized_tokens += optimized_tokens
2905
- tokens_saved = original_tokens - optimized_tokens
2906
  total_tokens_saved += tokens_saved
2907
 
2908
  # CCR Tool Injection: Inject retrieval tool if compression occurred
@@ -2993,6 +2995,8 @@ class HeadroomProxy:
2993
  output_tokens=0,
2994
  tokens_saved=total_tokens_saved,
2995
  latency_ms=optimization_latency,
 
 
2996
  )
2997
 
2998
  # Log compression stats
@@ -3697,6 +3701,7 @@ class HeadroomProxy:
3697
  tags: dict[str, str],
3698
  optimization_latency: float,
3699
  memory_user_id: str | None = None,
 
3700
  ) -> StreamingResponse:
3701
  """Stream response with metrics tracking and memory tool handling.
3702
 
@@ -3719,6 +3724,7 @@ class HeadroomProxy:
3719
  "cache_creation_input_tokens": 0,
3720
  "total_bytes": 0,
3721
  "sse_buffer": "", # Buffer for incomplete SSE events
 
3722
  }
3723
 
3724
  # Track if we need to handle memory tools
@@ -3740,6 +3746,10 @@ class HeadroomProxy:
3740
  "POST", url, json=body, headers=headers
3741
  ) as response:
3742
  async for chunk in response.aiter_bytes():
 
 
 
 
3743
  stream_state["total_bytes"] += len(chunk)
3744
 
3745
  # Buffer SSE data to handle chunks split across calls
@@ -3901,12 +3911,9 @@ class HeadroomProxy:
3901
  f"[{request_id}] No usage in stream, estimated {output_tokens} output tokens"
3902
  )
3903
 
3904
- # Use actual tokens from API if available, fallback to estimates
3905
- # Note: use 'is not None' instead of 'or' to handle 0 correctly
3906
- api_input_tokens = stream_state["input_tokens"]
3907
- total_input_tokens = (
3908
- api_input_tokens if api_input_tokens is not None else optimized_tokens
3909
- )
3910
  cache_read_tokens = stream_state["cache_read_input_tokens"]
3911
  cache_write_tokens = stream_state["cache_creation_input_tokens"]
3912
 
@@ -3928,54 +3935,19 @@ class HeadroomProxy:
3928
  f"transforms={_summarize_transforms(transforms_applied)}"
3929
  )
3930
 
3931
- # Normalize input tokens based on provider semantics:
3932
- # - Anthropic: input_tokens excludes cache_read (it's separate), pass as-is
3933
- # - OpenAI/Gemini: input_tokens includes cache_read (it's a subset), subtract it
3934
- if provider == "anthropic":
3935
- # Anthropic's input_tokens = non-cached tokens sent (excludes cache_read)
3936
- non_cached_input = total_input_tokens
3937
- else:
3938
- # OpenAI/Gemini's input_tokens = total (includes cache_read)
3939
- non_cached_input = total_input_tokens - cache_read_tokens
3940
-
3941
- # Calculate cost using actual API tokens with proper cache pricing
3942
- cost_usd = None
3943
- savings_usd = None
3944
  if self.cost_tracker:
3945
- cost_usd = self.cost_tracker.estimate_cost(
3946
- model,
3947
- non_cached_input,
3948
- output_tokens,
3949
- cache_read_tokens=cache_read_tokens,
3950
- cache_write_tokens=cache_write_tokens,
3951
- )
3952
- # For savings calculation, compare compression benefit using base token rates only
3953
- # (cache effects are Anthropic's feature, not Headroom's compression benefit)
3954
- compressed_base_cost = self.cost_tracker.estimate_cost(
3955
- model,
3956
- non_cached_input,
3957
- output_tokens,
3958
- )
3959
- original_base_cost = self.cost_tracker.estimate_cost(
3960
- model,
3961
- original_tokens,
3962
- output_tokens,
3963
- )
3964
- if cost_usd:
3965
- self.cost_tracker.record_cost(cost_usd)
3966
- if compressed_base_cost and original_base_cost:
3967
- savings_usd = original_base_cost - compressed_base_cost
3968
- self.cost_tracker.record_savings(max(0, savings_usd))
3969
 
3970
  await self.metrics.record_request(
3971
  provider=provider,
3972
  model=model,
3973
- input_tokens=total_input_tokens, # Record total for accurate tracking
3974
  output_tokens=output_tokens,
3975
  tokens_saved=tokens_saved,
3976
  latency_ms=total_latency,
3977
- cost_usd=cost_usd or 0,
3978
- savings_usd=savings_usd or 0,
 
3979
  )
3980
 
3981
  return StreamingResponse(
@@ -3996,6 +3968,7 @@ class HeadroomProxy:
3996
  transforms_applied: list[str],
3997
  tags: dict[str, str],
3998
  optimization_latency: float,
 
3999
  ) -> StreamingResponse:
4000
  """Stream response from Bedrock backend with metrics tracking.
4001
 
@@ -4007,6 +3980,7 @@ class HeadroomProxy:
4007
  stream_state: dict[str, Any] = {
4008
  "input_tokens": 0,
4009
  "output_tokens": 0,
 
4010
  }
4011
 
4012
  async def generate():
@@ -4014,6 +3988,10 @@ class HeadroomProxy:
4014
  assert self.anthropic_backend is not None
4015
 
4016
  async for event in self.anthropic_backend.stream_message(body, headers):
 
 
 
 
4017
  # Format as SSE
4018
  if event.raw_sse:
4019
  yield event.raw_sse.encode()
@@ -4060,22 +4038,12 @@ class HeadroomProxy:
4060
  latency_ms=total_latency,
4061
  cached=False,
4062
  overhead_ms=optimization_latency,
 
 
4063
  )
4064
 
4065
- cost_usd = None
4066
- savings_usd = None
4067
  if self.cost_tracker:
4068
- cost_usd = self.cost_tracker.estimate_cost(
4069
- model, optimized_tokens, output_tokens
4070
- )
4071
- original_cost = self.cost_tracker.estimate_cost(
4072
- model, original_tokens, output_tokens
4073
- )
4074
- if cost_usd:
4075
- self.cost_tracker.record_cost(cost_usd)
4076
- if cost_usd and original_cost:
4077
- savings_usd = original_cost - cost_usd
4078
- self.cost_tracker.record_savings(savings_usd)
4079
 
4080
  # Log request
4081
  if self.logger:
@@ -4092,8 +4060,6 @@ class HeadroomProxy:
4092
  savings_percent=(tokens_saved / original_tokens * 100)
4093
  if original_tokens > 0
4094
  else 0,
4095
- estimated_cost_usd=cost_usd,
4096
- estimated_savings_usd=savings_usd,
4097
  optimization_latency_ms=optimization_latency,
4098
  total_latency_ms=total_latency,
4099
  tags=tags,
@@ -4230,6 +4196,8 @@ class HeadroomProxy:
4230
 
4231
  # Optimization
4232
  transforms_applied = []
 
 
4233
  optimized_messages = messages
4234
  optimized_tokens = original_tokens
4235
 
@@ -4245,12 +4213,15 @@ class HeadroomProxy:
4245
  if result.messages != messages:
4246
  optimized_messages = result.messages
4247
  transforms_applied = result.transforms_applied
 
4248
  original_tokens = result.tokens_before
4249
  optimized_tokens = result.tokens_after
 
 
4250
  except Exception as e:
4251
  logger.warning(f"Optimization failed: {e}")
4252
 
4253
- tokens_saved = original_tokens - optimized_tokens
4254
  optimization_latency = (time.time() - start_time) * 1000
4255
 
4256
  # Hook: post_compress
@@ -4335,6 +4306,7 @@ class HeadroomProxy:
4335
  latency_ms=total_latency,
4336
  cached=False,
4337
  overhead_ms=optimization_latency,
 
4338
  )
4339
 
4340
  if tokens_saved > 0:
@@ -4385,6 +4357,7 @@ class HeadroomProxy:
4385
  transforms_applied,
4386
  tags,
4387
  optimization_latency,
 
4388
  )
4389
  else:
4390
  response = await self._retry_request("POST", url, headers, body)
@@ -4407,30 +4380,8 @@ class HeadroomProxy:
4407
  f"[{request_id}] Failed to extract cached tokens from OpenAI response: {e}"
4408
  )
4409
 
4410
- # For OpenAI, prompt_tokens is TOTAL (includes cached)
4411
- # Normalize to non-cached input for consistent cost calculation
4412
- non_cached_input = total_input_tokens - cache_read_tokens
4413
-
4414
- # Cost tracking using actual API tokens
4415
- cost_usd = savings_usd = None
4416
  if self.cost_tracker:
4417
- cost_usd = self.cost_tracker.estimate_cost(
4418
- model,
4419
- non_cached_input, # Pass non-cached portion
4420
- output_tokens,
4421
- cache_read_tokens=cache_read_tokens,
4422
- )
4423
- # original_cost: what it would have cost without compression
4424
- # No cache params — caching is orthogonal to compression savings
4425
- original_cost = self.cost_tracker.estimate_cost(
4426
- model,
4427
- original_tokens,
4428
- output_tokens,
4429
- )
4430
- if cost_usd and original_cost:
4431
- savings_usd = original_cost - cost_usd
4432
- self.cost_tracker.record_cost(cost_usd)
4433
- self.cost_tracker.record_savings(savings_usd)
4434
 
4435
  # Cache
4436
  if self.cache and response.status_code == 200:
@@ -4438,7 +4389,6 @@ class HeadroomProxy:
4438
  messages, model, response.content, dict(response.headers), tokens_saved
4439
  )
4440
 
4441
- # Metrics with actual API tokens (total, for accurate tracking)
4442
  await self.metrics.record_request(
4443
  provider="openai",
4444
  model=model,
@@ -4446,8 +4396,9 @@ class HeadroomProxy:
4446
  output_tokens=output_tokens,
4447
  tokens_saved=tokens_saved,
4448
  latency_ms=total_latency,
4449
- cost_usd=cost_usd or 0,
4450
- savings_usd=savings_usd or 0,
 
4451
  )
4452
 
4453
  if tokens_saved > 0:
@@ -4543,8 +4494,6 @@ class HeadroomProxy:
4543
  tokens_saved=0,
4544
  latency_ms=latency_ms,
4545
  cached=False,
4546
- cost_usd=0,
4547
- savings_usd=0,
4548
  )
4549
 
4550
  return Response(
@@ -5191,39 +5140,19 @@ class HeadroomProxy:
5191
 
5192
  total_input_tokens = original_tokens # fallback
5193
  output_tokens = 0
5194
- cache_read_tokens = 0
5195
  try:
5196
  resp_json = response.json()
5197
  usage = resp_json.get("usage", {})
5198
  total_input_tokens = usage.get("input_tokens", original_tokens)
5199
  output_tokens = usage.get("output_tokens", 0)
5200
- # OpenAI returns cached_tokens in prompt_tokens_details (or input_tokens_details)
5201
- prompt_details = usage.get(
5202
- "prompt_tokens_details", usage.get("input_tokens_details", {})
5203
- )
5204
- cache_read_tokens = prompt_details.get("cached_tokens", 0)
5205
  except (KeyError, TypeError, AttributeError) as e:
5206
  logger.debug(
5207
  f"[{request_id}] Failed to extract cached tokens from OpenAI passthrough response: {e}"
5208
  )
5209
 
5210
- # For OpenAI, input_tokens is TOTAL (includes cached)
5211
- # Normalize to non-cached input for consistent cost calculation
5212
- non_cached_input = total_input_tokens - cache_read_tokens
5213
-
5214
- # Cost tracking using actual API tokens
5215
- cost_usd = savings_usd = None
5216
  if self.cost_tracker:
5217
- cost_usd = self.cost_tracker.estimate_cost(
5218
- model,
5219
- non_cached_input, # Pass non-cached portion
5220
- output_tokens,
5221
- cache_read_tokens=cache_read_tokens,
5222
- )
5223
- if cost_usd:
5224
- self.cost_tracker.record_cost(cost_usd)
5225
 
5226
- # Metrics with actual API tokens (total, for accurate tracking)
5227
  await self.metrics.record_request(
5228
  provider="openai",
5229
  model=model,
@@ -5231,8 +5160,7 @@ class HeadroomProxy:
5231
  output_tokens=output_tokens,
5232
  tokens_saved=tokens_saved,
5233
  latency_ms=total_latency,
5234
- cost_usd=cost_usd or 0,
5235
- savings_usd=savings_usd or 0,
5236
  )
5237
 
5238
  logger.info(f"[{request_id}] /v1/responses {model}: {total_input_tokens:,} tokens")
@@ -5378,6 +5306,7 @@ class HeadroomProxy:
5378
 
5379
  # Optimization
5380
  transforms_applied: list[str] = []
 
5381
  optimized_messages = messages
5382
  optimized_tokens = original_tokens
5383
 
@@ -5396,10 +5325,12 @@ class HeadroomProxy:
5396
  # Use pipeline's token counts for consistency with pipeline logs
5397
  original_tokens = result.tokens_before
5398
  optimized_tokens = result.tokens_after
 
 
5399
  except Exception as e:
5400
  logger.warning(f"[{request_id}] Gemini optimization failed: {e}")
5401
 
5402
- tokens_saved = original_tokens - optimized_tokens
5403
  optimization_latency = (time.time() - start_time) * 1000
5404
 
5405
  # Query Echo: re-inject user's question after compressed tool outputs
@@ -5481,32 +5412,9 @@ class HeadroomProxy:
5481
  f"[{request_id}] Failed to extract cached tokens from Gemini response: {e}"
5482
  )
5483
 
5484
- # For Gemini, promptTokenCount is TOTAL (includes cached)
5485
- # Normalize to non-cached input for consistent cost calculation
5486
- non_cached_input = total_input_tokens - cache_read_tokens
5487
-
5488
- # Cost tracking using actual API tokens
5489
- cost_usd = savings_usd = None
5490
  if self.cost_tracker:
5491
- cost_usd = self.cost_tracker.estimate_cost(
5492
- model,
5493
- non_cached_input, # Pass non-cached portion
5494
- output_tokens,
5495
- cache_read_tokens=cache_read_tokens,
5496
- )
5497
- # original_cost: what it would have cost without compression
5498
- # No cache params — caching is orthogonal to compression savings
5499
- original_cost = self.cost_tracker.estimate_cost(
5500
- model,
5501
- original_tokens,
5502
- output_tokens,
5503
- )
5504
- if cost_usd and original_cost:
5505
- savings_usd = original_cost - cost_usd
5506
- self.cost_tracker.record_cost(cost_usd)
5507
- self.cost_tracker.record_savings(savings_usd)
5508
 
5509
- # Metrics with actual API tokens (total, for accurate tracking)
5510
  await self.metrics.record_request(
5511
  provider="gemini",
5512
  model=model,
@@ -5514,8 +5422,8 @@ class HeadroomProxy:
5514
  output_tokens=output_tokens,
5515
  tokens_saved=tokens_saved,
5516
  latency_ms=total_latency,
5517
- cost_usd=cost_usd or 0,
5518
- savings_usd=savings_usd or 0,
5519
  )
5520
 
5521
  if tokens_saved > 0:
@@ -5745,7 +5653,7 @@ class HeadroomProxy:
5745
  logger.debug(f"[{request_id}] Failed to parse Gemini token count response: {e}")
5746
 
5747
  # Track stats
5748
- tokens_saved = original_tokens - compressed_tokens if compressed_tokens > 0 else 0
5749
 
5750
  await self.metrics.record_request(
5751
  provider="gemini",
@@ -5754,8 +5662,6 @@ class HeadroomProxy:
5754
  output_tokens=0,
5755
  tokens_saved=tokens_saved,
5756
  latency_ms=total_latency,
5757
- cost_usd=0,
5758
- savings_usd=0,
5759
  )
5760
 
5761
  if tokens_saved > 0:
@@ -5937,16 +5843,23 @@ def create_app(config: ProxyConfig | None = None) -> FastAPI:
5937
  )
5938
  max_latency_ms = round(m.latency_max_ms, 2) if m.latency_count > 0 else 0
5939
 
5940
- # Calculate Headroom overhead (optimization time only)
5941
  avg_overhead_ms = (
5942
- round(m.overhead_sum_ms / m.latency_count, 2) if m.latency_count > 0 else 0
5943
  )
5944
  min_overhead_ms = (
5945
  round(m.overhead_min_ms, 2)
5946
- if m.latency_count > 0 and m.overhead_min_ms != float("inf")
5947
  else 0
5948
  )
5949
- max_overhead_ms = round(m.overhead_max_ms, 2) if m.latency_count > 0 else 0
 
 
 
 
 
 
 
5950
 
5951
  # Get compression store stats
5952
  store = get_compression_store()
@@ -5995,6 +5908,25 @@ def create_app(config: ProxyConfig | None = None) -> FastAPI:
5995
  "min_ms": min_overhead_ms,
5996
  "max_ms": max_overhead_ms,
5997
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5998
  "cost": proxy.cost_tracker.stats() if proxy.cost_tracker else None,
5999
  "compression": {
6000
  "ccr_entries": compression_stats.get("entry_count", 0),
 
216
  tokens_saved: int
217
  savings_percent: float
218
 
 
 
 
 
219
  # Performance
220
  optimization_latency_ms: float
221
  total_latency_ms: float | None
 
225
  cache_hit: bool
226
  transforms_applied: list[str]
227
 
228
+ # Waste signals detected in original messages
229
+ waste_signals: dict[str, int] | None = None
230
+
231
  # Request/Response (optional, for debugging)
232
  request_messages: list[dict] | None = None
233
  response_content: str | None = None
 
600
 
601
  # Cost tracking - using deque for efficient left-side removal
602
  self._costs: deque[tuple[datetime, float]] = deque(maxlen=self.MAX_COST_ENTRIES)
 
 
603
  self._last_prune_time: datetime = datetime.now()
604
 
605
+ # Token savings per model (exact, no dollar estimation)
606
+ self._tokens_saved_by_model: dict[str, int] = {}
607
+ self._tokens_sent_by_model: dict[str, int] = {}
608
+ self._requests_by_model: dict[str, int] = {}
609
+
610
  # Cache resolved model names to avoid repeated litellm lookups.
611
  # This is critical: litellm.cost_per_token() is synchronous and can block
612
  # the async event loop if it triggers I/O (lazy model info download).
 
669
  ) -> float | None:
670
  """Estimate cost in USD using LiteLLM's pricing database.
671
 
672
+ LiteLLM natively handles cache_read and cache_creation pricing
673
+ for all providers (Anthropic, OpenAI, Google, etc.) in a single call.
674
+
675
  Args:
676
  model: Model name for pricing lookup
677
+ input_tokens: Non-cached input tokens (excludes cache_read)
678
  output_tokens: Output tokens
679
+ cache_read_tokens: Tokens served from cache (~10% of input rate)
680
+ cache_write_tokens: Tokens written to cache (~125% of input rate)
681
  """
682
  if not LITELLM_AVAILABLE:
683
  logger.warning("LiteLLM not available - cannot calculate costs")
684
  return None
685
 
686
  try:
 
687
  resolved_model = self._resolve_litellm_model(model)
688
 
689
+ # litellm.cost_per_token handles all token types natively:
690
+ # prompt_tokens at input rate, cache_read at ~10%, cache_creation at ~125%
691
+ input_cost, output_cost = litellm.cost_per_token(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
692
  model=resolved_model,
693
+ prompt_tokens=input_tokens,
694
  completion_tokens=output_tokens,
695
+ cache_read_input_tokens=cache_read_tokens,
696
+ cache_creation_input_tokens=cache_write_tokens,
697
  )
698
 
699
+ total_cost = input_cost + output_cost
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700
  return float(total_cost) if total_cost > 0 else None
701
 
702
  except Exception as e:
 
722
  while self._costs and self._costs[0][0] < cutoff:
723
  self._costs.popleft()
724
 
725
+ def record_tokens(self, model: str, tokens_saved: int, tokens_sent: int):
726
+ """Record token counts per model. This is exact — no estimation."""
727
+ self._tokens_saved_by_model[model] = (
728
+ self._tokens_saved_by_model.get(model, 0) + tokens_saved
729
+ )
730
+ self._tokens_sent_by_model[model] = self._tokens_sent_by_model.get(model, 0) + tokens_sent
731
+ self._requests_by_model[model] = self._requests_by_model.get(model, 0) + 1
 
 
 
732
 
733
  def get_period_cost(self) -> float:
734
  """Get cost for current budget period."""
 
752
  remaining = self.budget_limit_usd - period_cost
753
  return remaining > 0, max(0, remaining)
754
 
755
+ def _get_list_price(self, model: str) -> float | None:
756
+ """Get list input price per 1M tokens for a model."""
757
+ if not LITELLM_AVAILABLE:
758
+ return None
759
+ try:
760
+ resolved = self._resolve_litellm_model(model)
761
+ info = litellm.model_cost.get(resolved, {})
762
+ cost_per_token = info.get("input_cost_per_token")
763
+ return cost_per_token * 1_000_000 if cost_per_token else None
764
+ except Exception:
765
+ return None
766
+
767
  def stats(self) -> dict:
768
+ """Get token statistics per model."""
769
+ per_model = {}
770
+ total_saved = 0
771
+ for model in sorted(self._tokens_saved_by_model.keys()):
772
+ saved = self._tokens_saved_by_model[model]
773
+ sent = self._tokens_sent_by_model.get(model, 0)
774
+ reqs = self._requests_by_model.get(model, 0)
775
+ total_saved += saved
776
+ per_model[model] = {
777
+ "requests": reqs,
778
+ "tokens_saved": saved,
779
+ "tokens_sent": sent,
780
+ "reduction_pct": round(saved / (saved + sent) * 100, 1)
781
+ if (saved + sent) > 0
782
+ else 0,
783
+ }
784
+
785
+ # Compute counterfactual: what would you have paid without Headroom?
786
+ # Note: uses input token pricing only. Output tokens and cache pricing
787
+ # are excluded since Headroom only compresses input tokens.
788
+ cost_with_headroom = 0.0
789
+ cost_without_headroom = 0.0
790
+ for model in self._tokens_saved_by_model:
791
+ saved = self._tokens_saved_by_model[model]
792
+ sent = self._tokens_sent_by_model.get(model, 0)
793
+ price_per_1m = self._get_list_price(model)
794
+ if price_per_1m:
795
+ cost_with_headroom += (sent / 1_000_000) * price_per_1m
796
+ cost_without_headroom += ((saved + sent) / 1_000_000) * price_per_1m
797
+
798
  return {
799
+ "total_tokens_saved": total_saved,
800
+ "per_model": per_model,
801
+ "cost_with_headroom_usd": round(cost_with_headroom, 4),
802
+ "cost_without_headroom_usd": round(cost_without_headroom, 4),
803
+ "savings_usd": round(cost_without_headroom - cost_with_headroom, 4),
 
 
 
804
  }
805
 
806
 
 
833
  self.overhead_sum_ms = 0.0
834
  self.overhead_min_ms = float("inf")
835
  self.overhead_max_ms = 0.0
836
+ self.overhead_count = 0
837
+
838
+ # Time to first byte (TTFB) from upstream — what the user actually feels
839
+ self.ttfb_sum_ms = 0.0
840
+ self.ttfb_min_ms = float("inf")
841
+ self.ttfb_max_ms = 0.0
842
+ self.ttfb_count = 0
843
 
844
+ # Per-transform timing (name → cumulative ms, count)
845
+ self.transform_timing_sum: dict[str, float] = defaultdict(float)
846
+ self.transform_timing_count: dict[str, int] = defaultdict(int)
847
+ self.transform_timing_max: dict[str, float] = defaultdict(float)
848
+
849
+ # Aggregate waste signals
850
+ self.waste_signals_total: dict[str, int] = defaultdict(int)
851
+
852
+ # Cumulative savings history (timestamp → cumulative tokens saved)
853
+ self.savings_history: list[tuple[str, int]] = []
854
 
855
  self._lock = asyncio.Lock()
856
 
 
863
  tokens_saved: int,
864
  latency_ms: float,
865
  cached: bool = False,
 
 
866
  overhead_ms: float = 0,
867
+ ttfb_ms: float = 0,
868
+ pipeline_timing: dict[str, float] | None = None,
869
+ waste_signals: dict[str, int] | None = None,
870
  ):
871
  """Record metrics for a request."""
872
  async with self._lock:
 
891
  self.overhead_sum_ms += overhead_ms
892
  self.overhead_min_ms = min(self.overhead_min_ms, overhead_ms)
893
  self.overhead_max_ms = max(self.overhead_max_ms, overhead_ms)
894
+ self.overhead_count += 1
895
+
896
+ # Track TTFB (time to first byte from upstream)
897
+ if ttfb_ms > 0:
898
+ self.ttfb_sum_ms += ttfb_ms
899
+ self.ttfb_min_ms = min(self.ttfb_min_ms, ttfb_ms)
900
+ self.ttfb_max_ms = max(self.ttfb_max_ms, ttfb_ms)
901
+ self.ttfb_count += 1
902
+
903
+ # Track per-transform timing
904
+ if pipeline_timing:
905
+ for name, ms in pipeline_timing.items():
906
+ self.transform_timing_sum[name] += ms
907
+ self.transform_timing_count[name] += 1
908
+ self.transform_timing_max[name] = max(self.transform_timing_max[name], ms)
909
+
910
+ # Track waste signals
911
+ if waste_signals:
912
+ for signal_name, token_count in waste_signals.items():
913
+ self.waste_signals_total[signal_name] += token_count
914
+
915
+ # Track cumulative savings history (record every request)
916
+ from datetime import datetime
917
+
918
+ self.savings_history.append((datetime.now().isoformat(), self.tokens_saved_total))
919
+ # Keep last 500 data points
920
+ if len(self.savings_history) > 500:
921
+ self.savings_history = self.savings_history[-500:]
922
 
923
  async def record_rate_limited(self):
924
  async with self._lock:
 
963
  "# HELP headroom_latency_ms_sum Sum of request latencies",
964
  "# TYPE headroom_latency_ms_sum counter",
965
  f"headroom_latency_ms_sum {self.latency_sum_ms:.2f}",
 
 
 
 
 
 
 
 
966
  ]
967
 
968
  # Per-provider metrics
 
1427
  else:
1428
  logger.info("Smart Routing: DISABLED (legacy sequential mode)")
1429
 
1430
+ # Eagerly load LLMLingua model at startup (avoids 5s delay on first request)
1431
+ if self.config.llmlingua_enabled:
1432
+ for transform in self.anthropic_pipeline.transforms:
1433
+ if hasattr(transform, "eager_load_compressors"):
1434
+ transform.eager_load_compressors()
1435
+ self._llmlingua_status = "enabled"
1436
+ break
1437
+
1438
  # LLMLingua status with helpful hint
1439
  if self._llmlingua_status == "enabled":
1440
  logger.info(
 
1503
  m.tokens_saved_total / (m.tokens_input_total + m.tokens_saved_total)
1504
  ) * 100
1505
  logger.info(f"Token savings: {savings_pct:.1f}%")
 
 
1506
  if m.latency_count > 0:
1507
  avg_latency = m.latency_sum_ms / m.latency_count
1508
  logger.info(f"Avg latency: {avg_latency:.0f}ms")
 
1754
 
1755
  # Apply optimization
1756
  transforms_applied = []
1757
+ pipeline_timing: dict[str, float] = {}
1758
+ waste_signals_dict: dict[str, int] | None = None
1759
  optimized_messages = messages
1760
  optimized_tokens = original_tokens
1761
 
 
1774
  if result.messages != messages:
1775
  optimized_messages = result.messages
1776
  transforms_applied = result.transforms_applied
1777
+ pipeline_timing = result.timing
1778
  # Use pipeline's token counts for consistency with pipeline logs
1779
  original_tokens = result.tokens_before
1780
  optimized_tokens = result.tokens_after
1781
+ if result.waste_signals:
1782
+ waste_signals_dict = result.waste_signals.to_dict()
1783
  except Exception as e:
1784
  logger.warning(f"Optimization failed: {e}")
1785
 
1786
+ tokens_saved = max(0, original_tokens - optimized_tokens)
1787
  optimization_latency = (time.time() - start_time) * 1000
1788
 
1789
  # Hook: post_compress — let hooks observe compression results
 
1965
  transforms_applied,
1966
  tags,
1967
  optimization_latency,
1968
+ pipeline_timing=pipeline_timing,
1969
  )
1970
  else:
1971
  backend_response = await self.anthropic_backend.send_message(body, headers)
 
1990
  latency_ms=total_latency,
1991
  cached=False,
1992
  overhead_ms=optimization_latency,
1993
+ pipeline_timing=pipeline_timing,
1994
  )
1995
 
 
 
1996
  if self.cost_tracker:
1997
+ self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
 
 
 
 
 
 
 
 
 
 
1998
 
1999
  # Log request
2000
  if self.logger:
 
2011
  savings_percent=(tokens_saved / original_tokens * 100)
2012
  if original_tokens > 0
2013
  else 0,
 
 
2014
  optimization_latency_ms=optimization_latency,
2015
  total_latency_ms=total_latency,
2016
  tags=tags,
 
2055
  tags,
2056
  optimization_latency,
2057
  memory_user_id=memory_user_id,
2058
+ pipeline_timing=pipeline_timing,
2059
  )
2060
  else:
2061
  response = await self._retry_request("POST", url, headers, body)
 
2221
 
2222
  total_latency = (time.time() - start_time) * 1000
2223
 
2224
+ # Parse response for output token count
 
2225
  output_tokens = 0
 
 
2226
  if resp_json:
2227
  usage = resp_json.get("usage", {})
 
2228
  output_tokens = usage.get("output_tokens", 0)
2229
+
 
 
 
 
 
 
 
 
 
2230
  if self.cost_tracker:
2231
+ self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2232
 
2233
  # Cache response
2234
  if self.cache and response.status_code == 200:
 
2240
  tokens_saved=tokens_saved,
2241
  )
2242
 
2243
+ # Record metrics use optimized_tokens (what we sent), not API's
2244
+ # input_tokens which is just the non-cached portion with prompt caching
2245
  await self.metrics.record_request(
2246
  provider="anthropic",
2247
  model=model,
2248
+ input_tokens=optimized_tokens,
2249
  output_tokens=output_tokens,
2250
  tokens_saved=tokens_saved,
2251
  latency_ms=total_latency,
 
 
2252
  overhead_ms=optimization_latency,
2253
+ pipeline_timing=pipeline_timing,
2254
+ waste_signals=waste_signals_dict,
2255
  )
2256
 
2257
  # Log request
 
2269
  savings_percent=(tokens_saved / original_tokens * 100)
2270
  if original_tokens > 0
2271
  else 0,
 
 
2272
  optimization_latency_ms=optimization_latency,
2273
  total_latency_ms=total_latency,
2274
  tags=tags,
2275
  cache_hit=cache_hit,
2276
  transforms_applied=transforms_applied,
2277
+ waste_signals=waste_signals_dict,
2278
  request_messages=messages if self.config.log_full_messages else None,
2279
  )
2280
  )
 
2285
  cr = resp_usage.get("cache_read_input_tokens", 0)
2286
  cw = resp_usage.get("cache_creation_input_tokens", 0)
2287
  chp = round(cr / (cr + cw) * 100) if (cr + cw) > 0 else 0
2288
+ timing_str = (
2289
+ " ".join(f"{k}={v:.0f}ms" for k, v in pipeline_timing.items())
2290
+ if pipeline_timing
2291
+ else ""
2292
+ )
2293
  logger.info(
2294
  f"[{request_id}] PERF "
2295
  f"model={model} msgs={num_msgs} "
 
2298
  f"cache_read={cr} cache_write={cw} cache_hit_pct={chp} "
2299
  f"opt_ms={optimization_latency:.0f} "
2300
  f"transforms={_summarize_transforms(transforms_applied)}"
2301
+ f"{' timing=' + timing_str if timing_str else ''}"
2302
  )
2303
 
2304
  # Remove compression headers since httpx already decompressed the response
 
2423
  total_optimized_tokens = 0
2424
  total_tokens_saved = 0
2425
  compressed_requests = []
2426
+ pipeline_timing: dict[str, float] = {}
2427
 
2428
  # Apply compression to each request in the batch
2429
  for batch_req in requests_list:
 
2447
  )
2448
 
2449
  optimized_messages = result.messages
2450
+ pipeline_timing = result.timing
2451
  # Use pipeline's token counts for consistency with pipeline logs
2452
  original_tokens = result.tokens_before
2453
  optimized_tokens = result.tokens_after
2454
  total_original_tokens += original_tokens
2455
  total_optimized_tokens += optimized_tokens
2456
+ tokens_saved = max(0, original_tokens - optimized_tokens)
2457
  total_tokens_saved += tokens_saved
2458
 
2459
  # CCR Tool Injection: Inject retrieval tool if compression occurred
 
2517
  output_tokens=0,
2518
  tokens_saved=total_tokens_saved,
2519
  latency_ms=optimization_latency,
2520
+ overhead_ms=optimization_latency,
2521
+ pipeline_timing=pipeline_timing,
2522
  )
2523
 
2524
  # Log compression stats
 
2857
  total_optimized_tokens = 0
2858
  total_tokens_saved = 0
2859
  compressed_requests = []
2860
+ pipeline_timing: dict[str, float] = {}
2861
 
2862
  # Apply compression to each request in the batch
2863
  for idx, batch_req in enumerate(requests_list):
 
2898
  )
2899
 
2900
  optimized_messages = result.messages
2901
+ pipeline_timing = result.timing
2902
  # Use pipeline's token counts for consistency with pipeline logs
2903
  original_tokens = result.tokens_before
2904
  optimized_tokens = result.tokens_after
2905
  total_original_tokens += original_tokens
2906
  total_optimized_tokens += optimized_tokens
2907
+ tokens_saved = max(0, original_tokens - optimized_tokens)
2908
  total_tokens_saved += tokens_saved
2909
 
2910
  # CCR Tool Injection: Inject retrieval tool if compression occurred
 
2995
  output_tokens=0,
2996
  tokens_saved=total_tokens_saved,
2997
  latency_ms=optimization_latency,
2998
+ overhead_ms=optimization_latency,
2999
+ pipeline_timing=pipeline_timing,
3000
  )
3001
 
3002
  # Log compression stats
 
3701
  tags: dict[str, str],
3702
  optimization_latency: float,
3703
  memory_user_id: str | None = None,
3704
+ pipeline_timing: dict[str, float] | None = None,
3705
  ) -> StreamingResponse:
3706
  """Stream response with metrics tracking and memory tool handling.
3707
 
 
3724
  "cache_creation_input_tokens": 0,
3725
  "total_bytes": 0,
3726
  "sse_buffer": "", # Buffer for incomplete SSE events
3727
+ "ttfb_ms": None, # Time to first byte from upstream
3728
  }
3729
 
3730
  # Track if we need to handle memory tools
 
3746
  "POST", url, json=body, headers=headers
3747
  ) as response:
3748
  async for chunk in response.aiter_bytes():
3749
+ # Record TTFB on first chunk
3750
+ if stream_state["ttfb_ms"] is None:
3751
+ stream_state["ttfb_ms"] = (time.time() - start_time) * 1000
3752
+
3753
  stream_state["total_bytes"] += len(chunk)
3754
 
3755
  # Buffer SSE data to handle chunks split across calls
 
3911
  f"[{request_id}] No usage in stream, estimated {output_tokens} output tokens"
3912
  )
3913
 
3914
+ # Use optimized_tokens for dashboard metrics (what we actually sent).
3915
+ # API's input_tokens is the non-cached portion only, which is
3916
+ # misleading for aggregation (often just 1 with prompt caching).
 
 
 
3917
  cache_read_tokens = stream_state["cache_read_input_tokens"]
3918
  cache_write_tokens = stream_state["cache_creation_input_tokens"]
3919
 
 
3935
  f"transforms={_summarize_transforms(transforms_applied)}"
3936
  )
3937
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3938
  if self.cost_tracker:
3939
+ self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3940
 
3941
  await self.metrics.record_request(
3942
  provider=provider,
3943
  model=model,
3944
+ input_tokens=optimized_tokens, # What we sent, not API's non-cached count
3945
  output_tokens=output_tokens,
3946
  tokens_saved=tokens_saved,
3947
  latency_ms=total_latency,
3948
+ overhead_ms=optimization_latency,
3949
+ ttfb_ms=stream_state["ttfb_ms"] or 0,
3950
+ pipeline_timing=pipeline_timing,
3951
  )
3952
 
3953
  return StreamingResponse(
 
3968
  transforms_applied: list[str],
3969
  tags: dict[str, str],
3970
  optimization_latency: float,
3971
+ pipeline_timing: dict[str, float] | None = None,
3972
  ) -> StreamingResponse:
3973
  """Stream response from Bedrock backend with metrics tracking.
3974
 
 
3980
  stream_state: dict[str, Any] = {
3981
  "input_tokens": 0,
3982
  "output_tokens": 0,
3983
+ "ttfb_ms": None,
3984
  }
3985
 
3986
  async def generate():
 
3988
  assert self.anthropic_backend is not None
3989
 
3990
  async for event in self.anthropic_backend.stream_message(body, headers):
3991
+ # Record TTFB on first event
3992
+ if stream_state["ttfb_ms"] is None:
3993
+ stream_state["ttfb_ms"] = (time.time() - start_time) * 1000
3994
+
3995
  # Format as SSE
3996
  if event.raw_sse:
3997
  yield event.raw_sse.encode()
 
4038
  latency_ms=total_latency,
4039
  cached=False,
4040
  overhead_ms=optimization_latency,
4041
+ ttfb_ms=stream_state["ttfb_ms"] or 0,
4042
+ pipeline_timing=pipeline_timing,
4043
  )
4044
 
 
 
4045
  if self.cost_tracker:
4046
+ self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
 
 
 
 
 
 
 
 
 
 
4047
 
4048
  # Log request
4049
  if self.logger:
 
4060
  savings_percent=(tokens_saved / original_tokens * 100)
4061
  if original_tokens > 0
4062
  else 0,
 
 
4063
  optimization_latency_ms=optimization_latency,
4064
  total_latency_ms=total_latency,
4065
  tags=tags,
 
4196
 
4197
  # Optimization
4198
  transforms_applied = []
4199
+ pipeline_timing: dict[str, float] = {}
4200
+ waste_signals_dict: dict[str, int] | None = None
4201
  optimized_messages = messages
4202
  optimized_tokens = original_tokens
4203
 
 
4213
  if result.messages != messages:
4214
  optimized_messages = result.messages
4215
  transforms_applied = result.transforms_applied
4216
+ pipeline_timing = result.timing
4217
  original_tokens = result.tokens_before
4218
  optimized_tokens = result.tokens_after
4219
+ if result.waste_signals:
4220
+ waste_signals_dict = result.waste_signals.to_dict()
4221
  except Exception as e:
4222
  logger.warning(f"Optimization failed: {e}")
4223
 
4224
+ tokens_saved = max(0, original_tokens - optimized_tokens)
4225
  optimization_latency = (time.time() - start_time) * 1000
4226
 
4227
  # Hook: post_compress
 
4306
  latency_ms=total_latency,
4307
  cached=False,
4308
  overhead_ms=optimization_latency,
4309
+ pipeline_timing=pipeline_timing,
4310
  )
4311
 
4312
  if tokens_saved > 0:
 
4357
  transforms_applied,
4358
  tags,
4359
  optimization_latency,
4360
+ pipeline_timing=pipeline_timing,
4361
  )
4362
  else:
4363
  response = await self._retry_request("POST", url, headers, body)
 
4380
  f"[{request_id}] Failed to extract cached tokens from OpenAI response: {e}"
4381
  )
4382
 
 
 
 
 
 
 
4383
  if self.cost_tracker:
4384
+ self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4385
 
4386
  # Cache
4387
  if self.cache and response.status_code == 200:
 
4389
  messages, model, response.content, dict(response.headers), tokens_saved
4390
  )
4391
 
 
4392
  await self.metrics.record_request(
4393
  provider="openai",
4394
  model=model,
 
4396
  output_tokens=output_tokens,
4397
  tokens_saved=tokens_saved,
4398
  latency_ms=total_latency,
4399
+ overhead_ms=optimization_latency,
4400
+ pipeline_timing=pipeline_timing,
4401
+ waste_signals=waste_signals_dict,
4402
  )
4403
 
4404
  if tokens_saved > 0:
 
4494
  tokens_saved=0,
4495
  latency_ms=latency_ms,
4496
  cached=False,
 
 
4497
  )
4498
 
4499
  return Response(
 
5140
 
5141
  total_input_tokens = original_tokens # fallback
5142
  output_tokens = 0
 
5143
  try:
5144
  resp_json = response.json()
5145
  usage = resp_json.get("usage", {})
5146
  total_input_tokens = usage.get("input_tokens", original_tokens)
5147
  output_tokens = usage.get("output_tokens", 0)
 
 
 
 
 
5148
  except (KeyError, TypeError, AttributeError) as e:
5149
  logger.debug(
5150
  f"[{request_id}] Failed to extract cached tokens from OpenAI passthrough response: {e}"
5151
  )
5152
 
 
 
 
 
 
 
5153
  if self.cost_tracker:
5154
+ self.cost_tracker.record_tokens(model, tokens_saved, total_input_tokens)
 
 
 
 
 
 
 
5155
 
 
5156
  await self.metrics.record_request(
5157
  provider="openai",
5158
  model=model,
 
5160
  output_tokens=output_tokens,
5161
  tokens_saved=tokens_saved,
5162
  latency_ms=total_latency,
5163
+ overhead_ms=optimization_latency,
 
5164
  )
5165
 
5166
  logger.info(f"[{request_id}] /v1/responses {model}: {total_input_tokens:,} tokens")
 
5306
 
5307
  # Optimization
5308
  transforms_applied: list[str] = []
5309
+ waste_signals_dict: dict[str, int] | None = None
5310
  optimized_messages = messages
5311
  optimized_tokens = original_tokens
5312
 
 
5325
  # Use pipeline's token counts for consistency with pipeline logs
5326
  original_tokens = result.tokens_before
5327
  optimized_tokens = result.tokens_after
5328
+ if result.waste_signals:
5329
+ waste_signals_dict = result.waste_signals.to_dict()
5330
  except Exception as e:
5331
  logger.warning(f"[{request_id}] Gemini optimization failed: {e}")
5332
 
5333
+ tokens_saved = max(0, original_tokens - optimized_tokens)
5334
  optimization_latency = (time.time() - start_time) * 1000
5335
 
5336
  # Query Echo: re-inject user's question after compressed tool outputs
 
5412
  f"[{request_id}] Failed to extract cached tokens from Gemini response: {e}"
5413
  )
5414
 
 
 
 
 
 
 
5415
  if self.cost_tracker:
5416
+ self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5417
 
 
5418
  await self.metrics.record_request(
5419
  provider="gemini",
5420
  model=model,
 
5422
  output_tokens=output_tokens,
5423
  tokens_saved=tokens_saved,
5424
  latency_ms=total_latency,
5425
+ overhead_ms=optimization_latency,
5426
+ waste_signals=waste_signals_dict,
5427
  )
5428
 
5429
  if tokens_saved > 0:
 
5653
  logger.debug(f"[{request_id}] Failed to parse Gemini token count response: {e}")
5654
 
5655
  # Track stats
5656
+ tokens_saved = max(0, original_tokens - compressed_tokens) if compressed_tokens > 0 else 0
5657
 
5658
  await self.metrics.record_request(
5659
  provider="gemini",
 
5662
  output_tokens=0,
5663
  tokens_saved=tokens_saved,
5664
  latency_ms=total_latency,
 
 
5665
  )
5666
 
5667
  if tokens_saved > 0:
 
5843
  )
5844
  max_latency_ms = round(m.latency_max_ms, 2) if m.latency_count > 0 else 0
5845
 
5846
+ # Calculate Headroom overhead (optimization time only, excludes pass-through requests)
5847
  avg_overhead_ms = (
5848
+ round(m.overhead_sum_ms / m.overhead_count, 2) if m.overhead_count > 0 else 0
5849
  )
5850
  min_overhead_ms = (
5851
  round(m.overhead_min_ms, 2)
5852
+ if m.overhead_count > 0 and m.overhead_min_ms != float("inf")
5853
  else 0
5854
  )
5855
+ max_overhead_ms = round(m.overhead_max_ms, 2) if m.overhead_count > 0 else 0
5856
+
5857
+ # Calculate TTFB (time to first byte)
5858
+ avg_ttfb_ms = round(m.ttfb_sum_ms / m.ttfb_count, 2) if m.ttfb_count > 0 else 0
5859
+ min_ttfb_ms = (
5860
+ round(m.ttfb_min_ms, 2) if m.ttfb_count > 0 and m.ttfb_min_ms != float("inf") else 0
5861
+ )
5862
+ max_ttfb_ms = round(m.ttfb_max_ms, 2) if m.ttfb_count > 0 else 0
5863
 
5864
  # Get compression store stats
5865
  store = get_compression_store()
 
5908
  "min_ms": min_overhead_ms,
5909
  "max_ms": max_overhead_ms,
5910
  },
5911
+ "ttfb": {
5912
+ "average_ms": avg_ttfb_ms,
5913
+ "min_ms": min_ttfb_ms,
5914
+ "max_ms": max_ttfb_ms,
5915
+ },
5916
+ "pipeline_timing": {
5917
+ name: {
5918
+ "average_ms": round(
5919
+ m.transform_timing_sum[name] / m.transform_timing_count[name], 2
5920
+ ),
5921
+ "max_ms": round(m.transform_timing_max[name], 2),
5922
+ "count": m.transform_timing_count[name],
5923
+ }
5924
+ for name in sorted(m.transform_timing_sum.keys())
5925
+ }
5926
+ if m.transform_timing_sum
5927
+ else {},
5928
+ "waste_signals": dict(m.waste_signals_total) if m.waste_signals_total else {},
5929
+ "savings_history": m.savings_history[-100:], # Last 100 data points
5930
  "cost": proxy.cost_tracker.stats() if proxy.cost_tracker else None,
5931
  "compression": {
5932
  "ccr_entries": compression_stats.get("entry_count", 0),
headroom/transforms/code_compressor.py CHANGED
@@ -867,6 +867,25 @@ class CodeAwareCompressor(Transform):
867
 
868
  ratio = compressed_tokens / max(original_tokens, 1)
869
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
870
  # Store in CCR if significant compression
871
  cache_key = None
872
  if self.config.enable_ccr and ratio < 0.8:
 
867
 
868
  ratio = compressed_tokens / max(original_tokens, 1)
869
 
870
+ # Guard against over-aggressive compression (data loss).
871
+ # If AST extraction stripped content to <5% of original,
872
+ # the output is essentially empty — return original.
873
+ if ratio < 0.05:
874
+ logger.warning(
875
+ "Code compression too aggressive (ratio=%.3f), returning original",
876
+ ratio,
877
+ )
878
+ return CodeCompressionResult(
879
+ compressed=code,
880
+ original=code,
881
+ original_tokens=original_tokens,
882
+ compressed_tokens=original_tokens,
883
+ compression_ratio=1.0,
884
+ language=detected_lang,
885
+ language_confidence=confidence,
886
+ syntax_valid=True,
887
+ )
888
+
889
  # Store in CCR if significant compression
890
  cache_key = None
891
  if self.config.enable_ccr and ratio < 0.8:
headroom/transforms/content_router.py CHANGED
@@ -38,6 +38,7 @@ from __future__ import annotations
38
  import hashlib
39
  import logging
40
  import re
 
41
  from dataclasses import dataclass, field
42
  from enum import Enum
43
  from typing import Any
@@ -132,6 +133,112 @@ def _create_content_signature(
132
  return None
133
 
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  class CompressionStrategy(Enum):
136
  """Available compression strategies."""
137
 
@@ -533,6 +640,8 @@ class ContentRouter(Transform):
533
  # TOIN integration for cross-strategy learning
534
  self._toin: Any = None
535
 
 
 
536
  def _record_to_toin(
537
  self,
538
  strategy: CompressionStrategy,
@@ -1042,6 +1151,25 @@ class ContentRouter(Transform):
1042
  logger.debug("HTMLExtractor not available (install trafilatura)")
1043
  return self._html_extractor
1044
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1045
  def _get_llmlingua(self) -> Any:
1046
  """Get LLMLinguaCompressor (lazy load)."""
1047
  if self._llmlingua is None:
@@ -1269,6 +1397,7 @@ class ContentRouter(Transform):
1269
  transformed_messages: list[dict[str, Any]] = []
1270
  transforms_applied: list[str] = []
1271
  warnings: list[str] = []
 
1272
 
1273
  # Routing reason counters for summary logging
1274
  route_counts: dict[str, int] = {
@@ -1309,6 +1438,7 @@ class ContentRouter(Transform):
1309
  min_ratio=min_ratio,
1310
  read_protection_window=read_protection_window,
1311
  messages_from_end=messages_from_end,
 
1312
  )
1313
  transformed_messages.append(transformed_message)
1314
  route_counts["content_blocks"] += 1
@@ -1375,14 +1505,70 @@ class ContentRouter(Transform):
1375
  route_counts["analysis_ctx"] += 1
1376
  continue
1377
 
 
 
 
 
 
 
 
 
 
 
1378
  # Route and compress based on content detection
1379
  # Merge tool-specific bias with hook-provided bias (multiplicative)
1380
  msg_bias = bias if role == "tool" else 1.0
1381
  if i in hook_biases:
1382
  msg_bias *= hook_biases[i]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1383
  result = self.compress(content, context=context, bias=msg_bias)
 
 
 
1384
 
1385
  if result.compression_ratio < min_ratio:
 
 
 
 
 
 
 
1386
  transformed_messages.append({**message, "content": result.compressed})
1387
  transforms_applied.append(
1388
  f"router:{result.strategy_used.value}:{result.compression_ratio:.2f}"
@@ -1391,6 +1577,8 @@ class ContentRouter(Transform):
1391
  f"{result.strategy_used.value}:{result.compression_ratio:.2f}"
1392
  )
1393
  else:
 
 
1394
  transformed_messages.append(message)
1395
  route_counts["ratio_too_high"] += 1
1396
 
@@ -1398,6 +1586,14 @@ class ContentRouter(Transform):
1398
  tokenizer.count_text(str(m.get("content", ""))) for m in transformed_messages
1399
  )
1400
 
 
 
 
 
 
 
 
 
1401
  # Log routing summary
1402
  parts = []
1403
  if compressed_details:
@@ -1412,12 +1608,24 @@ class ContentRouter(Transform):
1412
  parts.append(f"{route_counts['recent_code']} protected (recent code)")
1413
  if route_counts["analysis_ctx"]:
1414
  parts.append(f"{route_counts['analysis_ctx']} protected (analysis ctx)")
 
 
1415
  if route_counts["ratio_too_high"]:
1416
  parts.append(f"{route_counts['ratio_too_high']} unchanged (ratio>={min_ratio:.2f})")
1417
  if route_counts["content_blocks"]:
1418
  parts.append(f"{route_counts['content_blocks']} content-block msgs")
1419
  if route_counts["non_string"]:
1420
  parts.append(f"{route_counts['non_string']} non-string")
 
 
 
 
 
 
 
 
 
 
1421
  if parts:
1422
  logger.info(
1423
  "content_router: %d msgs — %s",
@@ -1433,6 +1641,7 @@ class ContentRouter(Transform):
1433
  transforms_applied=all_transforms if all_transforms else ["router:noop"],
1434
  markers_inserted=lifecycle_ccr_hashes,
1435
  warnings=warnings,
 
1436
  )
1437
 
1438
  def _get_tool_bias(self, tool_name: str) -> float:
@@ -1469,6 +1678,7 @@ class ContentRouter(Transform):
1469
  min_ratio: float = 0.85,
1470
  read_protection_window: int = 8,
1471
  messages_from_end: int = 0,
 
1472
  ) -> dict[str, Any]:
1473
  """Process content blocks (Anthropic format) for tool_result compression.
1474
 
@@ -1523,9 +1733,70 @@ class ContentRouter(Transform):
1523
 
1524
  # Only process string content
1525
  if isinstance(tool_content, str) and len(tool_content) > 500:
1526
- # Compress using content detection (will auto-detect JSON arrays, etc.)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1527
  result = self.compress(tool_content, context=context, bias=bias)
 
 
 
 
1528
  if result.compression_ratio < min_ratio:
 
 
 
 
 
 
 
1529
  new_blocks.append({**block, "content": result.compressed})
1530
  transforms_applied.append(
1531
  f"router:tool_result:{result.strategy_used.value}"
@@ -1537,6 +1808,8 @@ class ContentRouter(Transform):
1537
  any_compressed = True
1538
  continue
1539
  else:
 
 
1540
  if route_counts is not None:
1541
  route_counts["ratio_too_high"] += 1
1542
  else:
 
38
  import hashlib
39
  import logging
40
  import re
41
+ import time
42
  from dataclasses import dataclass, field
43
  from enum import Enum
44
  from typing import Any
 
133
  return None
134
 
135
 
136
+ class CompressionCache:
137
+ """Two-tier compression cache with TTL.
138
+
139
+ Tier 1 (skip set): content hashes that won't compress — instant skip,
140
+ near-zero memory (just ints in a set).
141
+
142
+ Tier 2 (result cache): compressed results for content that DID compress —
143
+ reuse the compressed text on subsequent requests.
144
+
145
+ Entries expire after TTL (default 30min). No max-entries cap — TTL is the
146
+ natural bound. Memory grows proportional to compressible content × TTL,
147
+ which is bounded by session duration.
148
+
149
+ Uses in-process dict for ultra-fast lookups (~100ns). Could be backed
150
+ by memcached/Redis for multi-process deployments.
151
+ """
152
+
153
+ def __init__(self, ttl_seconds: int = 1800):
154
+ # Tier 2: compressed results {hash: (text, ratio, strategy, timestamp)}
155
+ self._results: dict[int, tuple[str, float, str, float]] = {}
156
+ # Tier 1: hashes of content that won't compress {hash: timestamp}
157
+ self._skip: dict[int, float] = {}
158
+ self._ttl_seconds = ttl_seconds
159
+ # Metrics
160
+ self._hits = 0
161
+ self._misses = 0
162
+ self._skip_hits = 0
163
+ self._evictions = 0
164
+ self._total_lookup_ns = 0
165
+ self._lookup_count = 0
166
+
167
+ def get(self, key: int) -> tuple[str, float, str] | None:
168
+ """Get cached compression result.
169
+
170
+ Returns (compressed_text, ratio, strategy) or None if not found/expired.
171
+ Use is_skipped() first to check if content is known non-compressible.
172
+ """
173
+ t0 = time.perf_counter_ns()
174
+ entry = self._results.get(key)
175
+ if entry is not None:
176
+ compressed, ratio, strategy, created_at = entry
177
+ if (time.time() - created_at) < self._ttl_seconds:
178
+ self._hits += 1
179
+ self._total_lookup_ns += time.perf_counter_ns() - t0
180
+ self._lookup_count += 1
181
+ return (compressed, ratio, strategy)
182
+ else:
183
+ del self._results[key]
184
+ self._evictions += 1
185
+ self._misses += 1
186
+ self._total_lookup_ns += time.perf_counter_ns() - t0
187
+ self._lookup_count += 1
188
+ return None
189
+
190
+ def is_skipped(self, key: int) -> bool:
191
+ """Check if content is known non-compressible (Tier 1)."""
192
+ ts = self._skip.get(key)
193
+ if ts is not None:
194
+ if (time.time() - ts) < self._ttl_seconds:
195
+ self._skip_hits += 1
196
+ return True
197
+ else:
198
+ del self._skip[key]
199
+ self._evictions += 1
200
+ return False
201
+
202
+ def put(self, key: int, compressed: str, ratio: float, strategy: str) -> None:
203
+ """Store a compressed result (Tier 2)."""
204
+ self._results[key] = (compressed, ratio, strategy, time.time())
205
+
206
+ def mark_skip(self, key: int) -> None:
207
+ """Mark content as non-compressible (Tier 1)."""
208
+ self._skip[key] = time.time()
209
+
210
+ def move_to_skip(self, key: int) -> None:
211
+ """Move a result to skip set (threshold tightened, no longer qualifies)."""
212
+ self._results.pop(key, None)
213
+ self._skip[key] = time.time()
214
+
215
+ @property
216
+ def size(self) -> int:
217
+ return len(self._results)
218
+
219
+ @property
220
+ def skip_size(self) -> int:
221
+ return len(self._skip)
222
+
223
+ @property
224
+ def stats(self) -> dict[str, int | float]:
225
+ avg_ns = self._total_lookup_ns / self._lookup_count if self._lookup_count else 0
226
+ return {
227
+ "cache_hits": self._hits,
228
+ "cache_skip_hits": self._skip_hits,
229
+ "cache_misses": self._misses,
230
+ "cache_evictions": self._evictions,
231
+ "cache_size": len(self._results),
232
+ "cache_skip_size": len(self._skip),
233
+ "cache_avg_lookup_ns": avg_ns,
234
+ }
235
+
236
+ def clear(self) -> None:
237
+ """Clear all entries (e.g., on session end)."""
238
+ self._results.clear()
239
+ self._skip.clear()
240
+
241
+
242
  class CompressionStrategy(Enum):
243
  """Available compression strategies."""
244
 
 
640
  # TOIN integration for cross-strategy learning
641
  self._toin: Any = None
642
 
643
+ self._cache = CompressionCache()
644
+
645
  def _record_to_toin(
646
  self,
647
  strategy: CompressionStrategy,
 
1151
  logger.debug("HTMLExtractor not available (install trafilatura)")
1152
  return self._html_extractor
1153
 
1154
+ def eager_load_compressors(self) -> None:
1155
+ """Pre-load compressors at startup to avoid first-request latency.
1156
+
1157
+ Call this during proxy startup to load LLMLingua model (~5s)
1158
+ before any requests arrive.
1159
+ """
1160
+ if self.config.enable_llmlingua:
1161
+ compressor = self._get_llmlingua()
1162
+ if compressor:
1163
+ # Trigger the underlying model load by accessing it
1164
+ try:
1165
+ from .llmlingua_compressor import _get_llmlingua_compressor
1166
+
1167
+ device = compressor._resolve_device()
1168
+ _get_llmlingua_compressor(compressor.config.model_name, device)
1169
+ logger.info("LLMLingua model pre-loaded at startup")
1170
+ except Exception as e:
1171
+ logger.warning("Failed to pre-load LLMLingua model: %s", e)
1172
+
1173
  def _get_llmlingua(self) -> Any:
1174
  """Get LLMLinguaCompressor (lazy load)."""
1175
  if self._llmlingua is None:
 
1397
  transformed_messages: list[dict[str, Any]] = []
1398
  transforms_applied: list[str] = []
1399
  warnings: list[str] = []
1400
+ compressor_timing: dict[str, float] = {} # strategy → cumulative ms
1401
 
1402
  # Routing reason counters for summary logging
1403
  route_counts: dict[str, int] = {
 
1438
  min_ratio=min_ratio,
1439
  read_protection_window=read_protection_window,
1440
  messages_from_end=messages_from_end,
1441
+ compressor_timing=compressor_timing,
1442
  )
1443
  transformed_messages.append(transformed_message)
1444
  route_counts["content_blocks"] += 1
 
1505
  route_counts["analysis_ctx"] += 1
1506
  continue
1507
 
1508
+ # Compression pinning: if this message was already compressed
1509
+ # (contains a CCR retrieval marker), skip recompression.
1510
+ # Recompressing would change byte content and break provider
1511
+ # prefix caching with no meaningful further reduction.
1512
+ if "Retrieve more: hash=" in content or "Retrieve original: hash=" in content:
1513
+ transformed_messages.append(message)
1514
+ route_counts.setdefault("already_compressed", 0)
1515
+ route_counts["already_compressed"] += 1
1516
+ continue
1517
+
1518
  # Route and compress based on content detection
1519
  # Merge tool-specific bias with hook-provided bias (multiplicative)
1520
  msg_bias = bias if role == "tool" else 1.0
1521
  if i in hook_biases:
1522
  msg_bias *= hook_biases[i]
1523
+
1524
+ # Two-tier compression cache.
1525
+ # Tier 1 (skip): known won't-compress → instant skip.
1526
+ # Tier 2 (result): known compresses → reuse compressed text.
1527
+ content_key = hash(content)
1528
+
1529
+ # Tier 1: skip set — instant rejection
1530
+ if self._cache.is_skipped(content_key):
1531
+ transformed_messages.append(message)
1532
+ route_counts["ratio_too_high"] += 1
1533
+ route_counts.setdefault("cache_hit", 0)
1534
+ route_counts["cache_hit"] += 1
1535
+ continue
1536
+
1537
+ # Tier 2: result cache — reuse compressed output
1538
+ cached = self._cache.get(content_key)
1539
+ if cached is not None:
1540
+ cached_compressed, cached_ratio, cached_strategy = cached
1541
+ # Re-check ratio against current min_ratio (shifts with context pressure)
1542
+ if cached_ratio < min_ratio:
1543
+ transformed_messages.append({**message, "content": cached_compressed})
1544
+ transforms_applied.append(f"router:{cached_strategy}:{cached_ratio:.2f}")
1545
+ compressed_details.append(f"{cached_strategy}:{cached_ratio:.2f}")
1546
+ else:
1547
+ # Threshold tightened — no longer qualifies. Move to skip.
1548
+ self._cache.move_to_skip(content_key)
1549
+ transformed_messages.append(message)
1550
+ route_counts["ratio_too_high"] += 1
1551
+ route_counts.setdefault("cache_hit", 0)
1552
+ route_counts["cache_hit"] += 1
1553
+ continue
1554
+
1555
+ # Cache miss — run full compression
1556
+ route_counts.setdefault("cache_miss", 0)
1557
+ route_counts["cache_miss"] += 1
1558
+ t0 = time.perf_counter()
1559
  result = self.compress(content, context=context, bias=msg_bias)
1560
+ compress_ms = (time.perf_counter() - t0) * 1000
1561
+ strategy_key = f"compressor:{result.strategy_used.value}"
1562
+ compressor_timing[strategy_key] = compressor_timing.get(strategy_key, 0.0) + compress_ms
1563
 
1564
  if result.compression_ratio < min_ratio:
1565
+ # Compressed — store in result cache
1566
+ self._cache.put(
1567
+ content_key,
1568
+ result.compressed,
1569
+ result.compression_ratio,
1570
+ result.strategy_used.value,
1571
+ )
1572
  transformed_messages.append({**message, "content": result.compressed})
1573
  transforms_applied.append(
1574
  f"router:{result.strategy_used.value}:{result.compression_ratio:.2f}"
 
1577
  f"{result.strategy_used.value}:{result.compression_ratio:.2f}"
1578
  )
1579
  else:
1580
+ # Didn't compress — add to skip set
1581
+ self._cache.mark_skip(content_key)
1582
  transformed_messages.append(message)
1583
  route_counts["ratio_too_high"] += 1
1584
 
 
1586
  tokenizer.count_text(str(m.get("content", ""))) for m in transformed_messages
1587
  )
1588
 
1589
+ # Add cache performance metrics to timing
1590
+ cache_stats = self._cache.stats
1591
+ compressor_timing["cache_hits"] = float(cache_stats["cache_hits"])
1592
+ compressor_timing["cache_skip_hits"] = float(cache_stats["cache_skip_hits"])
1593
+ compressor_timing["cache_size"] = float(cache_stats["cache_size"])
1594
+ compressor_timing["cache_skip_size"] = float(cache_stats["cache_skip_size"])
1595
+ compressor_timing["cache_avg_lookup_ns"] = cache_stats["cache_avg_lookup_ns"]
1596
+
1597
  # Log routing summary
1598
  parts = []
1599
  if compressed_details:
 
1608
  parts.append(f"{route_counts['recent_code']} protected (recent code)")
1609
  if route_counts["analysis_ctx"]:
1610
  parts.append(f"{route_counts['analysis_ctx']} protected (analysis ctx)")
1611
+ if route_counts.get("already_compressed"):
1612
+ parts.append(f"{route_counts['already_compressed']} pinned (already compressed)")
1613
  if route_counts["ratio_too_high"]:
1614
  parts.append(f"{route_counts['ratio_too_high']} unchanged (ratio>={min_ratio:.2f})")
1615
  if route_counts["content_blocks"]:
1616
  parts.append(f"{route_counts['content_blocks']} content-block msgs")
1617
  if route_counts["non_string"]:
1618
  parts.append(f"{route_counts['non_string']} non-string")
1619
+ if route_counts.get("cache_hit"):
1620
+ parts.append(f"{route_counts['cache_hit']} cache hits")
1621
+ if route_counts.get("cache_miss"):
1622
+ parts.append(f"{route_counts['cache_miss']} cache misses")
1623
+ cs = self._cache.stats
1624
+ if cs["cache_size"] > 0 or cs["cache_skip_size"] > 0:
1625
+ parts.append(
1626
+ f"cache[{cs['cache_size']} results, {cs['cache_skip_size']} skips, "
1627
+ f"{cs['cache_avg_lookup_ns']:.0f}ns avg]"
1628
+ )
1629
  if parts:
1630
  logger.info(
1631
  "content_router: %d msgs — %s",
 
1641
  transforms_applied=all_transforms if all_transforms else ["router:noop"],
1642
  markers_inserted=lifecycle_ccr_hashes,
1643
  warnings=warnings,
1644
+ timing=compressor_timing,
1645
  )
1646
 
1647
  def _get_tool_bias(self, tool_name: str) -> float:
 
1678
  min_ratio: float = 0.85,
1679
  read_protection_window: int = 8,
1680
  messages_from_end: int = 0,
1681
+ compressor_timing: dict[str, float] | None = None,
1682
  ) -> dict[str, Any]:
1683
  """Process content blocks (Anthropic format) for tool_result compression.
1684
 
 
1733
 
1734
  # Only process string content
1735
  if isinstance(tool_content, str) and len(tool_content) > 500:
1736
+ # Compression pinning: skip already-compressed content
1737
+ if (
1738
+ "Retrieve more: hash=" in tool_content
1739
+ or "Retrieve original: hash=" in tool_content
1740
+ ):
1741
+ new_blocks.append(block)
1742
+ if route_counts is not None:
1743
+ route_counts.setdefault("already_compressed", 0)
1744
+ route_counts["already_compressed"] += 1
1745
+ continue
1746
+
1747
+ # Two-tier compression cache
1748
+ content_key = hash(tool_content)
1749
+
1750
+ # Tier 1: skip set — instant rejection
1751
+ if self._cache.is_skipped(content_key):
1752
+ new_blocks.append(block)
1753
+ if route_counts is not None:
1754
+ route_counts["ratio_too_high"] += 1
1755
+ route_counts.setdefault("cache_hit", 0)
1756
+ route_counts["cache_hit"] += 1
1757
+ continue
1758
+
1759
+ # Tier 2: result cache — reuse compressed output
1760
+ cached = self._cache.get(content_key)
1761
+ if cached is not None:
1762
+ cached_compressed, cached_ratio, cached_strategy = cached
1763
+ if cached_ratio < min_ratio:
1764
+ new_blocks.append({**block, "content": cached_compressed})
1765
+ transforms_applied.append(f"router:tool_result:{cached_strategy}")
1766
+ if compressed_details is not None:
1767
+ compressed_details.append(
1768
+ f"tool:{cached_strategy}:{cached_ratio:.2f}"
1769
+ )
1770
+ any_compressed = True
1771
+ else:
1772
+ # Threshold tightened — move to skip
1773
+ self._cache.move_to_skip(content_key)
1774
+ new_blocks.append(block)
1775
+ if route_counts is not None:
1776
+ route_counts["ratio_too_high"] += 1
1777
+ if route_counts is not None:
1778
+ route_counts.setdefault("cache_hit", 0)
1779
+ route_counts["cache_hit"] += 1
1780
+ continue
1781
+
1782
+ # Cache miss — run full compression
1783
+ if route_counts is not None:
1784
+ route_counts.setdefault("cache_miss", 0)
1785
+ route_counts["cache_miss"] += 1
1786
+ t0 = time.perf_counter()
1787
  result = self.compress(tool_content, context=context, bias=bias)
1788
+ compress_ms = (time.perf_counter() - t0) * 1000
1789
+ if compressor_timing is not None:
1790
+ key = f"compressor:{result.strategy_used.value}"
1791
+ compressor_timing[key] = compressor_timing.get(key, 0.0) + compress_ms
1792
  if result.compression_ratio < min_ratio:
1793
+ # Compressed — store in result cache
1794
+ self._cache.put(
1795
+ content_key,
1796
+ result.compressed,
1797
+ result.compression_ratio,
1798
+ result.strategy_used.value,
1799
+ )
1800
  new_blocks.append({**block, "content": result.compressed})
1801
  transforms_applied.append(
1802
  f"router:tool_result:{result.strategy_used.value}"
 
1808
  any_compressed = True
1809
  continue
1810
  else:
1811
+ # Didn't compress — add to skip set
1812
+ self._cache.mark_skip(content_key)
1813
  if route_counts is not None:
1814
  route_counts["ratio_too_high"] += 1
1815
  else:
headroom/transforms/pipeline.py CHANGED
@@ -3,6 +3,7 @@
3
  from __future__ import annotations
4
 
5
  import logging
 
6
  from typing import TYPE_CHECKING, Any
7
 
8
  from ..config import (
@@ -14,6 +15,7 @@ from ..config import (
14
  ToolCrusherConfig,
15
  TransformDiff,
16
  TransformResult,
 
17
  )
18
  from ..tokenizer import Tokenizer
19
  from ..utils import deep_copy_messages
@@ -188,12 +190,14 @@ class TransformPipeline:
188
  all_transforms: list[str] = []
189
  all_markers: list[str] = []
190
  all_warnings: list[str] = []
 
191
 
192
  # Track transform diffs if enabled
193
  transform_diffs: list[TransformDiff] = []
194
  generate_diff = self.config.generate_diff_artifact
195
 
196
  current_messages = deep_copy_messages(messages)
 
197
 
198
  for transform in self.transforms:
199
  # Check if transform should run
@@ -203,8 +207,10 @@ class TransformPipeline:
203
  # Track tokens before this transform (for diff)
204
  tokens_before_transform = tokenizer.count_messages(current_messages)
205
 
206
- # Apply transform
 
207
  result = transform.apply(current_messages, tokenizer, **kwargs)
 
208
 
209
  # Update messages for next transform
210
  current_messages = result.messages
@@ -216,18 +222,24 @@ class TransformPipeline:
216
  all_transforms.extend(result.transforms_applied)
217
  all_markers.extend(result.markers_inserted)
218
  all_warnings.extend(result.warnings)
 
 
 
 
 
219
 
220
  # Log transform results
221
  if result.transforms_applied:
222
  logger.info(
223
- "Transform %s: %d -> %d tokens (saved %d)",
224
  transform.name,
225
  tokens_before_transform,
226
  tokens_after_transform,
227
  tokens_before_transform - tokens_after_transform,
 
228
  )
229
  else:
230
- logger.debug("Transform %s: no changes", transform.name)
231
 
232
  # Record diff if enabled
233
  if generate_diff:
@@ -240,24 +252,29 @@ class TransformPipeline:
240
  details=", ".join(result.transforms_applied)
241
  if result.transforms_applied
242
  else "",
 
243
  )
244
  )
245
 
246
  # Final token count
247
  tokens_after = tokenizer.count_messages(current_messages)
 
 
248
 
249
  # Log pipeline summary
250
  total_saved = tokens_before - tokens_after
 
251
  if total_saved > 0:
252
  logger.info(
253
- "Pipeline complete: %d -> %d tokens (saved %d, %.1f%% reduction)",
254
  tokens_before,
255
  tokens_after,
256
  total_saved,
257
  (total_saved / tokens_before * 100) if tokens_before > 0 else 0,
 
258
  )
259
  else:
260
- logger.debug("Pipeline complete: no token savings")
261
 
262
  # Build diff artifact if enabled
263
  diff_artifact = None
@@ -270,6 +287,18 @@ class TransformPipeline:
270
  transforms=transform_diffs,
271
  )
272
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  return TransformResult(
274
  messages=current_messages,
275
  tokens_before=tokens_before,
@@ -278,6 +307,8 @@ class TransformPipeline:
278
  markers_inserted=all_markers,
279
  warnings=all_warnings,
280
  diff_artifact=diff_artifact,
 
 
281
  )
282
 
283
  def simulate(
 
3
  from __future__ import annotations
4
 
5
  import logging
6
+ import time
7
  from typing import TYPE_CHECKING, Any
8
 
9
  from ..config import (
 
15
  ToolCrusherConfig,
16
  TransformDiff,
17
  TransformResult,
18
+ WasteSignals,
19
  )
20
  from ..tokenizer import Tokenizer
21
  from ..utils import deep_copy_messages
 
190
  all_transforms: list[str] = []
191
  all_markers: list[str] = []
192
  all_warnings: list[str] = []
193
+ all_timing: dict[str, float] = {} # transform_name → ms
194
 
195
  # Track transform diffs if enabled
196
  transform_diffs: list[TransformDiff] = []
197
  generate_diff = self.config.generate_diff_artifact
198
 
199
  current_messages = deep_copy_messages(messages)
200
+ pipeline_start = time.perf_counter()
201
 
202
  for transform in self.transforms:
203
  # Check if transform should run
 
207
  # Track tokens before this transform (for diff)
208
  tokens_before_transform = tokenizer.count_messages(current_messages)
209
 
210
+ # Time the transform
211
+ t0 = time.perf_counter()
212
  result = transform.apply(current_messages, tokenizer, **kwargs)
213
+ duration_ms = (time.perf_counter() - t0) * 1000
214
 
215
  # Update messages for next transform
216
  current_messages = result.messages
 
222
  all_transforms.extend(result.transforms_applied)
223
  all_markers.extend(result.markers_inserted)
224
  all_warnings.extend(result.warnings)
225
+ all_timing[transform.name] = duration_ms
226
+
227
+ # Merge sub-transform timing (e.g. ContentRouter's per-compressor breakdown)
228
+ if result.timing:
229
+ all_timing.update(result.timing)
230
 
231
  # Log transform results
232
  if result.transforms_applied:
233
  logger.info(
234
+ "Transform %s: %d -> %d tokens (saved %d) [%.1fms]",
235
  transform.name,
236
  tokens_before_transform,
237
  tokens_after_transform,
238
  tokens_before_transform - tokens_after_transform,
239
+ duration_ms,
240
  )
241
  else:
242
+ logger.debug("Transform %s: no changes [%.1fms]", transform.name, duration_ms)
243
 
244
  # Record diff if enabled
245
  if generate_diff:
 
252
  details=", ".join(result.transforms_applied)
253
  if result.transforms_applied
254
  else "",
255
+ duration_ms=duration_ms,
256
  )
257
  )
258
 
259
  # Final token count
260
  tokens_after = tokenizer.count_messages(current_messages)
261
+ pipeline_ms = (time.perf_counter() - pipeline_start) * 1000
262
+ all_timing["pipeline_total"] = pipeline_ms
263
 
264
  # Log pipeline summary
265
  total_saved = tokens_before - tokens_after
266
+ timing_parts = " ".join(f"{k}={v:.0f}ms" for k, v in all_timing.items())
267
  if total_saved > 0:
268
  logger.info(
269
+ "Pipeline complete: %d -> %d tokens (saved %d, %.1f%% reduction) [%s]",
270
  tokens_before,
271
  tokens_after,
272
  total_saved,
273
  (total_saved / tokens_before * 100) if tokens_before > 0 else 0,
274
+ timing_parts,
275
  )
276
  else:
277
+ logger.debug("Pipeline complete: no token savings [%s]", timing_parts)
278
 
279
  # Build diff artifact if enabled
280
  diff_artifact = None
 
287
  transforms=transform_diffs,
288
  )
289
 
290
+ # Detect waste signals in original messages (only when significant compression)
291
+ waste_signals: WasteSignals | None = None
292
+ if tokens_before > tokens_after and (tokens_before - tokens_after) > 100:
293
+ try:
294
+ from ..parser import parse_messages
295
+
296
+ _, _, waste_signals = parse_messages(messages, tokenizer)
297
+ if waste_signals.total() == 0:
298
+ waste_signals = None
299
+ except Exception:
300
+ pass
301
+
302
  return TransformResult(
303
  messages=current_messages,
304
  tokens_before=tokens_before,
 
307
  markers_inserted=all_markers,
308
  warnings=all_warnings,
309
  diff_artifact=diff_artifact,
310
+ timing=all_timing,
311
+ waste_signals=waste_signals,
312
  )
313
 
314
  def simulate(
headroom/transforms/read_lifecycle.py CHANGED
@@ -50,6 +50,8 @@ class FileOperation:
50
  file_path: str
51
  operation: str # "read" | "edit" | "write"
52
  content_size: int = 0 # Size of tool_result content (for reads only)
 
 
53
 
54
 
55
  @dataclass
@@ -116,13 +118,14 @@ class ReadLifecycleManager:
116
 
117
  def _build_tool_metadata(
118
  self, messages: list[dict[str, Any]]
119
- ) -> dict[str, tuple[str, str | None]]:
120
  """Build tool_call_id → (tool_name, file_path) mapping.
121
 
122
  Scans assistant messages for tool calls, extracts name and file_path
123
  from tool inputs. Handles both OpenAI and Anthropic formats.
124
  """
125
- metadata: dict[str, tuple[str, str | None]] = {}
 
126
 
127
  for msg in messages:
128
  if msg.get("role") != "assistant":
@@ -139,12 +142,16 @@ class ReadLifecycleManager:
139
  continue
140
 
141
  file_path = None
 
 
142
  try:
143
  args = json.loads(func.get("arguments", "{}"))
144
  file_path = args.get("file_path") or args.get("path")
 
 
145
  except (json.JSONDecodeError, TypeError):
146
  pass
147
- metadata[tc_id] = (name, file_path)
148
 
149
  # Anthropic format: content blocks with type=tool_use
150
  content = msg.get("content", [])
@@ -160,16 +167,20 @@ class ReadLifecycleManager:
160
 
161
  inp = block.get("input", {})
162
  file_path = None
 
 
163
  if isinstance(inp, dict):
164
  file_path = inp.get("file_path") or inp.get("path")
165
- metadata[tc_id] = (name, file_path)
 
 
166
 
167
  return metadata
168
 
169
  def _build_file_operation_index(
170
  self,
171
  messages: list[dict[str, Any]],
172
- tool_metadata: dict[str, tuple[str, str | None]],
173
  ) -> dict[str, list[FileOperation]]:
174
  """Build file_path → [FileOperation] index in a single pass.
175
 
@@ -177,7 +188,7 @@ class ReadLifecycleManager:
177
  """
178
  file_ops: dict[str, list[FileOperation]] = defaultdict(list)
179
 
180
- for tc_id, (name, file_path) in tool_metadata.items():
181
  if not file_path:
182
  continue
183
 
@@ -200,6 +211,8 @@ class ReadLifecycleManager:
200
  tool_name=name,
201
  file_path=file_path,
202
  operation=operation,
 
 
203
  )
204
  )
205
 
@@ -231,6 +244,29 @@ class ReadLifecycleManager:
231
 
232
  return None
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  def _classify_reads(self, file_ops: dict[str, list[FileOperation]]) -> list[ReadClassification]:
235
  """Classify each Read as fresh, stale, or superseded."""
236
  classifications: list[ReadClassification] = []
@@ -248,9 +284,13 @@ class ReadLifecycleManager:
248
  e.msg_index > read_op.msg_index for e in edits
249
  )
250
 
251
- # Check superseded: any later read of this file?
 
 
 
 
252
  is_superseded = self.config.compress_superseded and any(
253
- r.msg_index > read_op.msg_index for r in reads
254
  )
255
 
256
  if is_stale:
 
50
  file_path: str
51
  operation: str # "read" | "edit" | "write"
52
  content_size: int = 0 # Size of tool_result content (for reads only)
53
+ read_offset: int | None = None # Line offset for partial reads
54
+ read_limit: int | None = None # Line limit for partial reads
55
 
56
 
57
  @dataclass
 
118
 
119
  def _build_tool_metadata(
120
  self, messages: list[dict[str, Any]]
121
+ ) -> dict[str, tuple[str, str | None, int | None, int | None]]:
122
  """Build tool_call_id → (tool_name, file_path) mapping.
123
 
124
  Scans assistant messages for tool calls, extracts name and file_path
125
  from tool inputs. Handles both OpenAI and Anthropic formats.
126
  """
127
+ # Maps tool_call_id (name, file_path, offset, limit)
128
+ metadata: dict[str, tuple[str, str | None, int | None, int | None]] = {}
129
 
130
  for msg in messages:
131
  if msg.get("role") != "assistant":
 
142
  continue
143
 
144
  file_path = None
145
+ offset = None
146
+ limit = None
147
  try:
148
  args = json.loads(func.get("arguments", "{}"))
149
  file_path = args.get("file_path") or args.get("path")
150
+ offset = args.get("offset")
151
+ limit = args.get("limit")
152
  except (json.JSONDecodeError, TypeError):
153
  pass
154
+ metadata[tc_id] = (name, file_path, offset, limit)
155
 
156
  # Anthropic format: content blocks with type=tool_use
157
  content = msg.get("content", [])
 
167
 
168
  inp = block.get("input", {})
169
  file_path = None
170
+ offset = None
171
+ limit = None
172
  if isinstance(inp, dict):
173
  file_path = inp.get("file_path") or inp.get("path")
174
+ offset = inp.get("offset")
175
+ limit = inp.get("limit")
176
+ metadata[tc_id] = (name, file_path, offset, limit)
177
 
178
  return metadata
179
 
180
  def _build_file_operation_index(
181
  self,
182
  messages: list[dict[str, Any]],
183
+ tool_metadata: dict[str, tuple[str, str | None, int | None, int | None]],
184
  ) -> dict[str, list[FileOperation]]:
185
  """Build file_path → [FileOperation] index in a single pass.
186
 
 
188
  """
189
  file_ops: dict[str, list[FileOperation]] = defaultdict(list)
190
 
191
+ for tc_id, (name, file_path, offset, limit) in tool_metadata.items():
192
  if not file_path:
193
  continue
194
 
 
211
  tool_name=name,
212
  file_path=file_path,
213
  operation=operation,
214
+ read_offset=offset if operation == "read" else None,
215
+ read_limit=limit if operation == "read" else None,
216
  )
217
  )
218
 
 
244
 
245
  return None
246
 
247
+ @staticmethod
248
+ def _read_covers(later: FileOperation, earlier: FileOperation) -> bool:
249
+ """Check if `later` read fully covers the line range of `earlier`.
250
+
251
+ A full-file read (no offset/limit) covers everything.
252
+ A partial read only covers another partial if its range is a superset.
253
+ """
254
+ # Full-file read supersedes anything
255
+ if later.read_offset is None and later.read_limit is None:
256
+ return True
257
+
258
+ # If the earlier was a full-file read, a partial can't cover it
259
+ if earlier.read_offset is None and earlier.read_limit is None:
260
+ return False
261
+
262
+ # Both are partial reads — check range containment
263
+ later_start = later.read_offset or 0
264
+ later_end = later_start + (later.read_limit or 2000)
265
+ earlier_start = earlier.read_offset or 0
266
+ earlier_end = earlier_start + (earlier.read_limit or 2000)
267
+
268
+ return later_start <= earlier_start and later_end >= earlier_end
269
+
270
  def _classify_reads(self, file_ops: dict[str, list[FileOperation]]) -> list[ReadClassification]:
271
  """Classify each Read as fresh, stale, or superseded."""
272
  classifications: list[ReadClassification] = []
 
284
  e.msg_index > read_op.msg_index for e in edits
285
  )
286
 
287
+ # Check superseded: any later read that FULLY COVERS this read's range?
288
+ # A partial read (offset=100, limit=50) is NOT superseded by a
289
+ # different partial read (offset=200, limit=50) — they cover
290
+ # different lines. Only supersede when the later read contains
291
+ # all the lines of this read.
292
  is_superseded = self.config.compress_superseded and any(
293
+ r.msg_index > read_op.msg_index and self._read_covers(r, read_op) for r in reads
294
  )
295
 
296
  if is_stale:
tests/test_config.py CHANGED
@@ -91,7 +91,7 @@ class TestCacheAlignerConfig:
91
  def test_default_values(self):
92
  """Default values are correctly set."""
93
  config = CacheAlignerConfig()
94
- assert config.enabled is True
95
  assert config.normalize_whitespace is True
96
  assert config.collapse_blank_lines is True
97
 
@@ -389,6 +389,8 @@ class TestTransformResult:
389
  "warnings",
390
  "diff_artifact",
391
  "cache_metrics",
 
 
392
  }
393
  assert field_names == expected_fields
394
 
 
91
  def test_default_values(self):
92
  """Default values are correctly set."""
93
  config = CacheAlignerConfig()
94
+ assert config.enabled is False
95
  assert config.normalize_whitespace is True
96
  assert config.collapse_blank_lines is True
97
 
 
389
  "warnings",
390
  "diff_artifact",
391
  "cache_metrics",
392
+ "timing",
393
+ "waste_signals",
394
  }
395
  assert field_names == expected_fields
396
 
tests/test_proxy_streaming_resilience.py CHANGED
@@ -559,9 +559,16 @@ class TestCostTrackingAccuracy:
559
  patch("headroom.proxy.server.litellm") as mock_litellm,
560
  ):
561
  # Setup: $10/M input, $30/M output
562
- def mock_cost(model, prompt_tokens, completion_tokens):
563
  input_cost = prompt_tokens * 0.00001
564
  output_cost = completion_tokens * 0.00003
 
 
 
 
 
 
 
565
  return (input_cost, output_cost)
566
 
567
  mock_litellm.cost_per_token.side_effect = mock_cost
@@ -598,7 +605,7 @@ class TestCostTrackingAccuracy:
598
  patch("headroom.proxy.server.litellm") as mock_litellm,
599
  ):
600
  mock_litellm.cost_per_token.side_effect = (
601
- lambda model, prompt_tokens, completion_tokens: (
602
  prompt_tokens * 0.00001,
603
  completion_tokens * 0.00003,
604
  )
 
559
  patch("headroom.proxy.server.litellm") as mock_litellm,
560
  ):
561
  # Setup: $10/M input, $30/M output
562
+ def mock_cost(model, prompt_tokens, completion_tokens, **kwargs):
563
  input_cost = prompt_tokens * 0.00001
564
  output_cost = completion_tokens * 0.00003
565
+ # Add cache costs if provided
566
+ cache_read = kwargs.get("cache_read_input_tokens", 0)
567
+ cache_write = kwargs.get("cache_creation_input_tokens", 0)
568
+ if cache_read or cache_write:
569
+ model_info = mock_litellm.get_model_info()
570
+ input_cost += cache_read * model_info.get("cache_read_input_token_cost", 0)
571
+ input_cost += cache_write * model_info.get("cache_creation_input_token_cost", 0)
572
  return (input_cost, output_cost)
573
 
574
  mock_litellm.cost_per_token.side_effect = mock_cost
 
605
  patch("headroom.proxy.server.litellm") as mock_litellm,
606
  ):
607
  mock_litellm.cost_per_token.side_effect = (
608
+ lambda model, prompt_tokens, completion_tokens, **kwargs: (
609
  prompt_tokens * 0.00001,
610
  completion_tokens * 0.00003,
611
  )
tests/test_transforms/test_cache_aligner.py CHANGED
@@ -29,8 +29,8 @@ def tokenizer():
29
 
30
  @pytest.fixture
31
  def default_config():
32
- """Default CacheAlignerConfig."""
33
- return CacheAlignerConfig()
34
 
35
 
36
  @pytest.fixture
@@ -194,7 +194,7 @@ class TestDateExtraction:
194
  {"role": "user", "content": "Hello"},
195
  ]
196
 
197
- config = CacheAlignerConfig(date_patterns=custom_patterns)
198
  aligner = CacheAligner(config)
199
 
200
  assert aligner.should_apply(messages, tokenizer)
@@ -705,7 +705,7 @@ Please be helpful, harmless, and honest."""
705
  {"role": "user", "content": "What can you help me with today?"},
706
  ]
707
 
708
- aligner = CacheAligner()
709
 
710
  # Check should_apply
711
  assert aligner.should_apply(messages, tokenizer)
 
29
 
30
  @pytest.fixture
31
  def default_config():
32
+ """Default CacheAlignerConfig with enabled=True for testing."""
33
+ return CacheAlignerConfig(enabled=True)
34
 
35
 
36
  @pytest.fixture
 
194
  {"role": "user", "content": "Hello"},
195
  ]
196
 
197
+ config = CacheAlignerConfig(enabled=True, date_patterns=custom_patterns)
198
  aligner = CacheAligner(config)
199
 
200
  assert aligner.should_apply(messages, tokenizer)
 
705
  {"role": "user", "content": "What can you help me with today?"},
706
  ]
707
 
708
+ aligner = CacheAligner(CacheAlignerConfig(enabled=True))
709
 
710
  # Check should_apply
711
  assert aligner.should_apply(messages, tokenizer)
tests/test_transforms/test_read_lifecycle.py CHANGED
@@ -276,7 +276,7 @@ class TestSupersededDetection:
276
 
277
  def test_reread_makes_superseded(self):
278
  """Read(A) → Read(A): first Read becomes superseded."""
279
- config = ReadLifecycleConfig(enabled=True)
280
  mgr = ReadLifecycleManager(config)
281
 
282
  messages = [
 
276
 
277
  def test_reread_makes_superseded(self):
278
  """Read(A) → Read(A): first Read becomes superseded."""
279
+ config = ReadLifecycleConfig(enabled=True, compress_superseded=True)
280
  mgr = ReadLifecycleManager(config)
281
 
282
  messages = [