Spaces:

minhtudragon
/

headroom

Build error

chopratejas commited on Mar 8

Commit

0e85550

1 Parent(s): 7e6087f

fix: dashboard metrics, TTFB tracking, eager LLMLingua loading, and multi-provider consistency

Dashboard was showing wildly incorrect metrics (99.5% savings, 3ms overhead)
due to using Anthropic API's non-cached input_tokens instead of optimized_tokens,
and dividing overhead by total request count instead of optimized-only count.

Key fixes:
- Use optimized_tokens (what we sent) for dashboard aggregation, not API's
input_tokens which excludes cached portion
- Track overhead_count separately from latency_count for correct averages
- Add TTFB (time to first byte) measurement, replace full stream latency in UI
- Eager-load LLMLingua model at proxy startup (eliminates 5.9s first-request delay)
- Simplify CostTracker to token-based accounting with counterfactual cost display
- Add two-tier compression cache to ContentRouter (skip set + result cache)
- Fix compression pinning to detect both CCR and ReadLifecycle markers
- Clamp tokens_saved to max(0, ...) across all provider paths
- Add per-transform timing instrumentation to pipeline
- Guard against over-aggressive code compression (<5% ratio)
- Fix ReadLifecycle partial read supersede logic (_read_covers range check)
- Disable CacheAligner and compress_superseded by default
- Fix all pre-existing mypy errors (CompressionCache return types)
- Fix test mocks to accept **kwargs for cache token parameters

Files changed (12) hide show

headroom/config.py +6 -3
headroom/dashboard/templates/dashboard.html +388 -79
headroom/perf/analyzer.py +116 -6
headroom/proxy/server.py +241 -309
headroom/transforms/code_compressor.py +19 -0
headroom/transforms/content_router.py +274 -1
headroom/transforms/pipeline.py +36 -5
headroom/transforms/read_lifecycle.py +48 -8
tests/test_config.py +3 -1
tests/test_proxy_streaming_resilience.py +9 -2
tests/test_transforms/test_cache_aligner.py +4 -4
tests/test_transforms/test_read_lifecycle.py +1 -1

headroom/config.py CHANGED Viewed

@@ -78,7 +78,7 @@ class CacheAlignerConfig:
     SAFE: Only applied to SYSTEM messages, not user/assistant/tool content.
     """
-    enabled: bool = True
     # === Phase 1: DynamicContentDetector Integration ===
     # When True, uses the full DynamicContentDetector with 15+ patterns
@@ -397,7 +397,7 @@ class ReadLifecycleConfig:
     enabled: bool = True  # On by default: stale/superseded Reads are provably safe to compress
     compress_stale: bool = True  # Replace Reads of files that were later edited
-    compress_superseded: bool = True  # Replace Reads of files that were later re-Read
     min_size_bytes: int = 512  # Skip tiny Read outputs (not worth the overhead)
@@ -702,11 +702,13 @@ class TransformResult:
     warnings: list[str] = field(default_factory=list)
     diff_artifact: DiffArtifact | None = None  # Populated if generate_diff_artifact=True
     cache_metrics: CachePrefixMetrics | None = None  # Populated by CacheAligner
 @dataclass
 class TransformDiff:
-    """Diff info for a single transform (for debugging)."""
     transform_name: str
     tokens_before: int
@@ -715,6 +717,7 @@ class TransformDiff:
     items_removed: int = 0
     items_kept: int = 0
     details: str = ""  # Human-readable description of what changed
 @dataclass

     SAFE: Only applied to SYSTEM messages, not user/assistant/tool content.
     """
+    enabled: bool = False  # Disabled by default — prefix stability gains are marginal in practice
     # === Phase 1: DynamicContentDetector Integration ===
     # When True, uses the full DynamicContentDetector with 15+ patterns
     enabled: bool = True  # On by default: stale/superseded Reads are provably safe to compress
     compress_stale: bool = True  # Replace Reads of files that were later edited
+    compress_superseded: bool = False  # Disabled: busts Anthropic prompt cache prefix
     min_size_bytes: int = 512  # Skip tiny Read outputs (not worth the overhead)
     warnings: list[str] = field(default_factory=list)
     diff_artifact: DiffArtifact | None = None  # Populated if generate_diff_artifact=True
     cache_metrics: CachePrefixMetrics | None = None  # Populated by CacheAligner
+    timing: dict[str, float] = field(default_factory=dict)  # transform_name → ms
+    waste_signals: WasteSignals | None = None  # Detected waste in original messages
 @dataclass
 class TransformDiff:
+    """Diff info for a single transform (for debugging/perf)."""
     transform_name: str
     tokens_before: int
     items_removed: int = 0
     items_kept: int = 0
     details: str = ""  # Human-readable description of what changed
+    duration_ms: float = 0.0  # Wall-clock time for this transform
 @dataclass

headroom/dashboard/templates/dashboard.html CHANGED Viewed

@@ -25,6 +25,8 @@
         body { background: #0f0f0f; }
         .sparkline { stroke: #22d3ee; stroke-width: 1.5; fill: none; }
         .sparkline-area { fill: url(#sparkline-gradient); }
         @keyframes pulse-subtle { 0%, 100% { opacity: 1; } 50% { opacity: 0.7; } }
         .pulse-live { animation: pulse-subtle 2s ease-in-out infinite; }
     </style>
@@ -52,27 +54,20 @@
     </header>
     <main class="p-6 max-w-7xl mx-auto">
-        <!-- Hero Metrics -->
         <div class="grid grid-cols-2 md:grid-cols-4 gap-4 mb-6">
-            <!-- Requests -->
             <div class="bg-surface rounded-lg p-4 border border-border">
-                <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Requests</div>
-                <div class="text-3xl font-light tabular-nums" x-text="formatNumber(stats.requests?.total || 0)"></div>
-                <div class="mt-2 h-8">
-                    <svg class="w-full h-full" viewBox="0 0 100 32" preserveAspectRatio="none">
-                        <defs>
-                            <linearGradient id="sparkline-gradient" x1="0%" y1="0%" x2="0%" y2="100%">
-                                <stop offset="0%" style="stop-color:#22d3ee;stop-opacity:0.3"/>
-                                <stop offset="100%" style="stop-color:#22d3ee;stop-opacity:0"/>
-                            </linearGradient>
-                        </defs>
-                        <path class="sparkline-area" :d="getSparklineArea(requestHistory)"></path>
-                        <path class="sparkline" :d="getSparkline(requestHistory)"></path>
-                    </svg>
                 </div>
             </div>
-            <!-- Tokens Saved -->
             <div class="bg-surface rounded-lg p-4 border border-border">
                 <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Tokens Saved</div>
                 <div class="flex items-baseline gap-2">
@@ -81,32 +76,120 @@
                 </div>
                 <div class="mt-2 h-8">
                     <svg class="w-full h-full" viewBox="0 0 100 32" preserveAspectRatio="none">
                         <path class="sparkline-area" :d="getSparklineArea(savingsHistory)"></path>
                         <path class="sparkline" :d="getSparkline(savingsHistory)"></path>
                     </svg>
                 </div>
             </div>
-            <!-- Cost Saved -->
             <div class="bg-surface rounded-lg p-4 border border-border">
-                <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Cost Saved</div>
                 <div class="flex items-baseline gap-2">
-                    <span class="text-3xl font-light tabular-nums text-accent" x-text="'$' + formatCost(stats.cost?.total_savings_usd || 0)"></span>
-                </div>
-                <div class="mt-3 text-xs text-gray-500">
-                    vs $<span x-text="formatCost(stats.cost?.total_cost_usd || 0)"></span> spent
                 </div>
             </div>
             <!-- Headroom Overhead -->
             <div class="bg-surface rounded-lg p-4 border border-border">
-                <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Headroom Overhead</div>
                 <div class="flex items-baseline gap-2">
                     <span class="text-3xl font-light tabular-nums" x-text="(stats.overhead?.average_ms || 0).toFixed(0) + 'ms'"></span>
                 </div>
-                <div class="mt-3 text-xs text-gray-500">
-                    Avg <span x-text="((stats.latency?.average_ms || 0) / 1000).toFixed(1)"></span>s total response time
                 </div>
             </div>
         </div>
@@ -117,21 +200,17 @@
                 <div class="text-sm font-medium mb-4 text-gray-300">Token Usage</div>
                 <div class="space-y-3">
                     <div class="flex justify-between items-center">
-                        <span class="text-sm text-gray-400">Input Tokens</span>
-                        <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.input || 0)"></span>
                     </div>
                     <div class="flex justify-between items-center">
-                        <span class="text-sm text-gray-400">Output Tokens</span>
-                        <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.output || 0)"></span>
                     </div>
                     <div class="border-t border-border my-2"></div>
                     <div class="flex justify-between items-center">
-                        <span class="text-sm text-gray-400">Original Size</span>
-                        <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.total_before_compression || 0)"></span>
-                    </div>
-                    <div class="flex justify-between items-center">
-                        <span class="text-sm text-gray-400">After Compression</span>
-                        <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.input || 0)"></span>
                     </div>
                 </div>
             </div>
@@ -171,55 +250,171 @@
                         <span class="font-mono text-sm" x-text="(stats.overhead?.min_ms || 0).toFixed(0) + ' - ' + (stats.overhead?.max_ms || 0).toFixed(0) + 'ms'"></span>
                     </div>
                     <div class="flex justify-between items-center">
-                        <span class="text-sm text-gray-400">Total Response Time</span>
-                        <span class="font-mono text-sm" x-text="((stats.latency?.average_ms || 0) / 1000).toFixed(1) + 's avg'"></span>
                     </div>
                     <div class="flex justify-between items-center">
                         <span class="text-sm text-gray-400">Failed Requests</span>
                         <span class="font-mono text-sm" x-text="stats.requests?.failed || 0"></span>
                     </div>
                 </div>
             </div>
         </div>
-        <!-- Recent Requests Table -->
         <div class="bg-surface rounded-lg border border-border overflow-hidden">
             <div class="px-4 py-3 border-b border-border flex justify-between items-center">
                 <span class="text-sm font-medium text-gray-300">Recent Requests</span>
-                <span class="text-xs text-gray-500">Last 10</span>
             </div>
             <div class="overflow-x-auto">
                 <table class="w-full text-sm">
                     <thead>
                         <tr class="text-left text-xs text-gray-500 uppercase tracking-wide">
                             <th class="px-4 py-3 font-medium">Time</th>
                             <th class="px-4 py-3 font-medium">Model</th>
                             <th class="px-4 py-3 font-medium text-right">Input</th>
                             <th class="px-4 py-3 font-medium text-right">Output</th>
                             <th class="px-4 py-3 font-medium text-right">Saved</th>
-                            <th class="px-4 py-3 font-medium text-right">Cost</th>
                             <th class="px-4 py-3 font-medium text-right">Latency</th>
                         </tr>
                     </thead>
                     <tbody class="divide-y divide-border">
                         <template x-for="req in (stats.recent_requests || [])" :key="req.request_id">
-                            <tr class="hover:bg-border/30 transition-colors">
-                                <td class="px-4 py-3 font-mono text-gray-400" x-text="formatTime(req.timestamp)"></td>
-                                <td class="px-4 py-3">
-                                    <span class="px-2 py-0.5 bg-border rounded text-xs" x-text="truncateModel(req.model)"></span>
-                                </td>
-                                <td class="px-4 py-3 text-right font-mono" x-text="formatNumber(req.input_tokens_optimized)"></td>
-                                <td class="px-4 py-3 text-right font-mono" x-text="formatNumber(req.output_tokens || 0)"></td>
-                                <td class="px-4 py-3 text-right">
-                                    <span class="text-accent font-mono" x-text="req.savings_percent.toFixed(0) + '%'"></span>
                                 </td>
-                                <td class="px-4 py-3 text-right font-mono text-gray-400" x-text="'$' + (req.estimated_cost_usd || 0).toFixed(4)"></td>
-                                <td class="px-4 py-3 text-right font-mono text-gray-400" x-text="(req.total_latency_ms || 0).toFixed(0) + 'ms'"></td>
                             </tr>
                         </template>
                         <template x-if="(stats.recent_requests || []).length === 0">
                             <tr>
-                                <td colspan="7" class="px-4 py-8 text-center text-gray-500 italic">
                                     No requests yet. Start using the proxy to see activity here.
                                 </td>
                             </tr>
@@ -229,23 +424,6 @@
             </div>
         </div>
-        <!-- Budget Bar (if configured) -->
-        <template x-if="stats.cost?.budget_limit_usd">
-            <div class="mt-6 bg-surface rounded-lg p-4 border border-border">
-                <div class="flex justify-between items-center mb-2">
-                    <span class="text-sm text-gray-400">Budget (<span x-text="stats.cost?.budget_period || 'daily'"></span>)</span>
-                    <span class="font-mono text-sm">
-                        $<span x-text="formatCost(stats.cost?.period_cost_usd || 0)"></span>
-                        / $<span x-text="formatCost(stats.cost?.budget_limit_usd || 0)"></span>
-                    </span>
-                </div>
-                <div class="w-full h-2 bg-border rounded-full overflow-hidden">
-                    <div class="h-full rounded-full transition-all duration-500"
-                         :class="getBudgetPercent() > 90 ? 'bg-red-400' : getBudgetPercent() > 70 ? 'bg-amber-400' : 'bg-accent'"
-                         :style="'width: ' + Math.min(getBudgetPercent(), 100) + '%'"></div>
-                </div>
-            </div>
-        </template>
     </main>
     <!-- Footer -->
@@ -269,6 +447,7 @@
                 lastUpdate: 'never',
                 requestHistory: [],
                 savingsHistory: [],
                 pollInterval: null,
                 async init() {
@@ -310,14 +489,21 @@
                     }
                 },
                 formatNumber(n) {
                     if (n >= 1000000) return (n / 1000000).toFixed(1) + 'M';
                     if (n >= 1000) return (n / 1000).toFixed(1) + 'k';
                     return n.toString();
                 },
-                formatCost(n) {
-                    return n.toFixed(2);
                 },
                 formatTime(ts) {
@@ -332,23 +518,123 @@
                 truncateModel(model) {
                     if (!model) return '-';
-                    // Remove provider prefix and version suffix for display
                     return model.replace(/^(anthropic\.|openai\.|bedrock\/)/, '')
                                .replace(/-\d{8}$/, '')
                                .substring(0, 20);
                 },
                 getProviderPercent(count) {
                     const total = this.stats.requests?.total || 1;
                     return Math.min((count / total) * 100, 100);
                 },
-                getBudgetPercent() {
-                    const limit = this.stats.cost?.budget_limit_usd || 1;
-                    const used = this.stats.cost?.period_cost_usd || 0;
-                    return (used / limit) * 100;
-                },
                 getSparkline(data) {
                     if (!data || data.length < 2) return '';
                     const min = Math.min(...data);
@@ -369,7 +655,30 @@
                     const line = this.getSparkline(data);
                     if (!line) return '';
                     return line + ` L100,32 L0,32 Z`;
-                }
             };
         }
     </script>

         body { background: #0f0f0f; }
         .sparkline { stroke: #22d3ee; stroke-width: 1.5; fill: none; }
         .sparkline-area { fill: url(#sparkline-gradient); }
+        .trend-line { stroke: #22d3ee; stroke-width: 2; fill: none; }
+        .trend-area { fill: url(#trend-gradient); }
         @keyframes pulse-subtle { 0%, 100% { opacity: 1; } 50% { opacity: 0.7; } }
         .pulse-live { animation: pulse-subtle 2s ease-in-out infinite; }
     </style>
     </header>
     <main class="p-6 max-w-7xl mx-auto">
+        <!-- Hero Metrics (reordered: Savings $ -> Tokens Saved % -> Quality Confidence -> Overhead) -->
         <div class="grid grid-cols-2 md:grid-cols-4 gap-4 mb-6">
+            <!-- Savings ($) - Lead with dollars -->
             <div class="bg-surface rounded-lg p-4 border border-border">
+                <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Savings</div>
+                <div class="flex items-baseline gap-2">
+                    <span class="text-3xl font-light tabular-nums text-emerald-400" x-text="'$' + formatCurrency(stats.cost?.savings_usd || 0)"></span>
+                </div>
+                <div class="mt-2 text-xs text-gray-500">
+                    <span x-text="formatNumber(stats.requests?.total || 0)"></span> requests processed
                 </div>
             </div>
+            <!-- Tokens Saved (%) -->
             <div class="bg-surface rounded-lg p-4 border border-border">
                 <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Tokens Saved</div>
                 <div class="flex items-baseline gap-2">
                 </div>
                 <div class="mt-2 h-8">
                     <svg class="w-full h-full" viewBox="0 0 100 32" preserveAspectRatio="none">
+                        <defs>
+                            <linearGradient id="sparkline-gradient" x1="0%" y1="0%" x2="0%" y2="100%">
+                                <stop offset="0%" style="stop-color:#22d3ee;stop-opacity:0.3"/>
+                                <stop offset="100%" style="stop-color:#22d3ee;stop-opacity:0"/>
+                            </linearGradient>
+                        </defs>
                         <path class="sparkline-area" :d="getSparklineArea(savingsHistory)"></path>
                         <path class="sparkline" :d="getSparkline(savingsHistory)"></path>
                     </svg>
                 </div>
             </div>
+            <!-- Quality Confidence -->
             <div class="bg-surface rounded-lg p-4 border border-border">
+                <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Compression Quality</div>
                 <div class="flex items-baseline gap-2">
+                    <span class="w-3 h-3 rounded-full mt-1" :class="confidenceColor"></span>
+                    <span class="text-3xl font-light tabular-nums" :class="confidenceTextColor" x-text="confidenceLabel"></span>
                 </div>
+                <div class="mt-2 text-xs text-gray-500" x-text="confidenceDetail"></div>
             </div>
             <!-- Headroom Overhead -->
             <div class="bg-surface rounded-lg p-4 border border-border">
+                <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Overhead</div>
                 <div class="flex items-baseline gap-2">
                     <span class="text-3xl font-light tabular-nums" x-text="(stats.overhead?.average_ms || 0).toFixed(0) + 'ms'"></span>
                 </div>
+                <div class="mt-2 text-xs text-gray-500">
+                    TTFB <span x-text="((stats.ttfb?.average_ms || 0) / 1000).toFixed(2)"></span>s avg
+                </div>
+            </div>
+        </div>
+        <!-- "Without Headroom" Counterfactual -->
+        <template x-if="(stats.cost?.cost_without_headroom_usd || 0) > 0">
+            <div class="bg-surface rounded-lg p-4 border border-border mb-6">
+                <div class="flex items-center justify-between">
+                    <div>
+                        <div class="text-sm font-medium text-gray-300 mb-2">Without Headroom</div>
+                        <div class="flex items-center gap-8">
+                            <div>
+                                <div class="text-xs text-gray-500 mb-1">Input cost</div>
+                                <div class="text-2xl font-light tabular-nums text-emerald-400" x-text="'$' + formatCurrency(stats.cost?.cost_with_headroom_usd || 0)"></div>
+                            </div>
+                            <div class="text-gray-500 text-2xl font-light">vs</div>
+                            <div>
+                                <div class="text-xs text-gray-500 mb-1">Input cost without Headroom</div>
+                                <div class="text-2xl font-light tabular-nums text-red-400 line-through decoration-red-400/50" x-text="'$' + formatCurrency(stats.cost?.cost_without_headroom_usd || 0)"></div>
+                            </div>
+                            <div class="ml-auto text-right">
+                                <div class="text-xs text-gray-500 mb-1">Total saved</div>
+                                <div class="text-2xl font-light tabular-nums text-emerald-400" x-text="'$' + formatCurrency(stats.cost?.savings_usd || 0)"></div>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </template>
+        <!-- New: Waste Signal Breakdown + Cumulative Savings Trend -->
+        <div class="grid grid-cols-1 lg:grid-cols-2 gap-4 mb-6">
+            <!-- What Headroom Removed -->
+            <div class="bg-surface rounded-lg p-4 border border-border">
+                <div class="text-sm font-medium mb-4 text-gray-300">What Headroom Removed</div>
+                <template x-if="Object.keys(stats.waste_signals || {}).length > 0">
+                    <div class="space-y-3">
+                        <template x-for="[signal, tokens] in sortedWasteSignals" :key="signal">
+                            <div>
+                                <div class="flex justify-between items-center mb-1">
+                                    <span class="text-sm text-gray-400" x-text="wasteSignalLabel(signal)"></span>
+                                    <span class="font-mono text-sm text-accent" x-text="formatNumber(tokens) + ' tokens'"></span>
+                                </div>
+                                <div class="w-full h-2 bg-border rounded-full overflow-hidden">
+                                    <div class="h-full rounded-full transition-all duration-500"
+                                         :class="wasteSignalColor(signal)"
+                                         :style="'width: ' + getWastePercent(tokens) + '%'"></div>
+                                </div>
+                            </div>
+                        </template>
+                    </div>
+                </template>
+                <template x-if="Object.keys(stats.waste_signals || {}).length === 0">
+                    <div class="text-sm text-gray-500 italic py-8 text-center">
+                        No waste signals detected yet. Data appears after requests are processed.
+                    </div>
+                </template>
+            </div>
+            <!-- Cumulative Savings Trend -->
+            <div class="bg-surface rounded-lg p-4 border border-border">
+                <div class="flex justify-between items-center mb-4">
+                    <span class="text-sm font-medium text-gray-300">Savings Over Time</span>
+                    <span class="text-xs text-gray-500 font-mono" x-text="formatNumber(stats.tokens?.saved || 0) + ' tokens total'"></span>
                 </div>
+                <template x-if="(stats.savings_history || []).length >= 2">
+                    <div class="h-32">
+                        <svg class="w-full h-full" viewBox="0 0 200 64" preserveAspectRatio="none">
+                            <defs>
+                                <linearGradient id="trend-gradient" x1="0%" y1="0%" x2="0%" y2="100%">
+                                    <stop offset="0%" style="stop-color:#22d3ee;stop-opacity:0.2"/>
+                                    <stop offset="100%" style="stop-color:#22d3ee;stop-opacity:0"/>
+                                </linearGradient>
+                            </defs>
+                            <path class="trend-area" :d="getTrendArea(stats.savings_history)"></path>
+                            <path class="trend-line" :d="getTrendLine(stats.savings_history)"></path>
+                        </svg>
+                    </div>
+                </template>
+                <template x-if="(stats.savings_history || []).length < 2">
+                    <div class="h-32 flex items-center justify-center text-sm text-gray-500 italic">
+                        Trend data will appear after multiple requests.
+                    </div>
+                </template>
             </div>
         </div>
                 <div class="text-sm font-medium mb-4 text-gray-300">Token Usage</div>
                 <div class="space-y-3">
                     <div class="flex justify-between items-center">
+                        <span class="text-sm text-gray-400">Before Compression</span>
+                        <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.total_before_compression || 0)"></span>
                     </div>
                     <div class="flex justify-between items-center">
+                        <span class="text-sm text-gray-400">After Compression (sent)</span>
+                        <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.input || 0)"></span>
                     </div>
                     <div class="border-t border-border my-2"></div>
                     <div class="flex justify-between items-center">
+                        <span class="text-sm text-gray-400">Output Tokens</span>
+                        <span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.output || 0)"></span>
                     </div>
                 </div>
             </div>
                         <span class="font-mono text-sm" x-text="(stats.overhead?.min_ms || 0).toFixed(0) + ' - ' + (stats.overhead?.max_ms || 0).toFixed(0) + 'ms'"></span>
                     </div>
                     <div class="flex justify-between items-center">
+                        <span class="text-sm text-gray-400">TTFB (upstream)</span>
+                        <span class="font-mono text-sm" x-text="((stats.ttfb?.average_ms || 0) / 1000).toFixed(2) + 's avg'"></span>
+                    </div>
+                    <div class="flex justify-between items-center">
+                        <span class="text-sm text-gray-400">TTFB Range</span>
+                        <span class="font-mono text-sm" x-text="((stats.ttfb?.min_ms || 0) / 1000).toFixed(2) + ' - ' + ((stats.ttfb?.max_ms || 0) / 1000).toFixed(2) + 's'"></span>
                     </div>
                     <div class="flex justify-between items-center">
                         <span class="text-sm text-gray-400">Failed Requests</span>
                         <span class="font-mono text-sm" x-text="stats.requests?.failed || 0"></span>
                     </div>
+                    <!-- Per-transform timing breakdown -->
+                    <template x-if="Object.keys(stats.pipeline_timing || {}).length > 0">
+                        <div>
+                            <div class="border-t border-border my-2"></div>
+                            <div class="text-xs text-gray-500 uppercase tracking-wide mb-2">Pipeline Breakdown</div>
+                            <template x-for="[name, t] in Object.entries(stats.pipeline_timing || {})" :key="name">
+                                <div class="flex justify-between items-center mb-1">
+                                    <span class="text-xs text-gray-400 font-mono truncate mr-2" x-text="name"></span>
+                                    <span class="text-xs font-mono whitespace-nowrap"
+                                          :class="t.average_ms > 100 ? 'text-amber-400' : t.average_ms > 50 ? 'text-yellow-400' : 'text-gray-400'"
+                                          x-text="t.average_ms.toFixed(0) + 'ms avg / ' + t.max_ms.toFixed(0) + 'ms max'"></span>
+                                </div>
+                            </template>
+                        </div>
+                    </template>
                 </div>
             </div>
         </div>
+        <!-- Per-Model Savings Breakdown -->
+        <template x-if="Object.keys(stats.cost?.per_model || {}).length > 0">
+            <div class="bg-surface rounded-lg border border-border overflow-hidden mb-6">
+                <div class="px-4 py-3 border-b border-border flex justify-between items-center">
+                    <span class="text-sm font-medium text-gray-300">Per-Model Token Savings</span>
+                    <span class="text-xs text-gray-500">Exact tokens saved per model</span>
+                </div>
+                <div class="overflow-x-auto">
+                    <table class="w-full text-sm">
+                        <thead>
+                            <tr class="text-left text-xs text-gray-500 uppercase tracking-wide">
+                                <th class="px-4 py-3 font-medium">Model</th>
+                                <th class="px-4 py-3 font-medium text-right">Requests</th>
+                                <th class="px-4 py-3 font-medium text-right">Tokens Saved</th>
+                                <th class="px-4 py-3 font-medium text-right">Tokens Sent</th>
+                                <th class="px-4 py-3 font-medium text-right">Reduction</th>
+                            </tr>
+                        </thead>
+                        <tbody class="divide-y divide-border">
+                            <template x-for="[model, info] in Object.entries(stats.cost?.per_model || {})" :key="model">
+                                <tr class="hover:bg-border/30 transition-colors">
+                                    <td class="px-4 py-3">
+                                        <span class="px-2 py-0.5 bg-border rounded text-xs" x-text="truncateModel(model)"></span>
+                                    </td>
+                                    <td class="px-4 py-3 text-right font-mono" x-text="info.requests"></td>
+                                    <td class="px-4 py-3 text-right font-mono text-accent" x-text="formatNumber(info.tokens_saved)"></td>
+                                    <td class="px-4 py-3 text-right font-mono" x-text="formatNumber(info.tokens_sent)"></td>
+                                    <td class="px-4 py-3 text-right">
+                                        <span class="text-accent font-mono" x-text="info.reduction_pct.toFixed(1) + '%'"></span>
+                                    </td>
+                                </tr>
+                            </template>
+                        </tbody>
+                    </table>
+                </div>
+            </div>
+        </template>
+        <!-- Recent Requests Table (with expandable rows) -->
         <div class="bg-surface rounded-lg border border-border overflow-hidden">
             <div class="px-4 py-3 border-b border-border flex justify-between items-center">
                 <span class="text-sm font-medium text-gray-300">Recent Requests</span>
+                <span class="text-xs text-gray-500">Last 10 &mdash; click row to expand</span>
             </div>
             <div class="overflow-x-auto">
                 <table class="w-full text-sm">
                     <thead>
                         <tr class="text-left text-xs text-gray-500 uppercase tracking-wide">
+                            <th class="px-4 py-3 font-medium w-6"></th>
                             <th class="px-4 py-3 font-medium">Time</th>
                             <th class="px-4 py-3 font-medium">Model</th>
                             <th class="px-4 py-3 font-medium text-right">Input</th>
                             <th class="px-4 py-3 font-medium text-right">Output</th>
                             <th class="px-4 py-3 font-medium text-right">Saved</th>
+                            <th class="px-4 py-3 font-medium text-right">Quality</th>
                             <th class="px-4 py-3 font-medium text-right">Latency</th>
                         </tr>
                     </thead>
                     <tbody class="divide-y divide-border">
                         <template x-for="req in (stats.recent_requests || [])" :key="req.request_id">
+                            <tr>
+                                <td colspan="8" class="p-0">
+                                    <div class="cursor-pointer" @click="toggleExpanded(req.request_id)">
+                                        <div class="flex hover:bg-border/30 transition-colors">
+                                            <div class="px-4 py-3 w-6 text-gray-500">
+                                                <span x-text="expandedRows[req.request_id] ? '-' : '+'"></span>
+                                            </div>
+                                            <div class="px-4 py-3 font-mono text-gray-400 flex-1" x-text="formatTime(req.timestamp)"></div>
+                                            <div class="px-4 py-3 flex-1">
+                                                <span class="px-2 py-0.5 bg-border rounded text-xs" x-text="truncateModel(req.model)"></span>
+                                            </div>
+                                            <div class="px-4 py-3 text-right font-mono flex-1" x-text="formatNumber(req.input_tokens_optimized)"></div>
+                                            <div class="px-4 py-3 text-right font-mono flex-1" x-text="formatNumber(req.output_tokens || 0)"></div>
+                                            <div class="px-4 py-3 text-right flex-1">
+                                                <span class="text-accent font-mono" x-text="req.savings_percent.toFixed(0) + '%'"></span>
+                                            </div>
+                                            <div class="px-4 py-3 text-right flex-1">
+                                                <span class="w-2 h-2 rounded-full inline-block" :class="getRequestConfidenceColor(req)"></span>
+                                            </div>
+                                            <div class="px-4 py-3 text-right font-mono text-gray-400 flex-1" x-text="(req.total_latency_ms || 0).toFixed(0) + 'ms'"></div>
+                                        </div>
+                                    </div>
+                                    <!-- Expanded detail row -->
+                                    <template x-if="expandedRows[req.request_id]">
+                                        <div class="px-8 py-4 bg-[#151515] border-t border-border">
+                                            <div class="grid grid-cols-2 lg:grid-cols-4 gap-4 text-xs">
+                                                <div>
+                                                    <div class="text-gray-500 uppercase tracking-wide mb-1">Original Tokens</div>
+                                                    <div class="font-mono" x-text="formatNumber(req.input_tokens_original)"></div>
+                                                </div>
+                                                <div>
+                                                    <div class="text-gray-500 uppercase tracking-wide mb-1">Compressed Tokens</div>
+                                                    <div class="font-mono" x-text="formatNumber(req.input_tokens_optimized)"></div>
+                                                </div>
+                                                <div>
+                                                    <div class="text-gray-500 uppercase tracking-wide mb-1">Tokens Removed</div>
+                                                    <div class="font-mono text-accent" x-text="formatNumber(req.tokens_saved)"></div>
+                                                </div>
+                                                <div>
+                                                    <div class="text-gray-500 uppercase tracking-wide mb-1">Optimization Time</div>
+                                                    <div class="font-mono" x-text="(req.optimization_latency_ms || 0).toFixed(0) + 'ms'"></div>
+                                                </div>
+                                            </div>
+                                            <!-- Transforms Applied -->
+                                            <template x-if="(req.transforms_applied || []).length > 0">
+                                                <div class="mt-3">
+                                                    <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Transforms Applied</div>
+                                                    <div class="flex flex-wrap gap-1">
+                                                        <template x-for="t in req.transforms_applied" :key="t">
+                                                            <span class="px-2 py-0.5 bg-border rounded text-xs font-mono" x-text="t"></span>
+                                                        </template>
+                                                    </div>
+                                                </div>
+                                            </template>
+                                            <!-- Waste Signals for this request -->
+                                            <template x-if="req.waste_signals && Object.keys(req.waste_signals).length > 0">
+                                                <div class="mt-3">
+                                                    <div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Waste Detected</div>
+                                                    <div class="flex flex-wrap gap-2">
+                                                        <template x-for="[signal, tokens] in Object.entries(req.waste_signals).filter(([,v]) => v > 0)" :key="signal">
+                                                            <span class="px-2 py-0.5 rounded text-xs font-mono"
+                                                                  :class="wasteSignalBadgeColor(signal)"
+                                                                  x-text="wasteSignalLabel(signal) + ': ' + formatNumber(tokens)"></span>
+                                                        </template>
+                                                    </div>
+                                                </div>
+                                            </template>
+                                        </div>
+                                    </template>
                                 </td>
                             </tr>
                         </template>
                         <template x-if="(stats.recent_requests || []).length === 0">
                             <tr>
+                                <td colspan="8" class="px-4 py-8 text-center text-gray-500 italic">
                                     No requests yet. Start using the proxy to see activity here.
                                 </td>
                             </tr>
             </div>
         </div>
     </main>
     <!-- Footer -->
                 lastUpdate: 'never',
                 requestHistory: [],
                 savingsHistory: [],
+                expandedRows: {},
                 pollInterval: null,
                 async init() {
                     }
                 },
+                // --- Formatting ---
                 formatNumber(n) {
                     if (n >= 1000000) return (n / 1000000).toFixed(1) + 'M';
                     if (n >= 1000) return (n / 1000).toFixed(1) + 'k';
                     return n.toString();
                 },
+                formatCurrency(n) {
+                    if (n < 0) return '-' + this.formatCurrency(-n);
+                    if (n >= 1000) return (n / 1000).toFixed(1) + 'k';
+                    if (n >= 1) return n.toFixed(2);
+                    if (n >= 0.01) return n.toFixed(3);
+                    if (n > 0) return n.toFixed(4);
+                    return '0.00';
                 },
                 formatTime(ts) {
                 truncateModel(model) {
                     if (!model) return '-';
                     return model.replace(/^(anthropic\.|openai\.|bedrock\/)/, '')
                                .replace(/-\d{8}$/, '')
                                .substring(0, 20);
                 },
+                // --- Waste Signals ---
+                wasteSignalLabel(signal) {
+                    const labels = {
+                        json_bloat: 'JSON Bloat',
+                        html_noise: 'HTML Noise',
+                        base64: 'Base64 Blobs',
+                        whitespace: 'Whitespace',
+                        dynamic_date: 'Dynamic Dates',
+                        repetition: 'Repetition',
+                    };
+                    return labels[signal] || signal;
+                },
+                wasteSignalColor(signal) {
+                    const colors = {
+                        json_bloat: 'bg-amber-500',
+                        html_noise: 'bg-orange-500',
+                        base64: 'bg-red-500',
+                        whitespace: 'bg-blue-500',
+                        dynamic_date: 'bg-purple-500',
+                        repetition: 'bg-pink-500',
+                    };
+                    return colors[signal] || 'bg-gray-500';
+                },
+                wasteSignalBadgeColor(signal) {
+                    const colors = {
+                        json_bloat: 'bg-amber-500/20 text-amber-400',
+                        html_noise: 'bg-orange-500/20 text-orange-400',
+                        base64: 'bg-red-500/20 text-red-400',
+                        whitespace: 'bg-blue-500/20 text-blue-400',
+                        dynamic_date: 'bg-purple-500/20 text-purple-400',
+                        repetition: 'bg-pink-500/20 text-pink-400',
+                    };
+                    return colors[signal] || 'bg-gray-500/20 text-gray-400';
+                },
+                get sortedWasteSignals() {
+                    const signals = this.stats.waste_signals || {};
+                    return Object.entries(signals)
+                        .filter(([, v]) => v > 0)
+                        .sort((a, b) => b[1] - a[1]);
+                },
+                getWastePercent(tokens) {
+                    const signals = this.stats.waste_signals || {};
+                    const max = Math.max(...Object.values(signals), 1);
+                    return Math.min((tokens / max) * 100, 100);
+                },
+                // --- Compression Confidence ---
+                get confidenceLevel() {
+                    const saved = this.stats.tokens?.saved || 0;
+                    if (saved === 0) return 'none';
+                    const signals = this.stats.waste_signals || {};
+                    const totalWaste = Object.values(signals).reduce((a, b) => a + b, 0);
+                    if (totalWaste === 0) return 'unknown';
+                    const wasteRatio = totalWaste / saved;
+                    if (wasteRatio >= 0.7) return 'high';
+                    if (wasteRatio >= 0.3) return 'medium';
+                    return 'low';
+                },
+                get confidenceColor() {
+                    const c = { high: 'bg-emerald-400', medium: 'bg-yellow-400', low: 'bg-red-400', none: 'bg-gray-500', unknown: 'bg-gray-500' };
+                    return c[this.confidenceLevel];
+                },
+                get confidenceTextColor() {
+                    const c = { high: 'text-emerald-400', medium: 'text-yellow-400', low: 'text-red-400', none: 'text-gray-500', unknown: 'text-gray-500' };
+                    return c[this.confidenceLevel];
+                },
+                get confidenceLabel() {
+                    const l = { high: 'High', medium: 'Medium', low: 'Low', none: '-', unknown: '-' };
+                    return l[this.confidenceLevel];
+                },
+                get confidenceDetail() {
+                    const saved = this.stats.tokens?.saved || 0;
+                    if (saved === 0) return 'No compression yet';
+                    const signals = this.stats.waste_signals || {};
+                    const totalWaste = Object.values(signals).reduce((a, b) => a + b, 0);
+                    if (totalWaste === 0) return 'No waste signals detected';
+                    const pct = Math.round((totalWaste / saved) * 100);
+                    return pct + '% of removed tokens were identified waste';
+                },
+                getRequestConfidenceColor(req) {
+                    if (!req.waste_signals || req.tokens_saved === 0) return 'bg-gray-500';
+                    const totalWaste = Object.values(req.waste_signals).reduce((a, b) => a + b, 0);
+                    const ratio = totalWaste / req.tokens_saved;
+                    if (ratio >= 0.7) return 'bg-emerald-400';
+                    if (ratio >= 0.3) return 'bg-yellow-400';
+                    return 'bg-red-400';
+                },
+                // --- Expandable Rows ---
+                toggleExpanded(id) {
+                    this.expandedRows[id] = !this.expandedRows[id];
+                },
+                // --- Charts ---
                 getProviderPercent(count) {
                     const total = this.stats.requests?.total || 1;
                     return Math.min((count / total) * 100, 100);
                 },
                 getSparkline(data) {
                     if (!data || data.length < 2) return '';
                     const min = Math.min(...data);
                     const line = this.getSparkline(data);
                     if (!line) return '';
                     return line + ` L100,32 L0,32 Z`;
+                },
+                getTrendLine(history) {
+                    if (!history || history.length < 2) return '';
+                    const values = history.map(h => h[1]);
+                    const min = Math.min(...values);
+                    const max = Math.max(...values);
+                    const range = max - min || 1;
+                    const points = values.map((v, i) => {
+                        const x = (i / (values.length - 1)) * 200;
+                        const y = 60 - ((v - min) / range) * 56;
+                        return `${x},${y}`;
+                    });
+                    return 'M' + points.join(' L');
+                },
+                getTrendArea(history) {
+                    if (!history || history.length < 2) return '';
+                    const line = this.getTrendLine(history);
+                    if (!line) return '';
+                    return line + ` L200,64 L0,64 Z`;
+                },
             };
         }
     </script>

headroom/perf/analyzer.py CHANGED Viewed

@@ -2,14 +2,21 @@
 Parses PERF log lines from ~/.headroom/logs/proxy.log* and produces
 actionable reports on token savings, cache efficiency, and transform impact.
 """
 from __future__ import annotations
 import re
 from dataclasses import dataclass, field
 from pathlib import Path
 LOG_DIR = Path.home() / ".headroom" / "logs"
 # Matches: 2026-03-07 13:38:31,009 - headroom.proxy - INFO - [hr_...] PERF model=... ...
@@ -38,6 +45,90 @@ _TOIN_RE = re.compile(
 )
 def _parse_kv(kv_str: str) -> dict[str, str]:
     """Parse key=value pairs from a PERF log line.
@@ -276,16 +367,35 @@ def format_report(report: PerfReport) -> str:
         total_saved = sum(r.tokens_saved for r in records)
         pct = (total_saved / total_before * 100) if total_before > 0 else 0
-        models = {r.model for r in records}
         lines.append(f"Requests:     {len(records)}")
-        lines.append(f"Models:       {', '.join(sorted(models))}")
-        lines.append(
-            f"Tokens:       {total_before:,} input -> {total_after:,} after transforms "
-            f"({pct:.1f}% reduction)"
-        )
         lines.append(f"Total saved:  {total_saved:,} tokens")
         lines.append("")
         # Cache analysis
         cache_records = [r for r in records if (r.cache_read + r.cache_write) > 0]
         if cache_records:

 Parses PERF log lines from ~/.headroom/logs/proxy.log* and produces
 actionable reports on token savings, cache efficiency, and transform impact.
+Cost accounting is **cache-aware**: saved tokens that would have been served
+from the provider's prompt cache are valued at cache_read price (~10% for
+Anthropic), not the full input price.  This prevents overstating dollar savings.
 """
 from __future__ import annotations
+import logging
 import re
 from dataclasses import dataclass, field
 from pathlib import Path
+log = logging.getLogger(__name__)
 LOG_DIR = Path.home() / ".headroom" / "logs"
 # Matches: 2026-03-07 13:38:31,009 - headroom.proxy - INFO - [hr_...] PERF model=... ...
 )
+# ---------------------------------------------------------------------------
+# Cache-aware pricing via LiteLLM
+# ---------------------------------------------------------------------------
+# LiteLLM already knows per-token costs for 100+ models including
+# cache_read and cache_creation pricing.  We call it directly instead
+# of maintaining our own pricing tables.
+try:
+    import litellm as _litellm
+    _LITELLM_AVAILABLE = True
+except ImportError:
+    _LITELLM_AVAILABLE = False
+# Cache resolved model names (e.g. "claude-opus-4-6" → "anthropic/claude-opus-4-6")
+_resolved_model_cache: dict[str, str] = {}
+def _resolve_model(model: str) -> str:
+    """Resolve to a model name LiteLLM recognises, adding provider prefix if needed.
+    TODO: Duplicated with CostTracker._resolve_litellm_model in proxy/server.py.
+    Extract to shared utility.
+    """
+    if model in _resolved_model_cache:
+        return _resolved_model_cache[model]
+    if not _LITELLM_AVAILABLE:
+        _resolved_model_cache[model] = model
+        return model
+    # Try as-is
+    if model in _litellm.model_cost:
+        _resolved_model_cache[model] = model
+        return model
+    # Try provider prefixes
+    for prefix in ("anthropic/", "openai/", "google/", "mistral/", "deepseek/"):
+        prefixed = f"{prefix}{model}"
+        if prefixed in _litellm.model_cost:
+            _resolved_model_cache[model] = prefixed
+            return prefixed
+    _resolved_model_cache[model] = model
+    return model
+def _litellm_cost(
+    model: str,
+    prompt_tokens: int,
+    cache_read_tokens: int = 0,
+    cache_write_tokens: int = 0,
+) -> float | None:
+    """Compute input cost via litellm.cost_per_token (cache-aware).
+    Returns total input cost in USD, or None if model not found.
+    """
+    if not _LITELLM_AVAILABLE:
+        return None
+    resolved = _resolve_model(model)
+    try:
+        input_cost, _ = _litellm.cost_per_token(
+            model=resolved,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=0,
+            cache_read_input_tokens=cache_read_tokens,
+            cache_creation_input_tokens=cache_write_tokens,
+        )
+        return float(input_cost)
+    except Exception:
+        return None
+def _get_list_price(model: str) -> float | None:
+    """Get list input price per 1M tokens."""
+    if not _LITELLM_AVAILABLE:
+        return None
+    resolved = _resolve_model(model)
+    info = _litellm.model_cost.get(resolved, {})
+    cost_per_token = info.get("input_cost_per_token")
+    return cost_per_token * 1_000_000 if cost_per_token else None
 def _parse_kv(kv_str: str) -> dict[str, str]:
     """Parse key=value pairs from a PERF log line.
         total_saved = sum(r.tokens_saved for r in records)
         pct = (total_saved / total_before * 100) if total_before > 0 else 0
         lines.append(f"Requests:     {len(records)}")
+        lines.append(f"Tokens:       {total_before:,} -> {total_after:,} ({pct:.1f}% reduction)")
         lines.append(f"Total saved:  {total_saved:,} tokens")
         lines.append("")
+        # Per-model breakdown with list prices
+        by_model: dict[str, list[PerfRecord]] = {}
+        for r in records:
+            by_model.setdefault(r.model, []).append(r)
+        lines.append("Per-Model Breakdown")
+        lines.append("-" * 40)
+        for model, model_recs in sorted(by_model.items()):
+            m_saved = sum(r.tokens_saved for r in model_recs)
+            m_before = sum(r.tokens_before for r in model_recs)
+            m_pct = (m_saved / m_before * 100) if m_before > 0 else 0
+            list_price = _get_list_price(model)
+            price_str = f"${list_price:.2f}/MTok" if list_price else "unknown"
+            est_str = (
+                f"  ~${m_saved * list_price / 1_000_000:.2f} at list price" if list_price else ""
+            )
+            lines.append(
+                f"  {model}: {len(model_recs)} reqs, "
+                f"{m_saved:,} tokens saved ({m_pct:.0f}%), "
+                f"list price {price_str}{est_str}"
+            )
+        lines.append("  * Actual bill savings depend on provider caching behavior")
+        lines.append("")
         # Cache analysis
         cache_records = [r for r in records if (r.cache_read + r.cache_write) > 0]
         if cache_records:

headroom/proxy/server.py CHANGED Viewed

@@ -216,10 +216,6 @@ class RequestLog:
     tokens_saved: int
     savings_percent: float
-    # Cost
-    estimated_cost_usd: float | None
-    estimated_savings_usd: float | None
     # Performance
     optimization_latency_ms: float
     total_latency_ms: float | None
@@ -229,6 +225,9 @@ class RequestLog:
     cache_hit: bool
     transforms_applied: list[str]
     # Request/Response (optional, for debugging)
     request_messages: list[dict] | None = None
     response_content: str | None = None
@@ -601,10 +600,13 @@ class CostTracker:
         # Cost tracking - using deque for efficient left-side removal
         self._costs: deque[tuple[datetime, float]] = deque(maxlen=self.MAX_COST_ENTRIES)
-        self._total_cost_usd: float = 0
-        self._total_savings_usd: float = 0
         self._last_prune_time: datetime = datetime.now()
     # Cache resolved model names to avoid repeated litellm lookups.
     # This is critical: litellm.cost_per_token() is synchronous and can block
     # the async event loop if it triggers I/O (lazy model info download).
@@ -667,83 +669,34 @@ class CostTracker:
     ) -> float | None:
         """Estimate cost in USD using LiteLLM's pricing database.
         Args:
             model: Model name for pricing lookup
-            input_tokens: Input tokens sent to API (does NOT include cache_read, which is served from cache)
             output_tokens: Output tokens
-            cache_read_tokens: Tokens read from cache (charged at ~10% of input rate)
-            cache_write_tokens: Tokens written to cache - this is a SUBSET of input_tokens (charged at ~125% of input rate)
         """
         if not LITELLM_AVAILABLE:
             logger.warning("LiteLLM not available - cannot calculate costs")
             return None
         try:
-            # Resolve model name (adds provider prefix if needed, e.g. claude-opus-4-6 → anthropic/claude-opus-4-6)
             resolved_model = self._resolve_litellm_model(model)
-            # cost_per_token returns (total_input_cost, total_output_cost) for the given token counts
-            # Despite the name, it returns total cost not per-token cost
-            # Anthropic's token semantics (all three are SEPARATE, not overlapping):
-            # - input_tokens: tokens sent that are NOT cached (neither read nor written)
-            # - cache_read_input_tokens: tokens served from existing cache
-            # - cache_creation_input_tokens: tokens being written to cache
-            # Total billable = input_tokens + cache_read + cache_write (each at different rates)
-            regular_input = input_tokens  # Don't subtract cache_write, they're separate
-            # Get cost for regular (non-cached) input tokens
-            input_cost, _ = litellm.cost_per_token(
-                model=resolved_model,
-                prompt_tokens=regular_input,
-                completion_tokens=0,
-            )
-            # Get cost for output tokens
-            _, output_cost = litellm.cost_per_token(
                 model=resolved_model,
-                prompt_tokens=0,
                 completion_tokens=output_tokens,
             )
-            # Get model info for cache pricing
-            model_info: dict[str, Any] = {}
-            try:
-                model_info = dict(litellm.get_model_info(resolved_model))
-            except Exception:
-                pass
-            # Calculate cache read cost (typically 10% of input price)
-            cache_read_cost = 0.0
-            if cache_read_tokens > 0:
-                cache_read_cost_per_token = model_info.get("cache_read_input_token_cost")
-                if cache_read_cost_per_token:
-                    cache_read_cost = cache_read_tokens * cache_read_cost_per_token
-                else:
-                    # Fallback: most providers charge ~10% of input price for cache reads
-                    cache_read_full_cost, _ = litellm.cost_per_token(
-                        model=resolved_model,
-                        prompt_tokens=cache_read_tokens,
-                        completion_tokens=0,
-                    )
-                    cache_read_cost = cache_read_full_cost * 0.1
-            # Calculate cache write cost (typically 125% of input price)
-            cache_write_cost = 0.0
-            if cache_write_tokens > 0:
-                cache_write_cost_per_token = model_info.get("cache_creation_input_token_cost")
-                if cache_write_cost_per_token:
-                    cache_write_cost = cache_write_tokens * cache_write_cost_per_token
-                else:
-                    # Fallback: most providers charge ~125% of input price for cache writes
-                    cache_write_full_cost, _ = litellm.cost_per_token(
-                        model=resolved_model,
-                        prompt_tokens=cache_write_tokens,
-                        completion_tokens=0,
-                    )
-                    cache_write_cost = cache_write_full_cost * 1.25
-            total_cost = input_cost + cache_read_cost + cache_write_cost + output_cost
             return float(total_cost) if total_cost > 0 else None
         except Exception as e:
@@ -769,16 +722,13 @@ class CostTracker:
         while self._costs and self._costs[0][0] < cutoff:
             self._costs.popleft()
-    def record_cost(self, cost_usd: float):
-        """Record a cost. Periodically prunes old entries."""
-        self._costs.append((datetime.now(), cost_usd))
-        self._total_cost_usd += cost_usd
-        # Periodically prune old costs to prevent memory growth
-        self._prune_old_costs()
-    def record_savings(self, savings_usd: float):
-        """Record savings from optimization."""
-        self._total_savings_usd += savings_usd
     def get_period_cost(self) -> float:
         """Get cost for current budget period."""
@@ -802,17 +752,55 @@ class CostTracker:
         remaining = self.budget_limit_usd - period_cost
         return remaining > 0, max(0, remaining)
     def stats(self) -> dict:
-        """Get cost statistics."""
         return {
-            "total_cost_usd": round(self._total_cost_usd, 4),
-            "total_savings_usd": round(self._total_savings_usd, 4),
-            "period_cost_usd": round(self.get_period_cost(), 4),
-            "budget_limit_usd": self.budget_limit_usd,
-            "budget_period": self.budget_period,
-            "budget_remaining_usd": round(self.check_budget()[1], 4)
-            if self.budget_limit_usd
-            else None,
         }
@@ -845,9 +833,24 @@ class PrometheusMetrics:
         self.overhead_sum_ms = 0.0
         self.overhead_min_ms = float("inf")
         self.overhead_max_ms = 0.0
-        self.cost_total_usd = 0.0
-        self.savings_total_usd = 0.0
         self._lock = asyncio.Lock()
@@ -860,9 +863,10 @@ class PrometheusMetrics:
         tokens_saved: int,
         latency_ms: float,
         cached: bool = False,
-        cost_usd: float = 0,
-        savings_usd: float = 0,
         overhead_ms: float = 0,
     ):
         """Record metrics for a request."""
         async with self._lock:
@@ -887,9 +891,34 @@ class PrometheusMetrics:
                 self.overhead_sum_ms += overhead_ms
                 self.overhead_min_ms = min(self.overhead_min_ms, overhead_ms)
                 self.overhead_max_ms = max(self.overhead_max_ms, overhead_ms)
-            self.cost_total_usd += cost_usd
-            self.savings_total_usd += savings_usd
     async def record_rate_limited(self):
         async with self._lock:
@@ -934,14 +963,6 @@ class PrometheusMetrics:
                 "# HELP headroom_latency_ms_sum Sum of request latencies",
                 "# TYPE headroom_latency_ms_sum counter",
                 f"headroom_latency_ms_sum {self.latency_sum_ms:.2f}",
-                "",
-                "# HELP headroom_cost_usd_total Total cost in USD",
-                "# TYPE headroom_cost_usd_total counter",
-                f"headroom_cost_usd_total {self.cost_total_usd:.6f}",
-                "",
-                "# HELP headroom_savings_usd_total Total savings in USD",
-                "# TYPE headroom_savings_usd_total counter",
-                f"headroom_savings_usd_total {self.savings_total_usd:.6f}",
             ]
             # Per-provider metrics
@@ -1406,6 +1427,14 @@ class HeadroomProxy:
         else:
             logger.info("Smart Routing: DISABLED (legacy sequential mode)")
         # LLMLingua status with helpful hint
         if self._llmlingua_status == "enabled":
             logger.info(
@@ -1474,8 +1503,6 @@ class HeadroomProxy:
                 m.tokens_saved_total / (m.tokens_input_total + m.tokens_saved_total)
             ) * 100
             logger.info(f"Token savings:         {savings_pct:.1f}%")
-        logger.info(f"Total cost:            ${m.cost_total_usd:.4f}")
-        logger.info(f"Total savings:         ${m.savings_total_usd:.4f}")
         if m.latency_count > 0:
             avg_latency = m.latency_sum_ms / m.latency_count
             logger.info(f"Avg latency:           {avg_latency:.0f}ms")
@@ -1727,6 +1754,8 @@ class HeadroomProxy:
         # Apply optimization
         transforms_applied = []
         optimized_messages = messages
         optimized_tokens = original_tokens
@@ -1745,13 +1774,16 @@ class HeadroomProxy:
                 if result.messages != messages:
                     optimized_messages = result.messages
                     transforms_applied = result.transforms_applied
                     # Use pipeline's token counts for consistency with pipeline logs
                     original_tokens = result.tokens_before
                     optimized_tokens = result.tokens_after
             except Exception as e:
                 logger.warning(f"Optimization failed: {e}")
-        tokens_saved = original_tokens - optimized_tokens
         optimization_latency = (time.time() - start_time) * 1000
         # Hook: post_compress — let hooks observe compression results
@@ -1933,6 +1965,7 @@ class HeadroomProxy:
                         transforms_applied,
                         tags,
                         optimization_latency,
                     )
                 else:
                     backend_response = await self.anthropic_backend.send_message(body, headers)
@@ -1957,22 +1990,11 @@ class HeadroomProxy:
                         latency_ms=total_latency,
                         cached=False,
                         overhead_ms=optimization_latency,
                     )
-                    cost_usd = None
-                    savings_usd = None
                     if self.cost_tracker:
-                        cost_usd = self.cost_tracker.estimate_cost(
-                            model, optimized_tokens, output_tokens
-                        )
-                        original_cost = self.cost_tracker.estimate_cost(
-                            model, original_tokens, output_tokens
-                        )
-                        if cost_usd:
-                            self.cost_tracker.record_cost(cost_usd)
-                        if cost_usd and original_cost:
-                            savings_usd = original_cost - cost_usd
-                            self.cost_tracker.record_savings(savings_usd)
                     # Log request
                     if self.logger:
@@ -1989,8 +2011,6 @@ class HeadroomProxy:
                                 savings_percent=(tokens_saved / original_tokens * 100)
                                 if original_tokens > 0
                                 else 0,
-                                estimated_cost_usd=cost_usd,
-                                estimated_savings_usd=savings_usd,
                                 optimization_latency_ms=optimization_latency,
                                 total_latency_ms=total_latency,
                                 tags=tags,
@@ -2035,6 +2055,7 @@ class HeadroomProxy:
                     tags,
                     optimization_latency,
                     memory_user_id=memory_user_id,
                 )
             else:
                 response = await self._retry_request("POST", url, headers, body)
@@ -2200,45 +2221,14 @@ class HeadroomProxy:
                 total_latency = (time.time() - start_time) * 1000
-                # Parse response for actual token counts from API
-                actual_input_tokens = optimized_tokens  # fallback
                 output_tokens = 0
-                cache_read_tokens = 0
-                cache_write_tokens = 0
                 if resp_json:
                     usage = resp_json.get("usage", {})
-                    actual_input_tokens = usage.get("input_tokens", optimized_tokens)
                     output_tokens = usage.get("output_tokens", 0)
-                    # Anthropic returns cache_read_input_tokens for cached prompt tokens
-                    # These are charged at 10% of the input price
-                    cache_read_tokens = usage.get("cache_read_input_tokens", 0)
-                    # Anthropic returns cache_creation_input_tokens for tokens written to cache
-                    # These are charged at 125% of the input price
-                    cache_write_tokens = usage.get("cache_creation_input_tokens", 0)
-                # Calculate cost using actual API tokens with proper cache pricing
-                cost_usd = None
-                savings_usd = None
                 if self.cost_tracker:
-                    cost_usd = self.cost_tracker.estimate_cost(
-                        model,
-                        actual_input_tokens,
-                        output_tokens,
-                        cache_read_tokens=cache_read_tokens,
-                        cache_write_tokens=cache_write_tokens,
-                    )
-                    # original_cost: what it would have cost without compression
-                    # Use only original_tokens at regular input rate — no cache params,
-                    # since caching is orthogonal to compression savings
-                    original_cost = self.cost_tracker.estimate_cost(
-                        model,
-                        original_tokens,
-                        output_tokens,
-                    )
-                    if cost_usd and original_cost:
-                        savings_usd = original_cost - cost_usd
-                        self.cost_tracker.record_cost(cost_usd)
-                        self.cost_tracker.record_savings(savings_usd)
                 # Cache response
                 if self.cache and response.status_code == 200:
@@ -2250,17 +2240,18 @@ class HeadroomProxy:
                         tokens_saved=tokens_saved,
                     )
-                # Record metrics with actual API tokens
                 await self.metrics.record_request(
                     provider="anthropic",
                     model=model,
-                    input_tokens=actual_input_tokens,
                     output_tokens=output_tokens,
                     tokens_saved=tokens_saved,
                     latency_ms=total_latency,
-                    cost_usd=cost_usd or 0,
-                    savings_usd=savings_usd or 0,
                     overhead_ms=optimization_latency,
                 )
                 # Log request
@@ -2278,13 +2269,12 @@ class HeadroomProxy:
                             savings_percent=(tokens_saved / original_tokens * 100)
                             if original_tokens > 0
                             else 0,
-                            estimated_cost_usd=cost_usd,
-                            estimated_savings_usd=savings_usd,
                             optimization_latency_ms=optimization_latency,
                             total_latency_ms=total_latency,
                             tags=tags,
                             cache_hit=cache_hit,
                             transforms_applied=transforms_applied,
                             request_messages=messages if self.config.log_full_messages else None,
                         )
                     )
@@ -2295,6 +2285,11 @@ class HeadroomProxy:
                 cr = resp_usage.get("cache_read_input_tokens", 0)
                 cw = resp_usage.get("cache_creation_input_tokens", 0)
                 chp = round(cr / (cr + cw) * 100) if (cr + cw) > 0 else 0
                 logger.info(
                     f"[{request_id}] PERF "
                     f"model={model} msgs={num_msgs} "
@@ -2303,6 +2298,7 @@ class HeadroomProxy:
                     f"cache_read={cr} cache_write={cw} cache_hit_pct={chp} "
                     f"opt_ms={optimization_latency:.0f} "
                     f"transforms={_summarize_transforms(transforms_applied)}"
                 )
                 # Remove compression headers since httpx already decompressed the response
@@ -2427,6 +2423,7 @@ class HeadroomProxy:
         total_optimized_tokens = 0
         total_tokens_saved = 0
         compressed_requests = []
         # Apply compression to each request in the batch
         for batch_req in requests_list:
@@ -2450,12 +2447,13 @@ class HeadroomProxy:
                 )
                 optimized_messages = result.messages
                 # Use pipeline's token counts for consistency with pipeline logs
                 original_tokens = result.tokens_before
                 optimized_tokens = result.tokens_after
                 total_original_tokens += original_tokens
                 total_optimized_tokens += optimized_tokens
-                tokens_saved = original_tokens - optimized_tokens
                 total_tokens_saved += tokens_saved
                 # CCR Tool Injection: Inject retrieval tool if compression occurred
@@ -2519,6 +2517,8 @@ class HeadroomProxy:
                 output_tokens=0,
                 tokens_saved=total_tokens_saved,
                 latency_ms=optimization_latency,
             )
             # Log compression stats
@@ -2857,6 +2857,7 @@ class HeadroomProxy:
         total_optimized_tokens = 0
         total_tokens_saved = 0
         compressed_requests = []
         # Apply compression to each request in the batch
         for idx, batch_req in enumerate(requests_list):
@@ -2897,12 +2898,13 @@ class HeadroomProxy:
                 )
                 optimized_messages = result.messages
                 # Use pipeline's token counts for consistency with pipeline logs
                 original_tokens = result.tokens_before
                 optimized_tokens = result.tokens_after
                 total_original_tokens += original_tokens
                 total_optimized_tokens += optimized_tokens
-                tokens_saved = original_tokens - optimized_tokens
                 total_tokens_saved += tokens_saved
                 # CCR Tool Injection: Inject retrieval tool if compression occurred
@@ -2993,6 +2995,8 @@ class HeadroomProxy:
                 output_tokens=0,
                 tokens_saved=total_tokens_saved,
                 latency_ms=optimization_latency,
             )
             # Log compression stats
@@ -3697,6 +3701,7 @@ class HeadroomProxy:
         tags: dict[str, str],
         optimization_latency: float,
         memory_user_id: str | None = None,
     ) -> StreamingResponse:
         """Stream response with metrics tracking and memory tool handling.
@@ -3719,6 +3724,7 @@ class HeadroomProxy:
             "cache_creation_input_tokens": 0,
             "total_bytes": 0,
             "sse_buffer": "",  # Buffer for incomplete SSE events
         }
         # Track if we need to handle memory tools
@@ -3740,6 +3746,10 @@ class HeadroomProxy:
                     "POST", url, json=body, headers=headers
                 ) as response:
                     async for chunk in response.aiter_bytes():
                         stream_state["total_bytes"] += len(chunk)
                         # Buffer SSE data to handle chunks split across calls
@@ -3901,12 +3911,9 @@ class HeadroomProxy:
                         f"[{request_id}] No usage in stream, estimated {output_tokens} output tokens"
                     )
-                # Use actual tokens from API if available, fallback to estimates
-                # Note: use 'is not None' instead of 'or' to handle 0 correctly
-                api_input_tokens = stream_state["input_tokens"]
-                total_input_tokens = (
-                    api_input_tokens if api_input_tokens is not None else optimized_tokens
-                )
                 cache_read_tokens = stream_state["cache_read_input_tokens"]
                 cache_write_tokens = stream_state["cache_creation_input_tokens"]
@@ -3928,54 +3935,19 @@ class HeadroomProxy:
                     f"transforms={_summarize_transforms(transforms_applied)}"
                 )
-                # Normalize input tokens based on provider semantics:
-                # - Anthropic: input_tokens excludes cache_read (it's separate), pass as-is
-                # - OpenAI/Gemini: input_tokens includes cache_read (it's a subset), subtract it
-                if provider == "anthropic":
-                    # Anthropic's input_tokens = non-cached tokens sent (excludes cache_read)
-                    non_cached_input = total_input_tokens
-                else:
-                    # OpenAI/Gemini's input_tokens = total (includes cache_read)
-                    non_cached_input = total_input_tokens - cache_read_tokens
-                # Calculate cost using actual API tokens with proper cache pricing
-                cost_usd = None
-                savings_usd = None
                 if self.cost_tracker:
-                    cost_usd = self.cost_tracker.estimate_cost(
-                        model,
-                        non_cached_input,
-                        output_tokens,
-                        cache_read_tokens=cache_read_tokens,
-                        cache_write_tokens=cache_write_tokens,
-                    )
-                    # For savings calculation, compare compression benefit using base token rates only
-                    # (cache effects are Anthropic's feature, not Headroom's compression benefit)
-                    compressed_base_cost = self.cost_tracker.estimate_cost(
-                        model,
-                        non_cached_input,
-                        output_tokens,
-                    )
-                    original_base_cost = self.cost_tracker.estimate_cost(
-                        model,
-                        original_tokens,
-                        output_tokens,
-                    )
-                    if cost_usd:
-                        self.cost_tracker.record_cost(cost_usd)
-                    if compressed_base_cost and original_base_cost:
-                        savings_usd = original_base_cost - compressed_base_cost
-                        self.cost_tracker.record_savings(max(0, savings_usd))
                 await self.metrics.record_request(
                     provider=provider,
                     model=model,
-                    input_tokens=total_input_tokens,  # Record total for accurate tracking
                     output_tokens=output_tokens,
                     tokens_saved=tokens_saved,
                     latency_ms=total_latency,
-                    cost_usd=cost_usd or 0,
-                    savings_usd=savings_usd or 0,
                 )
         return StreamingResponse(
@@ -3996,6 +3968,7 @@ class HeadroomProxy:
         transforms_applied: list[str],
         tags: dict[str, str],
         optimization_latency: float,
     ) -> StreamingResponse:
         """Stream response from Bedrock backend with metrics tracking.
@@ -4007,6 +3980,7 @@ class HeadroomProxy:
         stream_state: dict[str, Any] = {
             "input_tokens": 0,
             "output_tokens": 0,
         }
         async def generate():
@@ -4014,6 +3988,10 @@ class HeadroomProxy:
                 assert self.anthropic_backend is not None
                 async for event in self.anthropic_backend.stream_message(body, headers):
                     # Format as SSE
                     if event.raw_sse:
                         yield event.raw_sse.encode()
@@ -4060,22 +4038,12 @@ class HeadroomProxy:
                     latency_ms=total_latency,
                     cached=False,
                     overhead_ms=optimization_latency,
                 )
-                cost_usd = None
-                savings_usd = None
                 if self.cost_tracker:
-                    cost_usd = self.cost_tracker.estimate_cost(
-                        model, optimized_tokens, output_tokens
-                    )
-                    original_cost = self.cost_tracker.estimate_cost(
-                        model, original_tokens, output_tokens
-                    )
-                    if cost_usd:
-                        self.cost_tracker.record_cost(cost_usd)
-                    if cost_usd and original_cost:
-                        savings_usd = original_cost - cost_usd
-                        self.cost_tracker.record_savings(savings_usd)
                 # Log request
                 if self.logger:
@@ -4092,8 +4060,6 @@ class HeadroomProxy:
                             savings_percent=(tokens_saved / original_tokens * 100)
                             if original_tokens > 0
                             else 0,
-                            estimated_cost_usd=cost_usd,
-                            estimated_savings_usd=savings_usd,
                             optimization_latency_ms=optimization_latency,
                             total_latency_ms=total_latency,
                             tags=tags,
@@ -4230,6 +4196,8 @@ class HeadroomProxy:
         # Optimization
         transforms_applied = []
         optimized_messages = messages
         optimized_tokens = original_tokens
@@ -4245,12 +4213,15 @@ class HeadroomProxy:
                 if result.messages != messages:
                     optimized_messages = result.messages
                     transforms_applied = result.transforms_applied
                     original_tokens = result.tokens_before
                     optimized_tokens = result.tokens_after
             except Exception as e:
                 logger.warning(f"Optimization failed: {e}")
-        tokens_saved = original_tokens - optimized_tokens
         optimization_latency = (time.time() - start_time) * 1000
         # Hook: post_compress
@@ -4335,6 +4306,7 @@ class HeadroomProxy:
                     latency_ms=total_latency,
                     cached=False,
                     overhead_ms=optimization_latency,
                 )
                 if tokens_saved > 0:
@@ -4385,6 +4357,7 @@ class HeadroomProxy:
                     transforms_applied,
                     tags,
                     optimization_latency,
                 )
             else:
                 response = await self._retry_request("POST", url, headers, body)
@@ -4407,30 +4380,8 @@ class HeadroomProxy:
                         f"[{request_id}] Failed to extract cached tokens from OpenAI response: {e}"
                     )
-                # For OpenAI, prompt_tokens is TOTAL (includes cached)
-                # Normalize to non-cached input for consistent cost calculation
-                non_cached_input = total_input_tokens - cache_read_tokens
-                # Cost tracking using actual API tokens
-                cost_usd = savings_usd = None
                 if self.cost_tracker:
-                    cost_usd = self.cost_tracker.estimate_cost(
-                        model,
-                        non_cached_input,  # Pass non-cached portion
-                        output_tokens,
-                        cache_read_tokens=cache_read_tokens,
-                    )
-                    # original_cost: what it would have cost without compression
-                    # No cache params — caching is orthogonal to compression savings
-                    original_cost = self.cost_tracker.estimate_cost(
-                        model,
-                        original_tokens,
-                        output_tokens,
-                    )
-                    if cost_usd and original_cost:
-                        savings_usd = original_cost - cost_usd
-                        self.cost_tracker.record_cost(cost_usd)
-                        self.cost_tracker.record_savings(savings_usd)
                 # Cache
                 if self.cache and response.status_code == 200:
@@ -4438,7 +4389,6 @@ class HeadroomProxy:
                         messages, model, response.content, dict(response.headers), tokens_saved
                     )
-                # Metrics with actual API tokens (total, for accurate tracking)
                 await self.metrics.record_request(
                     provider="openai",
                     model=model,
@@ -4446,8 +4396,9 @@ class HeadroomProxy:
                     output_tokens=output_tokens,
                     tokens_saved=tokens_saved,
                     latency_ms=total_latency,
-                    cost_usd=cost_usd or 0,
-                    savings_usd=savings_usd or 0,
                 )
                 if tokens_saved > 0:
@@ -4543,8 +4494,6 @@ class HeadroomProxy:
                 tokens_saved=0,
                 latency_ms=latency_ms,
                 cached=False,
-                cost_usd=0,
-                savings_usd=0,
             )
         return Response(
@@ -5191,39 +5140,19 @@ class HeadroomProxy:
                 total_input_tokens = original_tokens  # fallback
                 output_tokens = 0
-                cache_read_tokens = 0
                 try:
                     resp_json = response.json()
                     usage = resp_json.get("usage", {})
                     total_input_tokens = usage.get("input_tokens", original_tokens)
                     output_tokens = usage.get("output_tokens", 0)
-                    # OpenAI returns cached_tokens in prompt_tokens_details (or input_tokens_details)
-                    prompt_details = usage.get(
-                        "prompt_tokens_details", usage.get("input_tokens_details", {})
-                    )
-                    cache_read_tokens = prompt_details.get("cached_tokens", 0)
                 except (KeyError, TypeError, AttributeError) as e:
                     logger.debug(
                         f"[{request_id}] Failed to extract cached tokens from OpenAI passthrough response: {e}"
                     )
-                # For OpenAI, input_tokens is TOTAL (includes cached)
-                # Normalize to non-cached input for consistent cost calculation
-                non_cached_input = total_input_tokens - cache_read_tokens
-                # Cost tracking using actual API tokens
-                cost_usd = savings_usd = None
                 if self.cost_tracker:
-                    cost_usd = self.cost_tracker.estimate_cost(
-                        model,
-                        non_cached_input,  # Pass non-cached portion
-                        output_tokens,
-                        cache_read_tokens=cache_read_tokens,
-                    )
-                    if cost_usd:
-                        self.cost_tracker.record_cost(cost_usd)
-                # Metrics with actual API tokens (total, for accurate tracking)
                 await self.metrics.record_request(
                     provider="openai",
                     model=model,
@@ -5231,8 +5160,7 @@ class HeadroomProxy:
                     output_tokens=output_tokens,
                     tokens_saved=tokens_saved,
                     latency_ms=total_latency,
-                    cost_usd=cost_usd or 0,
-                    savings_usd=savings_usd or 0,
                 )
                 logger.info(f"[{request_id}] /v1/responses {model}: {total_input_tokens:,} tokens")
@@ -5378,6 +5306,7 @@ class HeadroomProxy:
         # Optimization
         transforms_applied: list[str] = []
         optimized_messages = messages
         optimized_tokens = original_tokens
@@ -5396,10 +5325,12 @@ class HeadroomProxy:
                     # Use pipeline's token counts for consistency with pipeline logs
                     original_tokens = result.tokens_before
                     optimized_tokens = result.tokens_after
             except Exception as e:
                 logger.warning(f"[{request_id}] Gemini optimization failed: {e}")
-        tokens_saved = original_tokens - optimized_tokens
         optimization_latency = (time.time() - start_time) * 1000
         # Query Echo: re-inject user's question after compressed tool outputs
@@ -5481,32 +5412,9 @@ class HeadroomProxy:
                         f"[{request_id}] Failed to extract cached tokens from Gemini response: {e}"
                     )
-                # For Gemini, promptTokenCount is TOTAL (includes cached)
-                # Normalize to non-cached input for consistent cost calculation
-                non_cached_input = total_input_tokens - cache_read_tokens
-                # Cost tracking using actual API tokens
-                cost_usd = savings_usd = None
                 if self.cost_tracker:
-                    cost_usd = self.cost_tracker.estimate_cost(
-                        model,
-                        non_cached_input,  # Pass non-cached portion
-                        output_tokens,
-                        cache_read_tokens=cache_read_tokens,
-                    )
-                    # original_cost: what it would have cost without compression
-                    # No cache params — caching is orthogonal to compression savings
-                    original_cost = self.cost_tracker.estimate_cost(
-                        model,
-                        original_tokens,
-                        output_tokens,
-                    )
-                    if cost_usd and original_cost:
-                        savings_usd = original_cost - cost_usd
-                        self.cost_tracker.record_cost(cost_usd)
-                        self.cost_tracker.record_savings(savings_usd)
-                # Metrics with actual API tokens (total, for accurate tracking)
                 await self.metrics.record_request(
                     provider="gemini",
                     model=model,
@@ -5514,8 +5422,8 @@ class HeadroomProxy:
                     output_tokens=output_tokens,
                     tokens_saved=tokens_saved,
                     latency_ms=total_latency,
-                    cost_usd=cost_usd or 0,
-                    savings_usd=savings_usd or 0,
                 )
                 if tokens_saved > 0:
@@ -5745,7 +5653,7 @@ class HeadroomProxy:
                 logger.debug(f"[{request_id}] Failed to parse Gemini token count response: {e}")
             # Track stats
-            tokens_saved = original_tokens - compressed_tokens if compressed_tokens > 0 else 0
             await self.metrics.record_request(
                 provider="gemini",
@@ -5754,8 +5662,6 @@ class HeadroomProxy:
                 output_tokens=0,
                 tokens_saved=tokens_saved,
                 latency_ms=total_latency,
-                cost_usd=0,
-                savings_usd=0,
             )
             if tokens_saved > 0:
@@ -5937,16 +5843,23 @@ def create_app(config: ProxyConfig | None = None) -> FastAPI:
         )
         max_latency_ms = round(m.latency_max_ms, 2) if m.latency_count > 0 else 0
-        # Calculate Headroom overhead (optimization time only)
         avg_overhead_ms = (
-            round(m.overhead_sum_ms / m.latency_count, 2) if m.latency_count > 0 else 0
         )
         min_overhead_ms = (
             round(m.overhead_min_ms, 2)
-            if m.latency_count > 0 and m.overhead_min_ms != float("inf")
             else 0
         )
-        max_overhead_ms = round(m.overhead_max_ms, 2) if m.latency_count > 0 else 0
         # Get compression store stats
         store = get_compression_store()
@@ -5995,6 +5908,25 @@ def create_app(config: ProxyConfig | None = None) -> FastAPI:
                 "min_ms": min_overhead_ms,
                 "max_ms": max_overhead_ms,
             },
             "cost": proxy.cost_tracker.stats() if proxy.cost_tracker else None,
             "compression": {
                 "ccr_entries": compression_stats.get("entry_count", 0),

     tokens_saved: int
     savings_percent: float
     # Performance
     optimization_latency_ms: float
     total_latency_ms: float | None
     cache_hit: bool
     transforms_applied: list[str]
+    # Waste signals detected in original messages
+    waste_signals: dict[str, int] | None = None
     # Request/Response (optional, for debugging)
     request_messages: list[dict] | None = None
     response_content: str | None = None
         # Cost tracking - using deque for efficient left-side removal
         self._costs: deque[tuple[datetime, float]] = deque(maxlen=self.MAX_COST_ENTRIES)
         self._last_prune_time: datetime = datetime.now()
+        # Token savings per model (exact, no dollar estimation)
+        self._tokens_saved_by_model: dict[str, int] = {}
+        self._tokens_sent_by_model: dict[str, int] = {}
+        self._requests_by_model: dict[str, int] = {}
     # Cache resolved model names to avoid repeated litellm lookups.
     # This is critical: litellm.cost_per_token() is synchronous and can block
     # the async event loop if it triggers I/O (lazy model info download).
     ) -> float | None:
         """Estimate cost in USD using LiteLLM's pricing database.
+        LiteLLM natively handles cache_read and cache_creation pricing
+        for all providers (Anthropic, OpenAI, Google, etc.) in a single call.
         Args:
             model: Model name for pricing lookup
+            input_tokens: Non-cached input tokens (excludes cache_read)
             output_tokens: Output tokens
+            cache_read_tokens: Tokens served from cache (~10% of input rate)
+            cache_write_tokens: Tokens written to cache (~125% of input rate)
         """
         if not LITELLM_AVAILABLE:
             logger.warning("LiteLLM not available - cannot calculate costs")
             return None
         try:
             resolved_model = self._resolve_litellm_model(model)
+            # litellm.cost_per_token handles all token types natively:
+            # prompt_tokens at input rate, cache_read at ~10%, cache_creation at ~125%
+            input_cost, output_cost = litellm.cost_per_token(
                 model=resolved_model,
+                prompt_tokens=input_tokens,
                 completion_tokens=output_tokens,
+                cache_read_input_tokens=cache_read_tokens,
+                cache_creation_input_tokens=cache_write_tokens,
             )
+            total_cost = input_cost + output_cost
             return float(total_cost) if total_cost > 0 else None
         except Exception as e:
         while self._costs and self._costs[0][0] < cutoff:
             self._costs.popleft()
+    def record_tokens(self, model: str, tokens_saved: int, tokens_sent: int):
+        """Record token counts per model. This is exact — no estimation."""
+        self._tokens_saved_by_model[model] = (
+            self._tokens_saved_by_model.get(model, 0) + tokens_saved
+        )
+        self._tokens_sent_by_model[model] = self._tokens_sent_by_model.get(model, 0) + tokens_sent
+        self._requests_by_model[model] = self._requests_by_model.get(model, 0) + 1
     def get_period_cost(self) -> float:
         """Get cost for current budget period."""
         remaining = self.budget_limit_usd - period_cost
         return remaining > 0, max(0, remaining)
+    def _get_list_price(self, model: str) -> float | None:
+        """Get list input price per 1M tokens for a model."""
+        if not LITELLM_AVAILABLE:
+            return None
+        try:
+            resolved = self._resolve_litellm_model(model)
+            info = litellm.model_cost.get(resolved, {})
+            cost_per_token = info.get("input_cost_per_token")
+            return cost_per_token * 1_000_000 if cost_per_token else None
+        except Exception:
+            return None
     def stats(self) -> dict:
+        """Get token statistics per model."""
+        per_model = {}
+        total_saved = 0
+        for model in sorted(self._tokens_saved_by_model.keys()):
+            saved = self._tokens_saved_by_model[model]
+            sent = self._tokens_sent_by_model.get(model, 0)
+            reqs = self._requests_by_model.get(model, 0)
+            total_saved += saved
+            per_model[model] = {
+                "requests": reqs,
+                "tokens_saved": saved,
+                "tokens_sent": sent,
+                "reduction_pct": round(saved / (saved + sent) * 100, 1)
+                if (saved + sent) > 0
+                else 0,
+            }
+        # Compute counterfactual: what would you have paid without Headroom?
+        # Note: uses input token pricing only. Output tokens and cache pricing
+        # are excluded since Headroom only compresses input tokens.
+        cost_with_headroom = 0.0
+        cost_without_headroom = 0.0
+        for model in self._tokens_saved_by_model:
+            saved = self._tokens_saved_by_model[model]
+            sent = self._tokens_sent_by_model.get(model, 0)
+            price_per_1m = self._get_list_price(model)
+            if price_per_1m:
+                cost_with_headroom += (sent / 1_000_000) * price_per_1m
+                cost_without_headroom += ((saved + sent) / 1_000_000) * price_per_1m
         return {
+            "total_tokens_saved": total_saved,
+            "per_model": per_model,
+            "cost_with_headroom_usd": round(cost_with_headroom, 4),
+            "cost_without_headroom_usd": round(cost_without_headroom, 4),
+            "savings_usd": round(cost_without_headroom - cost_with_headroom, 4),
         }
         self.overhead_sum_ms = 0.0
         self.overhead_min_ms = float("inf")
         self.overhead_max_ms = 0.0
+        self.overhead_count = 0
+        # Time to first byte (TTFB) from upstream — what the user actually feels
+        self.ttfb_sum_ms = 0.0
+        self.ttfb_min_ms = float("inf")
+        self.ttfb_max_ms = 0.0
+        self.ttfb_count = 0
+        # Per-transform timing (name → cumulative ms, count)
+        self.transform_timing_sum: dict[str, float] = defaultdict(float)
+        self.transform_timing_count: dict[str, int] = defaultdict(int)
+        self.transform_timing_max: dict[str, float] = defaultdict(float)
+        # Aggregate waste signals
+        self.waste_signals_total: dict[str, int] = defaultdict(int)
+        # Cumulative savings history (timestamp → cumulative tokens saved)
+        self.savings_history: list[tuple[str, int]] = []
         self._lock = asyncio.Lock()
         tokens_saved: int,
         latency_ms: float,
         cached: bool = False,
         overhead_ms: float = 0,
+        ttfb_ms: float = 0,
+        pipeline_timing: dict[str, float] | None = None,
+        waste_signals: dict[str, int] | None = None,
     ):
         """Record metrics for a request."""
         async with self._lock:
                 self.overhead_sum_ms += overhead_ms
                 self.overhead_min_ms = min(self.overhead_min_ms, overhead_ms)
                 self.overhead_max_ms = max(self.overhead_max_ms, overhead_ms)
+                self.overhead_count += 1
+            # Track TTFB (time to first byte from upstream)
+            if ttfb_ms > 0:
+                self.ttfb_sum_ms += ttfb_ms
+                self.ttfb_min_ms = min(self.ttfb_min_ms, ttfb_ms)
+                self.ttfb_max_ms = max(self.ttfb_max_ms, ttfb_ms)
+                self.ttfb_count += 1
+            # Track per-transform timing
+            if pipeline_timing:
+                for name, ms in pipeline_timing.items():
+                    self.transform_timing_sum[name] += ms
+                    self.transform_timing_count[name] += 1
+                    self.transform_timing_max[name] = max(self.transform_timing_max[name], ms)
+            # Track waste signals
+            if waste_signals:
+                for signal_name, token_count in waste_signals.items():
+                    self.waste_signals_total[signal_name] += token_count
+            # Track cumulative savings history (record every request)
+            from datetime import datetime
+            self.savings_history.append((datetime.now().isoformat(), self.tokens_saved_total))
+            # Keep last 500 data points
+            if len(self.savings_history) > 500:
+                self.savings_history = self.savings_history[-500:]
     async def record_rate_limited(self):
         async with self._lock:
                 "# HELP headroom_latency_ms_sum Sum of request latencies",
                 "# TYPE headroom_latency_ms_sum counter",
                 f"headroom_latency_ms_sum {self.latency_sum_ms:.2f}",
             ]
             # Per-provider metrics
         else:
             logger.info("Smart Routing: DISABLED (legacy sequential mode)")
+        # Eagerly load LLMLingua model at startup (avoids 5s delay on first request)
+        if self.config.llmlingua_enabled:
+            for transform in self.anthropic_pipeline.transforms:
+                if hasattr(transform, "eager_load_compressors"):
+                    transform.eager_load_compressors()
+                    self._llmlingua_status = "enabled"
+                    break
         # LLMLingua status with helpful hint
         if self._llmlingua_status == "enabled":
             logger.info(
                 m.tokens_saved_total / (m.tokens_input_total + m.tokens_saved_total)
             ) * 100
             logger.info(f"Token savings:         {savings_pct:.1f}%")
         if m.latency_count > 0:
             avg_latency = m.latency_sum_ms / m.latency_count
             logger.info(f"Avg latency:           {avg_latency:.0f}ms")
         # Apply optimization
         transforms_applied = []
+        pipeline_timing: dict[str, float] = {}
+        waste_signals_dict: dict[str, int] | None = None
         optimized_messages = messages
         optimized_tokens = original_tokens
                 if result.messages != messages:
                     optimized_messages = result.messages
                     transforms_applied = result.transforms_applied
+                    pipeline_timing = result.timing
                     # Use pipeline's token counts for consistency with pipeline logs
                     original_tokens = result.tokens_before
                     optimized_tokens = result.tokens_after
+                if result.waste_signals:
+                    waste_signals_dict = result.waste_signals.to_dict()
             except Exception as e:
                 logger.warning(f"Optimization failed: {e}")
+        tokens_saved = max(0, original_tokens - optimized_tokens)
         optimization_latency = (time.time() - start_time) * 1000
         # Hook: post_compress — let hooks observe compression results
                         transforms_applied,
                         tags,
                         optimization_latency,
+                        pipeline_timing=pipeline_timing,
                     )
                 else:
                     backend_response = await self.anthropic_backend.send_message(body, headers)
                         latency_ms=total_latency,
                         cached=False,
                         overhead_ms=optimization_latency,
+                        pipeline_timing=pipeline_timing,
                     )
                     if self.cost_tracker:
+                        self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
                     # Log request
                     if self.logger:
                                 savings_percent=(tokens_saved / original_tokens * 100)
                                 if original_tokens > 0
                                 else 0,
                                 optimization_latency_ms=optimization_latency,
                                 total_latency_ms=total_latency,
                                 tags=tags,
                     tags,
                     optimization_latency,
                     memory_user_id=memory_user_id,
+                    pipeline_timing=pipeline_timing,
                 )
             else:
                 response = await self._retry_request("POST", url, headers, body)
                 total_latency = (time.time() - start_time) * 1000
+                # Parse response for output token count
                 output_tokens = 0
                 if resp_json:
                     usage = resp_json.get("usage", {})
                     output_tokens = usage.get("output_tokens", 0)
                 if self.cost_tracker:
+                    self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
                 # Cache response
                 if self.cache and response.status_code == 200:
                         tokens_saved=tokens_saved,
                     )
+                # Record metrics — use optimized_tokens (what we sent), not API's
+                # input_tokens which is just the non-cached portion with prompt caching
                 await self.metrics.record_request(
                     provider="anthropic",
                     model=model,
+                    input_tokens=optimized_tokens,
                     output_tokens=output_tokens,
                     tokens_saved=tokens_saved,
                     latency_ms=total_latency,
                     overhead_ms=optimization_latency,
+                    pipeline_timing=pipeline_timing,
+                    waste_signals=waste_signals_dict,
                 )
                 # Log request
                             savings_percent=(tokens_saved / original_tokens * 100)
                             if original_tokens > 0
                             else 0,
                             optimization_latency_ms=optimization_latency,
                             total_latency_ms=total_latency,
                             tags=tags,
                             cache_hit=cache_hit,
                             transforms_applied=transforms_applied,
+                            waste_signals=waste_signals_dict,
                             request_messages=messages if self.config.log_full_messages else None,
                         )
                     )
                 cr = resp_usage.get("cache_read_input_tokens", 0)
                 cw = resp_usage.get("cache_creation_input_tokens", 0)
                 chp = round(cr / (cr + cw) * 100) if (cr + cw) > 0 else 0
+                timing_str = (
+                    " ".join(f"{k}={v:.0f}ms" for k, v in pipeline_timing.items())
+                    if pipeline_timing
+                    else ""
+                )
                 logger.info(
                     f"[{request_id}] PERF "
                     f"model={model} msgs={num_msgs} "
                     f"cache_read={cr} cache_write={cw} cache_hit_pct={chp} "
                     f"opt_ms={optimization_latency:.0f} "
                     f"transforms={_summarize_transforms(transforms_applied)}"
+                    f"{' timing=' + timing_str if timing_str else ''}"
                 )
                 # Remove compression headers since httpx already decompressed the response
         total_optimized_tokens = 0
         total_tokens_saved = 0
         compressed_requests = []
+        pipeline_timing: dict[str, float] = {}
         # Apply compression to each request in the batch
         for batch_req in requests_list:
                 )
                 optimized_messages = result.messages
+                pipeline_timing = result.timing
                 # Use pipeline's token counts for consistency with pipeline logs
                 original_tokens = result.tokens_before
                 optimized_tokens = result.tokens_after
                 total_original_tokens += original_tokens
                 total_optimized_tokens += optimized_tokens
+                tokens_saved = max(0, original_tokens - optimized_tokens)
                 total_tokens_saved += tokens_saved
                 # CCR Tool Injection: Inject retrieval tool if compression occurred
                 output_tokens=0,
                 tokens_saved=total_tokens_saved,
                 latency_ms=optimization_latency,
+                overhead_ms=optimization_latency,
+                pipeline_timing=pipeline_timing,
             )
             # Log compression stats
         total_optimized_tokens = 0
         total_tokens_saved = 0
         compressed_requests = []
+        pipeline_timing: dict[str, float] = {}
         # Apply compression to each request in the batch
         for idx, batch_req in enumerate(requests_list):
                 )
                 optimized_messages = result.messages
+                pipeline_timing = result.timing
                 # Use pipeline's token counts for consistency with pipeline logs
                 original_tokens = result.tokens_before
                 optimized_tokens = result.tokens_after
                 total_original_tokens += original_tokens
                 total_optimized_tokens += optimized_tokens
+                tokens_saved = max(0, original_tokens - optimized_tokens)
                 total_tokens_saved += tokens_saved
                 # CCR Tool Injection: Inject retrieval tool if compression occurred
                 output_tokens=0,
                 tokens_saved=total_tokens_saved,
                 latency_ms=optimization_latency,
+                overhead_ms=optimization_latency,
+                pipeline_timing=pipeline_timing,
             )
             # Log compression stats
         tags: dict[str, str],
         optimization_latency: float,
         memory_user_id: str | None = None,
+        pipeline_timing: dict[str, float] | None = None,
     ) -> StreamingResponse:
         """Stream response with metrics tracking and memory tool handling.
             "cache_creation_input_tokens": 0,
             "total_bytes": 0,
             "sse_buffer": "",  # Buffer for incomplete SSE events
+            "ttfb_ms": None,  # Time to first byte from upstream
         }
         # Track if we need to handle memory tools
                     "POST", url, json=body, headers=headers
                 ) as response:
                     async for chunk in response.aiter_bytes():
+                        # Record TTFB on first chunk
+                        if stream_state["ttfb_ms"] is None:
+                            stream_state["ttfb_ms"] = (time.time() - start_time) * 1000
                         stream_state["total_bytes"] += len(chunk)
                         # Buffer SSE data to handle chunks split across calls
                         f"[{request_id}] No usage in stream, estimated {output_tokens} output tokens"
                     )
+                # Use optimized_tokens for dashboard metrics (what we actually sent).
+                # API's input_tokens is the non-cached portion only, which is
+                # misleading for aggregation (often just 1 with prompt caching).
                 cache_read_tokens = stream_state["cache_read_input_tokens"]
                 cache_write_tokens = stream_state["cache_creation_input_tokens"]
                     f"transforms={_summarize_transforms(transforms_applied)}"
                 )
                 if self.cost_tracker:
+                    self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
                 await self.metrics.record_request(
                     provider=provider,
                     model=model,
+                    input_tokens=optimized_tokens,  # What we sent, not API's non-cached count
                     output_tokens=output_tokens,
                     tokens_saved=tokens_saved,
                     latency_ms=total_latency,
+                    overhead_ms=optimization_latency,
+                    ttfb_ms=stream_state["ttfb_ms"] or 0,
+                    pipeline_timing=pipeline_timing,
                 )
         return StreamingResponse(
         transforms_applied: list[str],
         tags: dict[str, str],
         optimization_latency: float,
+        pipeline_timing: dict[str, float] | None = None,
     ) -> StreamingResponse:
         """Stream response from Bedrock backend with metrics tracking.
         stream_state: dict[str, Any] = {
             "input_tokens": 0,
             "output_tokens": 0,
+            "ttfb_ms": None,
         }
         async def generate():
                 assert self.anthropic_backend is not None
                 async for event in self.anthropic_backend.stream_message(body, headers):
+                    # Record TTFB on first event
+                    if stream_state["ttfb_ms"] is None:
+                        stream_state["ttfb_ms"] = (time.time() - start_time) * 1000
                     # Format as SSE
                     if event.raw_sse:
                         yield event.raw_sse.encode()
                     latency_ms=total_latency,
                     cached=False,
                     overhead_ms=optimization_latency,
+                    ttfb_ms=stream_state["ttfb_ms"] or 0,
+                    pipeline_timing=pipeline_timing,
                 )
                 if self.cost_tracker:
+                    self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
                 # Log request
                 if self.logger:
                             savings_percent=(tokens_saved / original_tokens * 100)
                             if original_tokens > 0
                             else 0,
                             optimization_latency_ms=optimization_latency,
                             total_latency_ms=total_latency,
                             tags=tags,
         # Optimization
         transforms_applied = []
+        pipeline_timing: dict[str, float] = {}
+        waste_signals_dict: dict[str, int] | None = None
         optimized_messages = messages
         optimized_tokens = original_tokens
                 if result.messages != messages:
                     optimized_messages = result.messages
                     transforms_applied = result.transforms_applied
+                    pipeline_timing = result.timing
                     original_tokens = result.tokens_before
                     optimized_tokens = result.tokens_after
+                if result.waste_signals:
+                    waste_signals_dict = result.waste_signals.to_dict()
             except Exception as e:
                 logger.warning(f"Optimization failed: {e}")
+        tokens_saved = max(0, original_tokens - optimized_tokens)
         optimization_latency = (time.time() - start_time) * 1000
         # Hook: post_compress
                     latency_ms=total_latency,
                     cached=False,
                     overhead_ms=optimization_latency,
+                    pipeline_timing=pipeline_timing,
                 )
                 if tokens_saved > 0:
                     transforms_applied,
                     tags,
                     optimization_latency,
+                    pipeline_timing=pipeline_timing,
                 )
             else:
                 response = await self._retry_request("POST", url, headers, body)
                         f"[{request_id}] Failed to extract cached tokens from OpenAI response: {e}"
                     )
                 if self.cost_tracker:
+                    self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
                 # Cache
                 if self.cache and response.status_code == 200:
                         messages, model, response.content, dict(response.headers), tokens_saved
                     )
                 await self.metrics.record_request(
                     provider="openai",
                     model=model,
                     output_tokens=output_tokens,
                     tokens_saved=tokens_saved,
                     latency_ms=total_latency,
+                    overhead_ms=optimization_latency,
+                    pipeline_timing=pipeline_timing,
+                    waste_signals=waste_signals_dict,
                 )
                 if tokens_saved > 0:
                 tokens_saved=0,
                 latency_ms=latency_ms,
                 cached=False,
             )
         return Response(
                 total_input_tokens = original_tokens  # fallback
                 output_tokens = 0
                 try:
                     resp_json = response.json()
                     usage = resp_json.get("usage", {})
                     total_input_tokens = usage.get("input_tokens", original_tokens)
                     output_tokens = usage.get("output_tokens", 0)
                 except (KeyError, TypeError, AttributeError) as e:
                     logger.debug(
                         f"[{request_id}] Failed to extract cached tokens from OpenAI passthrough response: {e}"
                     )
                 if self.cost_tracker:
+                    self.cost_tracker.record_tokens(model, tokens_saved, total_input_tokens)
                 await self.metrics.record_request(
                     provider="openai",
                     model=model,
                     output_tokens=output_tokens,
                     tokens_saved=tokens_saved,
                     latency_ms=total_latency,
+                    overhead_ms=optimization_latency,
                 )
                 logger.info(f"[{request_id}] /v1/responses {model}: {total_input_tokens:,} tokens")
         # Optimization
         transforms_applied: list[str] = []
+        waste_signals_dict: dict[str, int] | None = None
         optimized_messages = messages
         optimized_tokens = original_tokens
                     # Use pipeline's token counts for consistency with pipeline logs
                     original_tokens = result.tokens_before
                     optimized_tokens = result.tokens_after
+                if result.waste_signals:
+                    waste_signals_dict = result.waste_signals.to_dict()
             except Exception as e:
                 logger.warning(f"[{request_id}] Gemini optimization failed: {e}")
+        tokens_saved = max(0, original_tokens - optimized_tokens)
         optimization_latency = (time.time() - start_time) * 1000
         # Query Echo: re-inject user's question after compressed tool outputs
                         f"[{request_id}] Failed to extract cached tokens from Gemini response: {e}"
                     )
                 if self.cost_tracker:
+                    self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
                 await self.metrics.record_request(
                     provider="gemini",
                     model=model,
                     output_tokens=output_tokens,
                     tokens_saved=tokens_saved,
                     latency_ms=total_latency,
+                    overhead_ms=optimization_latency,
+                    waste_signals=waste_signals_dict,
                 )
                 if tokens_saved > 0:
                 logger.debug(f"[{request_id}] Failed to parse Gemini token count response: {e}")
             # Track stats
+            tokens_saved = max(0, original_tokens - compressed_tokens) if compressed_tokens > 0 else 0
             await self.metrics.record_request(
                 provider="gemini",
                 output_tokens=0,
                 tokens_saved=tokens_saved,
                 latency_ms=total_latency,
             )
             if tokens_saved > 0:
         )
         max_latency_ms = round(m.latency_max_ms, 2) if m.latency_count > 0 else 0
+        # Calculate Headroom overhead (optimization time only, excludes pass-through requests)
         avg_overhead_ms = (
+            round(m.overhead_sum_ms / m.overhead_count, 2) if m.overhead_count > 0 else 0
         )
         min_overhead_ms = (
             round(m.overhead_min_ms, 2)
+            if m.overhead_count > 0 and m.overhead_min_ms != float("inf")
             else 0
         )
+        max_overhead_ms = round(m.overhead_max_ms, 2) if m.overhead_count > 0 else 0
+        # Calculate TTFB (time to first byte)
+        avg_ttfb_ms = round(m.ttfb_sum_ms / m.ttfb_count, 2) if m.ttfb_count > 0 else 0
+        min_ttfb_ms = (
+            round(m.ttfb_min_ms, 2) if m.ttfb_count > 0 and m.ttfb_min_ms != float("inf") else 0
+        )
+        max_ttfb_ms = round(m.ttfb_max_ms, 2) if m.ttfb_count > 0 else 0
         # Get compression store stats
         store = get_compression_store()
                 "min_ms": min_overhead_ms,
                 "max_ms": max_overhead_ms,
             },
+            "ttfb": {
+                "average_ms": avg_ttfb_ms,
+                "min_ms": min_ttfb_ms,
+                "max_ms": max_ttfb_ms,
+            },
+            "pipeline_timing": {
+                name: {
+                    "average_ms": round(
+                        m.transform_timing_sum[name] / m.transform_timing_count[name], 2
+                    ),
+                    "max_ms": round(m.transform_timing_max[name], 2),
+                    "count": m.transform_timing_count[name],
+                }
+                for name in sorted(m.transform_timing_sum.keys())
+            }
+            if m.transform_timing_sum
+            else {},
+            "waste_signals": dict(m.waste_signals_total) if m.waste_signals_total else {},
+            "savings_history": m.savings_history[-100:],  # Last 100 data points
             "cost": proxy.cost_tracker.stats() if proxy.cost_tracker else None,
             "compression": {
                 "ccr_entries": compression_stats.get("entry_count", 0),

headroom/transforms/code_compressor.py CHANGED Viewed

@@ -867,6 +867,25 @@ class CodeAwareCompressor(Transform):
             ratio = compressed_tokens / max(original_tokens, 1)
             # Store in CCR if significant compression
             cache_key = None
             if self.config.enable_ccr and ratio < 0.8:

             ratio = compressed_tokens / max(original_tokens, 1)
+            # Guard against over-aggressive compression (data loss).
+            # If AST extraction stripped content to <5% of original,
+            # the output is essentially empty — return original.
+            if ratio < 0.05:
+                logger.warning(
+                    "Code compression too aggressive (ratio=%.3f), returning original",
+                    ratio,
+                )
+                return CodeCompressionResult(
+                    compressed=code,
+                    original=code,
+                    original_tokens=original_tokens,
+                    compressed_tokens=original_tokens,
+                    compression_ratio=1.0,
+                    language=detected_lang,
+                    language_confidence=confidence,
+                    syntax_valid=True,
+                )
             # Store in CCR if significant compression
             cache_key = None
             if self.config.enable_ccr and ratio < 0.8:

headroom/transforms/content_router.py CHANGED Viewed

@@ -38,6 +38,7 @@ from __future__ import annotations
 import hashlib
 import logging
 import re
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any
@@ -132,6 +133,112 @@ def _create_content_signature(
         return None
 class CompressionStrategy(Enum):
     """Available compression strategies."""
@@ -533,6 +640,8 @@ class ContentRouter(Transform):
         # TOIN integration for cross-strategy learning
         self._toin: Any = None
     def _record_to_toin(
         self,
         strategy: CompressionStrategy,
@@ -1042,6 +1151,25 @@ class ContentRouter(Transform):
                 logger.debug("HTMLExtractor not available (install trafilatura)")
         return self._html_extractor
     def _get_llmlingua(self) -> Any:
         """Get LLMLinguaCompressor (lazy load)."""
         if self._llmlingua is None:
@@ -1269,6 +1397,7 @@ class ContentRouter(Transform):
         transformed_messages: list[dict[str, Any]] = []
         transforms_applied: list[str] = []
         warnings: list[str] = []
         # Routing reason counters for summary logging
         route_counts: dict[str, int] = {
@@ -1309,6 +1438,7 @@ class ContentRouter(Transform):
                     min_ratio=min_ratio,
                     read_protection_window=read_protection_window,
                     messages_from_end=messages_from_end,
                 )
                 transformed_messages.append(transformed_message)
                 route_counts["content_blocks"] += 1
@@ -1375,14 +1505,70 @@ class ContentRouter(Transform):
                 route_counts["analysis_ctx"] += 1
                 continue
             # Route and compress based on content detection
             # Merge tool-specific bias with hook-provided bias (multiplicative)
             msg_bias = bias if role == "tool" else 1.0
             if i in hook_biases:
                 msg_bias *= hook_biases[i]
             result = self.compress(content, context=context, bias=msg_bias)
             if result.compression_ratio < min_ratio:
                 transformed_messages.append({**message, "content": result.compressed})
                 transforms_applied.append(
                     f"router:{result.strategy_used.value}:{result.compression_ratio:.2f}"
@@ -1391,6 +1577,8 @@ class ContentRouter(Transform):
                     f"{result.strategy_used.value}:{result.compression_ratio:.2f}"
                 )
             else:
                 transformed_messages.append(message)
                 route_counts["ratio_too_high"] += 1
@@ -1398,6 +1586,14 @@ class ContentRouter(Transform):
             tokenizer.count_text(str(m.get("content", ""))) for m in transformed_messages
         )
         # Log routing summary
         parts = []
         if compressed_details:
@@ -1412,12 +1608,24 @@ class ContentRouter(Transform):
             parts.append(f"{route_counts['recent_code']} protected (recent code)")
         if route_counts["analysis_ctx"]:
             parts.append(f"{route_counts['analysis_ctx']} protected (analysis ctx)")
         if route_counts["ratio_too_high"]:
             parts.append(f"{route_counts['ratio_too_high']} unchanged (ratio>={min_ratio:.2f})")
         if route_counts["content_blocks"]:
             parts.append(f"{route_counts['content_blocks']} content-block msgs")
         if route_counts["non_string"]:
             parts.append(f"{route_counts['non_string']} non-string")
         if parts:
             logger.info(
                 "content_router: %d msgs — %s",
@@ -1433,6 +1641,7 @@ class ContentRouter(Transform):
             transforms_applied=all_transforms if all_transforms else ["router:noop"],
             markers_inserted=lifecycle_ccr_hashes,
             warnings=warnings,
         )
     def _get_tool_bias(self, tool_name: str) -> float:
@@ -1469,6 +1678,7 @@ class ContentRouter(Transform):
         min_ratio: float = 0.85,
         read_protection_window: int = 8,
         messages_from_end: int = 0,
     ) -> dict[str, Any]:
         """Process content blocks (Anthropic format) for tool_result compression.
@@ -1523,9 +1733,70 @@ class ContentRouter(Transform):
                 # Only process string content
                 if isinstance(tool_content, str) and len(tool_content) > 500:
-                    # Compress using content detection (will auto-detect JSON arrays, etc.)
                     result = self.compress(tool_content, context=context, bias=bias)
                     if result.compression_ratio < min_ratio:
                         new_blocks.append({**block, "content": result.compressed})
                         transforms_applied.append(
                             f"router:tool_result:{result.strategy_used.value}"
@@ -1537,6 +1808,8 @@ class ContentRouter(Transform):
                         any_compressed = True
                         continue
                     else:
                         if route_counts is not None:
                             route_counts["ratio_too_high"] += 1
                 else:

 import hashlib
 import logging
 import re
+import time
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any
         return None
+class CompressionCache:
+    """Two-tier compression cache with TTL.
+    Tier 1 (skip set): content hashes that won't compress — instant skip,
+    near-zero memory (just ints in a set).
+    Tier 2 (result cache): compressed results for content that DID compress —
+    reuse the compressed text on subsequent requests.
+    Entries expire after TTL (default 30min). No max-entries cap — TTL is the
+    natural bound. Memory grows proportional to compressible content × TTL,
+    which is bounded by session duration.
+    Uses in-process dict for ultra-fast lookups (~100ns). Could be backed
+    by memcached/Redis for multi-process deployments.
+    """
+    def __init__(self, ttl_seconds: int = 1800):
+        # Tier 2: compressed results {hash: (text, ratio, strategy, timestamp)}
+        self._results: dict[int, tuple[str, float, str, float]] = {}
+        # Tier 1: hashes of content that won't compress {hash: timestamp}
+        self._skip: dict[int, float] = {}
+        self._ttl_seconds = ttl_seconds
+        # Metrics
+        self._hits = 0
+        self._misses = 0
+        self._skip_hits = 0
+        self._evictions = 0
+        self._total_lookup_ns = 0
+        self._lookup_count = 0
+    def get(self, key: int) -> tuple[str, float, str] | None:
+        """Get cached compression result.
+        Returns (compressed_text, ratio, strategy) or None if not found/expired.
+        Use is_skipped() first to check if content is known non-compressible.
+        """
+        t0 = time.perf_counter_ns()
+        entry = self._results.get(key)
+        if entry is not None:
+            compressed, ratio, strategy, created_at = entry
+            if (time.time() - created_at) < self._ttl_seconds:
+                self._hits += 1
+                self._total_lookup_ns += time.perf_counter_ns() - t0
+                self._lookup_count += 1
+                return (compressed, ratio, strategy)
+            else:
+                del self._results[key]
+                self._evictions += 1
+        self._misses += 1
+        self._total_lookup_ns += time.perf_counter_ns() - t0
+        self._lookup_count += 1
+        return None
+    def is_skipped(self, key: int) -> bool:
+        """Check if content is known non-compressible (Tier 1)."""
+        ts = self._skip.get(key)
+        if ts is not None:
+            if (time.time() - ts) < self._ttl_seconds:
+                self._skip_hits += 1
+                return True
+            else:
+                del self._skip[key]
+                self._evictions += 1
+        return False
+    def put(self, key: int, compressed: str, ratio: float, strategy: str) -> None:
+        """Store a compressed result (Tier 2)."""
+        self._results[key] = (compressed, ratio, strategy, time.time())
+    def mark_skip(self, key: int) -> None:
+        """Mark content as non-compressible (Tier 1)."""
+        self._skip[key] = time.time()
+    def move_to_skip(self, key: int) -> None:
+        """Move a result to skip set (threshold tightened, no longer qualifies)."""
+        self._results.pop(key, None)
+        self._skip[key] = time.time()
+    @property
+    def size(self) -> int:
+        return len(self._results)
+    @property
+    def skip_size(self) -> int:
+        return len(self._skip)
+    @property
+    def stats(self) -> dict[str, int | float]:
+        avg_ns = self._total_lookup_ns / self._lookup_count if self._lookup_count else 0
+        return {
+            "cache_hits": self._hits,
+            "cache_skip_hits": self._skip_hits,
+            "cache_misses": self._misses,
+            "cache_evictions": self._evictions,
+            "cache_size": len(self._results),
+            "cache_skip_size": len(self._skip),
+            "cache_avg_lookup_ns": avg_ns,
+        }
+    def clear(self) -> None:
+        """Clear all entries (e.g., on session end)."""
+        self._results.clear()
+        self._skip.clear()
 class CompressionStrategy(Enum):
     """Available compression strategies."""
         # TOIN integration for cross-strategy learning
         self._toin: Any = None
+        self._cache = CompressionCache()
     def _record_to_toin(
         self,
         strategy: CompressionStrategy,
                 logger.debug("HTMLExtractor not available (install trafilatura)")
         return self._html_extractor
+    def eager_load_compressors(self) -> None:
+        """Pre-load compressors at startup to avoid first-request latency.
+        Call this during proxy startup to load LLMLingua model (~5s)
+        before any requests arrive.
+        """
+        if self.config.enable_llmlingua:
+            compressor = self._get_llmlingua()
+            if compressor:
+                # Trigger the underlying model load by accessing it
+                try:
+                    from .llmlingua_compressor import _get_llmlingua_compressor
+                    device = compressor._resolve_device()
+                    _get_llmlingua_compressor(compressor.config.model_name, device)
+                    logger.info("LLMLingua model pre-loaded at startup")
+                except Exception as e:
+                    logger.warning("Failed to pre-load LLMLingua model: %s", e)
     def _get_llmlingua(self) -> Any:
         """Get LLMLinguaCompressor (lazy load)."""
         if self._llmlingua is None:
         transformed_messages: list[dict[str, Any]] = []
         transforms_applied: list[str] = []
         warnings: list[str] = []
+        compressor_timing: dict[str, float] = {}  # strategy → cumulative ms
         # Routing reason counters for summary logging
         route_counts: dict[str, int] = {
                     min_ratio=min_ratio,
                     read_protection_window=read_protection_window,
                     messages_from_end=messages_from_end,
+                    compressor_timing=compressor_timing,
                 )
                 transformed_messages.append(transformed_message)
                 route_counts["content_blocks"] += 1
                 route_counts["analysis_ctx"] += 1
                 continue
+            # Compression pinning: if this message was already compressed
+            # (contains a CCR retrieval marker), skip recompression.
+            # Recompressing would change byte content and break provider
+            # prefix caching with no meaningful further reduction.
+            if "Retrieve more: hash=" in content or "Retrieve original: hash=" in content:
+                transformed_messages.append(message)
+                route_counts.setdefault("already_compressed", 0)
+                route_counts["already_compressed"] += 1
+                continue
             # Route and compress based on content detection
             # Merge tool-specific bias with hook-provided bias (multiplicative)
             msg_bias = bias if role == "tool" else 1.0
             if i in hook_biases:
                 msg_bias *= hook_biases[i]
+            # Two-tier compression cache.
+            # Tier 1 (skip): known won't-compress → instant skip.
+            # Tier 2 (result): known compresses → reuse compressed text.
+            content_key = hash(content)
+            # Tier 1: skip set — instant rejection
+            if self._cache.is_skipped(content_key):
+                transformed_messages.append(message)
+                route_counts["ratio_too_high"] += 1
+                route_counts.setdefault("cache_hit", 0)
+                route_counts["cache_hit"] += 1
+                continue
+            # Tier 2: result cache — reuse compressed output
+            cached = self._cache.get(content_key)
+            if cached is not None:
+                cached_compressed, cached_ratio, cached_strategy = cached
+                # Re-check ratio against current min_ratio (shifts with context pressure)
+                if cached_ratio < min_ratio:
+                    transformed_messages.append({**message, "content": cached_compressed})
+                    transforms_applied.append(f"router:{cached_strategy}:{cached_ratio:.2f}")
+                    compressed_details.append(f"{cached_strategy}:{cached_ratio:.2f}")
+                else:
+                    # Threshold tightened — no longer qualifies. Move to skip.
+                    self._cache.move_to_skip(content_key)
+                    transformed_messages.append(message)
+                    route_counts["ratio_too_high"] += 1
+                route_counts.setdefault("cache_hit", 0)
+                route_counts["cache_hit"] += 1
+                continue
+            # Cache miss — run full compression
+            route_counts.setdefault("cache_miss", 0)
+            route_counts["cache_miss"] += 1
+            t0 = time.perf_counter()
             result = self.compress(content, context=context, bias=msg_bias)
+            compress_ms = (time.perf_counter() - t0) * 1000
+            strategy_key = f"compressor:{result.strategy_used.value}"
+            compressor_timing[strategy_key] = compressor_timing.get(strategy_key, 0.0) + compress_ms
             if result.compression_ratio < min_ratio:
+                # Compressed — store in result cache
+                self._cache.put(
+                    content_key,
+                    result.compressed,
+                    result.compression_ratio,
+                    result.strategy_used.value,
+                )
                 transformed_messages.append({**message, "content": result.compressed})
                 transforms_applied.append(
                     f"router:{result.strategy_used.value}:{result.compression_ratio:.2f}"
                     f"{result.strategy_used.value}:{result.compression_ratio:.2f}"
                 )
             else:
+                # Didn't compress — add to skip set
+                self._cache.mark_skip(content_key)
                 transformed_messages.append(message)
                 route_counts["ratio_too_high"] += 1
             tokenizer.count_text(str(m.get("content", ""))) for m in transformed_messages
         )
+        # Add cache performance metrics to timing
+        cache_stats = self._cache.stats
+        compressor_timing["cache_hits"] = float(cache_stats["cache_hits"])
+        compressor_timing["cache_skip_hits"] = float(cache_stats["cache_skip_hits"])
+        compressor_timing["cache_size"] = float(cache_stats["cache_size"])
+        compressor_timing["cache_skip_size"] = float(cache_stats["cache_skip_size"])
+        compressor_timing["cache_avg_lookup_ns"] = cache_stats["cache_avg_lookup_ns"]
         # Log routing summary
         parts = []
         if compressed_details:
             parts.append(f"{route_counts['recent_code']} protected (recent code)")
         if route_counts["analysis_ctx"]:
             parts.append(f"{route_counts['analysis_ctx']} protected (analysis ctx)")
+        if route_counts.get("already_compressed"):
+            parts.append(f"{route_counts['already_compressed']} pinned (already compressed)")
         if route_counts["ratio_too_high"]:
             parts.append(f"{route_counts['ratio_too_high']} unchanged (ratio>={min_ratio:.2f})")
         if route_counts["content_blocks"]:
             parts.append(f"{route_counts['content_blocks']} content-block msgs")
         if route_counts["non_string"]:
             parts.append(f"{route_counts['non_string']} non-string")
+        if route_counts.get("cache_hit"):
+            parts.append(f"{route_counts['cache_hit']} cache hits")
+        if route_counts.get("cache_miss"):
+            parts.append(f"{route_counts['cache_miss']} cache misses")
+        cs = self._cache.stats
+        if cs["cache_size"] > 0 or cs["cache_skip_size"] > 0:
+            parts.append(
+                f"cache[{cs['cache_size']} results, {cs['cache_skip_size']} skips, "
+                f"{cs['cache_avg_lookup_ns']:.0f}ns avg]"
+            )
         if parts:
             logger.info(
                 "content_router: %d msgs — %s",
             transforms_applied=all_transforms if all_transforms else ["router:noop"],
             markers_inserted=lifecycle_ccr_hashes,
             warnings=warnings,
+            timing=compressor_timing,
         )
     def _get_tool_bias(self, tool_name: str) -> float:
         min_ratio: float = 0.85,
         read_protection_window: int = 8,
         messages_from_end: int = 0,
+        compressor_timing: dict[str, float] | None = None,
     ) -> dict[str, Any]:
         """Process content blocks (Anthropic format) for tool_result compression.
                 # Only process string content
                 if isinstance(tool_content, str) and len(tool_content) > 500:
+                    # Compression pinning: skip already-compressed content
+                    if (
+                        "Retrieve more: hash=" in tool_content
+                        or "Retrieve original: hash=" in tool_content
+                    ):
+                        new_blocks.append(block)
+                        if route_counts is not None:
+                            route_counts.setdefault("already_compressed", 0)
+                            route_counts["already_compressed"] += 1
+                        continue
+                    # Two-tier compression cache
+                    content_key = hash(tool_content)
+                    # Tier 1: skip set — instant rejection
+                    if self._cache.is_skipped(content_key):
+                        new_blocks.append(block)
+                        if route_counts is not None:
+                            route_counts["ratio_too_high"] += 1
+                            route_counts.setdefault("cache_hit", 0)
+                            route_counts["cache_hit"] += 1
+                        continue
+                    # Tier 2: result cache — reuse compressed output
+                    cached = self._cache.get(content_key)
+                    if cached is not None:
+                        cached_compressed, cached_ratio, cached_strategy = cached
+                        if cached_ratio < min_ratio:
+                            new_blocks.append({**block, "content": cached_compressed})
+                            transforms_applied.append(f"router:tool_result:{cached_strategy}")
+                            if compressed_details is not None:
+                                compressed_details.append(
+                                    f"tool:{cached_strategy}:{cached_ratio:.2f}"
+                                )
+                            any_compressed = True
+                        else:
+                            # Threshold tightened — move to skip
+                            self._cache.move_to_skip(content_key)
+                            new_blocks.append(block)
+                            if route_counts is not None:
+                                route_counts["ratio_too_high"] += 1
+                        if route_counts is not None:
+                            route_counts.setdefault("cache_hit", 0)
+                            route_counts["cache_hit"] += 1
+                        continue
+                    # Cache miss — run full compression
+                    if route_counts is not None:
+                        route_counts.setdefault("cache_miss", 0)
+                        route_counts["cache_miss"] += 1
+                    t0 = time.perf_counter()
                     result = self.compress(tool_content, context=context, bias=bias)
+                    compress_ms = (time.perf_counter() - t0) * 1000
+                    if compressor_timing is not None:
+                        key = f"compressor:{result.strategy_used.value}"
+                        compressor_timing[key] = compressor_timing.get(key, 0.0) + compress_ms
                     if result.compression_ratio < min_ratio:
+                        # Compressed — store in result cache
+                        self._cache.put(
+                            content_key,
+                            result.compressed,
+                            result.compression_ratio,
+                            result.strategy_used.value,
+                        )
                         new_blocks.append({**block, "content": result.compressed})
                         transforms_applied.append(
                             f"router:tool_result:{result.strategy_used.value}"
                         any_compressed = True
                         continue
                     else:
+                        # Didn't compress — add to skip set
+                        self._cache.mark_skip(content_key)
                         if route_counts is not None:
                             route_counts["ratio_too_high"] += 1
                 else:

headroom/transforms/pipeline.py CHANGED Viewed

@@ -3,6 +3,7 @@
 from __future__ import annotations
 import logging
 from typing import TYPE_CHECKING, Any
 from ..config import (
@@ -14,6 +15,7 @@ from ..config import (
     ToolCrusherConfig,
     TransformDiff,
     TransformResult,
 )
 from ..tokenizer import Tokenizer
 from ..utils import deep_copy_messages
@@ -188,12 +190,14 @@ class TransformPipeline:
         all_transforms: list[str] = []
         all_markers: list[str] = []
         all_warnings: list[str] = []
         # Track transform diffs if enabled
         transform_diffs: list[TransformDiff] = []
         generate_diff = self.config.generate_diff_artifact
         current_messages = deep_copy_messages(messages)
         for transform in self.transforms:
             # Check if transform should run
@@ -203,8 +207,10 @@ class TransformPipeline:
             # Track tokens before this transform (for diff)
             tokens_before_transform = tokenizer.count_messages(current_messages)
-            # Apply transform
             result = transform.apply(current_messages, tokenizer, **kwargs)
             # Update messages for next transform
             current_messages = result.messages
@@ -216,18 +222,24 @@ class TransformPipeline:
             all_transforms.extend(result.transforms_applied)
             all_markers.extend(result.markers_inserted)
             all_warnings.extend(result.warnings)
             # Log transform results
             if result.transforms_applied:
                 logger.info(
-                    "Transform %s: %d -> %d tokens (saved %d)",
                     transform.name,
                     tokens_before_transform,
                     tokens_after_transform,
                     tokens_before_transform - tokens_after_transform,
                 )
             else:
-                logger.debug("Transform %s: no changes", transform.name)
             # Record diff if enabled
             if generate_diff:
@@ -240,24 +252,29 @@ class TransformPipeline:
                         details=", ".join(result.transforms_applied)
                         if result.transforms_applied
                         else "",
                     )
                 )
         # Final token count
         tokens_after = tokenizer.count_messages(current_messages)
         # Log pipeline summary
         total_saved = tokens_before - tokens_after
         if total_saved > 0:
             logger.info(
-                "Pipeline complete: %d -> %d tokens (saved %d, %.1f%% reduction)",
                 tokens_before,
                 tokens_after,
                 total_saved,
                 (total_saved / tokens_before * 100) if tokens_before > 0 else 0,
             )
         else:
-            logger.debug("Pipeline complete: no token savings")
         # Build diff artifact if enabled
         diff_artifact = None
@@ -270,6 +287,18 @@ class TransformPipeline:
                 transforms=transform_diffs,
             )
         return TransformResult(
             messages=current_messages,
             tokens_before=tokens_before,
@@ -278,6 +307,8 @@ class TransformPipeline:
             markers_inserted=all_markers,
             warnings=all_warnings,
             diff_artifact=diff_artifact,
         )
     def simulate(

 from __future__ import annotations
 import logging
+import time
 from typing import TYPE_CHECKING, Any
 from ..config import (
     ToolCrusherConfig,
     TransformDiff,
     TransformResult,
+    WasteSignals,
 )
 from ..tokenizer import Tokenizer
 from ..utils import deep_copy_messages
         all_transforms: list[str] = []
         all_markers: list[str] = []
         all_warnings: list[str] = []
+        all_timing: dict[str, float] = {}  # transform_name → ms
         # Track transform diffs if enabled
         transform_diffs: list[TransformDiff] = []
         generate_diff = self.config.generate_diff_artifact
         current_messages = deep_copy_messages(messages)
+        pipeline_start = time.perf_counter()
         for transform in self.transforms:
             # Check if transform should run
             # Track tokens before this transform (for diff)
             tokens_before_transform = tokenizer.count_messages(current_messages)
+            # Time the transform
+            t0 = time.perf_counter()
             result = transform.apply(current_messages, tokenizer, **kwargs)
+            duration_ms = (time.perf_counter() - t0) * 1000
             # Update messages for next transform
             current_messages = result.messages
             all_transforms.extend(result.transforms_applied)
             all_markers.extend(result.markers_inserted)
             all_warnings.extend(result.warnings)
+            all_timing[transform.name] = duration_ms
+            # Merge sub-transform timing (e.g. ContentRouter's per-compressor breakdown)
+            if result.timing:
+                all_timing.update(result.timing)
             # Log transform results
             if result.transforms_applied:
                 logger.info(
+                    "Transform %s: %d -> %d tokens (saved %d) [%.1fms]",
                     transform.name,
                     tokens_before_transform,
                     tokens_after_transform,
                     tokens_before_transform - tokens_after_transform,
+                    duration_ms,
                 )
             else:
+                logger.debug("Transform %s: no changes [%.1fms]", transform.name, duration_ms)
             # Record diff if enabled
             if generate_diff:
                         details=", ".join(result.transforms_applied)
                         if result.transforms_applied
                         else "",
+                        duration_ms=duration_ms,
                     )
                 )
         # Final token count
         tokens_after = tokenizer.count_messages(current_messages)
+        pipeline_ms = (time.perf_counter() - pipeline_start) * 1000
+        all_timing["pipeline_total"] = pipeline_ms
         # Log pipeline summary
         total_saved = tokens_before - tokens_after
+        timing_parts = " ".join(f"{k}={v:.0f}ms" for k, v in all_timing.items())
         if total_saved > 0:
             logger.info(
+                "Pipeline complete: %d -> %d tokens (saved %d, %.1f%% reduction) [%s]",
                 tokens_before,
                 tokens_after,
                 total_saved,
                 (total_saved / tokens_before * 100) if tokens_before > 0 else 0,
+                timing_parts,
             )
         else:
+            logger.debug("Pipeline complete: no token savings [%s]", timing_parts)
         # Build diff artifact if enabled
         diff_artifact = None
                 transforms=transform_diffs,
             )
+        # Detect waste signals in original messages (only when significant compression)
+        waste_signals: WasteSignals | None = None
+        if tokens_before > tokens_after and (tokens_before - tokens_after) > 100:
+            try:
+                from ..parser import parse_messages
+                _, _, waste_signals = parse_messages(messages, tokenizer)
+                if waste_signals.total() == 0:
+                    waste_signals = None
+            except Exception:
+                pass
         return TransformResult(
             messages=current_messages,
             tokens_before=tokens_before,
             markers_inserted=all_markers,
             warnings=all_warnings,
             diff_artifact=diff_artifact,
+            timing=all_timing,
+            waste_signals=waste_signals,
         )
     def simulate(

headroom/transforms/read_lifecycle.py CHANGED Viewed

@@ -50,6 +50,8 @@ class FileOperation:
     file_path: str
     operation: str  # "read" | "edit" | "write"
     content_size: int = 0  # Size of tool_result content (for reads only)
 @dataclass
@@ -116,13 +118,14 @@ class ReadLifecycleManager:
     def _build_tool_metadata(
         self, messages: list[dict[str, Any]]
-    ) -> dict[str, tuple[str, str | None]]:
         """Build tool_call_id → (tool_name, file_path) mapping.
         Scans assistant messages for tool calls, extracts name and file_path
         from tool inputs. Handles both OpenAI and Anthropic formats.
         """
-        metadata: dict[str, tuple[str, str | None]] = {}
         for msg in messages:
             if msg.get("role") != "assistant":
@@ -139,12 +142,16 @@ class ReadLifecycleManager:
                     continue
                 file_path = None
                 try:
                     args = json.loads(func.get("arguments", "{}"))
                     file_path = args.get("file_path") or args.get("path")
                 except (json.JSONDecodeError, TypeError):
                     pass
-                metadata[tc_id] = (name, file_path)
             # Anthropic format: content blocks with type=tool_use
             content = msg.get("content", [])
@@ -160,16 +167,20 @@ class ReadLifecycleManager:
                 inp = block.get("input", {})
                 file_path = None
                 if isinstance(inp, dict):
                     file_path = inp.get("file_path") or inp.get("path")
-                metadata[tc_id] = (name, file_path)
         return metadata
     def _build_file_operation_index(
         self,
         messages: list[dict[str, Any]],
-        tool_metadata: dict[str, tuple[str, str | None]],
     ) -> dict[str, list[FileOperation]]:
         """Build file_path → [FileOperation] index in a single pass.
@@ -177,7 +188,7 @@ class ReadLifecycleManager:
         """
         file_ops: dict[str, list[FileOperation]] = defaultdict(list)
-        for tc_id, (name, file_path) in tool_metadata.items():
             if not file_path:
                 continue
@@ -200,6 +211,8 @@ class ReadLifecycleManager:
                     tool_name=name,
                     file_path=file_path,
                     operation=operation,
                 )
             )
@@ -231,6 +244,29 @@ class ReadLifecycleManager:
         return None
     def _classify_reads(self, file_ops: dict[str, list[FileOperation]]) -> list[ReadClassification]:
         """Classify each Read as fresh, stale, or superseded."""
         classifications: list[ReadClassification] = []
@@ -248,9 +284,13 @@ class ReadLifecycleManager:
                     e.msg_index > read_op.msg_index for e in edits
                 )
-                # Check superseded: any later read of this file?
                 is_superseded = self.config.compress_superseded and any(
-                    r.msg_index > read_op.msg_index for r in reads
                 )
                 if is_stale:

     file_path: str
     operation: str  # "read" | "edit" | "write"
     content_size: int = 0  # Size of tool_result content (for reads only)
+    read_offset: int | None = None  # Line offset for partial reads
+    read_limit: int | None = None  # Line limit for partial reads
 @dataclass
     def _build_tool_metadata(
         self, messages: list[dict[str, Any]]
+    ) -> dict[str, tuple[str, str | None, int | None, int | None]]:
         """Build tool_call_id → (tool_name, file_path) mapping.
         Scans assistant messages for tool calls, extracts name and file_path
         from tool inputs. Handles both OpenAI and Anthropic formats.
         """
+        # Maps tool_call_id → (name, file_path, offset, limit)
+        metadata: dict[str, tuple[str, str | None, int | None, int | None]] = {}
         for msg in messages:
             if msg.get("role") != "assistant":
                     continue
                 file_path = None
+                offset = None
+                limit = None
                 try:
                     args = json.loads(func.get("arguments", "{}"))
                     file_path = args.get("file_path") or args.get("path")
+                    offset = args.get("offset")
+                    limit = args.get("limit")
                 except (json.JSONDecodeError, TypeError):
                     pass
+                metadata[tc_id] = (name, file_path, offset, limit)
             # Anthropic format: content blocks with type=tool_use
             content = msg.get("content", [])
                 inp = block.get("input", {})
                 file_path = None
+                offset = None
+                limit = None
                 if isinstance(inp, dict):
                     file_path = inp.get("file_path") or inp.get("path")
+                    offset = inp.get("offset")
+                    limit = inp.get("limit")
+                metadata[tc_id] = (name, file_path, offset, limit)
         return metadata
     def _build_file_operation_index(
         self,
         messages: list[dict[str, Any]],
+        tool_metadata: dict[str, tuple[str, str | None, int | None, int | None]],
     ) -> dict[str, list[FileOperation]]:
         """Build file_path → [FileOperation] index in a single pass.
         """
         file_ops: dict[str, list[FileOperation]] = defaultdict(list)
+        for tc_id, (name, file_path, offset, limit) in tool_metadata.items():
             if not file_path:
                 continue
                     tool_name=name,
                     file_path=file_path,
                     operation=operation,
+                    read_offset=offset if operation == "read" else None,
+                    read_limit=limit if operation == "read" else None,
                 )
             )
         return None
+    @staticmethod
+    def _read_covers(later: FileOperation, earlier: FileOperation) -> bool:
+        """Check if `later` read fully covers the line range of `earlier`.
+        A full-file read (no offset/limit) covers everything.
+        A partial read only covers another partial if its range is a superset.
+        """
+        # Full-file read supersedes anything
+        if later.read_offset is None and later.read_limit is None:
+            return True
+        # If the earlier was a full-file read, a partial can't cover it
+        if earlier.read_offset is None and earlier.read_limit is None:
+            return False
+        # Both are partial reads — check range containment
+        later_start = later.read_offset or 0
+        later_end = later_start + (later.read_limit or 2000)
+        earlier_start = earlier.read_offset or 0
+        earlier_end = earlier_start + (earlier.read_limit or 2000)
+        return later_start <= earlier_start and later_end >= earlier_end
     def _classify_reads(self, file_ops: dict[str, list[FileOperation]]) -> list[ReadClassification]:
         """Classify each Read as fresh, stale, or superseded."""
         classifications: list[ReadClassification] = []
                     e.msg_index > read_op.msg_index for e in edits
                 )
+                # Check superseded: any later read that FULLY COVERS this read's range?
+                # A partial read (offset=100, limit=50) is NOT superseded by a
+                # different partial read (offset=200, limit=50) — they cover
+                # different lines. Only supersede when the later read contains
+                # all the lines of this read.
                 is_superseded = self.config.compress_superseded and any(
+                    r.msg_index > read_op.msg_index and self._read_covers(r, read_op) for r in reads
                 )
                 if is_stale:

tests/test_config.py CHANGED Viewed

@@ -91,7 +91,7 @@ class TestCacheAlignerConfig:
     def test_default_values(self):
         """Default values are correctly set."""
         config = CacheAlignerConfig()
-        assert config.enabled is True
         assert config.normalize_whitespace is True
         assert config.collapse_blank_lines is True
@@ -389,6 +389,8 @@ class TestTransformResult:
             "warnings",
             "diff_artifact",
             "cache_metrics",
         }
         assert field_names == expected_fields

     def test_default_values(self):
         """Default values are correctly set."""
         config = CacheAlignerConfig()
+        assert config.enabled is False
         assert config.normalize_whitespace is True
         assert config.collapse_blank_lines is True
             "warnings",
             "diff_artifact",
             "cache_metrics",
+            "timing",
+            "waste_signals",
         }
         assert field_names == expected_fields

tests/test_proxy_streaming_resilience.py CHANGED Viewed

@@ -559,9 +559,16 @@ class TestCostTrackingAccuracy:
             patch("headroom.proxy.server.litellm") as mock_litellm,
         ):
             # Setup: $10/M input, $30/M output
-            def mock_cost(model, prompt_tokens, completion_tokens):
                 input_cost = prompt_tokens * 0.00001
                 output_cost = completion_tokens * 0.00003
                 return (input_cost, output_cost)
             mock_litellm.cost_per_token.side_effect = mock_cost
@@ -598,7 +605,7 @@ class TestCostTrackingAccuracy:
             patch("headroom.proxy.server.litellm") as mock_litellm,
         ):
             mock_litellm.cost_per_token.side_effect = (
-                lambda model, prompt_tokens, completion_tokens: (
                     prompt_tokens * 0.00001,
                     completion_tokens * 0.00003,
                 )

             patch("headroom.proxy.server.litellm") as mock_litellm,
         ):
             # Setup: $10/M input, $30/M output
+            def mock_cost(model, prompt_tokens, completion_tokens, **kwargs):
                 input_cost = prompt_tokens * 0.00001
                 output_cost = completion_tokens * 0.00003
+                # Add cache costs if provided
+                cache_read = kwargs.get("cache_read_input_tokens", 0)
+                cache_write = kwargs.get("cache_creation_input_tokens", 0)
+                if cache_read or cache_write:
+                    model_info = mock_litellm.get_model_info()
+                    input_cost += cache_read * model_info.get("cache_read_input_token_cost", 0)
+                    input_cost += cache_write * model_info.get("cache_creation_input_token_cost", 0)
                 return (input_cost, output_cost)
             mock_litellm.cost_per_token.side_effect = mock_cost
             patch("headroom.proxy.server.litellm") as mock_litellm,
         ):
             mock_litellm.cost_per_token.side_effect = (
+                lambda model, prompt_tokens, completion_tokens, **kwargs: (
                     prompt_tokens * 0.00001,
                     completion_tokens * 0.00003,
                 )

tests/test_transforms/test_cache_aligner.py CHANGED Viewed

@@ -29,8 +29,8 @@ def tokenizer():
 @pytest.fixture
 def default_config():
-    """Default CacheAlignerConfig."""
-    return CacheAlignerConfig()
 @pytest.fixture
@@ -194,7 +194,7 @@ class TestDateExtraction:
             {"role": "user", "content": "Hello"},
         ]
-        config = CacheAlignerConfig(date_patterns=custom_patterns)
         aligner = CacheAligner(config)
         assert aligner.should_apply(messages, tokenizer)
@@ -705,7 +705,7 @@ Please be helpful, harmless, and honest."""
             {"role": "user", "content": "What can you help me with today?"},
         ]
-        aligner = CacheAligner()
         # Check should_apply
         assert aligner.should_apply(messages, tokenizer)

 @pytest.fixture
 def default_config():
+    """Default CacheAlignerConfig with enabled=True for testing."""
+    return CacheAlignerConfig(enabled=True)
 @pytest.fixture
             {"role": "user", "content": "Hello"},
         ]
+        config = CacheAlignerConfig(enabled=True, date_patterns=custom_patterns)
         aligner = CacheAligner(config)
         assert aligner.should_apply(messages, tokenizer)
             {"role": "user", "content": "What can you help me with today?"},
         ]
+        aligner = CacheAligner(CacheAlignerConfig(enabled=True))
         # Check should_apply
         assert aligner.should_apply(messages, tokenizer)

tests/test_transforms/test_read_lifecycle.py CHANGED Viewed

@@ -276,7 +276,7 @@ class TestSupersededDetection:
     def test_reread_makes_superseded(self):
         """Read(A) → Read(A): first Read becomes superseded."""
-        config = ReadLifecycleConfig(enabled=True)
         mgr = ReadLifecycleManager(config)
         messages = [

     def test_reread_makes_superseded(self):
         """Read(A) → Read(A): first Read becomes superseded."""
+        config = ReadLifecycleConfig(enabled=True, compress_superseded=True)
         mgr = ReadLifecycleManager(config)
         messages = [