Spaces:
Build error
fix: dashboard metrics, TTFB tracking, eager LLMLingua loading, and multi-provider consistency
Browse filesDashboard was showing wildly incorrect metrics (99.5% savings, 3ms overhead)
due to using Anthropic API's non-cached input_tokens instead of optimized_tokens,
and dividing overhead by total request count instead of optimized-only count.
Key fixes:
- Use optimized_tokens (what we sent) for dashboard aggregation, not API's
input_tokens which excludes cached portion
- Track overhead_count separately from latency_count for correct averages
- Add TTFB (time to first byte) measurement, replace full stream latency in UI
- Eager-load LLMLingua model at proxy startup (eliminates 5.9s first-request delay)
- Simplify CostTracker to token-based accounting with counterfactual cost display
- Add two-tier compression cache to ContentRouter (skip set + result cache)
- Fix compression pinning to detect both CCR and ReadLifecycle markers
- Clamp tokens_saved to max(0, ...) across all provider paths
- Add per-transform timing instrumentation to pipeline
- Guard against over-aggressive code compression (<5% ratio)
- Fix ReadLifecycle partial read supersede logic (_read_covers range check)
- Disable CacheAligner and compress_superseded by default
- Fix all pre-existing mypy errors (CompressionCache return types)
- Fix test mocks to accept **kwargs for cache token parameters
- headroom/config.py +6 -3
- headroom/dashboard/templates/dashboard.html +388 -79
- headroom/perf/analyzer.py +116 -6
- headroom/proxy/server.py +241 -309
- headroom/transforms/code_compressor.py +19 -0
- headroom/transforms/content_router.py +274 -1
- headroom/transforms/pipeline.py +36 -5
- headroom/transforms/read_lifecycle.py +48 -8
- tests/test_config.py +3 -1
- tests/test_proxy_streaming_resilience.py +9 -2
- tests/test_transforms/test_cache_aligner.py +4 -4
- tests/test_transforms/test_read_lifecycle.py +1 -1
|
@@ -78,7 +78,7 @@ class CacheAlignerConfig:
|
|
| 78 |
SAFE: Only applied to SYSTEM messages, not user/assistant/tool content.
|
| 79 |
"""
|
| 80 |
|
| 81 |
-
enabled: bool =
|
| 82 |
|
| 83 |
# === Phase 1: DynamicContentDetector Integration ===
|
| 84 |
# When True, uses the full DynamicContentDetector with 15+ patterns
|
|
@@ -397,7 +397,7 @@ class ReadLifecycleConfig:
|
|
| 397 |
|
| 398 |
enabled: bool = True # On by default: stale/superseded Reads are provably safe to compress
|
| 399 |
compress_stale: bool = True # Replace Reads of files that were later edited
|
| 400 |
-
compress_superseded: bool =
|
| 401 |
min_size_bytes: int = 512 # Skip tiny Read outputs (not worth the overhead)
|
| 402 |
|
| 403 |
|
|
@@ -702,11 +702,13 @@ class TransformResult:
|
|
| 702 |
warnings: list[str] = field(default_factory=list)
|
| 703 |
diff_artifact: DiffArtifact | None = None # Populated if generate_diff_artifact=True
|
| 704 |
cache_metrics: CachePrefixMetrics | None = None # Populated by CacheAligner
|
|
|
|
|
|
|
| 705 |
|
| 706 |
|
| 707 |
@dataclass
|
| 708 |
class TransformDiff:
|
| 709 |
-
"""Diff info for a single transform (for debugging)."""
|
| 710 |
|
| 711 |
transform_name: str
|
| 712 |
tokens_before: int
|
|
@@ -715,6 +717,7 @@ class TransformDiff:
|
|
| 715 |
items_removed: int = 0
|
| 716 |
items_kept: int = 0
|
| 717 |
details: str = "" # Human-readable description of what changed
|
|
|
|
| 718 |
|
| 719 |
|
| 720 |
@dataclass
|
|
|
|
| 78 |
SAFE: Only applied to SYSTEM messages, not user/assistant/tool content.
|
| 79 |
"""
|
| 80 |
|
| 81 |
+
enabled: bool = False # Disabled by default — prefix stability gains are marginal in practice
|
| 82 |
|
| 83 |
# === Phase 1: DynamicContentDetector Integration ===
|
| 84 |
# When True, uses the full DynamicContentDetector with 15+ patterns
|
|
|
|
| 397 |
|
| 398 |
enabled: bool = True # On by default: stale/superseded Reads are provably safe to compress
|
| 399 |
compress_stale: bool = True # Replace Reads of files that were later edited
|
| 400 |
+
compress_superseded: bool = False # Disabled: busts Anthropic prompt cache prefix
|
| 401 |
min_size_bytes: int = 512 # Skip tiny Read outputs (not worth the overhead)
|
| 402 |
|
| 403 |
|
|
|
|
| 702 |
warnings: list[str] = field(default_factory=list)
|
| 703 |
diff_artifact: DiffArtifact | None = None # Populated if generate_diff_artifact=True
|
| 704 |
cache_metrics: CachePrefixMetrics | None = None # Populated by CacheAligner
|
| 705 |
+
timing: dict[str, float] = field(default_factory=dict) # transform_name → ms
|
| 706 |
+
waste_signals: WasteSignals | None = None # Detected waste in original messages
|
| 707 |
|
| 708 |
|
| 709 |
@dataclass
|
| 710 |
class TransformDiff:
|
| 711 |
+
"""Diff info for a single transform (for debugging/perf)."""
|
| 712 |
|
| 713 |
transform_name: str
|
| 714 |
tokens_before: int
|
|
|
|
| 717 |
items_removed: int = 0
|
| 718 |
items_kept: int = 0
|
| 719 |
details: str = "" # Human-readable description of what changed
|
| 720 |
+
duration_ms: float = 0.0 # Wall-clock time for this transform
|
| 721 |
|
| 722 |
|
| 723 |
@dataclass
|
|
@@ -25,6 +25,8 @@
|
|
| 25 |
body { background: #0f0f0f; }
|
| 26 |
.sparkline { stroke: #22d3ee; stroke-width: 1.5; fill: none; }
|
| 27 |
.sparkline-area { fill: url(#sparkline-gradient); }
|
|
|
|
|
|
|
| 28 |
@keyframes pulse-subtle { 0%, 100% { opacity: 1; } 50% { opacity: 0.7; } }
|
| 29 |
.pulse-live { animation: pulse-subtle 2s ease-in-out infinite; }
|
| 30 |
</style>
|
|
@@ -52,27 +54,20 @@
|
|
| 52 |
</header>
|
| 53 |
|
| 54 |
<main class="p-6 max-w-7xl mx-auto">
|
| 55 |
-
<!-- Hero Metrics -->
|
| 56 |
<div class="grid grid-cols-2 md:grid-cols-4 gap-4 mb-6">
|
| 57 |
-
<!--
|
| 58 |
<div class="bg-surface rounded-lg p-4 border border-border">
|
| 59 |
-
<div class="text-xs text-gray-500 uppercase tracking-wide mb-1">
|
| 60 |
-
<div class="
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
<stop offset="0%" style="stop-color:#22d3ee;stop-opacity:0.3"/>
|
| 66 |
-
<stop offset="100%" style="stop-color:#22d3ee;stop-opacity:0"/>
|
| 67 |
-
</linearGradient>
|
| 68 |
-
</defs>
|
| 69 |
-
<path class="sparkline-area" :d="getSparklineArea(requestHistory)"></path>
|
| 70 |
-
<path class="sparkline" :d="getSparkline(requestHistory)"></path>
|
| 71 |
-
</svg>
|
| 72 |
</div>
|
| 73 |
</div>
|
| 74 |
|
| 75 |
-
<!-- Tokens Saved -->
|
| 76 |
<div class="bg-surface rounded-lg p-4 border border-border">
|
| 77 |
<div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Tokens Saved</div>
|
| 78 |
<div class="flex items-baseline gap-2">
|
|
@@ -81,32 +76,120 @@
|
|
| 81 |
</div>
|
| 82 |
<div class="mt-2 h-8">
|
| 83 |
<svg class="w-full h-full" viewBox="0 0 100 32" preserveAspectRatio="none">
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
<path class="sparkline-area" :d="getSparklineArea(savingsHistory)"></path>
|
| 85 |
<path class="sparkline" :d="getSparkline(savingsHistory)"></path>
|
| 86 |
</svg>
|
| 87 |
</div>
|
| 88 |
</div>
|
| 89 |
|
| 90 |
-
<!--
|
| 91 |
<div class="bg-surface rounded-lg p-4 border border-border">
|
| 92 |
-
<div class="text-xs text-gray-500 uppercase tracking-wide mb-1">
|
| 93 |
<div class="flex items-baseline gap-2">
|
| 94 |
-
<span class="
|
| 95 |
-
|
| 96 |
-
<div class="mt-3 text-xs text-gray-500">
|
| 97 |
-
vs $<span x-text="formatCost(stats.cost?.total_cost_usd || 0)"></span> spent
|
| 98 |
</div>
|
|
|
|
| 99 |
</div>
|
| 100 |
|
| 101 |
<!-- Headroom Overhead -->
|
| 102 |
<div class="bg-surface rounded-lg p-4 border border-border">
|
| 103 |
-
<div class="text-xs text-gray-500 uppercase tracking-wide mb-1">
|
| 104 |
<div class="flex items-baseline gap-2">
|
| 105 |
<span class="text-3xl font-light tabular-nums" x-text="(stats.overhead?.average_ms || 0).toFixed(0) + 'ms'"></span>
|
| 106 |
</div>
|
| 107 |
-
<div class="mt-
|
| 108 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
</div>
|
| 111 |
</div>
|
| 112 |
|
|
@@ -117,21 +200,17 @@
|
|
| 117 |
<div class="text-sm font-medium mb-4 text-gray-300">Token Usage</div>
|
| 118 |
<div class="space-y-3">
|
| 119 |
<div class="flex justify-between items-center">
|
| 120 |
-
<span class="text-sm text-gray-400">
|
| 121 |
-
<span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.
|
| 122 |
</div>
|
| 123 |
<div class="flex justify-between items-center">
|
| 124 |
-
<span class="text-sm text-gray-400">
|
| 125 |
-
<span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.
|
| 126 |
</div>
|
| 127 |
<div class="border-t border-border my-2"></div>
|
| 128 |
<div class="flex justify-between items-center">
|
| 129 |
-
<span class="text-sm text-gray-400">
|
| 130 |
-
<span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.
|
| 131 |
-
</div>
|
| 132 |
-
<div class="flex justify-between items-center">
|
| 133 |
-
<span class="text-sm text-gray-400">After Compression</span>
|
| 134 |
-
<span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.input || 0)"></span>
|
| 135 |
</div>
|
| 136 |
</div>
|
| 137 |
</div>
|
|
@@ -171,55 +250,171 @@
|
|
| 171 |
<span class="font-mono text-sm" x-text="(stats.overhead?.min_ms || 0).toFixed(0) + ' - ' + (stats.overhead?.max_ms || 0).toFixed(0) + 'ms'"></span>
|
| 172 |
</div>
|
| 173 |
<div class="flex justify-between items-center">
|
| 174 |
-
<span class="text-sm text-gray-400">
|
| 175 |
-
<span class="font-mono text-sm" x-text="((stats.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
</div>
|
| 177 |
<div class="flex justify-between items-center">
|
| 178 |
<span class="text-sm text-gray-400">Failed Requests</span>
|
| 179 |
<span class="font-mono text-sm" x-text="stats.requests?.failed || 0"></span>
|
| 180 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
</div>
|
| 182 |
</div>
|
| 183 |
</div>
|
| 184 |
|
| 185 |
-
<!--
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
<div class="bg-surface rounded-lg border border-border overflow-hidden">
|
| 187 |
<div class="px-4 py-3 border-b border-border flex justify-between items-center">
|
| 188 |
<span class="text-sm font-medium text-gray-300">Recent Requests</span>
|
| 189 |
-
<span class="text-xs text-gray-500">Last 10</span>
|
| 190 |
</div>
|
| 191 |
<div class="overflow-x-auto">
|
| 192 |
<table class="w-full text-sm">
|
| 193 |
<thead>
|
| 194 |
<tr class="text-left text-xs text-gray-500 uppercase tracking-wide">
|
|
|
|
| 195 |
<th class="px-4 py-3 font-medium">Time</th>
|
| 196 |
<th class="px-4 py-3 font-medium">Model</th>
|
| 197 |
<th class="px-4 py-3 font-medium text-right">Input</th>
|
| 198 |
<th class="px-4 py-3 font-medium text-right">Output</th>
|
| 199 |
<th class="px-4 py-3 font-medium text-right">Saved</th>
|
| 200 |
-
<th class="px-4 py-3 font-medium text-right">
|
| 201 |
<th class="px-4 py-3 font-medium text-right">Latency</th>
|
| 202 |
</tr>
|
| 203 |
</thead>
|
| 204 |
<tbody class="divide-y divide-border">
|
| 205 |
<template x-for="req in (stats.recent_requests || [])" :key="req.request_id">
|
| 206 |
-
<tr
|
| 207 |
-
<td
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
</td>
|
| 216 |
-
<td class="px-4 py-3 text-right font-mono text-gray-400" x-text="'$' + (req.estimated_cost_usd || 0).toFixed(4)"></td>
|
| 217 |
-
<td class="px-4 py-3 text-right font-mono text-gray-400" x-text="(req.total_latency_ms || 0).toFixed(0) + 'ms'"></td>
|
| 218 |
</tr>
|
| 219 |
</template>
|
| 220 |
<template x-if="(stats.recent_requests || []).length === 0">
|
| 221 |
<tr>
|
| 222 |
-
<td colspan="
|
| 223 |
No requests yet. Start using the proxy to see activity here.
|
| 224 |
</td>
|
| 225 |
</tr>
|
|
@@ -229,23 +424,6 @@
|
|
| 229 |
</div>
|
| 230 |
</div>
|
| 231 |
|
| 232 |
-
<!-- Budget Bar (if configured) -->
|
| 233 |
-
<template x-if="stats.cost?.budget_limit_usd">
|
| 234 |
-
<div class="mt-6 bg-surface rounded-lg p-4 border border-border">
|
| 235 |
-
<div class="flex justify-between items-center mb-2">
|
| 236 |
-
<span class="text-sm text-gray-400">Budget (<span x-text="stats.cost?.budget_period || 'daily'"></span>)</span>
|
| 237 |
-
<span class="font-mono text-sm">
|
| 238 |
-
$<span x-text="formatCost(stats.cost?.period_cost_usd || 0)"></span>
|
| 239 |
-
/ $<span x-text="formatCost(stats.cost?.budget_limit_usd || 0)"></span>
|
| 240 |
-
</span>
|
| 241 |
-
</div>
|
| 242 |
-
<div class="w-full h-2 bg-border rounded-full overflow-hidden">
|
| 243 |
-
<div class="h-full rounded-full transition-all duration-500"
|
| 244 |
-
:class="getBudgetPercent() > 90 ? 'bg-red-400' : getBudgetPercent() > 70 ? 'bg-amber-400' : 'bg-accent'"
|
| 245 |
-
:style="'width: ' + Math.min(getBudgetPercent(), 100) + '%'"></div>
|
| 246 |
-
</div>
|
| 247 |
-
</div>
|
| 248 |
-
</template>
|
| 249 |
</main>
|
| 250 |
|
| 251 |
<!-- Footer -->
|
|
@@ -269,6 +447,7 @@
|
|
| 269 |
lastUpdate: 'never',
|
| 270 |
requestHistory: [],
|
| 271 |
savingsHistory: [],
|
|
|
|
| 272 |
pollInterval: null,
|
| 273 |
|
| 274 |
async init() {
|
|
@@ -310,14 +489,21 @@
|
|
| 310 |
}
|
| 311 |
},
|
| 312 |
|
|
|
|
|
|
|
| 313 |
formatNumber(n) {
|
| 314 |
if (n >= 1000000) return (n / 1000000).toFixed(1) + 'M';
|
| 315 |
if (n >= 1000) return (n / 1000).toFixed(1) + 'k';
|
| 316 |
return n.toString();
|
| 317 |
},
|
| 318 |
|
| 319 |
-
|
| 320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
},
|
| 322 |
|
| 323 |
formatTime(ts) {
|
|
@@ -332,23 +518,123 @@
|
|
| 332 |
|
| 333 |
truncateModel(model) {
|
| 334 |
if (!model) return '-';
|
| 335 |
-
// Remove provider prefix and version suffix for display
|
| 336 |
return model.replace(/^(anthropic\.|openai\.|bedrock\/)/, '')
|
| 337 |
.replace(/-\d{8}$/, '')
|
| 338 |
.substring(0, 20);
|
| 339 |
},
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
getProviderPercent(count) {
|
| 342 |
const total = this.stats.requests?.total || 1;
|
| 343 |
return Math.min((count / total) * 100, 100);
|
| 344 |
},
|
| 345 |
|
| 346 |
-
getBudgetPercent() {
|
| 347 |
-
const limit = this.stats.cost?.budget_limit_usd || 1;
|
| 348 |
-
const used = this.stats.cost?.period_cost_usd || 0;
|
| 349 |
-
return (used / limit) * 100;
|
| 350 |
-
},
|
| 351 |
-
|
| 352 |
getSparkline(data) {
|
| 353 |
if (!data || data.length < 2) return '';
|
| 354 |
const min = Math.min(...data);
|
|
@@ -369,7 +655,30 @@
|
|
| 369 |
const line = this.getSparkline(data);
|
| 370 |
if (!line) return '';
|
| 371 |
return line + ` L100,32 L0,32 Z`;
|
| 372 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 373 |
};
|
| 374 |
}
|
| 375 |
</script>
|
|
|
|
| 25 |
body { background: #0f0f0f; }
|
| 26 |
.sparkline { stroke: #22d3ee; stroke-width: 1.5; fill: none; }
|
| 27 |
.sparkline-area { fill: url(#sparkline-gradient); }
|
| 28 |
+
.trend-line { stroke: #22d3ee; stroke-width: 2; fill: none; }
|
| 29 |
+
.trend-area { fill: url(#trend-gradient); }
|
| 30 |
@keyframes pulse-subtle { 0%, 100% { opacity: 1; } 50% { opacity: 0.7; } }
|
| 31 |
.pulse-live { animation: pulse-subtle 2s ease-in-out infinite; }
|
| 32 |
</style>
|
|
|
|
| 54 |
</header>
|
| 55 |
|
| 56 |
<main class="p-6 max-w-7xl mx-auto">
|
| 57 |
+
<!-- Hero Metrics (reordered: Savings $ -> Tokens Saved % -> Quality Confidence -> Overhead) -->
|
| 58 |
<div class="grid grid-cols-2 md:grid-cols-4 gap-4 mb-6">
|
| 59 |
+
<!-- Savings ($) - Lead with dollars -->
|
| 60 |
<div class="bg-surface rounded-lg p-4 border border-border">
|
| 61 |
+
<div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Savings</div>
|
| 62 |
+
<div class="flex items-baseline gap-2">
|
| 63 |
+
<span class="text-3xl font-light tabular-nums text-emerald-400" x-text="'$' + formatCurrency(stats.cost?.savings_usd || 0)"></span>
|
| 64 |
+
</div>
|
| 65 |
+
<div class="mt-2 text-xs text-gray-500">
|
| 66 |
+
<span x-text="formatNumber(stats.requests?.total || 0)"></span> requests processed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
</div>
|
| 68 |
</div>
|
| 69 |
|
| 70 |
+
<!-- Tokens Saved (%) -->
|
| 71 |
<div class="bg-surface rounded-lg p-4 border border-border">
|
| 72 |
<div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Tokens Saved</div>
|
| 73 |
<div class="flex items-baseline gap-2">
|
|
|
|
| 76 |
</div>
|
| 77 |
<div class="mt-2 h-8">
|
| 78 |
<svg class="w-full h-full" viewBox="0 0 100 32" preserveAspectRatio="none">
|
| 79 |
+
<defs>
|
| 80 |
+
<linearGradient id="sparkline-gradient" x1="0%" y1="0%" x2="0%" y2="100%">
|
| 81 |
+
<stop offset="0%" style="stop-color:#22d3ee;stop-opacity:0.3"/>
|
| 82 |
+
<stop offset="100%" style="stop-color:#22d3ee;stop-opacity:0"/>
|
| 83 |
+
</linearGradient>
|
| 84 |
+
</defs>
|
| 85 |
<path class="sparkline-area" :d="getSparklineArea(savingsHistory)"></path>
|
| 86 |
<path class="sparkline" :d="getSparkline(savingsHistory)"></path>
|
| 87 |
</svg>
|
| 88 |
</div>
|
| 89 |
</div>
|
| 90 |
|
| 91 |
+
<!-- Quality Confidence -->
|
| 92 |
<div class="bg-surface rounded-lg p-4 border border-border">
|
| 93 |
+
<div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Compression Quality</div>
|
| 94 |
<div class="flex items-baseline gap-2">
|
| 95 |
+
<span class="w-3 h-3 rounded-full mt-1" :class="confidenceColor"></span>
|
| 96 |
+
<span class="text-3xl font-light tabular-nums" :class="confidenceTextColor" x-text="confidenceLabel"></span>
|
|
|
|
|
|
|
| 97 |
</div>
|
| 98 |
+
<div class="mt-2 text-xs text-gray-500" x-text="confidenceDetail"></div>
|
| 99 |
</div>
|
| 100 |
|
| 101 |
<!-- Headroom Overhead -->
|
| 102 |
<div class="bg-surface rounded-lg p-4 border border-border">
|
| 103 |
+
<div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Overhead</div>
|
| 104 |
<div class="flex items-baseline gap-2">
|
| 105 |
<span class="text-3xl font-light tabular-nums" x-text="(stats.overhead?.average_ms || 0).toFixed(0) + 'ms'"></span>
|
| 106 |
</div>
|
| 107 |
+
<div class="mt-2 text-xs text-gray-500">
|
| 108 |
+
TTFB <span x-text="((stats.ttfb?.average_ms || 0) / 1000).toFixed(2)"></span>s avg
|
| 109 |
+
</div>
|
| 110 |
+
</div>
|
| 111 |
+
</div>
|
| 112 |
+
|
| 113 |
+
<!-- "Without Headroom" Counterfactual -->
|
| 114 |
+
<template x-if="(stats.cost?.cost_without_headroom_usd || 0) > 0">
|
| 115 |
+
<div class="bg-surface rounded-lg p-4 border border-border mb-6">
|
| 116 |
+
<div class="flex items-center justify-between">
|
| 117 |
+
<div>
|
| 118 |
+
<div class="text-sm font-medium text-gray-300 mb-2">Without Headroom</div>
|
| 119 |
+
<div class="flex items-center gap-8">
|
| 120 |
+
<div>
|
| 121 |
+
<div class="text-xs text-gray-500 mb-1">Input cost</div>
|
| 122 |
+
<div class="text-2xl font-light tabular-nums text-emerald-400" x-text="'$' + formatCurrency(stats.cost?.cost_with_headroom_usd || 0)"></div>
|
| 123 |
+
</div>
|
| 124 |
+
<div class="text-gray-500 text-2xl font-light">vs</div>
|
| 125 |
+
<div>
|
| 126 |
+
<div class="text-xs text-gray-500 mb-1">Input cost without Headroom</div>
|
| 127 |
+
<div class="text-2xl font-light tabular-nums text-red-400 line-through decoration-red-400/50" x-text="'$' + formatCurrency(stats.cost?.cost_without_headroom_usd || 0)"></div>
|
| 128 |
+
</div>
|
| 129 |
+
<div class="ml-auto text-right">
|
| 130 |
+
<div class="text-xs text-gray-500 mb-1">Total saved</div>
|
| 131 |
+
<div class="text-2xl font-light tabular-nums text-emerald-400" x-text="'$' + formatCurrency(stats.cost?.savings_usd || 0)"></div>
|
| 132 |
+
</div>
|
| 133 |
+
</div>
|
| 134 |
+
</div>
|
| 135 |
+
</div>
|
| 136 |
+
</div>
|
| 137 |
+
</template>
|
| 138 |
+
|
| 139 |
+
<!-- New: Waste Signal Breakdown + Cumulative Savings Trend -->
|
| 140 |
+
<div class="grid grid-cols-1 lg:grid-cols-2 gap-4 mb-6">
|
| 141 |
+
<!-- What Headroom Removed -->
|
| 142 |
+
<div class="bg-surface rounded-lg p-4 border border-border">
|
| 143 |
+
<div class="text-sm font-medium mb-4 text-gray-300">What Headroom Removed</div>
|
| 144 |
+
<template x-if="Object.keys(stats.waste_signals || {}).length > 0">
|
| 145 |
+
<div class="space-y-3">
|
| 146 |
+
<template x-for="[signal, tokens] in sortedWasteSignals" :key="signal">
|
| 147 |
+
<div>
|
| 148 |
+
<div class="flex justify-between items-center mb-1">
|
| 149 |
+
<span class="text-sm text-gray-400" x-text="wasteSignalLabel(signal)"></span>
|
| 150 |
+
<span class="font-mono text-sm text-accent" x-text="formatNumber(tokens) + ' tokens'"></span>
|
| 151 |
+
</div>
|
| 152 |
+
<div class="w-full h-2 bg-border rounded-full overflow-hidden">
|
| 153 |
+
<div class="h-full rounded-full transition-all duration-500"
|
| 154 |
+
:class="wasteSignalColor(signal)"
|
| 155 |
+
:style="'width: ' + getWastePercent(tokens) + '%'"></div>
|
| 156 |
+
</div>
|
| 157 |
+
</div>
|
| 158 |
+
</template>
|
| 159 |
+
</div>
|
| 160 |
+
</template>
|
| 161 |
+
<template x-if="Object.keys(stats.waste_signals || {}).length === 0">
|
| 162 |
+
<div class="text-sm text-gray-500 italic py-8 text-center">
|
| 163 |
+
No waste signals detected yet. Data appears after requests are processed.
|
| 164 |
+
</div>
|
| 165 |
+
</template>
|
| 166 |
+
</div>
|
| 167 |
+
|
| 168 |
+
<!-- Cumulative Savings Trend -->
|
| 169 |
+
<div class="bg-surface rounded-lg p-4 border border-border">
|
| 170 |
+
<div class="flex justify-between items-center mb-4">
|
| 171 |
+
<span class="text-sm font-medium text-gray-300">Savings Over Time</span>
|
| 172 |
+
<span class="text-xs text-gray-500 font-mono" x-text="formatNumber(stats.tokens?.saved || 0) + ' tokens total'"></span>
|
| 173 |
</div>
|
| 174 |
+
<template x-if="(stats.savings_history || []).length >= 2">
|
| 175 |
+
<div class="h-32">
|
| 176 |
+
<svg class="w-full h-full" viewBox="0 0 200 64" preserveAspectRatio="none">
|
| 177 |
+
<defs>
|
| 178 |
+
<linearGradient id="trend-gradient" x1="0%" y1="0%" x2="0%" y2="100%">
|
| 179 |
+
<stop offset="0%" style="stop-color:#22d3ee;stop-opacity:0.2"/>
|
| 180 |
+
<stop offset="100%" style="stop-color:#22d3ee;stop-opacity:0"/>
|
| 181 |
+
</linearGradient>
|
| 182 |
+
</defs>
|
| 183 |
+
<path class="trend-area" :d="getTrendArea(stats.savings_history)"></path>
|
| 184 |
+
<path class="trend-line" :d="getTrendLine(stats.savings_history)"></path>
|
| 185 |
+
</svg>
|
| 186 |
+
</div>
|
| 187 |
+
</template>
|
| 188 |
+
<template x-if="(stats.savings_history || []).length < 2">
|
| 189 |
+
<div class="h-32 flex items-center justify-center text-sm text-gray-500 italic">
|
| 190 |
+
Trend data will appear after multiple requests.
|
| 191 |
+
</div>
|
| 192 |
+
</template>
|
| 193 |
</div>
|
| 194 |
</div>
|
| 195 |
|
|
|
|
| 200 |
<div class="text-sm font-medium mb-4 text-gray-300">Token Usage</div>
|
| 201 |
<div class="space-y-3">
|
| 202 |
<div class="flex justify-between items-center">
|
| 203 |
+
<span class="text-sm text-gray-400">Before Compression</span>
|
| 204 |
+
<span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.total_before_compression || 0)"></span>
|
| 205 |
</div>
|
| 206 |
<div class="flex justify-between items-center">
|
| 207 |
+
<span class="text-sm text-gray-400">After Compression (sent)</span>
|
| 208 |
+
<span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.input || 0)"></span>
|
| 209 |
</div>
|
| 210 |
<div class="border-t border-border my-2"></div>
|
| 211 |
<div class="flex justify-between items-center">
|
| 212 |
+
<span class="text-sm text-gray-400">Output Tokens</span>
|
| 213 |
+
<span class="font-mono text-sm" x-text="formatNumber(stats.tokens?.output || 0)"></span>
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
</div>
|
| 215 |
</div>
|
| 216 |
</div>
|
|
|
|
| 250 |
<span class="font-mono text-sm" x-text="(stats.overhead?.min_ms || 0).toFixed(0) + ' - ' + (stats.overhead?.max_ms || 0).toFixed(0) + 'ms'"></span>
|
| 251 |
</div>
|
| 252 |
<div class="flex justify-between items-center">
|
| 253 |
+
<span class="text-sm text-gray-400">TTFB (upstream)</span>
|
| 254 |
+
<span class="font-mono text-sm" x-text="((stats.ttfb?.average_ms || 0) / 1000).toFixed(2) + 's avg'"></span>
|
| 255 |
+
</div>
|
| 256 |
+
<div class="flex justify-between items-center">
|
| 257 |
+
<span class="text-sm text-gray-400">TTFB Range</span>
|
| 258 |
+
<span class="font-mono text-sm" x-text="((stats.ttfb?.min_ms || 0) / 1000).toFixed(2) + ' - ' + ((stats.ttfb?.max_ms || 0) / 1000).toFixed(2) + 's'"></span>
|
| 259 |
</div>
|
| 260 |
<div class="flex justify-between items-center">
|
| 261 |
<span class="text-sm text-gray-400">Failed Requests</span>
|
| 262 |
<span class="font-mono text-sm" x-text="stats.requests?.failed || 0"></span>
|
| 263 |
</div>
|
| 264 |
+
<!-- Per-transform timing breakdown -->
|
| 265 |
+
<template x-if="Object.keys(stats.pipeline_timing || {}).length > 0">
|
| 266 |
+
<div>
|
| 267 |
+
<div class="border-t border-border my-2"></div>
|
| 268 |
+
<div class="text-xs text-gray-500 uppercase tracking-wide mb-2">Pipeline Breakdown</div>
|
| 269 |
+
<template x-for="[name, t] in Object.entries(stats.pipeline_timing || {})" :key="name">
|
| 270 |
+
<div class="flex justify-between items-center mb-1">
|
| 271 |
+
<span class="text-xs text-gray-400 font-mono truncate mr-2" x-text="name"></span>
|
| 272 |
+
<span class="text-xs font-mono whitespace-nowrap"
|
| 273 |
+
:class="t.average_ms > 100 ? 'text-amber-400' : t.average_ms > 50 ? 'text-yellow-400' : 'text-gray-400'"
|
| 274 |
+
x-text="t.average_ms.toFixed(0) + 'ms avg / ' + t.max_ms.toFixed(0) + 'ms max'"></span>
|
| 275 |
+
</div>
|
| 276 |
+
</template>
|
| 277 |
+
</div>
|
| 278 |
+
</template>
|
| 279 |
</div>
|
| 280 |
</div>
|
| 281 |
</div>
|
| 282 |
|
| 283 |
+
<!-- Per-Model Savings Breakdown -->
|
| 284 |
+
<template x-if="Object.keys(stats.cost?.per_model || {}).length > 0">
|
| 285 |
+
<div class="bg-surface rounded-lg border border-border overflow-hidden mb-6">
|
| 286 |
+
<div class="px-4 py-3 border-b border-border flex justify-between items-center">
|
| 287 |
+
<span class="text-sm font-medium text-gray-300">Per-Model Token Savings</span>
|
| 288 |
+
<span class="text-xs text-gray-500">Exact tokens saved per model</span>
|
| 289 |
+
</div>
|
| 290 |
+
<div class="overflow-x-auto">
|
| 291 |
+
<table class="w-full text-sm">
|
| 292 |
+
<thead>
|
| 293 |
+
<tr class="text-left text-xs text-gray-500 uppercase tracking-wide">
|
| 294 |
+
<th class="px-4 py-3 font-medium">Model</th>
|
| 295 |
+
<th class="px-4 py-3 font-medium text-right">Requests</th>
|
| 296 |
+
<th class="px-4 py-3 font-medium text-right">Tokens Saved</th>
|
| 297 |
+
<th class="px-4 py-3 font-medium text-right">Tokens Sent</th>
|
| 298 |
+
<th class="px-4 py-3 font-medium text-right">Reduction</th>
|
| 299 |
+
</tr>
|
| 300 |
+
</thead>
|
| 301 |
+
<tbody class="divide-y divide-border">
|
| 302 |
+
<template x-for="[model, info] in Object.entries(stats.cost?.per_model || {})" :key="model">
|
| 303 |
+
<tr class="hover:bg-border/30 transition-colors">
|
| 304 |
+
<td class="px-4 py-3">
|
| 305 |
+
<span class="px-2 py-0.5 bg-border rounded text-xs" x-text="truncateModel(model)"></span>
|
| 306 |
+
</td>
|
| 307 |
+
<td class="px-4 py-3 text-right font-mono" x-text="info.requests"></td>
|
| 308 |
+
<td class="px-4 py-3 text-right font-mono text-accent" x-text="formatNumber(info.tokens_saved)"></td>
|
| 309 |
+
<td class="px-4 py-3 text-right font-mono" x-text="formatNumber(info.tokens_sent)"></td>
|
| 310 |
+
<td class="px-4 py-3 text-right">
|
| 311 |
+
<span class="text-accent font-mono" x-text="info.reduction_pct.toFixed(1) + '%'"></span>
|
| 312 |
+
</td>
|
| 313 |
+
</tr>
|
| 314 |
+
</template>
|
| 315 |
+
</tbody>
|
| 316 |
+
</table>
|
| 317 |
+
</div>
|
| 318 |
+
</div>
|
| 319 |
+
</template>
|
| 320 |
+
|
| 321 |
+
<!-- Recent Requests Table (with expandable rows) -->
|
| 322 |
<div class="bg-surface rounded-lg border border-border overflow-hidden">
|
| 323 |
<div class="px-4 py-3 border-b border-border flex justify-between items-center">
|
| 324 |
<span class="text-sm font-medium text-gray-300">Recent Requests</span>
|
| 325 |
+
<span class="text-xs text-gray-500">Last 10 — click row to expand</span>
|
| 326 |
</div>
|
| 327 |
<div class="overflow-x-auto">
|
| 328 |
<table class="w-full text-sm">
|
| 329 |
<thead>
|
| 330 |
<tr class="text-left text-xs text-gray-500 uppercase tracking-wide">
|
| 331 |
+
<th class="px-4 py-3 font-medium w-6"></th>
|
| 332 |
<th class="px-4 py-3 font-medium">Time</th>
|
| 333 |
<th class="px-4 py-3 font-medium">Model</th>
|
| 334 |
<th class="px-4 py-3 font-medium text-right">Input</th>
|
| 335 |
<th class="px-4 py-3 font-medium text-right">Output</th>
|
| 336 |
<th class="px-4 py-3 font-medium text-right">Saved</th>
|
| 337 |
+
<th class="px-4 py-3 font-medium text-right">Quality</th>
|
| 338 |
<th class="px-4 py-3 font-medium text-right">Latency</th>
|
| 339 |
</tr>
|
| 340 |
</thead>
|
| 341 |
<tbody class="divide-y divide-border">
|
| 342 |
<template x-for="req in (stats.recent_requests || [])" :key="req.request_id">
|
| 343 |
+
<tr>
|
| 344 |
+
<td colspan="8" class="p-0">
|
| 345 |
+
<div class="cursor-pointer" @click="toggleExpanded(req.request_id)">
|
| 346 |
+
<div class="flex hover:bg-border/30 transition-colors">
|
| 347 |
+
<div class="px-4 py-3 w-6 text-gray-500">
|
| 348 |
+
<span x-text="expandedRows[req.request_id] ? '-' : '+'"></span>
|
| 349 |
+
</div>
|
| 350 |
+
<div class="px-4 py-3 font-mono text-gray-400 flex-1" x-text="formatTime(req.timestamp)"></div>
|
| 351 |
+
<div class="px-4 py-3 flex-1">
|
| 352 |
+
<span class="px-2 py-0.5 bg-border rounded text-xs" x-text="truncateModel(req.model)"></span>
|
| 353 |
+
</div>
|
| 354 |
+
<div class="px-4 py-3 text-right font-mono flex-1" x-text="formatNumber(req.input_tokens_optimized)"></div>
|
| 355 |
+
<div class="px-4 py-3 text-right font-mono flex-1" x-text="formatNumber(req.output_tokens || 0)"></div>
|
| 356 |
+
<div class="px-4 py-3 text-right flex-1">
|
| 357 |
+
<span class="text-accent font-mono" x-text="req.savings_percent.toFixed(0) + '%'"></span>
|
| 358 |
+
</div>
|
| 359 |
+
<div class="px-4 py-3 text-right flex-1">
|
| 360 |
+
<span class="w-2 h-2 rounded-full inline-block" :class="getRequestConfidenceColor(req)"></span>
|
| 361 |
+
</div>
|
| 362 |
+
<div class="px-4 py-3 text-right font-mono text-gray-400 flex-1" x-text="(req.total_latency_ms || 0).toFixed(0) + 'ms'"></div>
|
| 363 |
+
</div>
|
| 364 |
+
</div>
|
| 365 |
+
<!-- Expanded detail row -->
|
| 366 |
+
<template x-if="expandedRows[req.request_id]">
|
| 367 |
+
<div class="px-8 py-4 bg-[#151515] border-t border-border">
|
| 368 |
+
<div class="grid grid-cols-2 lg:grid-cols-4 gap-4 text-xs">
|
| 369 |
+
<div>
|
| 370 |
+
<div class="text-gray-500 uppercase tracking-wide mb-1">Original Tokens</div>
|
| 371 |
+
<div class="font-mono" x-text="formatNumber(req.input_tokens_original)"></div>
|
| 372 |
+
</div>
|
| 373 |
+
<div>
|
| 374 |
+
<div class="text-gray-500 uppercase tracking-wide mb-1">Compressed Tokens</div>
|
| 375 |
+
<div class="font-mono" x-text="formatNumber(req.input_tokens_optimized)"></div>
|
| 376 |
+
</div>
|
| 377 |
+
<div>
|
| 378 |
+
<div class="text-gray-500 uppercase tracking-wide mb-1">Tokens Removed</div>
|
| 379 |
+
<div class="font-mono text-accent" x-text="formatNumber(req.tokens_saved)"></div>
|
| 380 |
+
</div>
|
| 381 |
+
<div>
|
| 382 |
+
<div class="text-gray-500 uppercase tracking-wide mb-1">Optimization Time</div>
|
| 383 |
+
<div class="font-mono" x-text="(req.optimization_latency_ms || 0).toFixed(0) + 'ms'"></div>
|
| 384 |
+
</div>
|
| 385 |
+
</div>
|
| 386 |
+
<!-- Transforms Applied -->
|
| 387 |
+
<template x-if="(req.transforms_applied || []).length > 0">
|
| 388 |
+
<div class="mt-3">
|
| 389 |
+
<div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Transforms Applied</div>
|
| 390 |
+
<div class="flex flex-wrap gap-1">
|
| 391 |
+
<template x-for="t in req.transforms_applied" :key="t">
|
| 392 |
+
<span class="px-2 py-0.5 bg-border rounded text-xs font-mono" x-text="t"></span>
|
| 393 |
+
</template>
|
| 394 |
+
</div>
|
| 395 |
+
</div>
|
| 396 |
+
</template>
|
| 397 |
+
<!-- Waste Signals for this request -->
|
| 398 |
+
<template x-if="req.waste_signals && Object.keys(req.waste_signals).length > 0">
|
| 399 |
+
<div class="mt-3">
|
| 400 |
+
<div class="text-xs text-gray-500 uppercase tracking-wide mb-1">Waste Detected</div>
|
| 401 |
+
<div class="flex flex-wrap gap-2">
|
| 402 |
+
<template x-for="[signal, tokens] in Object.entries(req.waste_signals).filter(([,v]) => v > 0)" :key="signal">
|
| 403 |
+
<span class="px-2 py-0.5 rounded text-xs font-mono"
|
| 404 |
+
:class="wasteSignalBadgeColor(signal)"
|
| 405 |
+
x-text="wasteSignalLabel(signal) + ': ' + formatNumber(tokens)"></span>
|
| 406 |
+
</template>
|
| 407 |
+
</div>
|
| 408 |
+
</div>
|
| 409 |
+
</template>
|
| 410 |
+
</div>
|
| 411 |
+
</template>
|
| 412 |
</td>
|
|
|
|
|
|
|
| 413 |
</tr>
|
| 414 |
</template>
|
| 415 |
<template x-if="(stats.recent_requests || []).length === 0">
|
| 416 |
<tr>
|
| 417 |
+
<td colspan="8" class="px-4 py-8 text-center text-gray-500 italic">
|
| 418 |
No requests yet. Start using the proxy to see activity here.
|
| 419 |
</td>
|
| 420 |
</tr>
|
|
|
|
| 424 |
</div>
|
| 425 |
</div>
|
| 426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
</main>
|
| 428 |
|
| 429 |
<!-- Footer -->
|
|
|
|
| 447 |
lastUpdate: 'never',
|
| 448 |
requestHistory: [],
|
| 449 |
savingsHistory: [],
|
| 450 |
+
expandedRows: {},
|
| 451 |
pollInterval: null,
|
| 452 |
|
| 453 |
async init() {
|
|
|
|
| 489 |
}
|
| 490 |
},
|
| 491 |
|
| 492 |
+
// --- Formatting ---
|
| 493 |
+
|
| 494 |
formatNumber(n) {
|
| 495 |
if (n >= 1000000) return (n / 1000000).toFixed(1) + 'M';
|
| 496 |
if (n >= 1000) return (n / 1000).toFixed(1) + 'k';
|
| 497 |
return n.toString();
|
| 498 |
},
|
| 499 |
|
| 500 |
+
formatCurrency(n) {
|
| 501 |
+
if (n < 0) return '-' + this.formatCurrency(-n);
|
| 502 |
+
if (n >= 1000) return (n / 1000).toFixed(1) + 'k';
|
| 503 |
+
if (n >= 1) return n.toFixed(2);
|
| 504 |
+
if (n >= 0.01) return n.toFixed(3);
|
| 505 |
+
if (n > 0) return n.toFixed(4);
|
| 506 |
+
return '0.00';
|
| 507 |
},
|
| 508 |
|
| 509 |
formatTime(ts) {
|
|
|
|
| 518 |
|
| 519 |
truncateModel(model) {
|
| 520 |
if (!model) return '-';
|
|
|
|
| 521 |
return model.replace(/^(anthropic\.|openai\.|bedrock\/)/, '')
|
| 522 |
.replace(/-\d{8}$/, '')
|
| 523 |
.substring(0, 20);
|
| 524 |
},
|
| 525 |
|
| 526 |
+
// --- Waste Signals ---
|
| 527 |
+
|
| 528 |
+
wasteSignalLabel(signal) {
|
| 529 |
+
const labels = {
|
| 530 |
+
json_bloat: 'JSON Bloat',
|
| 531 |
+
html_noise: 'HTML Noise',
|
| 532 |
+
base64: 'Base64 Blobs',
|
| 533 |
+
whitespace: 'Whitespace',
|
| 534 |
+
dynamic_date: 'Dynamic Dates',
|
| 535 |
+
repetition: 'Repetition',
|
| 536 |
+
};
|
| 537 |
+
return labels[signal] || signal;
|
| 538 |
+
},
|
| 539 |
+
|
| 540 |
+
wasteSignalColor(signal) {
|
| 541 |
+
const colors = {
|
| 542 |
+
json_bloat: 'bg-amber-500',
|
| 543 |
+
html_noise: 'bg-orange-500',
|
| 544 |
+
base64: 'bg-red-500',
|
| 545 |
+
whitespace: 'bg-blue-500',
|
| 546 |
+
dynamic_date: 'bg-purple-500',
|
| 547 |
+
repetition: 'bg-pink-500',
|
| 548 |
+
};
|
| 549 |
+
return colors[signal] || 'bg-gray-500';
|
| 550 |
+
},
|
| 551 |
+
|
| 552 |
+
wasteSignalBadgeColor(signal) {
|
| 553 |
+
const colors = {
|
| 554 |
+
json_bloat: 'bg-amber-500/20 text-amber-400',
|
| 555 |
+
html_noise: 'bg-orange-500/20 text-orange-400',
|
| 556 |
+
base64: 'bg-red-500/20 text-red-400',
|
| 557 |
+
whitespace: 'bg-blue-500/20 text-blue-400',
|
| 558 |
+
dynamic_date: 'bg-purple-500/20 text-purple-400',
|
| 559 |
+
repetition: 'bg-pink-500/20 text-pink-400',
|
| 560 |
+
};
|
| 561 |
+
return colors[signal] || 'bg-gray-500/20 text-gray-400';
|
| 562 |
+
},
|
| 563 |
+
|
| 564 |
+
get sortedWasteSignals() {
|
| 565 |
+
const signals = this.stats.waste_signals || {};
|
| 566 |
+
return Object.entries(signals)
|
| 567 |
+
.filter(([, v]) => v > 0)
|
| 568 |
+
.sort((a, b) => b[1] - a[1]);
|
| 569 |
+
},
|
| 570 |
+
|
| 571 |
+
getWastePercent(tokens) {
|
| 572 |
+
const signals = this.stats.waste_signals || {};
|
| 573 |
+
const max = Math.max(...Object.values(signals), 1);
|
| 574 |
+
return Math.min((tokens / max) * 100, 100);
|
| 575 |
+
},
|
| 576 |
+
|
| 577 |
+
// --- Compression Confidence ---
|
| 578 |
+
|
| 579 |
+
get confidenceLevel() {
|
| 580 |
+
const saved = this.stats.tokens?.saved || 0;
|
| 581 |
+
if (saved === 0) return 'none';
|
| 582 |
+
const signals = this.stats.waste_signals || {};
|
| 583 |
+
const totalWaste = Object.values(signals).reduce((a, b) => a + b, 0);
|
| 584 |
+
if (totalWaste === 0) return 'unknown';
|
| 585 |
+
const wasteRatio = totalWaste / saved;
|
| 586 |
+
if (wasteRatio >= 0.7) return 'high';
|
| 587 |
+
if (wasteRatio >= 0.3) return 'medium';
|
| 588 |
+
return 'low';
|
| 589 |
+
},
|
| 590 |
+
|
| 591 |
+
get confidenceColor() {
|
| 592 |
+
const c = { high: 'bg-emerald-400', medium: 'bg-yellow-400', low: 'bg-red-400', none: 'bg-gray-500', unknown: 'bg-gray-500' };
|
| 593 |
+
return c[this.confidenceLevel];
|
| 594 |
+
},
|
| 595 |
+
|
| 596 |
+
get confidenceTextColor() {
|
| 597 |
+
const c = { high: 'text-emerald-400', medium: 'text-yellow-400', low: 'text-red-400', none: 'text-gray-500', unknown: 'text-gray-500' };
|
| 598 |
+
return c[this.confidenceLevel];
|
| 599 |
+
},
|
| 600 |
+
|
| 601 |
+
get confidenceLabel() {
|
| 602 |
+
const l = { high: 'High', medium: 'Medium', low: 'Low', none: '-', unknown: '-' };
|
| 603 |
+
return l[this.confidenceLevel];
|
| 604 |
+
},
|
| 605 |
+
|
| 606 |
+
get confidenceDetail() {
|
| 607 |
+
const saved = this.stats.tokens?.saved || 0;
|
| 608 |
+
if (saved === 0) return 'No compression yet';
|
| 609 |
+
const signals = this.stats.waste_signals || {};
|
| 610 |
+
const totalWaste = Object.values(signals).reduce((a, b) => a + b, 0);
|
| 611 |
+
if (totalWaste === 0) return 'No waste signals detected';
|
| 612 |
+
const pct = Math.round((totalWaste / saved) * 100);
|
| 613 |
+
return pct + '% of removed tokens were identified waste';
|
| 614 |
+
},
|
| 615 |
+
|
| 616 |
+
getRequestConfidenceColor(req) {
|
| 617 |
+
if (!req.waste_signals || req.tokens_saved === 0) return 'bg-gray-500';
|
| 618 |
+
const totalWaste = Object.values(req.waste_signals).reduce((a, b) => a + b, 0);
|
| 619 |
+
const ratio = totalWaste / req.tokens_saved;
|
| 620 |
+
if (ratio >= 0.7) return 'bg-emerald-400';
|
| 621 |
+
if (ratio >= 0.3) return 'bg-yellow-400';
|
| 622 |
+
return 'bg-red-400';
|
| 623 |
+
},
|
| 624 |
+
|
| 625 |
+
// --- Expandable Rows ---
|
| 626 |
+
|
| 627 |
+
toggleExpanded(id) {
|
| 628 |
+
this.expandedRows[id] = !this.expandedRows[id];
|
| 629 |
+
},
|
| 630 |
+
|
| 631 |
+
// --- Charts ---
|
| 632 |
+
|
| 633 |
getProviderPercent(count) {
|
| 634 |
const total = this.stats.requests?.total || 1;
|
| 635 |
return Math.min((count / total) * 100, 100);
|
| 636 |
},
|
| 637 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
getSparkline(data) {
|
| 639 |
if (!data || data.length < 2) return '';
|
| 640 |
const min = Math.min(...data);
|
|
|
|
| 655 |
const line = this.getSparkline(data);
|
| 656 |
if (!line) return '';
|
| 657 |
return line + ` L100,32 L0,32 Z`;
|
| 658 |
+
},
|
| 659 |
+
|
| 660 |
+
getTrendLine(history) {
|
| 661 |
+
if (!history || history.length < 2) return '';
|
| 662 |
+
const values = history.map(h => h[1]);
|
| 663 |
+
const min = Math.min(...values);
|
| 664 |
+
const max = Math.max(...values);
|
| 665 |
+
const range = max - min || 1;
|
| 666 |
+
|
| 667 |
+
const points = values.map((v, i) => {
|
| 668 |
+
const x = (i / (values.length - 1)) * 200;
|
| 669 |
+
const y = 60 - ((v - min) / range) * 56;
|
| 670 |
+
return `${x},${y}`;
|
| 671 |
+
});
|
| 672 |
+
|
| 673 |
+
return 'M' + points.join(' L');
|
| 674 |
+
},
|
| 675 |
+
|
| 676 |
+
getTrendArea(history) {
|
| 677 |
+
if (!history || history.length < 2) return '';
|
| 678 |
+
const line = this.getTrendLine(history);
|
| 679 |
+
if (!line) return '';
|
| 680 |
+
return line + ` L200,64 L0,64 Z`;
|
| 681 |
+
},
|
| 682 |
};
|
| 683 |
}
|
| 684 |
</script>
|
|
@@ -2,14 +2,21 @@
|
|
| 2 |
|
| 3 |
Parses PERF log lines from ~/.headroom/logs/proxy.log* and produces
|
| 4 |
actionable reports on token savings, cache efficiency, and transform impact.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
|
|
|
| 9 |
import re
|
| 10 |
from dataclasses import dataclass, field
|
| 11 |
from pathlib import Path
|
| 12 |
|
|
|
|
|
|
|
| 13 |
LOG_DIR = Path.home() / ".headroom" / "logs"
|
| 14 |
|
| 15 |
# Matches: 2026-03-07 13:38:31,009 - headroom.proxy - INFO - [hr_...] PERF model=... ...
|
|
@@ -38,6 +45,90 @@ _TOIN_RE = re.compile(
|
|
| 38 |
)
|
| 39 |
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
def _parse_kv(kv_str: str) -> dict[str, str]:
|
| 42 |
"""Parse key=value pairs from a PERF log line.
|
| 43 |
|
|
@@ -276,16 +367,35 @@ def format_report(report: PerfReport) -> str:
|
|
| 276 |
total_saved = sum(r.tokens_saved for r in records)
|
| 277 |
pct = (total_saved / total_before * 100) if total_before > 0 else 0
|
| 278 |
|
| 279 |
-
models = {r.model for r in records}
|
| 280 |
lines.append(f"Requests: {len(records)}")
|
| 281 |
-
lines.append(f"
|
| 282 |
-
lines.append(
|
| 283 |
-
f"Tokens: {total_before:,} input -> {total_after:,} after transforms "
|
| 284 |
-
f"({pct:.1f}% reduction)"
|
| 285 |
-
)
|
| 286 |
lines.append(f"Total saved: {total_saved:,} tokens")
|
| 287 |
lines.append("")
|
| 288 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
# Cache analysis
|
| 290 |
cache_records = [r for r in records if (r.cache_read + r.cache_write) > 0]
|
| 291 |
if cache_records:
|
|
|
|
| 2 |
|
| 3 |
Parses PERF log lines from ~/.headroom/logs/proxy.log* and produces
|
| 4 |
actionable reports on token savings, cache efficiency, and transform impact.
|
| 5 |
+
|
| 6 |
+
Cost accounting is **cache-aware**: saved tokens that would have been served
|
| 7 |
+
from the provider's prompt cache are valued at cache_read price (~10% for
|
| 8 |
+
Anthropic), not the full input price. This prevents overstating dollar savings.
|
| 9 |
"""
|
| 10 |
|
| 11 |
from __future__ import annotations
|
| 12 |
|
| 13 |
+
import logging
|
| 14 |
import re
|
| 15 |
from dataclasses import dataclass, field
|
| 16 |
from pathlib import Path
|
| 17 |
|
| 18 |
+
log = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
LOG_DIR = Path.home() / ".headroom" / "logs"
|
| 21 |
|
| 22 |
# Matches: 2026-03-07 13:38:31,009 - headroom.proxy - INFO - [hr_...] PERF model=... ...
|
|
|
|
| 45 |
)
|
| 46 |
|
| 47 |
|
| 48 |
+
# ---------------------------------------------------------------------------
|
| 49 |
+
# Cache-aware pricing via LiteLLM
|
| 50 |
+
# ---------------------------------------------------------------------------
|
| 51 |
+
|
| 52 |
+
# LiteLLM already knows per-token costs for 100+ models including
|
| 53 |
+
# cache_read and cache_creation pricing. We call it directly instead
|
| 54 |
+
# of maintaining our own pricing tables.
|
| 55 |
+
|
| 56 |
+
try:
|
| 57 |
+
import litellm as _litellm
|
| 58 |
+
|
| 59 |
+
_LITELLM_AVAILABLE = True
|
| 60 |
+
except ImportError:
|
| 61 |
+
_LITELLM_AVAILABLE = False
|
| 62 |
+
|
| 63 |
+
# Cache resolved model names (e.g. "claude-opus-4-6" → "anthropic/claude-opus-4-6")
|
| 64 |
+
_resolved_model_cache: dict[str, str] = {}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def _resolve_model(model: str) -> str:
|
| 68 |
+
"""Resolve to a model name LiteLLM recognises, adding provider prefix if needed.
|
| 69 |
+
|
| 70 |
+
TODO: Duplicated with CostTracker._resolve_litellm_model in proxy/server.py.
|
| 71 |
+
Extract to shared utility.
|
| 72 |
+
"""
|
| 73 |
+
if model in _resolved_model_cache:
|
| 74 |
+
return _resolved_model_cache[model]
|
| 75 |
+
|
| 76 |
+
if not _LITELLM_AVAILABLE:
|
| 77 |
+
_resolved_model_cache[model] = model
|
| 78 |
+
return model
|
| 79 |
+
|
| 80 |
+
# Try as-is
|
| 81 |
+
if model in _litellm.model_cost:
|
| 82 |
+
_resolved_model_cache[model] = model
|
| 83 |
+
return model
|
| 84 |
+
|
| 85 |
+
# Try provider prefixes
|
| 86 |
+
for prefix in ("anthropic/", "openai/", "google/", "mistral/", "deepseek/"):
|
| 87 |
+
prefixed = f"{prefix}{model}"
|
| 88 |
+
if prefixed in _litellm.model_cost:
|
| 89 |
+
_resolved_model_cache[model] = prefixed
|
| 90 |
+
return prefixed
|
| 91 |
+
|
| 92 |
+
_resolved_model_cache[model] = model
|
| 93 |
+
return model
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _litellm_cost(
|
| 97 |
+
model: str,
|
| 98 |
+
prompt_tokens: int,
|
| 99 |
+
cache_read_tokens: int = 0,
|
| 100 |
+
cache_write_tokens: int = 0,
|
| 101 |
+
) -> float | None:
|
| 102 |
+
"""Compute input cost via litellm.cost_per_token (cache-aware).
|
| 103 |
+
|
| 104 |
+
Returns total input cost in USD, or None if model not found.
|
| 105 |
+
"""
|
| 106 |
+
if not _LITELLM_AVAILABLE:
|
| 107 |
+
return None
|
| 108 |
+
resolved = _resolve_model(model)
|
| 109 |
+
try:
|
| 110 |
+
input_cost, _ = _litellm.cost_per_token(
|
| 111 |
+
model=resolved,
|
| 112 |
+
prompt_tokens=prompt_tokens,
|
| 113 |
+
completion_tokens=0,
|
| 114 |
+
cache_read_input_tokens=cache_read_tokens,
|
| 115 |
+
cache_creation_input_tokens=cache_write_tokens,
|
| 116 |
+
)
|
| 117 |
+
return float(input_cost)
|
| 118 |
+
except Exception:
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def _get_list_price(model: str) -> float | None:
|
| 123 |
+
"""Get list input price per 1M tokens."""
|
| 124 |
+
if not _LITELLM_AVAILABLE:
|
| 125 |
+
return None
|
| 126 |
+
resolved = _resolve_model(model)
|
| 127 |
+
info = _litellm.model_cost.get(resolved, {})
|
| 128 |
+
cost_per_token = info.get("input_cost_per_token")
|
| 129 |
+
return cost_per_token * 1_000_000 if cost_per_token else None
|
| 130 |
+
|
| 131 |
+
|
| 132 |
def _parse_kv(kv_str: str) -> dict[str, str]:
|
| 133 |
"""Parse key=value pairs from a PERF log line.
|
| 134 |
|
|
|
|
| 367 |
total_saved = sum(r.tokens_saved for r in records)
|
| 368 |
pct = (total_saved / total_before * 100) if total_before > 0 else 0
|
| 369 |
|
|
|
|
| 370 |
lines.append(f"Requests: {len(records)}")
|
| 371 |
+
lines.append(f"Tokens: {total_before:,} -> {total_after:,} ({pct:.1f}% reduction)")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
lines.append(f"Total saved: {total_saved:,} tokens")
|
| 373 |
lines.append("")
|
| 374 |
|
| 375 |
+
# Per-model breakdown with list prices
|
| 376 |
+
by_model: dict[str, list[PerfRecord]] = {}
|
| 377 |
+
for r in records:
|
| 378 |
+
by_model.setdefault(r.model, []).append(r)
|
| 379 |
+
|
| 380 |
+
lines.append("Per-Model Breakdown")
|
| 381 |
+
lines.append("-" * 40)
|
| 382 |
+
for model, model_recs in sorted(by_model.items()):
|
| 383 |
+
m_saved = sum(r.tokens_saved for r in model_recs)
|
| 384 |
+
m_before = sum(r.tokens_before for r in model_recs)
|
| 385 |
+
m_pct = (m_saved / m_before * 100) if m_before > 0 else 0
|
| 386 |
+
list_price = _get_list_price(model)
|
| 387 |
+
price_str = f"${list_price:.2f}/MTok" if list_price else "unknown"
|
| 388 |
+
est_str = (
|
| 389 |
+
f" ~${m_saved * list_price / 1_000_000:.2f} at list price" if list_price else ""
|
| 390 |
+
)
|
| 391 |
+
lines.append(
|
| 392 |
+
f" {model}: {len(model_recs)} reqs, "
|
| 393 |
+
f"{m_saved:,} tokens saved ({m_pct:.0f}%), "
|
| 394 |
+
f"list price {price_str}{est_str}"
|
| 395 |
+
)
|
| 396 |
+
lines.append(" * Actual bill savings depend on provider caching behavior")
|
| 397 |
+
lines.append("")
|
| 398 |
+
|
| 399 |
# Cache analysis
|
| 400 |
cache_records = [r for r in records if (r.cache_read + r.cache_write) > 0]
|
| 401 |
if cache_records:
|
|
@@ -216,10 +216,6 @@ class RequestLog:
|
|
| 216 |
tokens_saved: int
|
| 217 |
savings_percent: float
|
| 218 |
|
| 219 |
-
# Cost
|
| 220 |
-
estimated_cost_usd: float | None
|
| 221 |
-
estimated_savings_usd: float | None
|
| 222 |
-
|
| 223 |
# Performance
|
| 224 |
optimization_latency_ms: float
|
| 225 |
total_latency_ms: float | None
|
|
@@ -229,6 +225,9 @@ class RequestLog:
|
|
| 229 |
cache_hit: bool
|
| 230 |
transforms_applied: list[str]
|
| 231 |
|
|
|
|
|
|
|
|
|
|
| 232 |
# Request/Response (optional, for debugging)
|
| 233 |
request_messages: list[dict] | None = None
|
| 234 |
response_content: str | None = None
|
|
@@ -601,10 +600,13 @@ class CostTracker:
|
|
| 601 |
|
| 602 |
# Cost tracking - using deque for efficient left-side removal
|
| 603 |
self._costs: deque[tuple[datetime, float]] = deque(maxlen=self.MAX_COST_ENTRIES)
|
| 604 |
-
self._total_cost_usd: float = 0
|
| 605 |
-
self._total_savings_usd: float = 0
|
| 606 |
self._last_prune_time: datetime = datetime.now()
|
| 607 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
# Cache resolved model names to avoid repeated litellm lookups.
|
| 609 |
# This is critical: litellm.cost_per_token() is synchronous and can block
|
| 610 |
# the async event loop if it triggers I/O (lazy model info download).
|
|
@@ -667,83 +669,34 @@ class CostTracker:
|
|
| 667 |
) -> float | None:
|
| 668 |
"""Estimate cost in USD using LiteLLM's pricing database.
|
| 669 |
|
|
|
|
|
|
|
|
|
|
| 670 |
Args:
|
| 671 |
model: Model name for pricing lookup
|
| 672 |
-
input_tokens:
|
| 673 |
output_tokens: Output tokens
|
| 674 |
-
cache_read_tokens: Tokens
|
| 675 |
-
cache_write_tokens: Tokens written to cache
|
| 676 |
"""
|
| 677 |
if not LITELLM_AVAILABLE:
|
| 678 |
logger.warning("LiteLLM not available - cannot calculate costs")
|
| 679 |
return None
|
| 680 |
|
| 681 |
try:
|
| 682 |
-
# Resolve model name (adds provider prefix if needed, e.g. claude-opus-4-6 → anthropic/claude-opus-4-6)
|
| 683 |
resolved_model = self._resolve_litellm_model(model)
|
| 684 |
|
| 685 |
-
# cost_per_token
|
| 686 |
-
#
|
| 687 |
-
|
| 688 |
-
# Anthropic's token semantics (all three are SEPARATE, not overlapping):
|
| 689 |
-
# - input_tokens: tokens sent that are NOT cached (neither read nor written)
|
| 690 |
-
# - cache_read_input_tokens: tokens served from existing cache
|
| 691 |
-
# - cache_creation_input_tokens: tokens being written to cache
|
| 692 |
-
# Total billable = input_tokens + cache_read + cache_write (each at different rates)
|
| 693 |
-
regular_input = input_tokens # Don't subtract cache_write, they're separate
|
| 694 |
-
|
| 695 |
-
# Get cost for regular (non-cached) input tokens
|
| 696 |
-
input_cost, _ = litellm.cost_per_token(
|
| 697 |
-
model=resolved_model,
|
| 698 |
-
prompt_tokens=regular_input,
|
| 699 |
-
completion_tokens=0,
|
| 700 |
-
)
|
| 701 |
-
|
| 702 |
-
# Get cost for output tokens
|
| 703 |
-
_, output_cost = litellm.cost_per_token(
|
| 704 |
model=resolved_model,
|
| 705 |
-
prompt_tokens=
|
| 706 |
completion_tokens=output_tokens,
|
|
|
|
|
|
|
| 707 |
)
|
| 708 |
|
| 709 |
-
|
| 710 |
-
model_info: dict[str, Any] = {}
|
| 711 |
-
try:
|
| 712 |
-
model_info = dict(litellm.get_model_info(resolved_model))
|
| 713 |
-
except Exception:
|
| 714 |
-
pass
|
| 715 |
-
|
| 716 |
-
# Calculate cache read cost (typically 10% of input price)
|
| 717 |
-
cache_read_cost = 0.0
|
| 718 |
-
if cache_read_tokens > 0:
|
| 719 |
-
cache_read_cost_per_token = model_info.get("cache_read_input_token_cost")
|
| 720 |
-
if cache_read_cost_per_token:
|
| 721 |
-
cache_read_cost = cache_read_tokens * cache_read_cost_per_token
|
| 722 |
-
else:
|
| 723 |
-
# Fallback: most providers charge ~10% of input price for cache reads
|
| 724 |
-
cache_read_full_cost, _ = litellm.cost_per_token(
|
| 725 |
-
model=resolved_model,
|
| 726 |
-
prompt_tokens=cache_read_tokens,
|
| 727 |
-
completion_tokens=0,
|
| 728 |
-
)
|
| 729 |
-
cache_read_cost = cache_read_full_cost * 0.1
|
| 730 |
-
|
| 731 |
-
# Calculate cache write cost (typically 125% of input price)
|
| 732 |
-
cache_write_cost = 0.0
|
| 733 |
-
if cache_write_tokens > 0:
|
| 734 |
-
cache_write_cost_per_token = model_info.get("cache_creation_input_token_cost")
|
| 735 |
-
if cache_write_cost_per_token:
|
| 736 |
-
cache_write_cost = cache_write_tokens * cache_write_cost_per_token
|
| 737 |
-
else:
|
| 738 |
-
# Fallback: most providers charge ~125% of input price for cache writes
|
| 739 |
-
cache_write_full_cost, _ = litellm.cost_per_token(
|
| 740 |
-
model=resolved_model,
|
| 741 |
-
prompt_tokens=cache_write_tokens,
|
| 742 |
-
completion_tokens=0,
|
| 743 |
-
)
|
| 744 |
-
cache_write_cost = cache_write_full_cost * 1.25
|
| 745 |
-
|
| 746 |
-
total_cost = input_cost + cache_read_cost + cache_write_cost + output_cost
|
| 747 |
return float(total_cost) if total_cost > 0 else None
|
| 748 |
|
| 749 |
except Exception as e:
|
|
@@ -769,16 +722,13 @@ class CostTracker:
|
|
| 769 |
while self._costs and self._costs[0][0] < cutoff:
|
| 770 |
self._costs.popleft()
|
| 771 |
|
| 772 |
-
def
|
| 773 |
-
"""Record
|
| 774 |
-
self.
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
self.
|
| 778 |
-
|
| 779 |
-
def record_savings(self, savings_usd: float):
|
| 780 |
-
"""Record savings from optimization."""
|
| 781 |
-
self._total_savings_usd += savings_usd
|
| 782 |
|
| 783 |
def get_period_cost(self) -> float:
|
| 784 |
"""Get cost for current budget period."""
|
|
@@ -802,17 +752,55 @@ class CostTracker:
|
|
| 802 |
remaining = self.budget_limit_usd - period_cost
|
| 803 |
return remaining > 0, max(0, remaining)
|
| 804 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 805 |
def stats(self) -> dict:
|
| 806 |
-
"""Get
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 807 |
return {
|
| 808 |
-
"
|
| 809 |
-
"
|
| 810 |
-
"
|
| 811 |
-
"
|
| 812 |
-
"
|
| 813 |
-
"budget_remaining_usd": round(self.check_budget()[1], 4)
|
| 814 |
-
if self.budget_limit_usd
|
| 815 |
-
else None,
|
| 816 |
}
|
| 817 |
|
| 818 |
|
|
@@ -845,9 +833,24 @@ class PrometheusMetrics:
|
|
| 845 |
self.overhead_sum_ms = 0.0
|
| 846 |
self.overhead_min_ms = float("inf")
|
| 847 |
self.overhead_max_ms = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 848 |
|
| 849 |
-
|
| 850 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 851 |
|
| 852 |
self._lock = asyncio.Lock()
|
| 853 |
|
|
@@ -860,9 +863,10 @@ class PrometheusMetrics:
|
|
| 860 |
tokens_saved: int,
|
| 861 |
latency_ms: float,
|
| 862 |
cached: bool = False,
|
| 863 |
-
cost_usd: float = 0,
|
| 864 |
-
savings_usd: float = 0,
|
| 865 |
overhead_ms: float = 0,
|
|
|
|
|
|
|
|
|
|
| 866 |
):
|
| 867 |
"""Record metrics for a request."""
|
| 868 |
async with self._lock:
|
|
@@ -887,9 +891,34 @@ class PrometheusMetrics:
|
|
| 887 |
self.overhead_sum_ms += overhead_ms
|
| 888 |
self.overhead_min_ms = min(self.overhead_min_ms, overhead_ms)
|
| 889 |
self.overhead_max_ms = max(self.overhead_max_ms, overhead_ms)
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 893 |
|
| 894 |
async def record_rate_limited(self):
|
| 895 |
async with self._lock:
|
|
@@ -934,14 +963,6 @@ class PrometheusMetrics:
|
|
| 934 |
"# HELP headroom_latency_ms_sum Sum of request latencies",
|
| 935 |
"# TYPE headroom_latency_ms_sum counter",
|
| 936 |
f"headroom_latency_ms_sum {self.latency_sum_ms:.2f}",
|
| 937 |
-
"",
|
| 938 |
-
"# HELP headroom_cost_usd_total Total cost in USD",
|
| 939 |
-
"# TYPE headroom_cost_usd_total counter",
|
| 940 |
-
f"headroom_cost_usd_total {self.cost_total_usd:.6f}",
|
| 941 |
-
"",
|
| 942 |
-
"# HELP headroom_savings_usd_total Total savings in USD",
|
| 943 |
-
"# TYPE headroom_savings_usd_total counter",
|
| 944 |
-
f"headroom_savings_usd_total {self.savings_total_usd:.6f}",
|
| 945 |
]
|
| 946 |
|
| 947 |
# Per-provider metrics
|
|
@@ -1406,6 +1427,14 @@ class HeadroomProxy:
|
|
| 1406 |
else:
|
| 1407 |
logger.info("Smart Routing: DISABLED (legacy sequential mode)")
|
| 1408 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1409 |
# LLMLingua status with helpful hint
|
| 1410 |
if self._llmlingua_status == "enabled":
|
| 1411 |
logger.info(
|
|
@@ -1474,8 +1503,6 @@ class HeadroomProxy:
|
|
| 1474 |
m.tokens_saved_total / (m.tokens_input_total + m.tokens_saved_total)
|
| 1475 |
) * 100
|
| 1476 |
logger.info(f"Token savings: {savings_pct:.1f}%")
|
| 1477 |
-
logger.info(f"Total cost: ${m.cost_total_usd:.4f}")
|
| 1478 |
-
logger.info(f"Total savings: ${m.savings_total_usd:.4f}")
|
| 1479 |
if m.latency_count > 0:
|
| 1480 |
avg_latency = m.latency_sum_ms / m.latency_count
|
| 1481 |
logger.info(f"Avg latency: {avg_latency:.0f}ms")
|
|
@@ -1727,6 +1754,8 @@ class HeadroomProxy:
|
|
| 1727 |
|
| 1728 |
# Apply optimization
|
| 1729 |
transforms_applied = []
|
|
|
|
|
|
|
| 1730 |
optimized_messages = messages
|
| 1731 |
optimized_tokens = original_tokens
|
| 1732 |
|
|
@@ -1745,13 +1774,16 @@ class HeadroomProxy:
|
|
| 1745 |
if result.messages != messages:
|
| 1746 |
optimized_messages = result.messages
|
| 1747 |
transforms_applied = result.transforms_applied
|
|
|
|
| 1748 |
# Use pipeline's token counts for consistency with pipeline logs
|
| 1749 |
original_tokens = result.tokens_before
|
| 1750 |
optimized_tokens = result.tokens_after
|
|
|
|
|
|
|
| 1751 |
except Exception as e:
|
| 1752 |
logger.warning(f"Optimization failed: {e}")
|
| 1753 |
|
| 1754 |
-
tokens_saved = original_tokens - optimized_tokens
|
| 1755 |
optimization_latency = (time.time() - start_time) * 1000
|
| 1756 |
|
| 1757 |
# Hook: post_compress — let hooks observe compression results
|
|
@@ -1933,6 +1965,7 @@ class HeadroomProxy:
|
|
| 1933 |
transforms_applied,
|
| 1934 |
tags,
|
| 1935 |
optimization_latency,
|
|
|
|
| 1936 |
)
|
| 1937 |
else:
|
| 1938 |
backend_response = await self.anthropic_backend.send_message(body, headers)
|
|
@@ -1957,22 +1990,11 @@ class HeadroomProxy:
|
|
| 1957 |
latency_ms=total_latency,
|
| 1958 |
cached=False,
|
| 1959 |
overhead_ms=optimization_latency,
|
|
|
|
| 1960 |
)
|
| 1961 |
|
| 1962 |
-
cost_usd = None
|
| 1963 |
-
savings_usd = None
|
| 1964 |
if self.cost_tracker:
|
| 1965 |
-
|
| 1966 |
-
model, optimized_tokens, output_tokens
|
| 1967 |
-
)
|
| 1968 |
-
original_cost = self.cost_tracker.estimate_cost(
|
| 1969 |
-
model, original_tokens, output_tokens
|
| 1970 |
-
)
|
| 1971 |
-
if cost_usd:
|
| 1972 |
-
self.cost_tracker.record_cost(cost_usd)
|
| 1973 |
-
if cost_usd and original_cost:
|
| 1974 |
-
savings_usd = original_cost - cost_usd
|
| 1975 |
-
self.cost_tracker.record_savings(savings_usd)
|
| 1976 |
|
| 1977 |
# Log request
|
| 1978 |
if self.logger:
|
|
@@ -1989,8 +2011,6 @@ class HeadroomProxy:
|
|
| 1989 |
savings_percent=(tokens_saved / original_tokens * 100)
|
| 1990 |
if original_tokens > 0
|
| 1991 |
else 0,
|
| 1992 |
-
estimated_cost_usd=cost_usd,
|
| 1993 |
-
estimated_savings_usd=savings_usd,
|
| 1994 |
optimization_latency_ms=optimization_latency,
|
| 1995 |
total_latency_ms=total_latency,
|
| 1996 |
tags=tags,
|
|
@@ -2035,6 +2055,7 @@ class HeadroomProxy:
|
|
| 2035 |
tags,
|
| 2036 |
optimization_latency,
|
| 2037 |
memory_user_id=memory_user_id,
|
|
|
|
| 2038 |
)
|
| 2039 |
else:
|
| 2040 |
response = await self._retry_request("POST", url, headers, body)
|
|
@@ -2200,45 +2221,14 @@ class HeadroomProxy:
|
|
| 2200 |
|
| 2201 |
total_latency = (time.time() - start_time) * 1000
|
| 2202 |
|
| 2203 |
-
# Parse response for
|
| 2204 |
-
actual_input_tokens = optimized_tokens # fallback
|
| 2205 |
output_tokens = 0
|
| 2206 |
-
cache_read_tokens = 0
|
| 2207 |
-
cache_write_tokens = 0
|
| 2208 |
if resp_json:
|
| 2209 |
usage = resp_json.get("usage", {})
|
| 2210 |
-
actual_input_tokens = usage.get("input_tokens", optimized_tokens)
|
| 2211 |
output_tokens = usage.get("output_tokens", 0)
|
| 2212 |
-
|
| 2213 |
-
# These are charged at 10% of the input price
|
| 2214 |
-
cache_read_tokens = usage.get("cache_read_input_tokens", 0)
|
| 2215 |
-
# Anthropic returns cache_creation_input_tokens for tokens written to cache
|
| 2216 |
-
# These are charged at 125% of the input price
|
| 2217 |
-
cache_write_tokens = usage.get("cache_creation_input_tokens", 0)
|
| 2218 |
-
|
| 2219 |
-
# Calculate cost using actual API tokens with proper cache pricing
|
| 2220 |
-
cost_usd = None
|
| 2221 |
-
savings_usd = None
|
| 2222 |
if self.cost_tracker:
|
| 2223 |
-
|
| 2224 |
-
model,
|
| 2225 |
-
actual_input_tokens,
|
| 2226 |
-
output_tokens,
|
| 2227 |
-
cache_read_tokens=cache_read_tokens,
|
| 2228 |
-
cache_write_tokens=cache_write_tokens,
|
| 2229 |
-
)
|
| 2230 |
-
# original_cost: what it would have cost without compression
|
| 2231 |
-
# Use only original_tokens at regular input rate — no cache params,
|
| 2232 |
-
# since caching is orthogonal to compression savings
|
| 2233 |
-
original_cost = self.cost_tracker.estimate_cost(
|
| 2234 |
-
model,
|
| 2235 |
-
original_tokens,
|
| 2236 |
-
output_tokens,
|
| 2237 |
-
)
|
| 2238 |
-
if cost_usd and original_cost:
|
| 2239 |
-
savings_usd = original_cost - cost_usd
|
| 2240 |
-
self.cost_tracker.record_cost(cost_usd)
|
| 2241 |
-
self.cost_tracker.record_savings(savings_usd)
|
| 2242 |
|
| 2243 |
# Cache response
|
| 2244 |
if self.cache and response.status_code == 200:
|
|
@@ -2250,17 +2240,18 @@ class HeadroomProxy:
|
|
| 2250 |
tokens_saved=tokens_saved,
|
| 2251 |
)
|
| 2252 |
|
| 2253 |
-
# Record metrics
|
|
|
|
| 2254 |
await self.metrics.record_request(
|
| 2255 |
provider="anthropic",
|
| 2256 |
model=model,
|
| 2257 |
-
input_tokens=
|
| 2258 |
output_tokens=output_tokens,
|
| 2259 |
tokens_saved=tokens_saved,
|
| 2260 |
latency_ms=total_latency,
|
| 2261 |
-
cost_usd=cost_usd or 0,
|
| 2262 |
-
savings_usd=savings_usd or 0,
|
| 2263 |
overhead_ms=optimization_latency,
|
|
|
|
|
|
|
| 2264 |
)
|
| 2265 |
|
| 2266 |
# Log request
|
|
@@ -2278,13 +2269,12 @@ class HeadroomProxy:
|
|
| 2278 |
savings_percent=(tokens_saved / original_tokens * 100)
|
| 2279 |
if original_tokens > 0
|
| 2280 |
else 0,
|
| 2281 |
-
estimated_cost_usd=cost_usd,
|
| 2282 |
-
estimated_savings_usd=savings_usd,
|
| 2283 |
optimization_latency_ms=optimization_latency,
|
| 2284 |
total_latency_ms=total_latency,
|
| 2285 |
tags=tags,
|
| 2286 |
cache_hit=cache_hit,
|
| 2287 |
transforms_applied=transforms_applied,
|
|
|
|
| 2288 |
request_messages=messages if self.config.log_full_messages else None,
|
| 2289 |
)
|
| 2290 |
)
|
|
@@ -2295,6 +2285,11 @@ class HeadroomProxy:
|
|
| 2295 |
cr = resp_usage.get("cache_read_input_tokens", 0)
|
| 2296 |
cw = resp_usage.get("cache_creation_input_tokens", 0)
|
| 2297 |
chp = round(cr / (cr + cw) * 100) if (cr + cw) > 0 else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2298 |
logger.info(
|
| 2299 |
f"[{request_id}] PERF "
|
| 2300 |
f"model={model} msgs={num_msgs} "
|
|
@@ -2303,6 +2298,7 @@ class HeadroomProxy:
|
|
| 2303 |
f"cache_read={cr} cache_write={cw} cache_hit_pct={chp} "
|
| 2304 |
f"opt_ms={optimization_latency:.0f} "
|
| 2305 |
f"transforms={_summarize_transforms(transforms_applied)}"
|
|
|
|
| 2306 |
)
|
| 2307 |
|
| 2308 |
# Remove compression headers since httpx already decompressed the response
|
|
@@ -2427,6 +2423,7 @@ class HeadroomProxy:
|
|
| 2427 |
total_optimized_tokens = 0
|
| 2428 |
total_tokens_saved = 0
|
| 2429 |
compressed_requests = []
|
|
|
|
| 2430 |
|
| 2431 |
# Apply compression to each request in the batch
|
| 2432 |
for batch_req in requests_list:
|
|
@@ -2450,12 +2447,13 @@ class HeadroomProxy:
|
|
| 2450 |
)
|
| 2451 |
|
| 2452 |
optimized_messages = result.messages
|
|
|
|
| 2453 |
# Use pipeline's token counts for consistency with pipeline logs
|
| 2454 |
original_tokens = result.tokens_before
|
| 2455 |
optimized_tokens = result.tokens_after
|
| 2456 |
total_original_tokens += original_tokens
|
| 2457 |
total_optimized_tokens += optimized_tokens
|
| 2458 |
-
tokens_saved = original_tokens - optimized_tokens
|
| 2459 |
total_tokens_saved += tokens_saved
|
| 2460 |
|
| 2461 |
# CCR Tool Injection: Inject retrieval tool if compression occurred
|
|
@@ -2519,6 +2517,8 @@ class HeadroomProxy:
|
|
| 2519 |
output_tokens=0,
|
| 2520 |
tokens_saved=total_tokens_saved,
|
| 2521 |
latency_ms=optimization_latency,
|
|
|
|
|
|
|
| 2522 |
)
|
| 2523 |
|
| 2524 |
# Log compression stats
|
|
@@ -2857,6 +2857,7 @@ class HeadroomProxy:
|
|
| 2857 |
total_optimized_tokens = 0
|
| 2858 |
total_tokens_saved = 0
|
| 2859 |
compressed_requests = []
|
|
|
|
| 2860 |
|
| 2861 |
# Apply compression to each request in the batch
|
| 2862 |
for idx, batch_req in enumerate(requests_list):
|
|
@@ -2897,12 +2898,13 @@ class HeadroomProxy:
|
|
| 2897 |
)
|
| 2898 |
|
| 2899 |
optimized_messages = result.messages
|
|
|
|
| 2900 |
# Use pipeline's token counts for consistency with pipeline logs
|
| 2901 |
original_tokens = result.tokens_before
|
| 2902 |
optimized_tokens = result.tokens_after
|
| 2903 |
total_original_tokens += original_tokens
|
| 2904 |
total_optimized_tokens += optimized_tokens
|
| 2905 |
-
tokens_saved = original_tokens - optimized_tokens
|
| 2906 |
total_tokens_saved += tokens_saved
|
| 2907 |
|
| 2908 |
# CCR Tool Injection: Inject retrieval tool if compression occurred
|
|
@@ -2993,6 +2995,8 @@ class HeadroomProxy:
|
|
| 2993 |
output_tokens=0,
|
| 2994 |
tokens_saved=total_tokens_saved,
|
| 2995 |
latency_ms=optimization_latency,
|
|
|
|
|
|
|
| 2996 |
)
|
| 2997 |
|
| 2998 |
# Log compression stats
|
|
@@ -3697,6 +3701,7 @@ class HeadroomProxy:
|
|
| 3697 |
tags: dict[str, str],
|
| 3698 |
optimization_latency: float,
|
| 3699 |
memory_user_id: str | None = None,
|
|
|
|
| 3700 |
) -> StreamingResponse:
|
| 3701 |
"""Stream response with metrics tracking and memory tool handling.
|
| 3702 |
|
|
@@ -3719,6 +3724,7 @@ class HeadroomProxy:
|
|
| 3719 |
"cache_creation_input_tokens": 0,
|
| 3720 |
"total_bytes": 0,
|
| 3721 |
"sse_buffer": "", # Buffer for incomplete SSE events
|
|
|
|
| 3722 |
}
|
| 3723 |
|
| 3724 |
# Track if we need to handle memory tools
|
|
@@ -3740,6 +3746,10 @@ class HeadroomProxy:
|
|
| 3740 |
"POST", url, json=body, headers=headers
|
| 3741 |
) as response:
|
| 3742 |
async for chunk in response.aiter_bytes():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3743 |
stream_state["total_bytes"] += len(chunk)
|
| 3744 |
|
| 3745 |
# Buffer SSE data to handle chunks split across calls
|
|
@@ -3901,12 +3911,9 @@ class HeadroomProxy:
|
|
| 3901 |
f"[{request_id}] No usage in stream, estimated {output_tokens} output tokens"
|
| 3902 |
)
|
| 3903 |
|
| 3904 |
-
# Use
|
| 3905 |
-
#
|
| 3906 |
-
|
| 3907 |
-
total_input_tokens = (
|
| 3908 |
-
api_input_tokens if api_input_tokens is not None else optimized_tokens
|
| 3909 |
-
)
|
| 3910 |
cache_read_tokens = stream_state["cache_read_input_tokens"]
|
| 3911 |
cache_write_tokens = stream_state["cache_creation_input_tokens"]
|
| 3912 |
|
|
@@ -3928,54 +3935,19 @@ class HeadroomProxy:
|
|
| 3928 |
f"transforms={_summarize_transforms(transforms_applied)}"
|
| 3929 |
)
|
| 3930 |
|
| 3931 |
-
# Normalize input tokens based on provider semantics:
|
| 3932 |
-
# - Anthropic: input_tokens excludes cache_read (it's separate), pass as-is
|
| 3933 |
-
# - OpenAI/Gemini: input_tokens includes cache_read (it's a subset), subtract it
|
| 3934 |
-
if provider == "anthropic":
|
| 3935 |
-
# Anthropic's input_tokens = non-cached tokens sent (excludes cache_read)
|
| 3936 |
-
non_cached_input = total_input_tokens
|
| 3937 |
-
else:
|
| 3938 |
-
# OpenAI/Gemini's input_tokens = total (includes cache_read)
|
| 3939 |
-
non_cached_input = total_input_tokens - cache_read_tokens
|
| 3940 |
-
|
| 3941 |
-
# Calculate cost using actual API tokens with proper cache pricing
|
| 3942 |
-
cost_usd = None
|
| 3943 |
-
savings_usd = None
|
| 3944 |
if self.cost_tracker:
|
| 3945 |
-
|
| 3946 |
-
model,
|
| 3947 |
-
non_cached_input,
|
| 3948 |
-
output_tokens,
|
| 3949 |
-
cache_read_tokens=cache_read_tokens,
|
| 3950 |
-
cache_write_tokens=cache_write_tokens,
|
| 3951 |
-
)
|
| 3952 |
-
# For savings calculation, compare compression benefit using base token rates only
|
| 3953 |
-
# (cache effects are Anthropic's feature, not Headroom's compression benefit)
|
| 3954 |
-
compressed_base_cost = self.cost_tracker.estimate_cost(
|
| 3955 |
-
model,
|
| 3956 |
-
non_cached_input,
|
| 3957 |
-
output_tokens,
|
| 3958 |
-
)
|
| 3959 |
-
original_base_cost = self.cost_tracker.estimate_cost(
|
| 3960 |
-
model,
|
| 3961 |
-
original_tokens,
|
| 3962 |
-
output_tokens,
|
| 3963 |
-
)
|
| 3964 |
-
if cost_usd:
|
| 3965 |
-
self.cost_tracker.record_cost(cost_usd)
|
| 3966 |
-
if compressed_base_cost and original_base_cost:
|
| 3967 |
-
savings_usd = original_base_cost - compressed_base_cost
|
| 3968 |
-
self.cost_tracker.record_savings(max(0, savings_usd))
|
| 3969 |
|
| 3970 |
await self.metrics.record_request(
|
| 3971 |
provider=provider,
|
| 3972 |
model=model,
|
| 3973 |
-
input_tokens=
|
| 3974 |
output_tokens=output_tokens,
|
| 3975 |
tokens_saved=tokens_saved,
|
| 3976 |
latency_ms=total_latency,
|
| 3977 |
-
|
| 3978 |
-
|
|
|
|
| 3979 |
)
|
| 3980 |
|
| 3981 |
return StreamingResponse(
|
|
@@ -3996,6 +3968,7 @@ class HeadroomProxy:
|
|
| 3996 |
transforms_applied: list[str],
|
| 3997 |
tags: dict[str, str],
|
| 3998 |
optimization_latency: float,
|
|
|
|
| 3999 |
) -> StreamingResponse:
|
| 4000 |
"""Stream response from Bedrock backend with metrics tracking.
|
| 4001 |
|
|
@@ -4007,6 +3980,7 @@ class HeadroomProxy:
|
|
| 4007 |
stream_state: dict[str, Any] = {
|
| 4008 |
"input_tokens": 0,
|
| 4009 |
"output_tokens": 0,
|
|
|
|
| 4010 |
}
|
| 4011 |
|
| 4012 |
async def generate():
|
|
@@ -4014,6 +3988,10 @@ class HeadroomProxy:
|
|
| 4014 |
assert self.anthropic_backend is not None
|
| 4015 |
|
| 4016 |
async for event in self.anthropic_backend.stream_message(body, headers):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4017 |
# Format as SSE
|
| 4018 |
if event.raw_sse:
|
| 4019 |
yield event.raw_sse.encode()
|
|
@@ -4060,22 +4038,12 @@ class HeadroomProxy:
|
|
| 4060 |
latency_ms=total_latency,
|
| 4061 |
cached=False,
|
| 4062 |
overhead_ms=optimization_latency,
|
|
|
|
|
|
|
| 4063 |
)
|
| 4064 |
|
| 4065 |
-
cost_usd = None
|
| 4066 |
-
savings_usd = None
|
| 4067 |
if self.cost_tracker:
|
| 4068 |
-
|
| 4069 |
-
model, optimized_tokens, output_tokens
|
| 4070 |
-
)
|
| 4071 |
-
original_cost = self.cost_tracker.estimate_cost(
|
| 4072 |
-
model, original_tokens, output_tokens
|
| 4073 |
-
)
|
| 4074 |
-
if cost_usd:
|
| 4075 |
-
self.cost_tracker.record_cost(cost_usd)
|
| 4076 |
-
if cost_usd and original_cost:
|
| 4077 |
-
savings_usd = original_cost - cost_usd
|
| 4078 |
-
self.cost_tracker.record_savings(savings_usd)
|
| 4079 |
|
| 4080 |
# Log request
|
| 4081 |
if self.logger:
|
|
@@ -4092,8 +4060,6 @@ class HeadroomProxy:
|
|
| 4092 |
savings_percent=(tokens_saved / original_tokens * 100)
|
| 4093 |
if original_tokens > 0
|
| 4094 |
else 0,
|
| 4095 |
-
estimated_cost_usd=cost_usd,
|
| 4096 |
-
estimated_savings_usd=savings_usd,
|
| 4097 |
optimization_latency_ms=optimization_latency,
|
| 4098 |
total_latency_ms=total_latency,
|
| 4099 |
tags=tags,
|
|
@@ -4230,6 +4196,8 @@ class HeadroomProxy:
|
|
| 4230 |
|
| 4231 |
# Optimization
|
| 4232 |
transforms_applied = []
|
|
|
|
|
|
|
| 4233 |
optimized_messages = messages
|
| 4234 |
optimized_tokens = original_tokens
|
| 4235 |
|
|
@@ -4245,12 +4213,15 @@ class HeadroomProxy:
|
|
| 4245 |
if result.messages != messages:
|
| 4246 |
optimized_messages = result.messages
|
| 4247 |
transforms_applied = result.transforms_applied
|
|
|
|
| 4248 |
original_tokens = result.tokens_before
|
| 4249 |
optimized_tokens = result.tokens_after
|
|
|
|
|
|
|
| 4250 |
except Exception as e:
|
| 4251 |
logger.warning(f"Optimization failed: {e}")
|
| 4252 |
|
| 4253 |
-
tokens_saved = original_tokens - optimized_tokens
|
| 4254 |
optimization_latency = (time.time() - start_time) * 1000
|
| 4255 |
|
| 4256 |
# Hook: post_compress
|
|
@@ -4335,6 +4306,7 @@ class HeadroomProxy:
|
|
| 4335 |
latency_ms=total_latency,
|
| 4336 |
cached=False,
|
| 4337 |
overhead_ms=optimization_latency,
|
|
|
|
| 4338 |
)
|
| 4339 |
|
| 4340 |
if tokens_saved > 0:
|
|
@@ -4385,6 +4357,7 @@ class HeadroomProxy:
|
|
| 4385 |
transforms_applied,
|
| 4386 |
tags,
|
| 4387 |
optimization_latency,
|
|
|
|
| 4388 |
)
|
| 4389 |
else:
|
| 4390 |
response = await self._retry_request("POST", url, headers, body)
|
|
@@ -4407,30 +4380,8 @@ class HeadroomProxy:
|
|
| 4407 |
f"[{request_id}] Failed to extract cached tokens from OpenAI response: {e}"
|
| 4408 |
)
|
| 4409 |
|
| 4410 |
-
# For OpenAI, prompt_tokens is TOTAL (includes cached)
|
| 4411 |
-
# Normalize to non-cached input for consistent cost calculation
|
| 4412 |
-
non_cached_input = total_input_tokens - cache_read_tokens
|
| 4413 |
-
|
| 4414 |
-
# Cost tracking using actual API tokens
|
| 4415 |
-
cost_usd = savings_usd = None
|
| 4416 |
if self.cost_tracker:
|
| 4417 |
-
|
| 4418 |
-
model,
|
| 4419 |
-
non_cached_input, # Pass non-cached portion
|
| 4420 |
-
output_tokens,
|
| 4421 |
-
cache_read_tokens=cache_read_tokens,
|
| 4422 |
-
)
|
| 4423 |
-
# original_cost: what it would have cost without compression
|
| 4424 |
-
# No cache params — caching is orthogonal to compression savings
|
| 4425 |
-
original_cost = self.cost_tracker.estimate_cost(
|
| 4426 |
-
model,
|
| 4427 |
-
original_tokens,
|
| 4428 |
-
output_tokens,
|
| 4429 |
-
)
|
| 4430 |
-
if cost_usd and original_cost:
|
| 4431 |
-
savings_usd = original_cost - cost_usd
|
| 4432 |
-
self.cost_tracker.record_cost(cost_usd)
|
| 4433 |
-
self.cost_tracker.record_savings(savings_usd)
|
| 4434 |
|
| 4435 |
# Cache
|
| 4436 |
if self.cache and response.status_code == 200:
|
|
@@ -4438,7 +4389,6 @@ class HeadroomProxy:
|
|
| 4438 |
messages, model, response.content, dict(response.headers), tokens_saved
|
| 4439 |
)
|
| 4440 |
|
| 4441 |
-
# Metrics with actual API tokens (total, for accurate tracking)
|
| 4442 |
await self.metrics.record_request(
|
| 4443 |
provider="openai",
|
| 4444 |
model=model,
|
|
@@ -4446,8 +4396,9 @@ class HeadroomProxy:
|
|
| 4446 |
output_tokens=output_tokens,
|
| 4447 |
tokens_saved=tokens_saved,
|
| 4448 |
latency_ms=total_latency,
|
| 4449 |
-
|
| 4450 |
-
|
|
|
|
| 4451 |
)
|
| 4452 |
|
| 4453 |
if tokens_saved > 0:
|
|
@@ -4543,8 +4494,6 @@ class HeadroomProxy:
|
|
| 4543 |
tokens_saved=0,
|
| 4544 |
latency_ms=latency_ms,
|
| 4545 |
cached=False,
|
| 4546 |
-
cost_usd=0,
|
| 4547 |
-
savings_usd=0,
|
| 4548 |
)
|
| 4549 |
|
| 4550 |
return Response(
|
|
@@ -5191,39 +5140,19 @@ class HeadroomProxy:
|
|
| 5191 |
|
| 5192 |
total_input_tokens = original_tokens # fallback
|
| 5193 |
output_tokens = 0
|
| 5194 |
-
cache_read_tokens = 0
|
| 5195 |
try:
|
| 5196 |
resp_json = response.json()
|
| 5197 |
usage = resp_json.get("usage", {})
|
| 5198 |
total_input_tokens = usage.get("input_tokens", original_tokens)
|
| 5199 |
output_tokens = usage.get("output_tokens", 0)
|
| 5200 |
-
# OpenAI returns cached_tokens in prompt_tokens_details (or input_tokens_details)
|
| 5201 |
-
prompt_details = usage.get(
|
| 5202 |
-
"prompt_tokens_details", usage.get("input_tokens_details", {})
|
| 5203 |
-
)
|
| 5204 |
-
cache_read_tokens = prompt_details.get("cached_tokens", 0)
|
| 5205 |
except (KeyError, TypeError, AttributeError) as e:
|
| 5206 |
logger.debug(
|
| 5207 |
f"[{request_id}] Failed to extract cached tokens from OpenAI passthrough response: {e}"
|
| 5208 |
)
|
| 5209 |
|
| 5210 |
-
# For OpenAI, input_tokens is TOTAL (includes cached)
|
| 5211 |
-
# Normalize to non-cached input for consistent cost calculation
|
| 5212 |
-
non_cached_input = total_input_tokens - cache_read_tokens
|
| 5213 |
-
|
| 5214 |
-
# Cost tracking using actual API tokens
|
| 5215 |
-
cost_usd = savings_usd = None
|
| 5216 |
if self.cost_tracker:
|
| 5217 |
-
|
| 5218 |
-
model,
|
| 5219 |
-
non_cached_input, # Pass non-cached portion
|
| 5220 |
-
output_tokens,
|
| 5221 |
-
cache_read_tokens=cache_read_tokens,
|
| 5222 |
-
)
|
| 5223 |
-
if cost_usd:
|
| 5224 |
-
self.cost_tracker.record_cost(cost_usd)
|
| 5225 |
|
| 5226 |
-
# Metrics with actual API tokens (total, for accurate tracking)
|
| 5227 |
await self.metrics.record_request(
|
| 5228 |
provider="openai",
|
| 5229 |
model=model,
|
|
@@ -5231,8 +5160,7 @@ class HeadroomProxy:
|
|
| 5231 |
output_tokens=output_tokens,
|
| 5232 |
tokens_saved=tokens_saved,
|
| 5233 |
latency_ms=total_latency,
|
| 5234 |
-
|
| 5235 |
-
savings_usd=savings_usd or 0,
|
| 5236 |
)
|
| 5237 |
|
| 5238 |
logger.info(f"[{request_id}] /v1/responses {model}: {total_input_tokens:,} tokens")
|
|
@@ -5378,6 +5306,7 @@ class HeadroomProxy:
|
|
| 5378 |
|
| 5379 |
# Optimization
|
| 5380 |
transforms_applied: list[str] = []
|
|
|
|
| 5381 |
optimized_messages = messages
|
| 5382 |
optimized_tokens = original_tokens
|
| 5383 |
|
|
@@ -5396,10 +5325,12 @@ class HeadroomProxy:
|
|
| 5396 |
# Use pipeline's token counts for consistency with pipeline logs
|
| 5397 |
original_tokens = result.tokens_before
|
| 5398 |
optimized_tokens = result.tokens_after
|
|
|
|
|
|
|
| 5399 |
except Exception as e:
|
| 5400 |
logger.warning(f"[{request_id}] Gemini optimization failed: {e}")
|
| 5401 |
|
| 5402 |
-
tokens_saved = original_tokens - optimized_tokens
|
| 5403 |
optimization_latency = (time.time() - start_time) * 1000
|
| 5404 |
|
| 5405 |
# Query Echo: re-inject user's question after compressed tool outputs
|
|
@@ -5481,32 +5412,9 @@ class HeadroomProxy:
|
|
| 5481 |
f"[{request_id}] Failed to extract cached tokens from Gemini response: {e}"
|
| 5482 |
)
|
| 5483 |
|
| 5484 |
-
# For Gemini, promptTokenCount is TOTAL (includes cached)
|
| 5485 |
-
# Normalize to non-cached input for consistent cost calculation
|
| 5486 |
-
non_cached_input = total_input_tokens - cache_read_tokens
|
| 5487 |
-
|
| 5488 |
-
# Cost tracking using actual API tokens
|
| 5489 |
-
cost_usd = savings_usd = None
|
| 5490 |
if self.cost_tracker:
|
| 5491 |
-
|
| 5492 |
-
model,
|
| 5493 |
-
non_cached_input, # Pass non-cached portion
|
| 5494 |
-
output_tokens,
|
| 5495 |
-
cache_read_tokens=cache_read_tokens,
|
| 5496 |
-
)
|
| 5497 |
-
# original_cost: what it would have cost without compression
|
| 5498 |
-
# No cache params — caching is orthogonal to compression savings
|
| 5499 |
-
original_cost = self.cost_tracker.estimate_cost(
|
| 5500 |
-
model,
|
| 5501 |
-
original_tokens,
|
| 5502 |
-
output_tokens,
|
| 5503 |
-
)
|
| 5504 |
-
if cost_usd and original_cost:
|
| 5505 |
-
savings_usd = original_cost - cost_usd
|
| 5506 |
-
self.cost_tracker.record_cost(cost_usd)
|
| 5507 |
-
self.cost_tracker.record_savings(savings_usd)
|
| 5508 |
|
| 5509 |
-
# Metrics with actual API tokens (total, for accurate tracking)
|
| 5510 |
await self.metrics.record_request(
|
| 5511 |
provider="gemini",
|
| 5512 |
model=model,
|
|
@@ -5514,8 +5422,8 @@ class HeadroomProxy:
|
|
| 5514 |
output_tokens=output_tokens,
|
| 5515 |
tokens_saved=tokens_saved,
|
| 5516 |
latency_ms=total_latency,
|
| 5517 |
-
|
| 5518 |
-
|
| 5519 |
)
|
| 5520 |
|
| 5521 |
if tokens_saved > 0:
|
|
@@ -5745,7 +5653,7 @@ class HeadroomProxy:
|
|
| 5745 |
logger.debug(f"[{request_id}] Failed to parse Gemini token count response: {e}")
|
| 5746 |
|
| 5747 |
# Track stats
|
| 5748 |
-
tokens_saved = original_tokens - compressed_tokens if compressed_tokens > 0 else 0
|
| 5749 |
|
| 5750 |
await self.metrics.record_request(
|
| 5751 |
provider="gemini",
|
|
@@ -5754,8 +5662,6 @@ class HeadroomProxy:
|
|
| 5754 |
output_tokens=0,
|
| 5755 |
tokens_saved=tokens_saved,
|
| 5756 |
latency_ms=total_latency,
|
| 5757 |
-
cost_usd=0,
|
| 5758 |
-
savings_usd=0,
|
| 5759 |
)
|
| 5760 |
|
| 5761 |
if tokens_saved > 0:
|
|
@@ -5937,16 +5843,23 @@ def create_app(config: ProxyConfig | None = None) -> FastAPI:
|
|
| 5937 |
)
|
| 5938 |
max_latency_ms = round(m.latency_max_ms, 2) if m.latency_count > 0 else 0
|
| 5939 |
|
| 5940 |
-
# Calculate Headroom overhead (optimization time only)
|
| 5941 |
avg_overhead_ms = (
|
| 5942 |
-
round(m.overhead_sum_ms / m.
|
| 5943 |
)
|
| 5944 |
min_overhead_ms = (
|
| 5945 |
round(m.overhead_min_ms, 2)
|
| 5946 |
-
if m.
|
| 5947 |
else 0
|
| 5948 |
)
|
| 5949 |
-
max_overhead_ms = round(m.overhead_max_ms, 2) if m.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5950 |
|
| 5951 |
# Get compression store stats
|
| 5952 |
store = get_compression_store()
|
|
@@ -5995,6 +5908,25 @@ def create_app(config: ProxyConfig | None = None) -> FastAPI:
|
|
| 5995 |
"min_ms": min_overhead_ms,
|
| 5996 |
"max_ms": max_overhead_ms,
|
| 5997 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5998 |
"cost": proxy.cost_tracker.stats() if proxy.cost_tracker else None,
|
| 5999 |
"compression": {
|
| 6000 |
"ccr_entries": compression_stats.get("entry_count", 0),
|
|
|
|
| 216 |
tokens_saved: int
|
| 217 |
savings_percent: float
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
# Performance
|
| 220 |
optimization_latency_ms: float
|
| 221 |
total_latency_ms: float | None
|
|
|
|
| 225 |
cache_hit: bool
|
| 226 |
transforms_applied: list[str]
|
| 227 |
|
| 228 |
+
# Waste signals detected in original messages
|
| 229 |
+
waste_signals: dict[str, int] | None = None
|
| 230 |
+
|
| 231 |
# Request/Response (optional, for debugging)
|
| 232 |
request_messages: list[dict] | None = None
|
| 233 |
response_content: str | None = None
|
|
|
|
| 600 |
|
| 601 |
# Cost tracking - using deque for efficient left-side removal
|
| 602 |
self._costs: deque[tuple[datetime, float]] = deque(maxlen=self.MAX_COST_ENTRIES)
|
|
|
|
|
|
|
| 603 |
self._last_prune_time: datetime = datetime.now()
|
| 604 |
|
| 605 |
+
# Token savings per model (exact, no dollar estimation)
|
| 606 |
+
self._tokens_saved_by_model: dict[str, int] = {}
|
| 607 |
+
self._tokens_sent_by_model: dict[str, int] = {}
|
| 608 |
+
self._requests_by_model: dict[str, int] = {}
|
| 609 |
+
|
| 610 |
# Cache resolved model names to avoid repeated litellm lookups.
|
| 611 |
# This is critical: litellm.cost_per_token() is synchronous and can block
|
| 612 |
# the async event loop if it triggers I/O (lazy model info download).
|
|
|
|
| 669 |
) -> float | None:
|
| 670 |
"""Estimate cost in USD using LiteLLM's pricing database.
|
| 671 |
|
| 672 |
+
LiteLLM natively handles cache_read and cache_creation pricing
|
| 673 |
+
for all providers (Anthropic, OpenAI, Google, etc.) in a single call.
|
| 674 |
+
|
| 675 |
Args:
|
| 676 |
model: Model name for pricing lookup
|
| 677 |
+
input_tokens: Non-cached input tokens (excludes cache_read)
|
| 678 |
output_tokens: Output tokens
|
| 679 |
+
cache_read_tokens: Tokens served from cache (~10% of input rate)
|
| 680 |
+
cache_write_tokens: Tokens written to cache (~125% of input rate)
|
| 681 |
"""
|
| 682 |
if not LITELLM_AVAILABLE:
|
| 683 |
logger.warning("LiteLLM not available - cannot calculate costs")
|
| 684 |
return None
|
| 685 |
|
| 686 |
try:
|
|
|
|
| 687 |
resolved_model = self._resolve_litellm_model(model)
|
| 688 |
|
| 689 |
+
# litellm.cost_per_token handles all token types natively:
|
| 690 |
+
# prompt_tokens at input rate, cache_read at ~10%, cache_creation at ~125%
|
| 691 |
+
input_cost, output_cost = litellm.cost_per_token(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 692 |
model=resolved_model,
|
| 693 |
+
prompt_tokens=input_tokens,
|
| 694 |
completion_tokens=output_tokens,
|
| 695 |
+
cache_read_input_tokens=cache_read_tokens,
|
| 696 |
+
cache_creation_input_tokens=cache_write_tokens,
|
| 697 |
)
|
| 698 |
|
| 699 |
+
total_cost = input_cost + output_cost
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
return float(total_cost) if total_cost > 0 else None
|
| 701 |
|
| 702 |
except Exception as e:
|
|
|
|
| 722 |
while self._costs and self._costs[0][0] < cutoff:
|
| 723 |
self._costs.popleft()
|
| 724 |
|
| 725 |
+
def record_tokens(self, model: str, tokens_saved: int, tokens_sent: int):
|
| 726 |
+
"""Record token counts per model. This is exact — no estimation."""
|
| 727 |
+
self._tokens_saved_by_model[model] = (
|
| 728 |
+
self._tokens_saved_by_model.get(model, 0) + tokens_saved
|
| 729 |
+
)
|
| 730 |
+
self._tokens_sent_by_model[model] = self._tokens_sent_by_model.get(model, 0) + tokens_sent
|
| 731 |
+
self._requests_by_model[model] = self._requests_by_model.get(model, 0) + 1
|
|
|
|
|
|
|
|
|
|
| 732 |
|
| 733 |
def get_period_cost(self) -> float:
|
| 734 |
"""Get cost for current budget period."""
|
|
|
|
| 752 |
remaining = self.budget_limit_usd - period_cost
|
| 753 |
return remaining > 0, max(0, remaining)
|
| 754 |
|
| 755 |
+
def _get_list_price(self, model: str) -> float | None:
|
| 756 |
+
"""Get list input price per 1M tokens for a model."""
|
| 757 |
+
if not LITELLM_AVAILABLE:
|
| 758 |
+
return None
|
| 759 |
+
try:
|
| 760 |
+
resolved = self._resolve_litellm_model(model)
|
| 761 |
+
info = litellm.model_cost.get(resolved, {})
|
| 762 |
+
cost_per_token = info.get("input_cost_per_token")
|
| 763 |
+
return cost_per_token * 1_000_000 if cost_per_token else None
|
| 764 |
+
except Exception:
|
| 765 |
+
return None
|
| 766 |
+
|
| 767 |
def stats(self) -> dict:
|
| 768 |
+
"""Get token statistics per model."""
|
| 769 |
+
per_model = {}
|
| 770 |
+
total_saved = 0
|
| 771 |
+
for model in sorted(self._tokens_saved_by_model.keys()):
|
| 772 |
+
saved = self._tokens_saved_by_model[model]
|
| 773 |
+
sent = self._tokens_sent_by_model.get(model, 0)
|
| 774 |
+
reqs = self._requests_by_model.get(model, 0)
|
| 775 |
+
total_saved += saved
|
| 776 |
+
per_model[model] = {
|
| 777 |
+
"requests": reqs,
|
| 778 |
+
"tokens_saved": saved,
|
| 779 |
+
"tokens_sent": sent,
|
| 780 |
+
"reduction_pct": round(saved / (saved + sent) * 100, 1)
|
| 781 |
+
if (saved + sent) > 0
|
| 782 |
+
else 0,
|
| 783 |
+
}
|
| 784 |
+
|
| 785 |
+
# Compute counterfactual: what would you have paid without Headroom?
|
| 786 |
+
# Note: uses input token pricing only. Output tokens and cache pricing
|
| 787 |
+
# are excluded since Headroom only compresses input tokens.
|
| 788 |
+
cost_with_headroom = 0.0
|
| 789 |
+
cost_without_headroom = 0.0
|
| 790 |
+
for model in self._tokens_saved_by_model:
|
| 791 |
+
saved = self._tokens_saved_by_model[model]
|
| 792 |
+
sent = self._tokens_sent_by_model.get(model, 0)
|
| 793 |
+
price_per_1m = self._get_list_price(model)
|
| 794 |
+
if price_per_1m:
|
| 795 |
+
cost_with_headroom += (sent / 1_000_000) * price_per_1m
|
| 796 |
+
cost_without_headroom += ((saved + sent) / 1_000_000) * price_per_1m
|
| 797 |
+
|
| 798 |
return {
|
| 799 |
+
"total_tokens_saved": total_saved,
|
| 800 |
+
"per_model": per_model,
|
| 801 |
+
"cost_with_headroom_usd": round(cost_with_headroom, 4),
|
| 802 |
+
"cost_without_headroom_usd": round(cost_without_headroom, 4),
|
| 803 |
+
"savings_usd": round(cost_without_headroom - cost_with_headroom, 4),
|
|
|
|
|
|
|
|
|
|
| 804 |
}
|
| 805 |
|
| 806 |
|
|
|
|
| 833 |
self.overhead_sum_ms = 0.0
|
| 834 |
self.overhead_min_ms = float("inf")
|
| 835 |
self.overhead_max_ms = 0.0
|
| 836 |
+
self.overhead_count = 0
|
| 837 |
+
|
| 838 |
+
# Time to first byte (TTFB) from upstream — what the user actually feels
|
| 839 |
+
self.ttfb_sum_ms = 0.0
|
| 840 |
+
self.ttfb_min_ms = float("inf")
|
| 841 |
+
self.ttfb_max_ms = 0.0
|
| 842 |
+
self.ttfb_count = 0
|
| 843 |
|
| 844 |
+
# Per-transform timing (name → cumulative ms, count)
|
| 845 |
+
self.transform_timing_sum: dict[str, float] = defaultdict(float)
|
| 846 |
+
self.transform_timing_count: dict[str, int] = defaultdict(int)
|
| 847 |
+
self.transform_timing_max: dict[str, float] = defaultdict(float)
|
| 848 |
+
|
| 849 |
+
# Aggregate waste signals
|
| 850 |
+
self.waste_signals_total: dict[str, int] = defaultdict(int)
|
| 851 |
+
|
| 852 |
+
# Cumulative savings history (timestamp → cumulative tokens saved)
|
| 853 |
+
self.savings_history: list[tuple[str, int]] = []
|
| 854 |
|
| 855 |
self._lock = asyncio.Lock()
|
| 856 |
|
|
|
|
| 863 |
tokens_saved: int,
|
| 864 |
latency_ms: float,
|
| 865 |
cached: bool = False,
|
|
|
|
|
|
|
| 866 |
overhead_ms: float = 0,
|
| 867 |
+
ttfb_ms: float = 0,
|
| 868 |
+
pipeline_timing: dict[str, float] | None = None,
|
| 869 |
+
waste_signals: dict[str, int] | None = None,
|
| 870 |
):
|
| 871 |
"""Record metrics for a request."""
|
| 872 |
async with self._lock:
|
|
|
|
| 891 |
self.overhead_sum_ms += overhead_ms
|
| 892 |
self.overhead_min_ms = min(self.overhead_min_ms, overhead_ms)
|
| 893 |
self.overhead_max_ms = max(self.overhead_max_ms, overhead_ms)
|
| 894 |
+
self.overhead_count += 1
|
| 895 |
+
|
| 896 |
+
# Track TTFB (time to first byte from upstream)
|
| 897 |
+
if ttfb_ms > 0:
|
| 898 |
+
self.ttfb_sum_ms += ttfb_ms
|
| 899 |
+
self.ttfb_min_ms = min(self.ttfb_min_ms, ttfb_ms)
|
| 900 |
+
self.ttfb_max_ms = max(self.ttfb_max_ms, ttfb_ms)
|
| 901 |
+
self.ttfb_count += 1
|
| 902 |
+
|
| 903 |
+
# Track per-transform timing
|
| 904 |
+
if pipeline_timing:
|
| 905 |
+
for name, ms in pipeline_timing.items():
|
| 906 |
+
self.transform_timing_sum[name] += ms
|
| 907 |
+
self.transform_timing_count[name] += 1
|
| 908 |
+
self.transform_timing_max[name] = max(self.transform_timing_max[name], ms)
|
| 909 |
+
|
| 910 |
+
# Track waste signals
|
| 911 |
+
if waste_signals:
|
| 912 |
+
for signal_name, token_count in waste_signals.items():
|
| 913 |
+
self.waste_signals_total[signal_name] += token_count
|
| 914 |
+
|
| 915 |
+
# Track cumulative savings history (record every request)
|
| 916 |
+
from datetime import datetime
|
| 917 |
+
|
| 918 |
+
self.savings_history.append((datetime.now().isoformat(), self.tokens_saved_total))
|
| 919 |
+
# Keep last 500 data points
|
| 920 |
+
if len(self.savings_history) > 500:
|
| 921 |
+
self.savings_history = self.savings_history[-500:]
|
| 922 |
|
| 923 |
async def record_rate_limited(self):
|
| 924 |
async with self._lock:
|
|
|
|
| 963 |
"# HELP headroom_latency_ms_sum Sum of request latencies",
|
| 964 |
"# TYPE headroom_latency_ms_sum counter",
|
| 965 |
f"headroom_latency_ms_sum {self.latency_sum_ms:.2f}",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 966 |
]
|
| 967 |
|
| 968 |
# Per-provider metrics
|
|
|
|
| 1427 |
else:
|
| 1428 |
logger.info("Smart Routing: DISABLED (legacy sequential mode)")
|
| 1429 |
|
| 1430 |
+
# Eagerly load LLMLingua model at startup (avoids 5s delay on first request)
|
| 1431 |
+
if self.config.llmlingua_enabled:
|
| 1432 |
+
for transform in self.anthropic_pipeline.transforms:
|
| 1433 |
+
if hasattr(transform, "eager_load_compressors"):
|
| 1434 |
+
transform.eager_load_compressors()
|
| 1435 |
+
self._llmlingua_status = "enabled"
|
| 1436 |
+
break
|
| 1437 |
+
|
| 1438 |
# LLMLingua status with helpful hint
|
| 1439 |
if self._llmlingua_status == "enabled":
|
| 1440 |
logger.info(
|
|
|
|
| 1503 |
m.tokens_saved_total / (m.tokens_input_total + m.tokens_saved_total)
|
| 1504 |
) * 100
|
| 1505 |
logger.info(f"Token savings: {savings_pct:.1f}%")
|
|
|
|
|
|
|
| 1506 |
if m.latency_count > 0:
|
| 1507 |
avg_latency = m.latency_sum_ms / m.latency_count
|
| 1508 |
logger.info(f"Avg latency: {avg_latency:.0f}ms")
|
|
|
|
| 1754 |
|
| 1755 |
# Apply optimization
|
| 1756 |
transforms_applied = []
|
| 1757 |
+
pipeline_timing: dict[str, float] = {}
|
| 1758 |
+
waste_signals_dict: dict[str, int] | None = None
|
| 1759 |
optimized_messages = messages
|
| 1760 |
optimized_tokens = original_tokens
|
| 1761 |
|
|
|
|
| 1774 |
if result.messages != messages:
|
| 1775 |
optimized_messages = result.messages
|
| 1776 |
transforms_applied = result.transforms_applied
|
| 1777 |
+
pipeline_timing = result.timing
|
| 1778 |
# Use pipeline's token counts for consistency with pipeline logs
|
| 1779 |
original_tokens = result.tokens_before
|
| 1780 |
optimized_tokens = result.tokens_after
|
| 1781 |
+
if result.waste_signals:
|
| 1782 |
+
waste_signals_dict = result.waste_signals.to_dict()
|
| 1783 |
except Exception as e:
|
| 1784 |
logger.warning(f"Optimization failed: {e}")
|
| 1785 |
|
| 1786 |
+
tokens_saved = max(0, original_tokens - optimized_tokens)
|
| 1787 |
optimization_latency = (time.time() - start_time) * 1000
|
| 1788 |
|
| 1789 |
# Hook: post_compress — let hooks observe compression results
|
|
|
|
| 1965 |
transforms_applied,
|
| 1966 |
tags,
|
| 1967 |
optimization_latency,
|
| 1968 |
+
pipeline_timing=pipeline_timing,
|
| 1969 |
)
|
| 1970 |
else:
|
| 1971 |
backend_response = await self.anthropic_backend.send_message(body, headers)
|
|
|
|
| 1990 |
latency_ms=total_latency,
|
| 1991 |
cached=False,
|
| 1992 |
overhead_ms=optimization_latency,
|
| 1993 |
+
pipeline_timing=pipeline_timing,
|
| 1994 |
)
|
| 1995 |
|
|
|
|
|
|
|
| 1996 |
if self.cost_tracker:
|
| 1997 |
+
self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1998 |
|
| 1999 |
# Log request
|
| 2000 |
if self.logger:
|
|
|
|
| 2011 |
savings_percent=(tokens_saved / original_tokens * 100)
|
| 2012 |
if original_tokens > 0
|
| 2013 |
else 0,
|
|
|
|
|
|
|
| 2014 |
optimization_latency_ms=optimization_latency,
|
| 2015 |
total_latency_ms=total_latency,
|
| 2016 |
tags=tags,
|
|
|
|
| 2055 |
tags,
|
| 2056 |
optimization_latency,
|
| 2057 |
memory_user_id=memory_user_id,
|
| 2058 |
+
pipeline_timing=pipeline_timing,
|
| 2059 |
)
|
| 2060 |
else:
|
| 2061 |
response = await self._retry_request("POST", url, headers, body)
|
|
|
|
| 2221 |
|
| 2222 |
total_latency = (time.time() - start_time) * 1000
|
| 2223 |
|
| 2224 |
+
# Parse response for output token count
|
|
|
|
| 2225 |
output_tokens = 0
|
|
|
|
|
|
|
| 2226 |
if resp_json:
|
| 2227 |
usage = resp_json.get("usage", {})
|
|
|
|
| 2228 |
output_tokens = usage.get("output_tokens", 0)
|
| 2229 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2230 |
if self.cost_tracker:
|
| 2231 |
+
self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2232 |
|
| 2233 |
# Cache response
|
| 2234 |
if self.cache and response.status_code == 200:
|
|
|
|
| 2240 |
tokens_saved=tokens_saved,
|
| 2241 |
)
|
| 2242 |
|
| 2243 |
+
# Record metrics — use optimized_tokens (what we sent), not API's
|
| 2244 |
+
# input_tokens which is just the non-cached portion with prompt caching
|
| 2245 |
await self.metrics.record_request(
|
| 2246 |
provider="anthropic",
|
| 2247 |
model=model,
|
| 2248 |
+
input_tokens=optimized_tokens,
|
| 2249 |
output_tokens=output_tokens,
|
| 2250 |
tokens_saved=tokens_saved,
|
| 2251 |
latency_ms=total_latency,
|
|
|
|
|
|
|
| 2252 |
overhead_ms=optimization_latency,
|
| 2253 |
+
pipeline_timing=pipeline_timing,
|
| 2254 |
+
waste_signals=waste_signals_dict,
|
| 2255 |
)
|
| 2256 |
|
| 2257 |
# Log request
|
|
|
|
| 2269 |
savings_percent=(tokens_saved / original_tokens * 100)
|
| 2270 |
if original_tokens > 0
|
| 2271 |
else 0,
|
|
|
|
|
|
|
| 2272 |
optimization_latency_ms=optimization_latency,
|
| 2273 |
total_latency_ms=total_latency,
|
| 2274 |
tags=tags,
|
| 2275 |
cache_hit=cache_hit,
|
| 2276 |
transforms_applied=transforms_applied,
|
| 2277 |
+
waste_signals=waste_signals_dict,
|
| 2278 |
request_messages=messages if self.config.log_full_messages else None,
|
| 2279 |
)
|
| 2280 |
)
|
|
|
|
| 2285 |
cr = resp_usage.get("cache_read_input_tokens", 0)
|
| 2286 |
cw = resp_usage.get("cache_creation_input_tokens", 0)
|
| 2287 |
chp = round(cr / (cr + cw) * 100) if (cr + cw) > 0 else 0
|
| 2288 |
+
timing_str = (
|
| 2289 |
+
" ".join(f"{k}={v:.0f}ms" for k, v in pipeline_timing.items())
|
| 2290 |
+
if pipeline_timing
|
| 2291 |
+
else ""
|
| 2292 |
+
)
|
| 2293 |
logger.info(
|
| 2294 |
f"[{request_id}] PERF "
|
| 2295 |
f"model={model} msgs={num_msgs} "
|
|
|
|
| 2298 |
f"cache_read={cr} cache_write={cw} cache_hit_pct={chp} "
|
| 2299 |
f"opt_ms={optimization_latency:.0f} "
|
| 2300 |
f"transforms={_summarize_transforms(transforms_applied)}"
|
| 2301 |
+
f"{' timing=' + timing_str if timing_str else ''}"
|
| 2302 |
)
|
| 2303 |
|
| 2304 |
# Remove compression headers since httpx already decompressed the response
|
|
|
|
| 2423 |
total_optimized_tokens = 0
|
| 2424 |
total_tokens_saved = 0
|
| 2425 |
compressed_requests = []
|
| 2426 |
+
pipeline_timing: dict[str, float] = {}
|
| 2427 |
|
| 2428 |
# Apply compression to each request in the batch
|
| 2429 |
for batch_req in requests_list:
|
|
|
|
| 2447 |
)
|
| 2448 |
|
| 2449 |
optimized_messages = result.messages
|
| 2450 |
+
pipeline_timing = result.timing
|
| 2451 |
# Use pipeline's token counts for consistency with pipeline logs
|
| 2452 |
original_tokens = result.tokens_before
|
| 2453 |
optimized_tokens = result.tokens_after
|
| 2454 |
total_original_tokens += original_tokens
|
| 2455 |
total_optimized_tokens += optimized_tokens
|
| 2456 |
+
tokens_saved = max(0, original_tokens - optimized_tokens)
|
| 2457 |
total_tokens_saved += tokens_saved
|
| 2458 |
|
| 2459 |
# CCR Tool Injection: Inject retrieval tool if compression occurred
|
|
|
|
| 2517 |
output_tokens=0,
|
| 2518 |
tokens_saved=total_tokens_saved,
|
| 2519 |
latency_ms=optimization_latency,
|
| 2520 |
+
overhead_ms=optimization_latency,
|
| 2521 |
+
pipeline_timing=pipeline_timing,
|
| 2522 |
)
|
| 2523 |
|
| 2524 |
# Log compression stats
|
|
|
|
| 2857 |
total_optimized_tokens = 0
|
| 2858 |
total_tokens_saved = 0
|
| 2859 |
compressed_requests = []
|
| 2860 |
+
pipeline_timing: dict[str, float] = {}
|
| 2861 |
|
| 2862 |
# Apply compression to each request in the batch
|
| 2863 |
for idx, batch_req in enumerate(requests_list):
|
|
|
|
| 2898 |
)
|
| 2899 |
|
| 2900 |
optimized_messages = result.messages
|
| 2901 |
+
pipeline_timing = result.timing
|
| 2902 |
# Use pipeline's token counts for consistency with pipeline logs
|
| 2903 |
original_tokens = result.tokens_before
|
| 2904 |
optimized_tokens = result.tokens_after
|
| 2905 |
total_original_tokens += original_tokens
|
| 2906 |
total_optimized_tokens += optimized_tokens
|
| 2907 |
+
tokens_saved = max(0, original_tokens - optimized_tokens)
|
| 2908 |
total_tokens_saved += tokens_saved
|
| 2909 |
|
| 2910 |
# CCR Tool Injection: Inject retrieval tool if compression occurred
|
|
|
|
| 2995 |
output_tokens=0,
|
| 2996 |
tokens_saved=total_tokens_saved,
|
| 2997 |
latency_ms=optimization_latency,
|
| 2998 |
+
overhead_ms=optimization_latency,
|
| 2999 |
+
pipeline_timing=pipeline_timing,
|
| 3000 |
)
|
| 3001 |
|
| 3002 |
# Log compression stats
|
|
|
|
| 3701 |
tags: dict[str, str],
|
| 3702 |
optimization_latency: float,
|
| 3703 |
memory_user_id: str | None = None,
|
| 3704 |
+
pipeline_timing: dict[str, float] | None = None,
|
| 3705 |
) -> StreamingResponse:
|
| 3706 |
"""Stream response with metrics tracking and memory tool handling.
|
| 3707 |
|
|
|
|
| 3724 |
"cache_creation_input_tokens": 0,
|
| 3725 |
"total_bytes": 0,
|
| 3726 |
"sse_buffer": "", # Buffer for incomplete SSE events
|
| 3727 |
+
"ttfb_ms": None, # Time to first byte from upstream
|
| 3728 |
}
|
| 3729 |
|
| 3730 |
# Track if we need to handle memory tools
|
|
|
|
| 3746 |
"POST", url, json=body, headers=headers
|
| 3747 |
) as response:
|
| 3748 |
async for chunk in response.aiter_bytes():
|
| 3749 |
+
# Record TTFB on first chunk
|
| 3750 |
+
if stream_state["ttfb_ms"] is None:
|
| 3751 |
+
stream_state["ttfb_ms"] = (time.time() - start_time) * 1000
|
| 3752 |
+
|
| 3753 |
stream_state["total_bytes"] += len(chunk)
|
| 3754 |
|
| 3755 |
# Buffer SSE data to handle chunks split across calls
|
|
|
|
| 3911 |
f"[{request_id}] No usage in stream, estimated {output_tokens} output tokens"
|
| 3912 |
)
|
| 3913 |
|
| 3914 |
+
# Use optimized_tokens for dashboard metrics (what we actually sent).
|
| 3915 |
+
# API's input_tokens is the non-cached portion only, which is
|
| 3916 |
+
# misleading for aggregation (often just 1 with prompt caching).
|
|
|
|
|
|
|
|
|
|
| 3917 |
cache_read_tokens = stream_state["cache_read_input_tokens"]
|
| 3918 |
cache_write_tokens = stream_state["cache_creation_input_tokens"]
|
| 3919 |
|
|
|
|
| 3935 |
f"transforms={_summarize_transforms(transforms_applied)}"
|
| 3936 |
)
|
| 3937 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3938 |
if self.cost_tracker:
|
| 3939 |
+
self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3940 |
|
| 3941 |
await self.metrics.record_request(
|
| 3942 |
provider=provider,
|
| 3943 |
model=model,
|
| 3944 |
+
input_tokens=optimized_tokens, # What we sent, not API's non-cached count
|
| 3945 |
output_tokens=output_tokens,
|
| 3946 |
tokens_saved=tokens_saved,
|
| 3947 |
latency_ms=total_latency,
|
| 3948 |
+
overhead_ms=optimization_latency,
|
| 3949 |
+
ttfb_ms=stream_state["ttfb_ms"] or 0,
|
| 3950 |
+
pipeline_timing=pipeline_timing,
|
| 3951 |
)
|
| 3952 |
|
| 3953 |
return StreamingResponse(
|
|
|
|
| 3968 |
transforms_applied: list[str],
|
| 3969 |
tags: dict[str, str],
|
| 3970 |
optimization_latency: float,
|
| 3971 |
+
pipeline_timing: dict[str, float] | None = None,
|
| 3972 |
) -> StreamingResponse:
|
| 3973 |
"""Stream response from Bedrock backend with metrics tracking.
|
| 3974 |
|
|
|
|
| 3980 |
stream_state: dict[str, Any] = {
|
| 3981 |
"input_tokens": 0,
|
| 3982 |
"output_tokens": 0,
|
| 3983 |
+
"ttfb_ms": None,
|
| 3984 |
}
|
| 3985 |
|
| 3986 |
async def generate():
|
|
|
|
| 3988 |
assert self.anthropic_backend is not None
|
| 3989 |
|
| 3990 |
async for event in self.anthropic_backend.stream_message(body, headers):
|
| 3991 |
+
# Record TTFB on first event
|
| 3992 |
+
if stream_state["ttfb_ms"] is None:
|
| 3993 |
+
stream_state["ttfb_ms"] = (time.time() - start_time) * 1000
|
| 3994 |
+
|
| 3995 |
# Format as SSE
|
| 3996 |
if event.raw_sse:
|
| 3997 |
yield event.raw_sse.encode()
|
|
|
|
| 4038 |
latency_ms=total_latency,
|
| 4039 |
cached=False,
|
| 4040 |
overhead_ms=optimization_latency,
|
| 4041 |
+
ttfb_ms=stream_state["ttfb_ms"] or 0,
|
| 4042 |
+
pipeline_timing=pipeline_timing,
|
| 4043 |
)
|
| 4044 |
|
|
|
|
|
|
|
| 4045 |
if self.cost_tracker:
|
| 4046 |
+
self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4047 |
|
| 4048 |
# Log request
|
| 4049 |
if self.logger:
|
|
|
|
| 4060 |
savings_percent=(tokens_saved / original_tokens * 100)
|
| 4061 |
if original_tokens > 0
|
| 4062 |
else 0,
|
|
|
|
|
|
|
| 4063 |
optimization_latency_ms=optimization_latency,
|
| 4064 |
total_latency_ms=total_latency,
|
| 4065 |
tags=tags,
|
|
|
|
| 4196 |
|
| 4197 |
# Optimization
|
| 4198 |
transforms_applied = []
|
| 4199 |
+
pipeline_timing: dict[str, float] = {}
|
| 4200 |
+
waste_signals_dict: dict[str, int] | None = None
|
| 4201 |
optimized_messages = messages
|
| 4202 |
optimized_tokens = original_tokens
|
| 4203 |
|
|
|
|
| 4213 |
if result.messages != messages:
|
| 4214 |
optimized_messages = result.messages
|
| 4215 |
transforms_applied = result.transforms_applied
|
| 4216 |
+
pipeline_timing = result.timing
|
| 4217 |
original_tokens = result.tokens_before
|
| 4218 |
optimized_tokens = result.tokens_after
|
| 4219 |
+
if result.waste_signals:
|
| 4220 |
+
waste_signals_dict = result.waste_signals.to_dict()
|
| 4221 |
except Exception as e:
|
| 4222 |
logger.warning(f"Optimization failed: {e}")
|
| 4223 |
|
| 4224 |
+
tokens_saved = max(0, original_tokens - optimized_tokens)
|
| 4225 |
optimization_latency = (time.time() - start_time) * 1000
|
| 4226 |
|
| 4227 |
# Hook: post_compress
|
|
|
|
| 4306 |
latency_ms=total_latency,
|
| 4307 |
cached=False,
|
| 4308 |
overhead_ms=optimization_latency,
|
| 4309 |
+
pipeline_timing=pipeline_timing,
|
| 4310 |
)
|
| 4311 |
|
| 4312 |
if tokens_saved > 0:
|
|
|
|
| 4357 |
transforms_applied,
|
| 4358 |
tags,
|
| 4359 |
optimization_latency,
|
| 4360 |
+
pipeline_timing=pipeline_timing,
|
| 4361 |
)
|
| 4362 |
else:
|
| 4363 |
response = await self._retry_request("POST", url, headers, body)
|
|
|
|
| 4380 |
f"[{request_id}] Failed to extract cached tokens from OpenAI response: {e}"
|
| 4381 |
)
|
| 4382 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4383 |
if self.cost_tracker:
|
| 4384 |
+
self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4385 |
|
| 4386 |
# Cache
|
| 4387 |
if self.cache and response.status_code == 200:
|
|
|
|
| 4389 |
messages, model, response.content, dict(response.headers), tokens_saved
|
| 4390 |
)
|
| 4391 |
|
|
|
|
| 4392 |
await self.metrics.record_request(
|
| 4393 |
provider="openai",
|
| 4394 |
model=model,
|
|
|
|
| 4396 |
output_tokens=output_tokens,
|
| 4397 |
tokens_saved=tokens_saved,
|
| 4398 |
latency_ms=total_latency,
|
| 4399 |
+
overhead_ms=optimization_latency,
|
| 4400 |
+
pipeline_timing=pipeline_timing,
|
| 4401 |
+
waste_signals=waste_signals_dict,
|
| 4402 |
)
|
| 4403 |
|
| 4404 |
if tokens_saved > 0:
|
|
|
|
| 4494 |
tokens_saved=0,
|
| 4495 |
latency_ms=latency_ms,
|
| 4496 |
cached=False,
|
|
|
|
|
|
|
| 4497 |
)
|
| 4498 |
|
| 4499 |
return Response(
|
|
|
|
| 5140 |
|
| 5141 |
total_input_tokens = original_tokens # fallback
|
| 5142 |
output_tokens = 0
|
|
|
|
| 5143 |
try:
|
| 5144 |
resp_json = response.json()
|
| 5145 |
usage = resp_json.get("usage", {})
|
| 5146 |
total_input_tokens = usage.get("input_tokens", original_tokens)
|
| 5147 |
output_tokens = usage.get("output_tokens", 0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5148 |
except (KeyError, TypeError, AttributeError) as e:
|
| 5149 |
logger.debug(
|
| 5150 |
f"[{request_id}] Failed to extract cached tokens from OpenAI passthrough response: {e}"
|
| 5151 |
)
|
| 5152 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5153 |
if self.cost_tracker:
|
| 5154 |
+
self.cost_tracker.record_tokens(model, tokens_saved, total_input_tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5155 |
|
|
|
|
| 5156 |
await self.metrics.record_request(
|
| 5157 |
provider="openai",
|
| 5158 |
model=model,
|
|
|
|
| 5160 |
output_tokens=output_tokens,
|
| 5161 |
tokens_saved=tokens_saved,
|
| 5162 |
latency_ms=total_latency,
|
| 5163 |
+
overhead_ms=optimization_latency,
|
|
|
|
| 5164 |
)
|
| 5165 |
|
| 5166 |
logger.info(f"[{request_id}] /v1/responses {model}: {total_input_tokens:,} tokens")
|
|
|
|
| 5306 |
|
| 5307 |
# Optimization
|
| 5308 |
transforms_applied: list[str] = []
|
| 5309 |
+
waste_signals_dict: dict[str, int] | None = None
|
| 5310 |
optimized_messages = messages
|
| 5311 |
optimized_tokens = original_tokens
|
| 5312 |
|
|
|
|
| 5325 |
# Use pipeline's token counts for consistency with pipeline logs
|
| 5326 |
original_tokens = result.tokens_before
|
| 5327 |
optimized_tokens = result.tokens_after
|
| 5328 |
+
if result.waste_signals:
|
| 5329 |
+
waste_signals_dict = result.waste_signals.to_dict()
|
| 5330 |
except Exception as e:
|
| 5331 |
logger.warning(f"[{request_id}] Gemini optimization failed: {e}")
|
| 5332 |
|
| 5333 |
+
tokens_saved = max(0, original_tokens - optimized_tokens)
|
| 5334 |
optimization_latency = (time.time() - start_time) * 1000
|
| 5335 |
|
| 5336 |
# Query Echo: re-inject user's question after compressed tool outputs
|
|
|
|
| 5412 |
f"[{request_id}] Failed to extract cached tokens from Gemini response: {e}"
|
| 5413 |
)
|
| 5414 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5415 |
if self.cost_tracker:
|
| 5416 |
+
self.cost_tracker.record_tokens(model, tokens_saved, optimized_tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5417 |
|
|
|
|
| 5418 |
await self.metrics.record_request(
|
| 5419 |
provider="gemini",
|
| 5420 |
model=model,
|
|
|
|
| 5422 |
output_tokens=output_tokens,
|
| 5423 |
tokens_saved=tokens_saved,
|
| 5424 |
latency_ms=total_latency,
|
| 5425 |
+
overhead_ms=optimization_latency,
|
| 5426 |
+
waste_signals=waste_signals_dict,
|
| 5427 |
)
|
| 5428 |
|
| 5429 |
if tokens_saved > 0:
|
|
|
|
| 5653 |
logger.debug(f"[{request_id}] Failed to parse Gemini token count response: {e}")
|
| 5654 |
|
| 5655 |
# Track stats
|
| 5656 |
+
tokens_saved = max(0, original_tokens - compressed_tokens) if compressed_tokens > 0 else 0
|
| 5657 |
|
| 5658 |
await self.metrics.record_request(
|
| 5659 |
provider="gemini",
|
|
|
|
| 5662 |
output_tokens=0,
|
| 5663 |
tokens_saved=tokens_saved,
|
| 5664 |
latency_ms=total_latency,
|
|
|
|
|
|
|
| 5665 |
)
|
| 5666 |
|
| 5667 |
if tokens_saved > 0:
|
|
|
|
| 5843 |
)
|
| 5844 |
max_latency_ms = round(m.latency_max_ms, 2) if m.latency_count > 0 else 0
|
| 5845 |
|
| 5846 |
+
# Calculate Headroom overhead (optimization time only, excludes pass-through requests)
|
| 5847 |
avg_overhead_ms = (
|
| 5848 |
+
round(m.overhead_sum_ms / m.overhead_count, 2) if m.overhead_count > 0 else 0
|
| 5849 |
)
|
| 5850 |
min_overhead_ms = (
|
| 5851 |
round(m.overhead_min_ms, 2)
|
| 5852 |
+
if m.overhead_count > 0 and m.overhead_min_ms != float("inf")
|
| 5853 |
else 0
|
| 5854 |
)
|
| 5855 |
+
max_overhead_ms = round(m.overhead_max_ms, 2) if m.overhead_count > 0 else 0
|
| 5856 |
+
|
| 5857 |
+
# Calculate TTFB (time to first byte)
|
| 5858 |
+
avg_ttfb_ms = round(m.ttfb_sum_ms / m.ttfb_count, 2) if m.ttfb_count > 0 else 0
|
| 5859 |
+
min_ttfb_ms = (
|
| 5860 |
+
round(m.ttfb_min_ms, 2) if m.ttfb_count > 0 and m.ttfb_min_ms != float("inf") else 0
|
| 5861 |
+
)
|
| 5862 |
+
max_ttfb_ms = round(m.ttfb_max_ms, 2) if m.ttfb_count > 0 else 0
|
| 5863 |
|
| 5864 |
# Get compression store stats
|
| 5865 |
store = get_compression_store()
|
|
|
|
| 5908 |
"min_ms": min_overhead_ms,
|
| 5909 |
"max_ms": max_overhead_ms,
|
| 5910 |
},
|
| 5911 |
+
"ttfb": {
|
| 5912 |
+
"average_ms": avg_ttfb_ms,
|
| 5913 |
+
"min_ms": min_ttfb_ms,
|
| 5914 |
+
"max_ms": max_ttfb_ms,
|
| 5915 |
+
},
|
| 5916 |
+
"pipeline_timing": {
|
| 5917 |
+
name: {
|
| 5918 |
+
"average_ms": round(
|
| 5919 |
+
m.transform_timing_sum[name] / m.transform_timing_count[name], 2
|
| 5920 |
+
),
|
| 5921 |
+
"max_ms": round(m.transform_timing_max[name], 2),
|
| 5922 |
+
"count": m.transform_timing_count[name],
|
| 5923 |
+
}
|
| 5924 |
+
for name in sorted(m.transform_timing_sum.keys())
|
| 5925 |
+
}
|
| 5926 |
+
if m.transform_timing_sum
|
| 5927 |
+
else {},
|
| 5928 |
+
"waste_signals": dict(m.waste_signals_total) if m.waste_signals_total else {},
|
| 5929 |
+
"savings_history": m.savings_history[-100:], # Last 100 data points
|
| 5930 |
"cost": proxy.cost_tracker.stats() if proxy.cost_tracker else None,
|
| 5931 |
"compression": {
|
| 5932 |
"ccr_entries": compression_stats.get("entry_count", 0),
|
|
@@ -867,6 +867,25 @@ class CodeAwareCompressor(Transform):
|
|
| 867 |
|
| 868 |
ratio = compressed_tokens / max(original_tokens, 1)
|
| 869 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 870 |
# Store in CCR if significant compression
|
| 871 |
cache_key = None
|
| 872 |
if self.config.enable_ccr and ratio < 0.8:
|
|
|
|
| 867 |
|
| 868 |
ratio = compressed_tokens / max(original_tokens, 1)
|
| 869 |
|
| 870 |
+
# Guard against over-aggressive compression (data loss).
|
| 871 |
+
# If AST extraction stripped content to <5% of original,
|
| 872 |
+
# the output is essentially empty — return original.
|
| 873 |
+
if ratio < 0.05:
|
| 874 |
+
logger.warning(
|
| 875 |
+
"Code compression too aggressive (ratio=%.3f), returning original",
|
| 876 |
+
ratio,
|
| 877 |
+
)
|
| 878 |
+
return CodeCompressionResult(
|
| 879 |
+
compressed=code,
|
| 880 |
+
original=code,
|
| 881 |
+
original_tokens=original_tokens,
|
| 882 |
+
compressed_tokens=original_tokens,
|
| 883 |
+
compression_ratio=1.0,
|
| 884 |
+
language=detected_lang,
|
| 885 |
+
language_confidence=confidence,
|
| 886 |
+
syntax_valid=True,
|
| 887 |
+
)
|
| 888 |
+
|
| 889 |
# Store in CCR if significant compression
|
| 890 |
cache_key = None
|
| 891 |
if self.config.enable_ccr and ratio < 0.8:
|
|
@@ -38,6 +38,7 @@ from __future__ import annotations
|
|
| 38 |
import hashlib
|
| 39 |
import logging
|
| 40 |
import re
|
|
|
|
| 41 |
from dataclasses import dataclass, field
|
| 42 |
from enum import Enum
|
| 43 |
from typing import Any
|
|
@@ -132,6 +133,112 @@ def _create_content_signature(
|
|
| 132 |
return None
|
| 133 |
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
class CompressionStrategy(Enum):
|
| 136 |
"""Available compression strategies."""
|
| 137 |
|
|
@@ -533,6 +640,8 @@ class ContentRouter(Transform):
|
|
| 533 |
# TOIN integration for cross-strategy learning
|
| 534 |
self._toin: Any = None
|
| 535 |
|
|
|
|
|
|
|
| 536 |
def _record_to_toin(
|
| 537 |
self,
|
| 538 |
strategy: CompressionStrategy,
|
|
@@ -1042,6 +1151,25 @@ class ContentRouter(Transform):
|
|
| 1042 |
logger.debug("HTMLExtractor not available (install trafilatura)")
|
| 1043 |
return self._html_extractor
|
| 1044 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1045 |
def _get_llmlingua(self) -> Any:
|
| 1046 |
"""Get LLMLinguaCompressor (lazy load)."""
|
| 1047 |
if self._llmlingua is None:
|
|
@@ -1269,6 +1397,7 @@ class ContentRouter(Transform):
|
|
| 1269 |
transformed_messages: list[dict[str, Any]] = []
|
| 1270 |
transforms_applied: list[str] = []
|
| 1271 |
warnings: list[str] = []
|
|
|
|
| 1272 |
|
| 1273 |
# Routing reason counters for summary logging
|
| 1274 |
route_counts: dict[str, int] = {
|
|
@@ -1309,6 +1438,7 @@ class ContentRouter(Transform):
|
|
| 1309 |
min_ratio=min_ratio,
|
| 1310 |
read_protection_window=read_protection_window,
|
| 1311 |
messages_from_end=messages_from_end,
|
|
|
|
| 1312 |
)
|
| 1313 |
transformed_messages.append(transformed_message)
|
| 1314 |
route_counts["content_blocks"] += 1
|
|
@@ -1375,14 +1505,70 @@ class ContentRouter(Transform):
|
|
| 1375 |
route_counts["analysis_ctx"] += 1
|
| 1376 |
continue
|
| 1377 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1378 |
# Route and compress based on content detection
|
| 1379 |
# Merge tool-specific bias with hook-provided bias (multiplicative)
|
| 1380 |
msg_bias = bias if role == "tool" else 1.0
|
| 1381 |
if i in hook_biases:
|
| 1382 |
msg_bias *= hook_biases[i]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1383 |
result = self.compress(content, context=context, bias=msg_bias)
|
|
|
|
|
|
|
|
|
|
| 1384 |
|
| 1385 |
if result.compression_ratio < min_ratio:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1386 |
transformed_messages.append({**message, "content": result.compressed})
|
| 1387 |
transforms_applied.append(
|
| 1388 |
f"router:{result.strategy_used.value}:{result.compression_ratio:.2f}"
|
|
@@ -1391,6 +1577,8 @@ class ContentRouter(Transform):
|
|
| 1391 |
f"{result.strategy_used.value}:{result.compression_ratio:.2f}"
|
| 1392 |
)
|
| 1393 |
else:
|
|
|
|
|
|
|
| 1394 |
transformed_messages.append(message)
|
| 1395 |
route_counts["ratio_too_high"] += 1
|
| 1396 |
|
|
@@ -1398,6 +1586,14 @@ class ContentRouter(Transform):
|
|
| 1398 |
tokenizer.count_text(str(m.get("content", ""))) for m in transformed_messages
|
| 1399 |
)
|
| 1400 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1401 |
# Log routing summary
|
| 1402 |
parts = []
|
| 1403 |
if compressed_details:
|
|
@@ -1412,12 +1608,24 @@ class ContentRouter(Transform):
|
|
| 1412 |
parts.append(f"{route_counts['recent_code']} protected (recent code)")
|
| 1413 |
if route_counts["analysis_ctx"]:
|
| 1414 |
parts.append(f"{route_counts['analysis_ctx']} protected (analysis ctx)")
|
|
|
|
|
|
|
| 1415 |
if route_counts["ratio_too_high"]:
|
| 1416 |
parts.append(f"{route_counts['ratio_too_high']} unchanged (ratio>={min_ratio:.2f})")
|
| 1417 |
if route_counts["content_blocks"]:
|
| 1418 |
parts.append(f"{route_counts['content_blocks']} content-block msgs")
|
| 1419 |
if route_counts["non_string"]:
|
| 1420 |
parts.append(f"{route_counts['non_string']} non-string")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1421 |
if parts:
|
| 1422 |
logger.info(
|
| 1423 |
"content_router: %d msgs — %s",
|
|
@@ -1433,6 +1641,7 @@ class ContentRouter(Transform):
|
|
| 1433 |
transforms_applied=all_transforms if all_transforms else ["router:noop"],
|
| 1434 |
markers_inserted=lifecycle_ccr_hashes,
|
| 1435 |
warnings=warnings,
|
|
|
|
| 1436 |
)
|
| 1437 |
|
| 1438 |
def _get_tool_bias(self, tool_name: str) -> float:
|
|
@@ -1469,6 +1678,7 @@ class ContentRouter(Transform):
|
|
| 1469 |
min_ratio: float = 0.85,
|
| 1470 |
read_protection_window: int = 8,
|
| 1471 |
messages_from_end: int = 0,
|
|
|
|
| 1472 |
) -> dict[str, Any]:
|
| 1473 |
"""Process content blocks (Anthropic format) for tool_result compression.
|
| 1474 |
|
|
@@ -1523,9 +1733,70 @@ class ContentRouter(Transform):
|
|
| 1523 |
|
| 1524 |
# Only process string content
|
| 1525 |
if isinstance(tool_content, str) and len(tool_content) > 500:
|
| 1526 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1527 |
result = self.compress(tool_content, context=context, bias=bias)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1528 |
if result.compression_ratio < min_ratio:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1529 |
new_blocks.append({**block, "content": result.compressed})
|
| 1530 |
transforms_applied.append(
|
| 1531 |
f"router:tool_result:{result.strategy_used.value}"
|
|
@@ -1537,6 +1808,8 @@ class ContentRouter(Transform):
|
|
| 1537 |
any_compressed = True
|
| 1538 |
continue
|
| 1539 |
else:
|
|
|
|
|
|
|
| 1540 |
if route_counts is not None:
|
| 1541 |
route_counts["ratio_too_high"] += 1
|
| 1542 |
else:
|
|
|
|
| 38 |
import hashlib
|
| 39 |
import logging
|
| 40 |
import re
|
| 41 |
+
import time
|
| 42 |
from dataclasses import dataclass, field
|
| 43 |
from enum import Enum
|
| 44 |
from typing import Any
|
|
|
|
| 133 |
return None
|
| 134 |
|
| 135 |
|
| 136 |
+
class CompressionCache:
|
| 137 |
+
"""Two-tier compression cache with TTL.
|
| 138 |
+
|
| 139 |
+
Tier 1 (skip set): content hashes that won't compress — instant skip,
|
| 140 |
+
near-zero memory (just ints in a set).
|
| 141 |
+
|
| 142 |
+
Tier 2 (result cache): compressed results for content that DID compress —
|
| 143 |
+
reuse the compressed text on subsequent requests.
|
| 144 |
+
|
| 145 |
+
Entries expire after TTL (default 30min). No max-entries cap — TTL is the
|
| 146 |
+
natural bound. Memory grows proportional to compressible content × TTL,
|
| 147 |
+
which is bounded by session duration.
|
| 148 |
+
|
| 149 |
+
Uses in-process dict for ultra-fast lookups (~100ns). Could be backed
|
| 150 |
+
by memcached/Redis for multi-process deployments.
|
| 151 |
+
"""
|
| 152 |
+
|
| 153 |
+
def __init__(self, ttl_seconds: int = 1800):
|
| 154 |
+
# Tier 2: compressed results {hash: (text, ratio, strategy, timestamp)}
|
| 155 |
+
self._results: dict[int, tuple[str, float, str, float]] = {}
|
| 156 |
+
# Tier 1: hashes of content that won't compress {hash: timestamp}
|
| 157 |
+
self._skip: dict[int, float] = {}
|
| 158 |
+
self._ttl_seconds = ttl_seconds
|
| 159 |
+
# Metrics
|
| 160 |
+
self._hits = 0
|
| 161 |
+
self._misses = 0
|
| 162 |
+
self._skip_hits = 0
|
| 163 |
+
self._evictions = 0
|
| 164 |
+
self._total_lookup_ns = 0
|
| 165 |
+
self._lookup_count = 0
|
| 166 |
+
|
| 167 |
+
def get(self, key: int) -> tuple[str, float, str] | None:
|
| 168 |
+
"""Get cached compression result.
|
| 169 |
+
|
| 170 |
+
Returns (compressed_text, ratio, strategy) or None if not found/expired.
|
| 171 |
+
Use is_skipped() first to check if content is known non-compressible.
|
| 172 |
+
"""
|
| 173 |
+
t0 = time.perf_counter_ns()
|
| 174 |
+
entry = self._results.get(key)
|
| 175 |
+
if entry is not None:
|
| 176 |
+
compressed, ratio, strategy, created_at = entry
|
| 177 |
+
if (time.time() - created_at) < self._ttl_seconds:
|
| 178 |
+
self._hits += 1
|
| 179 |
+
self._total_lookup_ns += time.perf_counter_ns() - t0
|
| 180 |
+
self._lookup_count += 1
|
| 181 |
+
return (compressed, ratio, strategy)
|
| 182 |
+
else:
|
| 183 |
+
del self._results[key]
|
| 184 |
+
self._evictions += 1
|
| 185 |
+
self._misses += 1
|
| 186 |
+
self._total_lookup_ns += time.perf_counter_ns() - t0
|
| 187 |
+
self._lookup_count += 1
|
| 188 |
+
return None
|
| 189 |
+
|
| 190 |
+
def is_skipped(self, key: int) -> bool:
|
| 191 |
+
"""Check if content is known non-compressible (Tier 1)."""
|
| 192 |
+
ts = self._skip.get(key)
|
| 193 |
+
if ts is not None:
|
| 194 |
+
if (time.time() - ts) < self._ttl_seconds:
|
| 195 |
+
self._skip_hits += 1
|
| 196 |
+
return True
|
| 197 |
+
else:
|
| 198 |
+
del self._skip[key]
|
| 199 |
+
self._evictions += 1
|
| 200 |
+
return False
|
| 201 |
+
|
| 202 |
+
def put(self, key: int, compressed: str, ratio: float, strategy: str) -> None:
|
| 203 |
+
"""Store a compressed result (Tier 2)."""
|
| 204 |
+
self._results[key] = (compressed, ratio, strategy, time.time())
|
| 205 |
+
|
| 206 |
+
def mark_skip(self, key: int) -> None:
|
| 207 |
+
"""Mark content as non-compressible (Tier 1)."""
|
| 208 |
+
self._skip[key] = time.time()
|
| 209 |
+
|
| 210 |
+
def move_to_skip(self, key: int) -> None:
|
| 211 |
+
"""Move a result to skip set (threshold tightened, no longer qualifies)."""
|
| 212 |
+
self._results.pop(key, None)
|
| 213 |
+
self._skip[key] = time.time()
|
| 214 |
+
|
| 215 |
+
@property
|
| 216 |
+
def size(self) -> int:
|
| 217 |
+
return len(self._results)
|
| 218 |
+
|
| 219 |
+
@property
|
| 220 |
+
def skip_size(self) -> int:
|
| 221 |
+
return len(self._skip)
|
| 222 |
+
|
| 223 |
+
@property
|
| 224 |
+
def stats(self) -> dict[str, int | float]:
|
| 225 |
+
avg_ns = self._total_lookup_ns / self._lookup_count if self._lookup_count else 0
|
| 226 |
+
return {
|
| 227 |
+
"cache_hits": self._hits,
|
| 228 |
+
"cache_skip_hits": self._skip_hits,
|
| 229 |
+
"cache_misses": self._misses,
|
| 230 |
+
"cache_evictions": self._evictions,
|
| 231 |
+
"cache_size": len(self._results),
|
| 232 |
+
"cache_skip_size": len(self._skip),
|
| 233 |
+
"cache_avg_lookup_ns": avg_ns,
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
def clear(self) -> None:
|
| 237 |
+
"""Clear all entries (e.g., on session end)."""
|
| 238 |
+
self._results.clear()
|
| 239 |
+
self._skip.clear()
|
| 240 |
+
|
| 241 |
+
|
| 242 |
class CompressionStrategy(Enum):
|
| 243 |
"""Available compression strategies."""
|
| 244 |
|
|
|
|
| 640 |
# TOIN integration for cross-strategy learning
|
| 641 |
self._toin: Any = None
|
| 642 |
|
| 643 |
+
self._cache = CompressionCache()
|
| 644 |
+
|
| 645 |
def _record_to_toin(
|
| 646 |
self,
|
| 647 |
strategy: CompressionStrategy,
|
|
|
|
| 1151 |
logger.debug("HTMLExtractor not available (install trafilatura)")
|
| 1152 |
return self._html_extractor
|
| 1153 |
|
| 1154 |
+
def eager_load_compressors(self) -> None:
|
| 1155 |
+
"""Pre-load compressors at startup to avoid first-request latency.
|
| 1156 |
+
|
| 1157 |
+
Call this during proxy startup to load LLMLingua model (~5s)
|
| 1158 |
+
before any requests arrive.
|
| 1159 |
+
"""
|
| 1160 |
+
if self.config.enable_llmlingua:
|
| 1161 |
+
compressor = self._get_llmlingua()
|
| 1162 |
+
if compressor:
|
| 1163 |
+
# Trigger the underlying model load by accessing it
|
| 1164 |
+
try:
|
| 1165 |
+
from .llmlingua_compressor import _get_llmlingua_compressor
|
| 1166 |
+
|
| 1167 |
+
device = compressor._resolve_device()
|
| 1168 |
+
_get_llmlingua_compressor(compressor.config.model_name, device)
|
| 1169 |
+
logger.info("LLMLingua model pre-loaded at startup")
|
| 1170 |
+
except Exception as e:
|
| 1171 |
+
logger.warning("Failed to pre-load LLMLingua model: %s", e)
|
| 1172 |
+
|
| 1173 |
def _get_llmlingua(self) -> Any:
|
| 1174 |
"""Get LLMLinguaCompressor (lazy load)."""
|
| 1175 |
if self._llmlingua is None:
|
|
|
|
| 1397 |
transformed_messages: list[dict[str, Any]] = []
|
| 1398 |
transforms_applied: list[str] = []
|
| 1399 |
warnings: list[str] = []
|
| 1400 |
+
compressor_timing: dict[str, float] = {} # strategy → cumulative ms
|
| 1401 |
|
| 1402 |
# Routing reason counters for summary logging
|
| 1403 |
route_counts: dict[str, int] = {
|
|
|
|
| 1438 |
min_ratio=min_ratio,
|
| 1439 |
read_protection_window=read_protection_window,
|
| 1440 |
messages_from_end=messages_from_end,
|
| 1441 |
+
compressor_timing=compressor_timing,
|
| 1442 |
)
|
| 1443 |
transformed_messages.append(transformed_message)
|
| 1444 |
route_counts["content_blocks"] += 1
|
|
|
|
| 1505 |
route_counts["analysis_ctx"] += 1
|
| 1506 |
continue
|
| 1507 |
|
| 1508 |
+
# Compression pinning: if this message was already compressed
|
| 1509 |
+
# (contains a CCR retrieval marker), skip recompression.
|
| 1510 |
+
# Recompressing would change byte content and break provider
|
| 1511 |
+
# prefix caching with no meaningful further reduction.
|
| 1512 |
+
if "Retrieve more: hash=" in content or "Retrieve original: hash=" in content:
|
| 1513 |
+
transformed_messages.append(message)
|
| 1514 |
+
route_counts.setdefault("already_compressed", 0)
|
| 1515 |
+
route_counts["already_compressed"] += 1
|
| 1516 |
+
continue
|
| 1517 |
+
|
| 1518 |
# Route and compress based on content detection
|
| 1519 |
# Merge tool-specific bias with hook-provided bias (multiplicative)
|
| 1520 |
msg_bias = bias if role == "tool" else 1.0
|
| 1521 |
if i in hook_biases:
|
| 1522 |
msg_bias *= hook_biases[i]
|
| 1523 |
+
|
| 1524 |
+
# Two-tier compression cache.
|
| 1525 |
+
# Tier 1 (skip): known won't-compress → instant skip.
|
| 1526 |
+
# Tier 2 (result): known compresses → reuse compressed text.
|
| 1527 |
+
content_key = hash(content)
|
| 1528 |
+
|
| 1529 |
+
# Tier 1: skip set — instant rejection
|
| 1530 |
+
if self._cache.is_skipped(content_key):
|
| 1531 |
+
transformed_messages.append(message)
|
| 1532 |
+
route_counts["ratio_too_high"] += 1
|
| 1533 |
+
route_counts.setdefault("cache_hit", 0)
|
| 1534 |
+
route_counts["cache_hit"] += 1
|
| 1535 |
+
continue
|
| 1536 |
+
|
| 1537 |
+
# Tier 2: result cache — reuse compressed output
|
| 1538 |
+
cached = self._cache.get(content_key)
|
| 1539 |
+
if cached is not None:
|
| 1540 |
+
cached_compressed, cached_ratio, cached_strategy = cached
|
| 1541 |
+
# Re-check ratio against current min_ratio (shifts with context pressure)
|
| 1542 |
+
if cached_ratio < min_ratio:
|
| 1543 |
+
transformed_messages.append({**message, "content": cached_compressed})
|
| 1544 |
+
transforms_applied.append(f"router:{cached_strategy}:{cached_ratio:.2f}")
|
| 1545 |
+
compressed_details.append(f"{cached_strategy}:{cached_ratio:.2f}")
|
| 1546 |
+
else:
|
| 1547 |
+
# Threshold tightened — no longer qualifies. Move to skip.
|
| 1548 |
+
self._cache.move_to_skip(content_key)
|
| 1549 |
+
transformed_messages.append(message)
|
| 1550 |
+
route_counts["ratio_too_high"] += 1
|
| 1551 |
+
route_counts.setdefault("cache_hit", 0)
|
| 1552 |
+
route_counts["cache_hit"] += 1
|
| 1553 |
+
continue
|
| 1554 |
+
|
| 1555 |
+
# Cache miss — run full compression
|
| 1556 |
+
route_counts.setdefault("cache_miss", 0)
|
| 1557 |
+
route_counts["cache_miss"] += 1
|
| 1558 |
+
t0 = time.perf_counter()
|
| 1559 |
result = self.compress(content, context=context, bias=msg_bias)
|
| 1560 |
+
compress_ms = (time.perf_counter() - t0) * 1000
|
| 1561 |
+
strategy_key = f"compressor:{result.strategy_used.value}"
|
| 1562 |
+
compressor_timing[strategy_key] = compressor_timing.get(strategy_key, 0.0) + compress_ms
|
| 1563 |
|
| 1564 |
if result.compression_ratio < min_ratio:
|
| 1565 |
+
# Compressed — store in result cache
|
| 1566 |
+
self._cache.put(
|
| 1567 |
+
content_key,
|
| 1568 |
+
result.compressed,
|
| 1569 |
+
result.compression_ratio,
|
| 1570 |
+
result.strategy_used.value,
|
| 1571 |
+
)
|
| 1572 |
transformed_messages.append({**message, "content": result.compressed})
|
| 1573 |
transforms_applied.append(
|
| 1574 |
f"router:{result.strategy_used.value}:{result.compression_ratio:.2f}"
|
|
|
|
| 1577 |
f"{result.strategy_used.value}:{result.compression_ratio:.2f}"
|
| 1578 |
)
|
| 1579 |
else:
|
| 1580 |
+
# Didn't compress — add to skip set
|
| 1581 |
+
self._cache.mark_skip(content_key)
|
| 1582 |
transformed_messages.append(message)
|
| 1583 |
route_counts["ratio_too_high"] += 1
|
| 1584 |
|
|
|
|
| 1586 |
tokenizer.count_text(str(m.get("content", ""))) for m in transformed_messages
|
| 1587 |
)
|
| 1588 |
|
| 1589 |
+
# Add cache performance metrics to timing
|
| 1590 |
+
cache_stats = self._cache.stats
|
| 1591 |
+
compressor_timing["cache_hits"] = float(cache_stats["cache_hits"])
|
| 1592 |
+
compressor_timing["cache_skip_hits"] = float(cache_stats["cache_skip_hits"])
|
| 1593 |
+
compressor_timing["cache_size"] = float(cache_stats["cache_size"])
|
| 1594 |
+
compressor_timing["cache_skip_size"] = float(cache_stats["cache_skip_size"])
|
| 1595 |
+
compressor_timing["cache_avg_lookup_ns"] = cache_stats["cache_avg_lookup_ns"]
|
| 1596 |
+
|
| 1597 |
# Log routing summary
|
| 1598 |
parts = []
|
| 1599 |
if compressed_details:
|
|
|
|
| 1608 |
parts.append(f"{route_counts['recent_code']} protected (recent code)")
|
| 1609 |
if route_counts["analysis_ctx"]:
|
| 1610 |
parts.append(f"{route_counts['analysis_ctx']} protected (analysis ctx)")
|
| 1611 |
+
if route_counts.get("already_compressed"):
|
| 1612 |
+
parts.append(f"{route_counts['already_compressed']} pinned (already compressed)")
|
| 1613 |
if route_counts["ratio_too_high"]:
|
| 1614 |
parts.append(f"{route_counts['ratio_too_high']} unchanged (ratio>={min_ratio:.2f})")
|
| 1615 |
if route_counts["content_blocks"]:
|
| 1616 |
parts.append(f"{route_counts['content_blocks']} content-block msgs")
|
| 1617 |
if route_counts["non_string"]:
|
| 1618 |
parts.append(f"{route_counts['non_string']} non-string")
|
| 1619 |
+
if route_counts.get("cache_hit"):
|
| 1620 |
+
parts.append(f"{route_counts['cache_hit']} cache hits")
|
| 1621 |
+
if route_counts.get("cache_miss"):
|
| 1622 |
+
parts.append(f"{route_counts['cache_miss']} cache misses")
|
| 1623 |
+
cs = self._cache.stats
|
| 1624 |
+
if cs["cache_size"] > 0 or cs["cache_skip_size"] > 0:
|
| 1625 |
+
parts.append(
|
| 1626 |
+
f"cache[{cs['cache_size']} results, {cs['cache_skip_size']} skips, "
|
| 1627 |
+
f"{cs['cache_avg_lookup_ns']:.0f}ns avg]"
|
| 1628 |
+
)
|
| 1629 |
if parts:
|
| 1630 |
logger.info(
|
| 1631 |
"content_router: %d msgs — %s",
|
|
|
|
| 1641 |
transforms_applied=all_transforms if all_transforms else ["router:noop"],
|
| 1642 |
markers_inserted=lifecycle_ccr_hashes,
|
| 1643 |
warnings=warnings,
|
| 1644 |
+
timing=compressor_timing,
|
| 1645 |
)
|
| 1646 |
|
| 1647 |
def _get_tool_bias(self, tool_name: str) -> float:
|
|
|
|
| 1678 |
min_ratio: float = 0.85,
|
| 1679 |
read_protection_window: int = 8,
|
| 1680 |
messages_from_end: int = 0,
|
| 1681 |
+
compressor_timing: dict[str, float] | None = None,
|
| 1682 |
) -> dict[str, Any]:
|
| 1683 |
"""Process content blocks (Anthropic format) for tool_result compression.
|
| 1684 |
|
|
|
|
| 1733 |
|
| 1734 |
# Only process string content
|
| 1735 |
if isinstance(tool_content, str) and len(tool_content) > 500:
|
| 1736 |
+
# Compression pinning: skip already-compressed content
|
| 1737 |
+
if (
|
| 1738 |
+
"Retrieve more: hash=" in tool_content
|
| 1739 |
+
or "Retrieve original: hash=" in tool_content
|
| 1740 |
+
):
|
| 1741 |
+
new_blocks.append(block)
|
| 1742 |
+
if route_counts is not None:
|
| 1743 |
+
route_counts.setdefault("already_compressed", 0)
|
| 1744 |
+
route_counts["already_compressed"] += 1
|
| 1745 |
+
continue
|
| 1746 |
+
|
| 1747 |
+
# Two-tier compression cache
|
| 1748 |
+
content_key = hash(tool_content)
|
| 1749 |
+
|
| 1750 |
+
# Tier 1: skip set — instant rejection
|
| 1751 |
+
if self._cache.is_skipped(content_key):
|
| 1752 |
+
new_blocks.append(block)
|
| 1753 |
+
if route_counts is not None:
|
| 1754 |
+
route_counts["ratio_too_high"] += 1
|
| 1755 |
+
route_counts.setdefault("cache_hit", 0)
|
| 1756 |
+
route_counts["cache_hit"] += 1
|
| 1757 |
+
continue
|
| 1758 |
+
|
| 1759 |
+
# Tier 2: result cache — reuse compressed output
|
| 1760 |
+
cached = self._cache.get(content_key)
|
| 1761 |
+
if cached is not None:
|
| 1762 |
+
cached_compressed, cached_ratio, cached_strategy = cached
|
| 1763 |
+
if cached_ratio < min_ratio:
|
| 1764 |
+
new_blocks.append({**block, "content": cached_compressed})
|
| 1765 |
+
transforms_applied.append(f"router:tool_result:{cached_strategy}")
|
| 1766 |
+
if compressed_details is not None:
|
| 1767 |
+
compressed_details.append(
|
| 1768 |
+
f"tool:{cached_strategy}:{cached_ratio:.2f}"
|
| 1769 |
+
)
|
| 1770 |
+
any_compressed = True
|
| 1771 |
+
else:
|
| 1772 |
+
# Threshold tightened — move to skip
|
| 1773 |
+
self._cache.move_to_skip(content_key)
|
| 1774 |
+
new_blocks.append(block)
|
| 1775 |
+
if route_counts is not None:
|
| 1776 |
+
route_counts["ratio_too_high"] += 1
|
| 1777 |
+
if route_counts is not None:
|
| 1778 |
+
route_counts.setdefault("cache_hit", 0)
|
| 1779 |
+
route_counts["cache_hit"] += 1
|
| 1780 |
+
continue
|
| 1781 |
+
|
| 1782 |
+
# Cache miss — run full compression
|
| 1783 |
+
if route_counts is not None:
|
| 1784 |
+
route_counts.setdefault("cache_miss", 0)
|
| 1785 |
+
route_counts["cache_miss"] += 1
|
| 1786 |
+
t0 = time.perf_counter()
|
| 1787 |
result = self.compress(tool_content, context=context, bias=bias)
|
| 1788 |
+
compress_ms = (time.perf_counter() - t0) * 1000
|
| 1789 |
+
if compressor_timing is not None:
|
| 1790 |
+
key = f"compressor:{result.strategy_used.value}"
|
| 1791 |
+
compressor_timing[key] = compressor_timing.get(key, 0.0) + compress_ms
|
| 1792 |
if result.compression_ratio < min_ratio:
|
| 1793 |
+
# Compressed — store in result cache
|
| 1794 |
+
self._cache.put(
|
| 1795 |
+
content_key,
|
| 1796 |
+
result.compressed,
|
| 1797 |
+
result.compression_ratio,
|
| 1798 |
+
result.strategy_used.value,
|
| 1799 |
+
)
|
| 1800 |
new_blocks.append({**block, "content": result.compressed})
|
| 1801 |
transforms_applied.append(
|
| 1802 |
f"router:tool_result:{result.strategy_used.value}"
|
|
|
|
| 1808 |
any_compressed = True
|
| 1809 |
continue
|
| 1810 |
else:
|
| 1811 |
+
# Didn't compress — add to skip set
|
| 1812 |
+
self._cache.mark_skip(content_key)
|
| 1813 |
if route_counts is not None:
|
| 1814 |
route_counts["ratio_too_high"] += 1
|
| 1815 |
else:
|
|
@@ -3,6 +3,7 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import logging
|
|
|
|
| 6 |
from typing import TYPE_CHECKING, Any
|
| 7 |
|
| 8 |
from ..config import (
|
|
@@ -14,6 +15,7 @@ from ..config import (
|
|
| 14 |
ToolCrusherConfig,
|
| 15 |
TransformDiff,
|
| 16 |
TransformResult,
|
|
|
|
| 17 |
)
|
| 18 |
from ..tokenizer import Tokenizer
|
| 19 |
from ..utils import deep_copy_messages
|
|
@@ -188,12 +190,14 @@ class TransformPipeline:
|
|
| 188 |
all_transforms: list[str] = []
|
| 189 |
all_markers: list[str] = []
|
| 190 |
all_warnings: list[str] = []
|
|
|
|
| 191 |
|
| 192 |
# Track transform diffs if enabled
|
| 193 |
transform_diffs: list[TransformDiff] = []
|
| 194 |
generate_diff = self.config.generate_diff_artifact
|
| 195 |
|
| 196 |
current_messages = deep_copy_messages(messages)
|
|
|
|
| 197 |
|
| 198 |
for transform in self.transforms:
|
| 199 |
# Check if transform should run
|
|
@@ -203,8 +207,10 @@ class TransformPipeline:
|
|
| 203 |
# Track tokens before this transform (for diff)
|
| 204 |
tokens_before_transform = tokenizer.count_messages(current_messages)
|
| 205 |
|
| 206 |
-
#
|
|
|
|
| 207 |
result = transform.apply(current_messages, tokenizer, **kwargs)
|
|
|
|
| 208 |
|
| 209 |
# Update messages for next transform
|
| 210 |
current_messages = result.messages
|
|
@@ -216,18 +222,24 @@ class TransformPipeline:
|
|
| 216 |
all_transforms.extend(result.transforms_applied)
|
| 217 |
all_markers.extend(result.markers_inserted)
|
| 218 |
all_warnings.extend(result.warnings)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
|
| 220 |
# Log transform results
|
| 221 |
if result.transforms_applied:
|
| 222 |
logger.info(
|
| 223 |
-
"Transform %s: %d -> %d tokens (saved %d)",
|
| 224 |
transform.name,
|
| 225 |
tokens_before_transform,
|
| 226 |
tokens_after_transform,
|
| 227 |
tokens_before_transform - tokens_after_transform,
|
|
|
|
| 228 |
)
|
| 229 |
else:
|
| 230 |
-
logger.debug("Transform %s: no changes", transform.name)
|
| 231 |
|
| 232 |
# Record diff if enabled
|
| 233 |
if generate_diff:
|
|
@@ -240,24 +252,29 @@ class TransformPipeline:
|
|
| 240 |
details=", ".join(result.transforms_applied)
|
| 241 |
if result.transforms_applied
|
| 242 |
else "",
|
|
|
|
| 243 |
)
|
| 244 |
)
|
| 245 |
|
| 246 |
# Final token count
|
| 247 |
tokens_after = tokenizer.count_messages(current_messages)
|
|
|
|
|
|
|
| 248 |
|
| 249 |
# Log pipeline summary
|
| 250 |
total_saved = tokens_before - tokens_after
|
|
|
|
| 251 |
if total_saved > 0:
|
| 252 |
logger.info(
|
| 253 |
-
"Pipeline complete: %d -> %d tokens (saved %d, %.1f%% reduction)",
|
| 254 |
tokens_before,
|
| 255 |
tokens_after,
|
| 256 |
total_saved,
|
| 257 |
(total_saved / tokens_before * 100) if tokens_before > 0 else 0,
|
|
|
|
| 258 |
)
|
| 259 |
else:
|
| 260 |
-
logger.debug("Pipeline complete: no token savings")
|
| 261 |
|
| 262 |
# Build diff artifact if enabled
|
| 263 |
diff_artifact = None
|
|
@@ -270,6 +287,18 @@ class TransformPipeline:
|
|
| 270 |
transforms=transform_diffs,
|
| 271 |
)
|
| 272 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
return TransformResult(
|
| 274 |
messages=current_messages,
|
| 275 |
tokens_before=tokens_before,
|
|
@@ -278,6 +307,8 @@ class TransformPipeline:
|
|
| 278 |
markers_inserted=all_markers,
|
| 279 |
warnings=all_warnings,
|
| 280 |
diff_artifact=diff_artifact,
|
|
|
|
|
|
|
| 281 |
)
|
| 282 |
|
| 283 |
def simulate(
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import logging
|
| 6 |
+
import time
|
| 7 |
from typing import TYPE_CHECKING, Any
|
| 8 |
|
| 9 |
from ..config import (
|
|
|
|
| 15 |
ToolCrusherConfig,
|
| 16 |
TransformDiff,
|
| 17 |
TransformResult,
|
| 18 |
+
WasteSignals,
|
| 19 |
)
|
| 20 |
from ..tokenizer import Tokenizer
|
| 21 |
from ..utils import deep_copy_messages
|
|
|
|
| 190 |
all_transforms: list[str] = []
|
| 191 |
all_markers: list[str] = []
|
| 192 |
all_warnings: list[str] = []
|
| 193 |
+
all_timing: dict[str, float] = {} # transform_name → ms
|
| 194 |
|
| 195 |
# Track transform diffs if enabled
|
| 196 |
transform_diffs: list[TransformDiff] = []
|
| 197 |
generate_diff = self.config.generate_diff_artifact
|
| 198 |
|
| 199 |
current_messages = deep_copy_messages(messages)
|
| 200 |
+
pipeline_start = time.perf_counter()
|
| 201 |
|
| 202 |
for transform in self.transforms:
|
| 203 |
# Check if transform should run
|
|
|
|
| 207 |
# Track tokens before this transform (for diff)
|
| 208 |
tokens_before_transform = tokenizer.count_messages(current_messages)
|
| 209 |
|
| 210 |
+
# Time the transform
|
| 211 |
+
t0 = time.perf_counter()
|
| 212 |
result = transform.apply(current_messages, tokenizer, **kwargs)
|
| 213 |
+
duration_ms = (time.perf_counter() - t0) * 1000
|
| 214 |
|
| 215 |
# Update messages for next transform
|
| 216 |
current_messages = result.messages
|
|
|
|
| 222 |
all_transforms.extend(result.transforms_applied)
|
| 223 |
all_markers.extend(result.markers_inserted)
|
| 224 |
all_warnings.extend(result.warnings)
|
| 225 |
+
all_timing[transform.name] = duration_ms
|
| 226 |
+
|
| 227 |
+
# Merge sub-transform timing (e.g. ContentRouter's per-compressor breakdown)
|
| 228 |
+
if result.timing:
|
| 229 |
+
all_timing.update(result.timing)
|
| 230 |
|
| 231 |
# Log transform results
|
| 232 |
if result.transforms_applied:
|
| 233 |
logger.info(
|
| 234 |
+
"Transform %s: %d -> %d tokens (saved %d) [%.1fms]",
|
| 235 |
transform.name,
|
| 236 |
tokens_before_transform,
|
| 237 |
tokens_after_transform,
|
| 238 |
tokens_before_transform - tokens_after_transform,
|
| 239 |
+
duration_ms,
|
| 240 |
)
|
| 241 |
else:
|
| 242 |
+
logger.debug("Transform %s: no changes [%.1fms]", transform.name, duration_ms)
|
| 243 |
|
| 244 |
# Record diff if enabled
|
| 245 |
if generate_diff:
|
|
|
|
| 252 |
details=", ".join(result.transforms_applied)
|
| 253 |
if result.transforms_applied
|
| 254 |
else "",
|
| 255 |
+
duration_ms=duration_ms,
|
| 256 |
)
|
| 257 |
)
|
| 258 |
|
| 259 |
# Final token count
|
| 260 |
tokens_after = tokenizer.count_messages(current_messages)
|
| 261 |
+
pipeline_ms = (time.perf_counter() - pipeline_start) * 1000
|
| 262 |
+
all_timing["pipeline_total"] = pipeline_ms
|
| 263 |
|
| 264 |
# Log pipeline summary
|
| 265 |
total_saved = tokens_before - tokens_after
|
| 266 |
+
timing_parts = " ".join(f"{k}={v:.0f}ms" for k, v in all_timing.items())
|
| 267 |
if total_saved > 0:
|
| 268 |
logger.info(
|
| 269 |
+
"Pipeline complete: %d -> %d tokens (saved %d, %.1f%% reduction) [%s]",
|
| 270 |
tokens_before,
|
| 271 |
tokens_after,
|
| 272 |
total_saved,
|
| 273 |
(total_saved / tokens_before * 100) if tokens_before > 0 else 0,
|
| 274 |
+
timing_parts,
|
| 275 |
)
|
| 276 |
else:
|
| 277 |
+
logger.debug("Pipeline complete: no token savings [%s]", timing_parts)
|
| 278 |
|
| 279 |
# Build diff artifact if enabled
|
| 280 |
diff_artifact = None
|
|
|
|
| 287 |
transforms=transform_diffs,
|
| 288 |
)
|
| 289 |
|
| 290 |
+
# Detect waste signals in original messages (only when significant compression)
|
| 291 |
+
waste_signals: WasteSignals | None = None
|
| 292 |
+
if tokens_before > tokens_after and (tokens_before - tokens_after) > 100:
|
| 293 |
+
try:
|
| 294 |
+
from ..parser import parse_messages
|
| 295 |
+
|
| 296 |
+
_, _, waste_signals = parse_messages(messages, tokenizer)
|
| 297 |
+
if waste_signals.total() == 0:
|
| 298 |
+
waste_signals = None
|
| 299 |
+
except Exception:
|
| 300 |
+
pass
|
| 301 |
+
|
| 302 |
return TransformResult(
|
| 303 |
messages=current_messages,
|
| 304 |
tokens_before=tokens_before,
|
|
|
|
| 307 |
markers_inserted=all_markers,
|
| 308 |
warnings=all_warnings,
|
| 309 |
diff_artifact=diff_artifact,
|
| 310 |
+
timing=all_timing,
|
| 311 |
+
waste_signals=waste_signals,
|
| 312 |
)
|
| 313 |
|
| 314 |
def simulate(
|
|
@@ -50,6 +50,8 @@ class FileOperation:
|
|
| 50 |
file_path: str
|
| 51 |
operation: str # "read" | "edit" | "write"
|
| 52 |
content_size: int = 0 # Size of tool_result content (for reads only)
|
|
|
|
|
|
|
| 53 |
|
| 54 |
|
| 55 |
@dataclass
|
|
@@ -116,13 +118,14 @@ class ReadLifecycleManager:
|
|
| 116 |
|
| 117 |
def _build_tool_metadata(
|
| 118 |
self, messages: list[dict[str, Any]]
|
| 119 |
-
) -> dict[str, tuple[str, str | None]]:
|
| 120 |
"""Build tool_call_id → (tool_name, file_path) mapping.
|
| 121 |
|
| 122 |
Scans assistant messages for tool calls, extracts name and file_path
|
| 123 |
from tool inputs. Handles both OpenAI and Anthropic formats.
|
| 124 |
"""
|
| 125 |
-
|
|
|
|
| 126 |
|
| 127 |
for msg in messages:
|
| 128 |
if msg.get("role") != "assistant":
|
|
@@ -139,12 +142,16 @@ class ReadLifecycleManager:
|
|
| 139 |
continue
|
| 140 |
|
| 141 |
file_path = None
|
|
|
|
|
|
|
| 142 |
try:
|
| 143 |
args = json.loads(func.get("arguments", "{}"))
|
| 144 |
file_path = args.get("file_path") or args.get("path")
|
|
|
|
|
|
|
| 145 |
except (json.JSONDecodeError, TypeError):
|
| 146 |
pass
|
| 147 |
-
metadata[tc_id] = (name, file_path)
|
| 148 |
|
| 149 |
# Anthropic format: content blocks with type=tool_use
|
| 150 |
content = msg.get("content", [])
|
|
@@ -160,16 +167,20 @@ class ReadLifecycleManager:
|
|
| 160 |
|
| 161 |
inp = block.get("input", {})
|
| 162 |
file_path = None
|
|
|
|
|
|
|
| 163 |
if isinstance(inp, dict):
|
| 164 |
file_path = inp.get("file_path") or inp.get("path")
|
| 165 |
-
|
|
|
|
|
|
|
| 166 |
|
| 167 |
return metadata
|
| 168 |
|
| 169 |
def _build_file_operation_index(
|
| 170 |
self,
|
| 171 |
messages: list[dict[str, Any]],
|
| 172 |
-
tool_metadata: dict[str, tuple[str, str | None]],
|
| 173 |
) -> dict[str, list[FileOperation]]:
|
| 174 |
"""Build file_path → [FileOperation] index in a single pass.
|
| 175 |
|
|
@@ -177,7 +188,7 @@ class ReadLifecycleManager:
|
|
| 177 |
"""
|
| 178 |
file_ops: dict[str, list[FileOperation]] = defaultdict(list)
|
| 179 |
|
| 180 |
-
for tc_id, (name, file_path) in tool_metadata.items():
|
| 181 |
if not file_path:
|
| 182 |
continue
|
| 183 |
|
|
@@ -200,6 +211,8 @@ class ReadLifecycleManager:
|
|
| 200 |
tool_name=name,
|
| 201 |
file_path=file_path,
|
| 202 |
operation=operation,
|
|
|
|
|
|
|
| 203 |
)
|
| 204 |
)
|
| 205 |
|
|
@@ -231,6 +244,29 @@ class ReadLifecycleManager:
|
|
| 231 |
|
| 232 |
return None
|
| 233 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
def _classify_reads(self, file_ops: dict[str, list[FileOperation]]) -> list[ReadClassification]:
|
| 235 |
"""Classify each Read as fresh, stale, or superseded."""
|
| 236 |
classifications: list[ReadClassification] = []
|
|
@@ -248,9 +284,13 @@ class ReadLifecycleManager:
|
|
| 248 |
e.msg_index > read_op.msg_index for e in edits
|
| 249 |
)
|
| 250 |
|
| 251 |
-
# Check superseded: any later read
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
is_superseded = self.config.compress_superseded and any(
|
| 253 |
-
r.msg_index > read_op.msg_index for r in reads
|
| 254 |
)
|
| 255 |
|
| 256 |
if is_stale:
|
|
|
|
| 50 |
file_path: str
|
| 51 |
operation: str # "read" | "edit" | "write"
|
| 52 |
content_size: int = 0 # Size of tool_result content (for reads only)
|
| 53 |
+
read_offset: int | None = None # Line offset for partial reads
|
| 54 |
+
read_limit: int | None = None # Line limit for partial reads
|
| 55 |
|
| 56 |
|
| 57 |
@dataclass
|
|
|
|
| 118 |
|
| 119 |
def _build_tool_metadata(
|
| 120 |
self, messages: list[dict[str, Any]]
|
| 121 |
+
) -> dict[str, tuple[str, str | None, int | None, int | None]]:
|
| 122 |
"""Build tool_call_id → (tool_name, file_path) mapping.
|
| 123 |
|
| 124 |
Scans assistant messages for tool calls, extracts name and file_path
|
| 125 |
from tool inputs. Handles both OpenAI and Anthropic formats.
|
| 126 |
"""
|
| 127 |
+
# Maps tool_call_id → (name, file_path, offset, limit)
|
| 128 |
+
metadata: dict[str, tuple[str, str | None, int | None, int | None]] = {}
|
| 129 |
|
| 130 |
for msg in messages:
|
| 131 |
if msg.get("role") != "assistant":
|
|
|
|
| 142 |
continue
|
| 143 |
|
| 144 |
file_path = None
|
| 145 |
+
offset = None
|
| 146 |
+
limit = None
|
| 147 |
try:
|
| 148 |
args = json.loads(func.get("arguments", "{}"))
|
| 149 |
file_path = args.get("file_path") or args.get("path")
|
| 150 |
+
offset = args.get("offset")
|
| 151 |
+
limit = args.get("limit")
|
| 152 |
except (json.JSONDecodeError, TypeError):
|
| 153 |
pass
|
| 154 |
+
metadata[tc_id] = (name, file_path, offset, limit)
|
| 155 |
|
| 156 |
# Anthropic format: content blocks with type=tool_use
|
| 157 |
content = msg.get("content", [])
|
|
|
|
| 167 |
|
| 168 |
inp = block.get("input", {})
|
| 169 |
file_path = None
|
| 170 |
+
offset = None
|
| 171 |
+
limit = None
|
| 172 |
if isinstance(inp, dict):
|
| 173 |
file_path = inp.get("file_path") or inp.get("path")
|
| 174 |
+
offset = inp.get("offset")
|
| 175 |
+
limit = inp.get("limit")
|
| 176 |
+
metadata[tc_id] = (name, file_path, offset, limit)
|
| 177 |
|
| 178 |
return metadata
|
| 179 |
|
| 180 |
def _build_file_operation_index(
|
| 181 |
self,
|
| 182 |
messages: list[dict[str, Any]],
|
| 183 |
+
tool_metadata: dict[str, tuple[str, str | None, int | None, int | None]],
|
| 184 |
) -> dict[str, list[FileOperation]]:
|
| 185 |
"""Build file_path → [FileOperation] index in a single pass.
|
| 186 |
|
|
|
|
| 188 |
"""
|
| 189 |
file_ops: dict[str, list[FileOperation]] = defaultdict(list)
|
| 190 |
|
| 191 |
+
for tc_id, (name, file_path, offset, limit) in tool_metadata.items():
|
| 192 |
if not file_path:
|
| 193 |
continue
|
| 194 |
|
|
|
|
| 211 |
tool_name=name,
|
| 212 |
file_path=file_path,
|
| 213 |
operation=operation,
|
| 214 |
+
read_offset=offset if operation == "read" else None,
|
| 215 |
+
read_limit=limit if operation == "read" else None,
|
| 216 |
)
|
| 217 |
)
|
| 218 |
|
|
|
|
| 244 |
|
| 245 |
return None
|
| 246 |
|
| 247 |
+
@staticmethod
|
| 248 |
+
def _read_covers(later: FileOperation, earlier: FileOperation) -> bool:
|
| 249 |
+
"""Check if `later` read fully covers the line range of `earlier`.
|
| 250 |
+
|
| 251 |
+
A full-file read (no offset/limit) covers everything.
|
| 252 |
+
A partial read only covers another partial if its range is a superset.
|
| 253 |
+
"""
|
| 254 |
+
# Full-file read supersedes anything
|
| 255 |
+
if later.read_offset is None and later.read_limit is None:
|
| 256 |
+
return True
|
| 257 |
+
|
| 258 |
+
# If the earlier was a full-file read, a partial can't cover it
|
| 259 |
+
if earlier.read_offset is None and earlier.read_limit is None:
|
| 260 |
+
return False
|
| 261 |
+
|
| 262 |
+
# Both are partial reads — check range containment
|
| 263 |
+
later_start = later.read_offset or 0
|
| 264 |
+
later_end = later_start + (later.read_limit or 2000)
|
| 265 |
+
earlier_start = earlier.read_offset or 0
|
| 266 |
+
earlier_end = earlier_start + (earlier.read_limit or 2000)
|
| 267 |
+
|
| 268 |
+
return later_start <= earlier_start and later_end >= earlier_end
|
| 269 |
+
|
| 270 |
def _classify_reads(self, file_ops: dict[str, list[FileOperation]]) -> list[ReadClassification]:
|
| 271 |
"""Classify each Read as fresh, stale, or superseded."""
|
| 272 |
classifications: list[ReadClassification] = []
|
|
|
|
| 284 |
e.msg_index > read_op.msg_index for e in edits
|
| 285 |
)
|
| 286 |
|
| 287 |
+
# Check superseded: any later read that FULLY COVERS this read's range?
|
| 288 |
+
# A partial read (offset=100, limit=50) is NOT superseded by a
|
| 289 |
+
# different partial read (offset=200, limit=50) — they cover
|
| 290 |
+
# different lines. Only supersede when the later read contains
|
| 291 |
+
# all the lines of this read.
|
| 292 |
is_superseded = self.config.compress_superseded and any(
|
| 293 |
+
r.msg_index > read_op.msg_index and self._read_covers(r, read_op) for r in reads
|
| 294 |
)
|
| 295 |
|
| 296 |
if is_stale:
|
|
@@ -91,7 +91,7 @@ class TestCacheAlignerConfig:
|
|
| 91 |
def test_default_values(self):
|
| 92 |
"""Default values are correctly set."""
|
| 93 |
config = CacheAlignerConfig()
|
| 94 |
-
assert config.enabled is
|
| 95 |
assert config.normalize_whitespace is True
|
| 96 |
assert config.collapse_blank_lines is True
|
| 97 |
|
|
@@ -389,6 +389,8 @@ class TestTransformResult:
|
|
| 389 |
"warnings",
|
| 390 |
"diff_artifact",
|
| 391 |
"cache_metrics",
|
|
|
|
|
|
|
| 392 |
}
|
| 393 |
assert field_names == expected_fields
|
| 394 |
|
|
|
|
| 91 |
def test_default_values(self):
|
| 92 |
"""Default values are correctly set."""
|
| 93 |
config = CacheAlignerConfig()
|
| 94 |
+
assert config.enabled is False
|
| 95 |
assert config.normalize_whitespace is True
|
| 96 |
assert config.collapse_blank_lines is True
|
| 97 |
|
|
|
|
| 389 |
"warnings",
|
| 390 |
"diff_artifact",
|
| 391 |
"cache_metrics",
|
| 392 |
+
"timing",
|
| 393 |
+
"waste_signals",
|
| 394 |
}
|
| 395 |
assert field_names == expected_fields
|
| 396 |
|
|
@@ -559,9 +559,16 @@ class TestCostTrackingAccuracy:
|
|
| 559 |
patch("headroom.proxy.server.litellm") as mock_litellm,
|
| 560 |
):
|
| 561 |
# Setup: $10/M input, $30/M output
|
| 562 |
-
def mock_cost(model, prompt_tokens, completion_tokens):
|
| 563 |
input_cost = prompt_tokens * 0.00001
|
| 564 |
output_cost = completion_tokens * 0.00003
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
return (input_cost, output_cost)
|
| 566 |
|
| 567 |
mock_litellm.cost_per_token.side_effect = mock_cost
|
|
@@ -598,7 +605,7 @@ class TestCostTrackingAccuracy:
|
|
| 598 |
patch("headroom.proxy.server.litellm") as mock_litellm,
|
| 599 |
):
|
| 600 |
mock_litellm.cost_per_token.side_effect = (
|
| 601 |
-
lambda model, prompt_tokens, completion_tokens: (
|
| 602 |
prompt_tokens * 0.00001,
|
| 603 |
completion_tokens * 0.00003,
|
| 604 |
)
|
|
|
|
| 559 |
patch("headroom.proxy.server.litellm") as mock_litellm,
|
| 560 |
):
|
| 561 |
# Setup: $10/M input, $30/M output
|
| 562 |
+
def mock_cost(model, prompt_tokens, completion_tokens, **kwargs):
|
| 563 |
input_cost = prompt_tokens * 0.00001
|
| 564 |
output_cost = completion_tokens * 0.00003
|
| 565 |
+
# Add cache costs if provided
|
| 566 |
+
cache_read = kwargs.get("cache_read_input_tokens", 0)
|
| 567 |
+
cache_write = kwargs.get("cache_creation_input_tokens", 0)
|
| 568 |
+
if cache_read or cache_write:
|
| 569 |
+
model_info = mock_litellm.get_model_info()
|
| 570 |
+
input_cost += cache_read * model_info.get("cache_read_input_token_cost", 0)
|
| 571 |
+
input_cost += cache_write * model_info.get("cache_creation_input_token_cost", 0)
|
| 572 |
return (input_cost, output_cost)
|
| 573 |
|
| 574 |
mock_litellm.cost_per_token.side_effect = mock_cost
|
|
|
|
| 605 |
patch("headroom.proxy.server.litellm") as mock_litellm,
|
| 606 |
):
|
| 607 |
mock_litellm.cost_per_token.side_effect = (
|
| 608 |
+
lambda model, prompt_tokens, completion_tokens, **kwargs: (
|
| 609 |
prompt_tokens * 0.00001,
|
| 610 |
completion_tokens * 0.00003,
|
| 611 |
)
|
|
@@ -29,8 +29,8 @@ def tokenizer():
|
|
| 29 |
|
| 30 |
@pytest.fixture
|
| 31 |
def default_config():
|
| 32 |
-
"""Default CacheAlignerConfig."""
|
| 33 |
-
return CacheAlignerConfig()
|
| 34 |
|
| 35 |
|
| 36 |
@pytest.fixture
|
|
@@ -194,7 +194,7 @@ class TestDateExtraction:
|
|
| 194 |
{"role": "user", "content": "Hello"},
|
| 195 |
]
|
| 196 |
|
| 197 |
-
config = CacheAlignerConfig(date_patterns=custom_patterns)
|
| 198 |
aligner = CacheAligner(config)
|
| 199 |
|
| 200 |
assert aligner.should_apply(messages, tokenizer)
|
|
@@ -705,7 +705,7 @@ Please be helpful, harmless, and honest."""
|
|
| 705 |
{"role": "user", "content": "What can you help me with today?"},
|
| 706 |
]
|
| 707 |
|
| 708 |
-
aligner = CacheAligner()
|
| 709 |
|
| 710 |
# Check should_apply
|
| 711 |
assert aligner.should_apply(messages, tokenizer)
|
|
|
|
| 29 |
|
| 30 |
@pytest.fixture
|
| 31 |
def default_config():
|
| 32 |
+
"""Default CacheAlignerConfig with enabled=True for testing."""
|
| 33 |
+
return CacheAlignerConfig(enabled=True)
|
| 34 |
|
| 35 |
|
| 36 |
@pytest.fixture
|
|
|
|
| 194 |
{"role": "user", "content": "Hello"},
|
| 195 |
]
|
| 196 |
|
| 197 |
+
config = CacheAlignerConfig(enabled=True, date_patterns=custom_patterns)
|
| 198 |
aligner = CacheAligner(config)
|
| 199 |
|
| 200 |
assert aligner.should_apply(messages, tokenizer)
|
|
|
|
| 705 |
{"role": "user", "content": "What can you help me with today?"},
|
| 706 |
]
|
| 707 |
|
| 708 |
+
aligner = CacheAligner(CacheAlignerConfig(enabled=True))
|
| 709 |
|
| 710 |
# Check should_apply
|
| 711 |
assert aligner.should_apply(messages, tokenizer)
|
|
@@ -276,7 +276,7 @@ class TestSupersededDetection:
|
|
| 276 |
|
| 277 |
def test_reread_makes_superseded(self):
|
| 278 |
"""Read(A) → Read(A): first Read becomes superseded."""
|
| 279 |
-
config = ReadLifecycleConfig(enabled=True)
|
| 280 |
mgr = ReadLifecycleManager(config)
|
| 281 |
|
| 282 |
messages = [
|
|
|
|
| 276 |
|
| 277 |
def test_reread_makes_superseded(self):
|
| 278 |
"""Read(A) → Read(A): first Read becomes superseded."""
|
| 279 |
+
config = ReadLifecycleConfig(enabled=True, compress_superseded=True)
|
| 280 |
mgr = ReadLifecycleManager(config)
|
| 281 |
|
| 282 |
messages = [
|