Spaces:
Running
Running
tudragon154203 commited on
Commit ·
40f707d
1
Parent(s): 0144a3e
Revert "fix: offload all sync compression paths to threads and bump transformers for ModernBERT"
Browse filesThis reverts commit d201c03a9349fd6c42d7ddcdb04915c28281d251.
- headroom/proxy/handlers/anthropic.py +6 -8
- headroom/proxy/handlers/batch.py +10 -14
- headroom/proxy/handlers/gemini.py +15 -21
- pyproject.toml +4 -4
headroom/proxy/handlers/anthropic.py
CHANGED
|
@@ -2070,14 +2070,12 @@ class AnthropicHandlerMixin:
|
|
| 2070 |
original_tokens = get_tokenizer(model).count_messages(messages)
|
| 2071 |
optimized_tokens = original_tokens
|
| 2072 |
else:
|
| 2073 |
-
result =
|
| 2074 |
-
|
| 2075 |
-
|
| 2076 |
-
|
| 2077 |
-
|
| 2078 |
-
|
| 2079 |
-
frozen_message_count=frozen_message_count,
|
| 2080 |
-
)
|
| 2081 |
)
|
| 2082 |
|
| 2083 |
optimized_messages = result.messages
|
|
|
|
| 2070 |
original_tokens = get_tokenizer(model).count_messages(messages)
|
| 2071 |
optimized_tokens = original_tokens
|
| 2072 |
else:
|
| 2073 |
+
result = self.anthropic_pipeline.apply(
|
| 2074 |
+
messages=messages,
|
| 2075 |
+
model=model,
|
| 2076 |
+
model_limit=context_limit,
|
| 2077 |
+
context=extract_user_query(messages),
|
| 2078 |
+
frozen_message_count=frozen_message_count,
|
|
|
|
|
|
|
| 2079 |
)
|
| 2080 |
|
| 2081 |
optimized_messages = result.messages
|
headroom/proxy/handlers/batch.py
CHANGED
|
@@ -145,13 +145,11 @@ class BatchHandlerMixin:
|
|
| 145 |
)
|
| 146 |
|
| 147 |
# Use OpenAI pipeline (similar message format after conversion)
|
| 148 |
-
result =
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
context=extract_user_query(messages),
|
| 154 |
-
)
|
| 155 |
)
|
| 156 |
|
| 157 |
optimized_messages = result.messages
|
|
@@ -906,13 +904,11 @@ class BatchHandlerMixin:
|
|
| 906 |
if self.config.optimize:
|
| 907 |
try:
|
| 908 |
context_limit = self.openai_provider.get_context_limit(model)
|
| 909 |
-
result =
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
| 914 |
-
context=extract_user_query(messages),
|
| 915 |
-
)
|
| 916 |
)
|
| 917 |
compressed_messages = result.messages
|
| 918 |
# Use pipeline's token counts for consistency with pipeline logs
|
|
|
|
| 145 |
)
|
| 146 |
|
| 147 |
# Use OpenAI pipeline (similar message format after conversion)
|
| 148 |
+
result = self.openai_pipeline.apply(
|
| 149 |
+
messages=messages,
|
| 150 |
+
model=model,
|
| 151 |
+
model_limit=context_limit,
|
| 152 |
+
context=extract_user_query(messages),
|
|
|
|
|
|
|
| 153 |
)
|
| 154 |
|
| 155 |
optimized_messages = result.messages
|
|
|
|
| 904 |
if self.config.optimize:
|
| 905 |
try:
|
| 906 |
context_limit = self.openai_provider.get_context_limit(model)
|
| 907 |
+
result = self.openai_pipeline.apply(
|
| 908 |
+
messages=messages,
|
| 909 |
+
model=model,
|
| 910 |
+
model_limit=context_limit,
|
| 911 |
+
context=extract_user_query(messages),
|
|
|
|
|
|
|
| 912 |
)
|
| 913 |
compressed_messages = result.messages
|
| 914 |
# Use pipeline's token counts for consistency with pipeline logs
|
headroom/proxy/handlers/gemini.py
CHANGED
|
@@ -277,13 +277,11 @@ class GeminiHandlerMixin:
|
|
| 277 |
try:
|
| 278 |
# Use OpenAI pipeline (similar message format)
|
| 279 |
context_limit = self.openai_provider.get_context_limit(model)
|
| 280 |
-
result =
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
context=extract_user_query(messages),
|
| 286 |
-
)
|
| 287 |
)
|
| 288 |
if result.messages != messages:
|
| 289 |
optimized_messages = result.messages
|
|
@@ -539,13 +537,11 @@ class GeminiHandlerMixin:
|
|
| 539 |
if self.config.optimize and messages and _license_ok:
|
| 540 |
try:
|
| 541 |
context_limit = self.openai_provider.get_context_limit(model)
|
| 542 |
-
result =
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
context=extract_user_query(messages),
|
| 548 |
-
)
|
| 549 |
)
|
| 550 |
if result.messages != messages:
|
| 551 |
optimized_messages = result.messages
|
|
@@ -748,13 +744,11 @@ class GeminiHandlerMixin:
|
|
| 748 |
if self.config.optimize and messages:
|
| 749 |
try:
|
| 750 |
context_limit = self.openai_provider.get_context_limit(model)
|
| 751 |
-
result =
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
context=extract_user_query(messages),
|
| 757 |
-
)
|
| 758 |
)
|
| 759 |
if result.messages != messages:
|
| 760 |
optimized_messages = result.messages
|
|
|
|
| 277 |
try:
|
| 278 |
# Use OpenAI pipeline (similar message format)
|
| 279 |
context_limit = self.openai_provider.get_context_limit(model)
|
| 280 |
+
result = self.openai_pipeline.apply(
|
| 281 |
+
messages=messages,
|
| 282 |
+
model=model,
|
| 283 |
+
model_limit=context_limit,
|
| 284 |
+
context=extract_user_query(messages),
|
|
|
|
|
|
|
| 285 |
)
|
| 286 |
if result.messages != messages:
|
| 287 |
optimized_messages = result.messages
|
|
|
|
| 537 |
if self.config.optimize and messages and _license_ok:
|
| 538 |
try:
|
| 539 |
context_limit = self.openai_provider.get_context_limit(model)
|
| 540 |
+
result = self.openai_pipeline.apply(
|
| 541 |
+
messages=messages,
|
| 542 |
+
model=model,
|
| 543 |
+
model_limit=context_limit,
|
| 544 |
+
context=extract_user_query(messages),
|
|
|
|
|
|
|
| 545 |
)
|
| 546 |
if result.messages != messages:
|
| 547 |
optimized_messages = result.messages
|
|
|
|
| 744 |
if self.config.optimize and messages:
|
| 745 |
try:
|
| 746 |
context_limit = self.openai_provider.get_context_limit(model)
|
| 747 |
+
result = self.openai_pipeline.apply(
|
| 748 |
+
messages=messages,
|
| 749 |
+
model=model,
|
| 750 |
+
model_limit=context_limit,
|
| 751 |
+
context=extract_user_query(messages),
|
|
|
|
|
|
|
| 752 |
)
|
| 753 |
if result.messages != messages:
|
| 754 |
optimized_messages = result.messages
|
pyproject.toml
CHANGED
|
@@ -68,7 +68,7 @@ proxy = [
|
|
| 68 |
"zstandard>=0.20.0", # Decompress zstd request bodies (Codex, etc.)
|
| 69 |
"websockets>=13.0", # WebSocket proxy for /v1/responses (Codex gpt-5.4+)
|
| 70 |
"onnxruntime>=1.16.0", # Kompress ONNX INT8 text compression (no torch needed)
|
| 71 |
-
"transformers>=4.
|
| 72 |
"watchdog>=4.0.0", # File watcher for live code graph reindexing (--code-graph)
|
| 73 |
"sqlite-vec>=0.1.6", # Vector index for memory (--memory). Lightweight, no torch.
|
| 74 |
"redis>=5.0.0", # Shared stats aggregation for multi-worker proxy (TCP)
|
|
@@ -88,13 +88,13 @@ code = [
|
|
| 88 |
# ML-based compression with Kompress (ModernBERT)
|
| 89 |
ml = [
|
| 90 |
"torch>=2.0.0",
|
| 91 |
-
"transformers>=4.
|
| 92 |
]
|
| 93 |
# Legacy ML compression (LLMLingua-2 — use [ml] instead for Kompress)
|
| 94 |
llmlingua = [
|
| 95 |
"llmlingua>=0.2.0",
|
| 96 |
"torch>=2.0.0",
|
| 97 |
-
"transformers>=4.
|
| 98 |
]
|
| 99 |
# Memory system (hierarchical memory with vector search)
|
| 100 |
memory = [
|
|
@@ -159,7 +159,7 @@ mcp = [
|
|
| 159 |
# Voice filler detection
|
| 160 |
voice = [
|
| 161 |
"onnxruntime>=1.16.0",
|
| 162 |
-
"transformers>=4.
|
| 163 |
"torch>=2.0.0",
|
| 164 |
]
|
| 165 |
# Voice training (includes voice deps + training extras)
|
|
|
|
| 68 |
"zstandard>=0.20.0", # Decompress zstd request bodies (Codex, etc.)
|
| 69 |
"websockets>=13.0", # WebSocket proxy for /v1/responses (Codex gpt-5.4+)
|
| 70 |
"onnxruntime>=1.16.0", # Kompress ONNX INT8 text compression (no torch needed)
|
| 71 |
+
"transformers>=4.30.0", # Tokenizer only (for Kompress)
|
| 72 |
"watchdog>=4.0.0", # File watcher for live code graph reindexing (--code-graph)
|
| 73 |
"sqlite-vec>=0.1.6", # Vector index for memory (--memory). Lightweight, no torch.
|
| 74 |
"redis>=5.0.0", # Shared stats aggregation for multi-worker proxy (TCP)
|
|
|
|
| 88 |
# ML-based compression with Kompress (ModernBERT)
|
| 89 |
ml = [
|
| 90 |
"torch>=2.0.0",
|
| 91 |
+
"transformers>=4.30.0",
|
| 92 |
]
|
| 93 |
# Legacy ML compression (LLMLingua-2 — use [ml] instead for Kompress)
|
| 94 |
llmlingua = [
|
| 95 |
"llmlingua>=0.2.0",
|
| 96 |
"torch>=2.0.0",
|
| 97 |
+
"transformers>=4.30.0",
|
| 98 |
]
|
| 99 |
# Memory system (hierarchical memory with vector search)
|
| 100 |
memory = [
|
|
|
|
| 159 |
# Voice filler detection
|
| 160 |
voice = [
|
| 161 |
"onnxruntime>=1.16.0",
|
| 162 |
+
"transformers>=4.30.0",
|
| 163 |
"torch>=2.0.0",
|
| 164 |
]
|
| 165 |
# Voice training (includes voice deps + training extras)
|