tudragon154203 commited on
Commit
40f707d
·
1 Parent(s): 0144a3e

Revert "fix: offload all sync compression paths to threads and bump transformers for ModernBERT"

Browse files

This reverts commit d201c03a9349fd6c42d7ddcdb04915c28281d251.

headroom/proxy/handlers/anthropic.py CHANGED
@@ -2070,14 +2070,12 @@ class AnthropicHandlerMixin:
2070
  original_tokens = get_tokenizer(model).count_messages(messages)
2071
  optimized_tokens = original_tokens
2072
  else:
2073
- result = await asyncio.to_thread(
2074
- lambda: self.anthropic_pipeline.apply(
2075
- messages=messages,
2076
- model=model,
2077
- model_limit=context_limit,
2078
- context=extract_user_query(messages),
2079
- frozen_message_count=frozen_message_count,
2080
- )
2081
  )
2082
 
2083
  optimized_messages = result.messages
 
2070
  original_tokens = get_tokenizer(model).count_messages(messages)
2071
  optimized_tokens = original_tokens
2072
  else:
2073
+ result = self.anthropic_pipeline.apply(
2074
+ messages=messages,
2075
+ model=model,
2076
+ model_limit=context_limit,
2077
+ context=extract_user_query(messages),
2078
+ frozen_message_count=frozen_message_count,
 
 
2079
  )
2080
 
2081
  optimized_messages = result.messages
headroom/proxy/handlers/batch.py CHANGED
@@ -145,13 +145,11 @@ class BatchHandlerMixin:
145
  )
146
 
147
  # Use OpenAI pipeline (similar message format after conversion)
148
- result = await asyncio.to_thread(
149
- lambda: self.openai_pipeline.apply(
150
- messages=messages,
151
- model=model,
152
- model_limit=context_limit,
153
- context=extract_user_query(messages),
154
- )
155
  )
156
 
157
  optimized_messages = result.messages
@@ -906,13 +904,11 @@ class BatchHandlerMixin:
906
  if self.config.optimize:
907
  try:
908
  context_limit = self.openai_provider.get_context_limit(model)
909
- result = await asyncio.to_thread(
910
- lambda: self.openai_pipeline.apply(
911
- messages=messages,
912
- model=model,
913
- model_limit=context_limit,
914
- context=extract_user_query(messages),
915
- )
916
  )
917
  compressed_messages = result.messages
918
  # Use pipeline's token counts for consistency with pipeline logs
 
145
  )
146
 
147
  # Use OpenAI pipeline (similar message format after conversion)
148
+ result = self.openai_pipeline.apply(
149
+ messages=messages,
150
+ model=model,
151
+ model_limit=context_limit,
152
+ context=extract_user_query(messages),
 
 
153
  )
154
 
155
  optimized_messages = result.messages
 
904
  if self.config.optimize:
905
  try:
906
  context_limit = self.openai_provider.get_context_limit(model)
907
+ result = self.openai_pipeline.apply(
908
+ messages=messages,
909
+ model=model,
910
+ model_limit=context_limit,
911
+ context=extract_user_query(messages),
 
 
912
  )
913
  compressed_messages = result.messages
914
  # Use pipeline's token counts for consistency with pipeline logs
headroom/proxy/handlers/gemini.py CHANGED
@@ -277,13 +277,11 @@ class GeminiHandlerMixin:
277
  try:
278
  # Use OpenAI pipeline (similar message format)
279
  context_limit = self.openai_provider.get_context_limit(model)
280
- result = await asyncio.to_thread(
281
- lambda: self.openai_pipeline.apply(
282
- messages=messages,
283
- model=model,
284
- model_limit=context_limit,
285
- context=extract_user_query(messages),
286
- )
287
  )
288
  if result.messages != messages:
289
  optimized_messages = result.messages
@@ -539,13 +537,11 @@ class GeminiHandlerMixin:
539
  if self.config.optimize and messages and _license_ok:
540
  try:
541
  context_limit = self.openai_provider.get_context_limit(model)
542
- result = await asyncio.to_thread(
543
- lambda: self.openai_pipeline.apply(
544
- messages=messages,
545
- model=model,
546
- model_limit=context_limit,
547
- context=extract_user_query(messages),
548
- )
549
  )
550
  if result.messages != messages:
551
  optimized_messages = result.messages
@@ -748,13 +744,11 @@ class GeminiHandlerMixin:
748
  if self.config.optimize and messages:
749
  try:
750
  context_limit = self.openai_provider.get_context_limit(model)
751
- result = await asyncio.to_thread(
752
- lambda: self.openai_pipeline.apply(
753
- messages=messages,
754
- model=model,
755
- model_limit=context_limit,
756
- context=extract_user_query(messages),
757
- )
758
  )
759
  if result.messages != messages:
760
  optimized_messages = result.messages
 
277
  try:
278
  # Use OpenAI pipeline (similar message format)
279
  context_limit = self.openai_provider.get_context_limit(model)
280
+ result = self.openai_pipeline.apply(
281
+ messages=messages,
282
+ model=model,
283
+ model_limit=context_limit,
284
+ context=extract_user_query(messages),
 
 
285
  )
286
  if result.messages != messages:
287
  optimized_messages = result.messages
 
537
  if self.config.optimize and messages and _license_ok:
538
  try:
539
  context_limit = self.openai_provider.get_context_limit(model)
540
+ result = self.openai_pipeline.apply(
541
+ messages=messages,
542
+ model=model,
543
+ model_limit=context_limit,
544
+ context=extract_user_query(messages),
 
 
545
  )
546
  if result.messages != messages:
547
  optimized_messages = result.messages
 
744
  if self.config.optimize and messages:
745
  try:
746
  context_limit = self.openai_provider.get_context_limit(model)
747
+ result = self.openai_pipeline.apply(
748
+ messages=messages,
749
+ model=model,
750
+ model_limit=context_limit,
751
+ context=extract_user_query(messages),
 
 
752
  )
753
  if result.messages != messages:
754
  optimized_messages = result.messages
pyproject.toml CHANGED
@@ -68,7 +68,7 @@ proxy = [
68
  "zstandard>=0.20.0", # Decompress zstd request bodies (Codex, etc.)
69
  "websockets>=13.0", # WebSocket proxy for /v1/responses (Codex gpt-5.4+)
70
  "onnxruntime>=1.16.0", # Kompress ONNX INT8 text compression (no torch needed)
71
- "transformers>=4.48.0", # ModernBERT support (for Kompress)
72
  "watchdog>=4.0.0", # File watcher for live code graph reindexing (--code-graph)
73
  "sqlite-vec>=0.1.6", # Vector index for memory (--memory). Lightweight, no torch.
74
  "redis>=5.0.0", # Shared stats aggregation for multi-worker proxy (TCP)
@@ -88,13 +88,13 @@ code = [
88
  # ML-based compression with Kompress (ModernBERT)
89
  ml = [
90
  "torch>=2.0.0",
91
- "transformers>=4.48.0",
92
  ]
93
  # Legacy ML compression (LLMLingua-2 — use [ml] instead for Kompress)
94
  llmlingua = [
95
  "llmlingua>=0.2.0",
96
  "torch>=2.0.0",
97
- "transformers>=4.48.0",
98
  ]
99
  # Memory system (hierarchical memory with vector search)
100
  memory = [
@@ -159,7 +159,7 @@ mcp = [
159
  # Voice filler detection
160
  voice = [
161
  "onnxruntime>=1.16.0",
162
- "transformers>=4.48.0",
163
  "torch>=2.0.0",
164
  ]
165
  # Voice training (includes voice deps + training extras)
 
68
  "zstandard>=0.20.0", # Decompress zstd request bodies (Codex, etc.)
69
  "websockets>=13.0", # WebSocket proxy for /v1/responses (Codex gpt-5.4+)
70
  "onnxruntime>=1.16.0", # Kompress ONNX INT8 text compression (no torch needed)
71
+ "transformers>=4.30.0", # Tokenizer only (for Kompress)
72
  "watchdog>=4.0.0", # File watcher for live code graph reindexing (--code-graph)
73
  "sqlite-vec>=0.1.6", # Vector index for memory (--memory). Lightweight, no torch.
74
  "redis>=5.0.0", # Shared stats aggregation for multi-worker proxy (TCP)
 
88
  # ML-based compression with Kompress (ModernBERT)
89
  ml = [
90
  "torch>=2.0.0",
91
+ "transformers>=4.30.0",
92
  ]
93
  # Legacy ML compression (LLMLingua-2 — use [ml] instead for Kompress)
94
  llmlingua = [
95
  "llmlingua>=0.2.0",
96
  "torch>=2.0.0",
97
+ "transformers>=4.30.0",
98
  ]
99
  # Memory system (hierarchical memory with vector search)
100
  memory = [
 
159
  # Voice filler detection
160
  voice = [
161
  "onnxruntime>=1.16.0",
162
+ "transformers>=4.30.0",
163
  "torch>=2.0.0",
164
  ]
165
  # Voice training (includes voice deps + training extras)