headroom / examples /real_world_eval.py
chopratejas's picture
Fix all ruff lint and format errors for CI
e4a41fa
Raw
History Blame
27.8 kB
#!/usr/bin/env python3
"""
Real-world evaluation of Headroom SDK with Anthropic.
This script simulates a complex agentic scenario with:
- Multiple tool calls (search, fetch, analyze)
- Large tool outputs (realistic JSON payloads)
- Multi-turn conversation
We compare:
1. Baseline (no optimization) - audit mode
2. Optimized (Headroom transforms) - optimize mode
And evaluate:
- Token usage (before/after)
- Response quality (semantic similarity)
- Cost savings
"""
import json
import os
import tempfile
import time
from dataclasses import dataclass
from anthropic import Anthropic
from dotenv import load_dotenv
from headroom import AnthropicProvider, HeadroomClient, HeadroomConfig, ToolCrusherConfig
load_dotenv(".env.local")
# Initialize clients
base_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
provider = AnthropicProvider()
# AGGRESSIVE optimization config
aggressive_tool_crusher = ToolCrusherConfig(
enabled=True,
min_tokens_to_crush=100, # Crush smaller outputs
max_array_items=3, # Only keep first 3 items (was 10)
max_string_length=200, # Truncate strings > 200 chars (was 1000)
max_depth=3, # Limit nesting to 3 levels (was 5)
)
db_path = os.path.join(tempfile.gettempdir(), "headroom_eval.db")
# Default (conservative) client
headroom_client = HeadroomClient(
original_client=base_client,
provider=provider,
store_url=f"sqlite:///{db_path}",
default_mode="audit",
)
# Aggressive optimization client
aggressive_config = HeadroomConfig()
aggressive_config.tool_crusher = aggressive_tool_crusher
db_path_aggressive = os.path.join(tempfile.gettempdir(), "headroom_eval_aggressive.db")
aggressive_client = HeadroomClient(
original_client=base_client,
provider=provider,
store_url=f"sqlite:///{db_path_aggressive}",
default_mode="audit",
)
# Manually set aggressive config on pipeline
aggressive_client._config = aggressive_config
aggressive_client._pipeline = __import__(
"headroom.transforms", fromlist=["TransformPipeline"]
).TransformPipeline(aggressive_config, provider=provider)
# =============================================================================
# REALISTIC AGENTIC SCENARIO: Research Assistant
# =============================================================================
def generate_search_results(query: str, count: int = 25) -> str:
"""Generate realistic search results JSON."""
results = []
for i in range(count):
results.append(
{
"id": f"doc_{i:04d}",
"title": f"Research Paper: {query.title()} - Study {i + 1}",
"url": f"https://research.example.com/papers/{query.replace(' ', '-')}/{i}",
"snippet": f"This comprehensive study examines {query} through multiple methodologies. "
f"Key findings include significant correlations between variables A and B, "
f"with p-values < 0.05. The sample size of {1000 + i * 100} participants "
f"provides robust statistical power. Methods included: surveys, interviews, "
f"longitudinal tracking, and meta-analysis of {50 + i * 10} prior studies.",
"citations": 150 + i * 23,
"year": 2020 + (i % 5),
"authors": [
{"name": f"Dr. Smith{i}", "affiliation": "MIT"},
{"name": f"Prof. Jones{i}", "affiliation": "Stanford"},
{"name": f"Dr. Williams{i}", "affiliation": "Harvard"},
],
"keywords": ["machine learning", "data science", query, "research", "analysis"],
"abstract": f"Abstract for paper {i}: " + "Lorem ipsum dolor sit amet. " * 20,
"methodology": {
"type": "mixed-methods",
"sample_size": 1000 + i * 100,
"duration_months": 12 + i,
"instruments": ["survey", "interview", "observation"],
},
}
)
return json.dumps({"results": results, "total_count": count, "query": query})
def generate_document_content(doc_id: str) -> str:
"""Generate realistic document content."""
return json.dumps(
{
"id": doc_id,
"full_text": """
Introduction:
This research investigates the complex interplay between artificial intelligence
and human decision-making processes. Our longitudinal study spanning 36 months
collected data from 5,000 participants across 12 countries.
Methodology:
We employed a mixed-methods approach combining quantitative surveys (n=4,500)
with qualitative interviews (n=500). Statistical analysis included regression
modeling, factor analysis, and structural equation modeling.
Results:
Key findings indicate that AI-assisted decision making improved accuracy by 34%
while reducing cognitive load by 28%. However, over-reliance on AI recommendations
correlated with decreased critical thinking skills (r=-0.42, p<0.001).
Discussion:
These findings have significant implications for the design of AI systems in
high-stakes environments. We recommend a balanced approach that leverages AI
capabilities while maintaining human oversight and skill development.
Conclusion:
The integration of AI in decision-making processes offers substantial benefits
but requires careful implementation to avoid potential negative outcomes.
"""
* 3, # Make it longer
"metadata": {
"word_count": 15000,
"pages": 45,
"figures": 12,
"tables": 8,
"references": 150,
},
"sections": [
{"title": "Introduction", "page": 1, "word_count": 2000},
{"title": "Literature Review", "page": 5, "word_count": 4000},
{"title": "Methodology", "page": 15, "word_count": 3000},
{"title": "Results", "page": 22, "word_count": 3500},
{"title": "Discussion", "page": 32, "word_count": 2000},
{"title": "Conclusion", "page": 40, "word_count": 500},
],
}
)
def generate_analytics_data() -> str:
"""Generate realistic analytics/metrics data."""
return json.dumps(
{
"summary_statistics": {
"total_papers_analyzed": 500,
"date_range": {"start": "2020-01-01", "end": "2024-12-31"},
"avg_citations": 45.7,
"median_citations": 32,
"std_dev": 28.3,
},
"trend_analysis": [
{
"year": 2020,
"papers": 80,
"avg_citations": 52.3,
"top_keywords": ["covid", "remote", "digital"],
},
{
"year": 2021,
"papers": 95,
"avg_citations": 48.1,
"top_keywords": ["hybrid", "adaptation", "resilience"],
},
{
"year": 2022,
"papers": 110,
"avg_citations": 44.2,
"top_keywords": ["AI", "automation", "efficiency"],
},
{
"year": 2023,
"papers": 120,
"avg_citations": 38.5,
"top_keywords": ["LLM", "generative", "ethics"],
},
{
"year": 2024,
"papers": 95,
"avg_citations": 25.1,
"top_keywords": ["agents", "multimodal", "safety"],
},
],
"citation_distribution": {
"0-10": 150,
"11-25": 120,
"26-50": 100,
"51-100": 80,
"101-200": 35,
"200+": 15,
},
"top_authors": [
{"name": "Dr. Smith", "papers": 25, "total_citations": 1250, "h_index": 18},
{"name": "Prof. Jones", "papers": 22, "total_citations": 980, "h_index": 15},
{"name": "Dr. Williams", "papers": 20, "total_citations": 890, "h_index": 14},
]
* 5, # More authors
"collaboration_network": {
"nodes": 150,
"edges": 450,
"avg_degree": 6.0,
"clustering_coefficient": 0.45,
},
}
)
# =============================================================================
# BUILD COMPLEX AGENTIC CONVERSATION
# =============================================================================
def build_agentic_conversation() -> list[dict]:
"""Build a realistic multi-turn agentic conversation."""
# Note: Anthropic doesn't use system role in messages array
# System prompt is passed separately, so CacheAligner won't trigger here
# But we'll include date in the first user message context
messages = [
# Turn 1: User asks for research (with date context that CacheAligner would detect)
{
"role": "user",
"content": "Current Date: 2024-12-15. I need you to research the impact of AI on workplace productivity. "
"Search for recent papers, analyze the top results, and give me a summary.",
},
# Turn 2: Assistant decides to search
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "I'll help you research AI's impact on workplace productivity. Let me search for recent academic papers on this topic.",
},
{
"type": "tool_use",
"id": "search_1",
"name": "academic_search",
"input": {"query": "AI impact workplace productivity", "limit": 25},
},
],
},
# Turn 3: Tool result - large search results
{
"role": "user",
"content": [
{
"type": "tool_result",
"tool_use_id": "search_1",
"content": generate_search_results("AI impact workplace productivity", 25),
}
],
},
# Turn 4: Assistant analyzes and fetches more
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "I found 25 relevant papers. Let me fetch the full content of the top 3 most cited papers and get analytics on the overall trends.",
},
{
"type": "tool_use",
"id": "fetch_1",
"name": "fetch_document",
"input": {"doc_id": "doc_0001"},
},
{
"type": "tool_use",
"id": "fetch_2",
"name": "fetch_document",
"input": {"doc_id": "doc_0002"},
},
{
"type": "tool_use",
"id": "analytics_1",
"name": "get_analytics",
"input": {"topic": "AI workplace productivity"},
},
],
},
# Turn 5: Multiple tool results
{
"role": "user",
"content": [
{
"type": "tool_result",
"tool_use_id": "fetch_1",
"content": generate_document_content("doc_0001"),
},
{
"type": "tool_result",
"tool_use_id": "fetch_2",
"content": generate_document_content("doc_0002"),
},
{
"type": "tool_result",
"tool_use_id": "analytics_1",
"content": generate_analytics_data(),
},
],
},
# Turn 6: Assistant provides initial summary
{
"role": "assistant",
"content": "Based on my analysis of 25 papers and detailed review of the top cited works, here's what the research shows:\n\n"
"**Key Findings:**\n"
"1. AI tools improve productivity by 25-40% on average\n"
"2. Biggest gains in data analysis and routine tasks\n"
"3. Mixed results for creative work\n"
"4. Training and adoption are critical success factors\n\n"
"Would you like me to dive deeper into any specific aspect?",
},
# Turn 7: User asks follow-up
{
"role": "user",
"content": "Yes, tell me more about the training and adoption factors. What makes AI adoption successful?",
},
# Turn 8: Assistant searches for more specific info
{
"role": "assistant",
"content": [
{
"type": "text",
"text": "Great question. Let me search for papers specifically about AI adoption success factors.",
},
{
"type": "tool_use",
"id": "search_2",
"name": "academic_search",
"input": {
"query": "AI adoption success factors workplace training",
"limit": 20,
},
},
],
},
# Turn 9: More search results
{
"role": "user",
"content": [
{
"type": "tool_result",
"tool_use_id": "search_2",
"content": generate_search_results(
"AI adoption success factors workplace training", 20
),
}
],
},
# Turn 10: Assistant synthesizes
{
"role": "assistant",
"content": "Based on the additional research on AI adoption success factors:\n\n"
"**Critical Success Factors:**\n\n"
"1. **Leadership Support** - Executive buy-in correlates with 3x higher adoption rates\n"
"2. **Gradual Rollout** - Phased implementation outperforms big-bang approaches\n"
"3. **Training Investment** - Organizations spending >40 hours on training see 50% better outcomes\n"
"4. **Change Management** - Addressing employee concerns reduces resistance by 60%\n"
"5. **Clear Use Cases** - Specific, measurable goals improve success rates\n\n"
"The research consistently shows that technical capabilities matter less than organizational readiness.",
},
# Turn 11: User asks for final deliverable
{
"role": "user",
"content": "Perfect. Now synthesize everything into a brief executive summary I can share with my team.",
},
]
return messages
# =============================================================================
# EVALUATION FRAMEWORK
# =============================================================================
@dataclass
class EvalResult:
"""Results from a single evaluation run."""
mode: str
tokens_input: int
tokens_output: int
total_tokens: int
latency_ms: float
response: str
cost_estimate: float
def run_evaluation(messages: list[dict], mode: str) -> EvalResult:
"""Run a single evaluation with specified mode."""
# Count input tokens
tokenizer = provider.get_token_counter("claude-3-5-haiku-latest")
tokens_input = sum(tokenizer.count_message(m) for m in messages)
start_time = time.time()
response = headroom_client.messages.create(
model="claude-3-5-haiku-latest",
messages=messages,
max_tokens=1000,
headroom_mode=mode,
)
latency_ms = (time.time() - start_time) * 1000
# Extract response text
response_text = response.content[0].text if response.content else ""
tokens_output = response.usage.output_tokens if response.usage else 0
# Get actual input tokens from response (more accurate)
actual_input = response.usage.input_tokens if response.usage else tokens_input
# Estimate cost (Haiku: $0.25/1M input, $1.25/1M output)
cost = (actual_input / 1_000_000) * 0.80 + (tokens_output / 1_000_000) * 4.00
return EvalResult(
mode=mode,
tokens_input=actual_input,
tokens_output=tokens_output,
total_tokens=actual_input + tokens_output,
latency_ms=latency_ms,
response=response_text,
cost_estimate=cost,
)
def evaluate_response_quality(baseline: str, optimized: str) -> dict:
"""Use Claude to evaluate response quality between baseline and optimized."""
eval_prompt = f"""Compare these two AI responses to the same question.
Rate each on a scale of 1-10 for:
1. Completeness - Does it fully answer the question?
2. Accuracy - Is the information correct and well-sourced?
3. Clarity - Is it well-organized and easy to understand?
4. Actionability - Does it provide useful, actionable insights?
Response A (Baseline):
{baseline[:2000]}
Response B (Optimized):
{optimized[:2000]}
Provide scores in this exact JSON format:
{{"baseline": {{"completeness": X, "accuracy": X, "clarity": X, "actionability": X}},
"optimized": {{"completeness": X, "accuracy": X, "clarity": X, "actionability": X}},
"winner": "baseline" or "optimized" or "tie",
"reasoning": "brief explanation"}}"""
response = base_client.messages.create(
model="claude-3-5-haiku-latest",
max_tokens=500,
messages=[{"role": "user", "content": eval_prompt}],
)
try:
# Extract JSON from response
text = response.content[0].text
# Find JSON in response
start = text.find("{")
end = text.rfind("}") + 1
if start >= 0 and end > start:
return json.loads(text[start:end])
except (json.JSONDecodeError, IndexError):
pass
return {"error": "Could not parse evaluation", "raw": response.content[0].text}
# =============================================================================
# MAIN EVALUATION
# =============================================================================
def run_aggressive_evaluation(messages: list[dict], mode: str) -> EvalResult:
"""Run evaluation with aggressive client."""
tokenizer = provider.get_token_counter("claude-3-5-haiku-latest")
tokens_input = sum(tokenizer.count_message(m) for m in messages)
start_time = time.time()
response = aggressive_client.messages.create(
model="claude-3-5-haiku-latest",
messages=messages,
max_tokens=1000,
headroom_mode=mode,
)
latency_ms = (time.time() - start_time) * 1000
response_text = response.content[0].text if response.content else ""
tokens_output = response.usage.output_tokens if response.usage else 0
actual_input = response.usage.input_tokens if response.usage else tokens_input
cost = (actual_input / 1_000_000) * 0.80 + (tokens_output / 1_000_000) * 4.00
return EvalResult(
mode=mode,
tokens_input=actual_input,
tokens_output=tokens_output,
total_tokens=actual_input + tokens_output,
latency_ms=latency_ms,
response=response_text,
cost_estimate=cost,
)
def main():
print("=" * 70)
print("HEADROOM SDK - REAL WORLD EVALUATION")
print("=" * 70)
print()
# Build the conversation
messages = build_agentic_conversation()
print(f"Scenario: Research Assistant with {len(messages)} turns")
print("Tool calls: 4 (search x2, fetch x2, analytics x1)")
print("Tool outputs: Large JSON payloads (~50KB total)")
print()
# =========================================================================
# SIMULATION: Preview all optimization levels
# =========================================================================
print("-" * 70)
print("SIMULATIONS (Preview of Optimizations)")
print("-" * 70)
# Conservative (default)
sim_default = headroom_client.messages.simulate(
model="claude-3-5-haiku-latest",
messages=messages,
)
# Aggressive
sim_aggressive = aggressive_client.messages.simulate(
model="claude-3-5-haiku-latest",
messages=messages,
)
print(f"\n{'Mode':<20} {'Before':>10} {'After':>10} {'Saved':>10} {'%':>8}")
print("-" * 60)
print(
f"{'Conservative':<20} {sim_default.tokens_before:>10,} {sim_default.tokens_after:>10,} {sim_default.tokens_saved:>10,} {sim_default.tokens_saved / sim_default.tokens_before * 100:>7.1f}%"
)
print(
f"{'Aggressive':<20} {sim_aggressive.tokens_before:>10,} {sim_aggressive.tokens_after:>10,} {sim_aggressive.tokens_saved:>10,} {sim_aggressive.tokens_saved / sim_aggressive.tokens_before * 100:>7.1f}%"
)
print()
print(f"Conservative transforms: {sim_default.transforms}")
print(f"Aggressive transforms: {sim_aggressive.transforms}")
print()
# =========================================================================
# ACTUAL API CALLS: Compare Baseline vs Conservative vs Aggressive
# =========================================================================
print("-" * 70)
print("1. BASELINE (No Optimization)")
print("-" * 70)
baseline = run_evaluation(messages, "audit")
print(
f"Input: {baseline.tokens_input:,} tokens | Cost: ${baseline.cost_estimate:.4f} | Latency: {baseline.latency_ms:.0f}ms"
)
print(f"Response: {baseline.response[:300]}...")
print()
print("-" * 70)
print("2. CONSERVATIVE OPTIMIZATION (Default Settings)")
print("-" * 70)
conservative = run_evaluation(messages, "optimize")
print(
f"Input: {conservative.tokens_input:,} tokens | Cost: ${conservative.cost_estimate:.4f} | Latency: {conservative.latency_ms:.0f}ms"
)
print(f"Response: {conservative.response[:300]}...")
print()
print("-" * 70)
print("3. AGGRESSIVE OPTIMIZATION (max_array=3, max_string=200, max_depth=3)")
print("-" * 70)
aggressive = run_aggressive_evaluation(messages, "optimize")
print(
f"Input: {aggressive.tokens_input:,} tokens | Cost: ${aggressive.cost_estimate:.4f} | Latency: {aggressive.latency_ms:.0f}ms"
)
print(f"Response: {aggressive.response[:300]}...")
print()
# =========================================================================
# COMPARISON TABLE
# =========================================================================
print("=" * 70)
print("COMPARISON TABLE")
print("=" * 70)
print(f"\n{'Metric':<25} {'Baseline':>12} {'Conservative':>12} {'Aggressive':>12}")
print("-" * 65)
print(
f"{'Input Tokens':<25} {baseline.tokens_input:>12,} {conservative.tokens_input:>12,} {aggressive.tokens_input:>12,}"
)
print(
f"{'Output Tokens':<25} {baseline.tokens_output:>12,} {conservative.tokens_output:>12,} {aggressive.tokens_output:>12,}"
)
print(
f"{'Cost':<25} ${baseline.cost_estimate:>11.4f} ${conservative.cost_estimate:>11.4f} ${aggressive.cost_estimate:>11.4f}"
)
print(
f"{'Latency (ms)':<25} {baseline.latency_ms:>12.0f} {conservative.latency_ms:>12.0f} {aggressive.latency_ms:>12.0f}"
)
# Savings vs baseline
cons_savings = baseline.tokens_input - conservative.tokens_input
cons_pct = (cons_savings / baseline.tokens_input) * 100 if baseline.tokens_input > 0 else 0
aggr_savings = baseline.tokens_input - aggressive.tokens_input
aggr_pct = (aggr_savings / baseline.tokens_input) * 100 if baseline.tokens_input > 0 else 0
print()
print(
f"{'Token Savings vs Baseline':<25} {'-':>12} {cons_savings:>10,} ({cons_pct:.0f}%) {aggr_savings:>10,} ({aggr_pct:.0f}%)"
)
cons_cost_save = baseline.cost_estimate - conservative.cost_estimate
aggr_cost_save = baseline.cost_estimate - aggressive.cost_estimate
print(
f"{'Cost Savings vs Baseline':<25} {'-':>12} ${cons_cost_save:>10.4f} ${aggr_cost_save:>10.4f}"
)
print()
# =========================================================================
# QUALITY EVALUATION: All three responses
# =========================================================================
print("-" * 70)
print("QUALITY EVALUATION (Claude as Judge)")
print("-" * 70)
# Conservative vs Baseline
qual_cons = evaluate_response_quality(baseline.response, conservative.response)
# Aggressive vs Baseline
qual_aggr = evaluate_response_quality(baseline.response, aggressive.response)
if "error" not in qual_cons and "error" not in qual_aggr:
print(f"\n{'Criterion':<20} {'Baseline':>10} {'Conservative':>12} {'Aggressive':>12}")
print("-" * 55)
for criterion in ["completeness", "accuracy", "clarity", "actionability"]:
b_score = qual_cons["baseline"].get(criterion, "N/A")
c_score = qual_cons["optimized"].get(criterion, "N/A")
a_score = qual_aggr["optimized"].get(criterion, "N/A")
print(f"{criterion.title():<20} {b_score:>10} {c_score:>12} {a_score:>12}")
print()
print(f"Baseline vs Conservative: {qual_cons.get('winner', 'N/A')}")
print(f"Baseline vs Aggressive: {qual_aggr.get('winner', 'N/A')}")
else:
print(f"Conservative eval: {qual_cons}")
print(f"Aggressive eval: {qual_aggr}")
# =========================================================================
# FINAL SUMMARY
# =========================================================================
print()
print("=" * 70)
print("SUMMARY: HEADROOM SDK OPTIMIZATION RESULTS")
print("=" * 70)
print(f"""
Real-world agentic scenario with 5 large tool outputs:
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ CONSERVATIVE MODE (Safe for Production) β”‚
β”‚ Token Reduction: {cons_savings:,} tokens ({cons_pct:.1f}%) β”‚
β”‚ Cost Savings: ${cons_cost_save:.4f}/request (${cons_cost_save * 30000:.2f}/month @ 1K/day) β”‚
β”‚ Quality Impact: Minimal β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ AGGRESSIVE MODE (Maximum Savings) β”‚
β”‚ Token Reduction: {aggr_savings:,} tokens ({aggr_pct:.1f}%) β”‚
β”‚ Cost Savings: ${aggr_cost_save:.4f}/request (${aggr_cost_save * 30000:.2f}/month @ 1K/day) β”‚
β”‚ Quality Impact: Slight reduction in detail β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
Transforms Applied:
- Tool Crusher: Compressed JSON arrays, truncated long strings
- Cache Aligner: (Only applies to system prompts with dates)
- Rolling Window: (Only applies when near context limit)
""")
if __name__ == "__main__":
main()