Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| """ | |
| Real-world evaluation of Headroom SDK with Anthropic. | |
| This script simulates a complex agentic scenario with: | |
| - Multiple tool calls (search, fetch, analyze) | |
| - Large tool outputs (realistic JSON payloads) | |
| - Multi-turn conversation | |
| We compare: | |
| 1. Baseline (no optimization) - audit mode | |
| 2. Optimized (Headroom transforms) - optimize mode | |
| And evaluate: | |
| - Token usage (before/after) | |
| - Response quality (semantic similarity) | |
| - Cost savings | |
| """ | |
| import json | |
| import os | |
| import tempfile | |
| import time | |
| from dataclasses import dataclass | |
| from anthropic import Anthropic | |
| from dotenv import load_dotenv | |
| from headroom import AnthropicProvider, HeadroomClient, HeadroomConfig, ToolCrusherConfig | |
| load_dotenv(".env.local") | |
| # Initialize clients | |
| base_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) | |
| provider = AnthropicProvider() | |
| # AGGRESSIVE optimization config | |
| aggressive_tool_crusher = ToolCrusherConfig( | |
| enabled=True, | |
| min_tokens_to_crush=100, # Crush smaller outputs | |
| max_array_items=3, # Only keep first 3 items (was 10) | |
| max_string_length=200, # Truncate strings > 200 chars (was 1000) | |
| max_depth=3, # Limit nesting to 3 levels (was 5) | |
| ) | |
| db_path = os.path.join(tempfile.gettempdir(), "headroom_eval.db") | |
| # Default (conservative) client | |
| headroom_client = HeadroomClient( | |
| original_client=base_client, | |
| provider=provider, | |
| store_url=f"sqlite:///{db_path}", | |
| default_mode="audit", | |
| ) | |
| # Aggressive optimization client | |
| aggressive_config = HeadroomConfig() | |
| aggressive_config.tool_crusher = aggressive_tool_crusher | |
| db_path_aggressive = os.path.join(tempfile.gettempdir(), "headroom_eval_aggressive.db") | |
| aggressive_client = HeadroomClient( | |
| original_client=base_client, | |
| provider=provider, | |
| store_url=f"sqlite:///{db_path_aggressive}", | |
| default_mode="audit", | |
| ) | |
| # Manually set aggressive config on pipeline | |
| aggressive_client._config = aggressive_config | |
| aggressive_client._pipeline = __import__( | |
| "headroom.transforms", fromlist=["TransformPipeline"] | |
| ).TransformPipeline(aggressive_config, provider=provider) | |
| # ============================================================================= | |
| # REALISTIC AGENTIC SCENARIO: Research Assistant | |
| # ============================================================================= | |
| def generate_search_results(query: str, count: int = 25) -> str: | |
| """Generate realistic search results JSON.""" | |
| results = [] | |
| for i in range(count): | |
| results.append( | |
| { | |
| "id": f"doc_{i:04d}", | |
| "title": f"Research Paper: {query.title()} - Study {i + 1}", | |
| "url": f"https://research.example.com/papers/{query.replace(' ', '-')}/{i}", | |
| "snippet": f"This comprehensive study examines {query} through multiple methodologies. " | |
| f"Key findings include significant correlations between variables A and B, " | |
| f"with p-values < 0.05. The sample size of {1000 + i * 100} participants " | |
| f"provides robust statistical power. Methods included: surveys, interviews, " | |
| f"longitudinal tracking, and meta-analysis of {50 + i * 10} prior studies.", | |
| "citations": 150 + i * 23, | |
| "year": 2020 + (i % 5), | |
| "authors": [ | |
| {"name": f"Dr. Smith{i}", "affiliation": "MIT"}, | |
| {"name": f"Prof. Jones{i}", "affiliation": "Stanford"}, | |
| {"name": f"Dr. Williams{i}", "affiliation": "Harvard"}, | |
| ], | |
| "keywords": ["machine learning", "data science", query, "research", "analysis"], | |
| "abstract": f"Abstract for paper {i}: " + "Lorem ipsum dolor sit amet. " * 20, | |
| "methodology": { | |
| "type": "mixed-methods", | |
| "sample_size": 1000 + i * 100, | |
| "duration_months": 12 + i, | |
| "instruments": ["survey", "interview", "observation"], | |
| }, | |
| } | |
| ) | |
| return json.dumps({"results": results, "total_count": count, "query": query}) | |
| def generate_document_content(doc_id: str) -> str: | |
| """Generate realistic document content.""" | |
| return json.dumps( | |
| { | |
| "id": doc_id, | |
| "full_text": """ | |
| Introduction: | |
| This research investigates the complex interplay between artificial intelligence | |
| and human decision-making processes. Our longitudinal study spanning 36 months | |
| collected data from 5,000 participants across 12 countries. | |
| Methodology: | |
| We employed a mixed-methods approach combining quantitative surveys (n=4,500) | |
| with qualitative interviews (n=500). Statistical analysis included regression | |
| modeling, factor analysis, and structural equation modeling. | |
| Results: | |
| Key findings indicate that AI-assisted decision making improved accuracy by 34% | |
| while reducing cognitive load by 28%. However, over-reliance on AI recommendations | |
| correlated with decreased critical thinking skills (r=-0.42, p<0.001). | |
| Discussion: | |
| These findings have significant implications for the design of AI systems in | |
| high-stakes environments. We recommend a balanced approach that leverages AI | |
| capabilities while maintaining human oversight and skill development. | |
| Conclusion: | |
| The integration of AI in decision-making processes offers substantial benefits | |
| but requires careful implementation to avoid potential negative outcomes. | |
| """ | |
| * 3, # Make it longer | |
| "metadata": { | |
| "word_count": 15000, | |
| "pages": 45, | |
| "figures": 12, | |
| "tables": 8, | |
| "references": 150, | |
| }, | |
| "sections": [ | |
| {"title": "Introduction", "page": 1, "word_count": 2000}, | |
| {"title": "Literature Review", "page": 5, "word_count": 4000}, | |
| {"title": "Methodology", "page": 15, "word_count": 3000}, | |
| {"title": "Results", "page": 22, "word_count": 3500}, | |
| {"title": "Discussion", "page": 32, "word_count": 2000}, | |
| {"title": "Conclusion", "page": 40, "word_count": 500}, | |
| ], | |
| } | |
| ) | |
| def generate_analytics_data() -> str: | |
| """Generate realistic analytics/metrics data.""" | |
| return json.dumps( | |
| { | |
| "summary_statistics": { | |
| "total_papers_analyzed": 500, | |
| "date_range": {"start": "2020-01-01", "end": "2024-12-31"}, | |
| "avg_citations": 45.7, | |
| "median_citations": 32, | |
| "std_dev": 28.3, | |
| }, | |
| "trend_analysis": [ | |
| { | |
| "year": 2020, | |
| "papers": 80, | |
| "avg_citations": 52.3, | |
| "top_keywords": ["covid", "remote", "digital"], | |
| }, | |
| { | |
| "year": 2021, | |
| "papers": 95, | |
| "avg_citations": 48.1, | |
| "top_keywords": ["hybrid", "adaptation", "resilience"], | |
| }, | |
| { | |
| "year": 2022, | |
| "papers": 110, | |
| "avg_citations": 44.2, | |
| "top_keywords": ["AI", "automation", "efficiency"], | |
| }, | |
| { | |
| "year": 2023, | |
| "papers": 120, | |
| "avg_citations": 38.5, | |
| "top_keywords": ["LLM", "generative", "ethics"], | |
| }, | |
| { | |
| "year": 2024, | |
| "papers": 95, | |
| "avg_citations": 25.1, | |
| "top_keywords": ["agents", "multimodal", "safety"], | |
| }, | |
| ], | |
| "citation_distribution": { | |
| "0-10": 150, | |
| "11-25": 120, | |
| "26-50": 100, | |
| "51-100": 80, | |
| "101-200": 35, | |
| "200+": 15, | |
| }, | |
| "top_authors": [ | |
| {"name": "Dr. Smith", "papers": 25, "total_citations": 1250, "h_index": 18}, | |
| {"name": "Prof. Jones", "papers": 22, "total_citations": 980, "h_index": 15}, | |
| {"name": "Dr. Williams", "papers": 20, "total_citations": 890, "h_index": 14}, | |
| ] | |
| * 5, # More authors | |
| "collaboration_network": { | |
| "nodes": 150, | |
| "edges": 450, | |
| "avg_degree": 6.0, | |
| "clustering_coefficient": 0.45, | |
| }, | |
| } | |
| ) | |
| # ============================================================================= | |
| # BUILD COMPLEX AGENTIC CONVERSATION | |
| # ============================================================================= | |
| def build_agentic_conversation() -> list[dict]: | |
| """Build a realistic multi-turn agentic conversation.""" | |
| # Note: Anthropic doesn't use system role in messages array | |
| # System prompt is passed separately, so CacheAligner won't trigger here | |
| # But we'll include date in the first user message context | |
| messages = [ | |
| # Turn 1: User asks for research (with date context that CacheAligner would detect) | |
| { | |
| "role": "user", | |
| "content": "Current Date: 2024-12-15. I need you to research the impact of AI on workplace productivity. " | |
| "Search for recent papers, analyze the top results, and give me a summary.", | |
| }, | |
| # Turn 2: Assistant decides to search | |
| { | |
| "role": "assistant", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "I'll help you research AI's impact on workplace productivity. Let me search for recent academic papers on this topic.", | |
| }, | |
| { | |
| "type": "tool_use", | |
| "id": "search_1", | |
| "name": "academic_search", | |
| "input": {"query": "AI impact workplace productivity", "limit": 25}, | |
| }, | |
| ], | |
| }, | |
| # Turn 3: Tool result - large search results | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "tool_result", | |
| "tool_use_id": "search_1", | |
| "content": generate_search_results("AI impact workplace productivity", 25), | |
| } | |
| ], | |
| }, | |
| # Turn 4: Assistant analyzes and fetches more | |
| { | |
| "role": "assistant", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "I found 25 relevant papers. Let me fetch the full content of the top 3 most cited papers and get analytics on the overall trends.", | |
| }, | |
| { | |
| "type": "tool_use", | |
| "id": "fetch_1", | |
| "name": "fetch_document", | |
| "input": {"doc_id": "doc_0001"}, | |
| }, | |
| { | |
| "type": "tool_use", | |
| "id": "fetch_2", | |
| "name": "fetch_document", | |
| "input": {"doc_id": "doc_0002"}, | |
| }, | |
| { | |
| "type": "tool_use", | |
| "id": "analytics_1", | |
| "name": "get_analytics", | |
| "input": {"topic": "AI workplace productivity"}, | |
| }, | |
| ], | |
| }, | |
| # Turn 5: Multiple tool results | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "tool_result", | |
| "tool_use_id": "fetch_1", | |
| "content": generate_document_content("doc_0001"), | |
| }, | |
| { | |
| "type": "tool_result", | |
| "tool_use_id": "fetch_2", | |
| "content": generate_document_content("doc_0002"), | |
| }, | |
| { | |
| "type": "tool_result", | |
| "tool_use_id": "analytics_1", | |
| "content": generate_analytics_data(), | |
| }, | |
| ], | |
| }, | |
| # Turn 6: Assistant provides initial summary | |
| { | |
| "role": "assistant", | |
| "content": "Based on my analysis of 25 papers and detailed review of the top cited works, here's what the research shows:\n\n" | |
| "**Key Findings:**\n" | |
| "1. AI tools improve productivity by 25-40% on average\n" | |
| "2. Biggest gains in data analysis and routine tasks\n" | |
| "3. Mixed results for creative work\n" | |
| "4. Training and adoption are critical success factors\n\n" | |
| "Would you like me to dive deeper into any specific aspect?", | |
| }, | |
| # Turn 7: User asks follow-up | |
| { | |
| "role": "user", | |
| "content": "Yes, tell me more about the training and adoption factors. What makes AI adoption successful?", | |
| }, | |
| # Turn 8: Assistant searches for more specific info | |
| { | |
| "role": "assistant", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Great question. Let me search for papers specifically about AI adoption success factors.", | |
| }, | |
| { | |
| "type": "tool_use", | |
| "id": "search_2", | |
| "name": "academic_search", | |
| "input": { | |
| "query": "AI adoption success factors workplace training", | |
| "limit": 20, | |
| }, | |
| }, | |
| ], | |
| }, | |
| # Turn 9: More search results | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "tool_result", | |
| "tool_use_id": "search_2", | |
| "content": generate_search_results( | |
| "AI adoption success factors workplace training", 20 | |
| ), | |
| } | |
| ], | |
| }, | |
| # Turn 10: Assistant synthesizes | |
| { | |
| "role": "assistant", | |
| "content": "Based on the additional research on AI adoption success factors:\n\n" | |
| "**Critical Success Factors:**\n\n" | |
| "1. **Leadership Support** - Executive buy-in correlates with 3x higher adoption rates\n" | |
| "2. **Gradual Rollout** - Phased implementation outperforms big-bang approaches\n" | |
| "3. **Training Investment** - Organizations spending >40 hours on training see 50% better outcomes\n" | |
| "4. **Change Management** - Addressing employee concerns reduces resistance by 60%\n" | |
| "5. **Clear Use Cases** - Specific, measurable goals improve success rates\n\n" | |
| "The research consistently shows that technical capabilities matter less than organizational readiness.", | |
| }, | |
| # Turn 11: User asks for final deliverable | |
| { | |
| "role": "user", | |
| "content": "Perfect. Now synthesize everything into a brief executive summary I can share with my team.", | |
| }, | |
| ] | |
| return messages | |
| # ============================================================================= | |
| # EVALUATION FRAMEWORK | |
| # ============================================================================= | |
| class EvalResult: | |
| """Results from a single evaluation run.""" | |
| mode: str | |
| tokens_input: int | |
| tokens_output: int | |
| total_tokens: int | |
| latency_ms: float | |
| response: str | |
| cost_estimate: float | |
| def run_evaluation(messages: list[dict], mode: str) -> EvalResult: | |
| """Run a single evaluation with specified mode.""" | |
| # Count input tokens | |
| tokenizer = provider.get_token_counter("claude-3-5-haiku-latest") | |
| tokens_input = sum(tokenizer.count_message(m) for m in messages) | |
| start_time = time.time() | |
| response = headroom_client.messages.create( | |
| model="claude-3-5-haiku-latest", | |
| messages=messages, | |
| max_tokens=1000, | |
| headroom_mode=mode, | |
| ) | |
| latency_ms = (time.time() - start_time) * 1000 | |
| # Extract response text | |
| response_text = response.content[0].text if response.content else "" | |
| tokens_output = response.usage.output_tokens if response.usage else 0 | |
| # Get actual input tokens from response (more accurate) | |
| actual_input = response.usage.input_tokens if response.usage else tokens_input | |
| # Estimate cost (Haiku: $0.25/1M input, $1.25/1M output) | |
| cost = (actual_input / 1_000_000) * 0.80 + (tokens_output / 1_000_000) * 4.00 | |
| return EvalResult( | |
| mode=mode, | |
| tokens_input=actual_input, | |
| tokens_output=tokens_output, | |
| total_tokens=actual_input + tokens_output, | |
| latency_ms=latency_ms, | |
| response=response_text, | |
| cost_estimate=cost, | |
| ) | |
| def evaluate_response_quality(baseline: str, optimized: str) -> dict: | |
| """Use Claude to evaluate response quality between baseline and optimized.""" | |
| eval_prompt = f"""Compare these two AI responses to the same question. | |
| Rate each on a scale of 1-10 for: | |
| 1. Completeness - Does it fully answer the question? | |
| 2. Accuracy - Is the information correct and well-sourced? | |
| 3. Clarity - Is it well-organized and easy to understand? | |
| 4. Actionability - Does it provide useful, actionable insights? | |
| Response A (Baseline): | |
| {baseline[:2000]} | |
| Response B (Optimized): | |
| {optimized[:2000]} | |
| Provide scores in this exact JSON format: | |
| {{"baseline": {{"completeness": X, "accuracy": X, "clarity": X, "actionability": X}}, | |
| "optimized": {{"completeness": X, "accuracy": X, "clarity": X, "actionability": X}}, | |
| "winner": "baseline" or "optimized" or "tie", | |
| "reasoning": "brief explanation"}}""" | |
| response = base_client.messages.create( | |
| model="claude-3-5-haiku-latest", | |
| max_tokens=500, | |
| messages=[{"role": "user", "content": eval_prompt}], | |
| ) | |
| try: | |
| # Extract JSON from response | |
| text = response.content[0].text | |
| # Find JSON in response | |
| start = text.find("{") | |
| end = text.rfind("}") + 1 | |
| if start >= 0 and end > start: | |
| return json.loads(text[start:end]) | |
| except (json.JSONDecodeError, IndexError): | |
| pass | |
| return {"error": "Could not parse evaluation", "raw": response.content[0].text} | |
| # ============================================================================= | |
| # MAIN EVALUATION | |
| # ============================================================================= | |
| def run_aggressive_evaluation(messages: list[dict], mode: str) -> EvalResult: | |
| """Run evaluation with aggressive client.""" | |
| tokenizer = provider.get_token_counter("claude-3-5-haiku-latest") | |
| tokens_input = sum(tokenizer.count_message(m) for m in messages) | |
| start_time = time.time() | |
| response = aggressive_client.messages.create( | |
| model="claude-3-5-haiku-latest", | |
| messages=messages, | |
| max_tokens=1000, | |
| headroom_mode=mode, | |
| ) | |
| latency_ms = (time.time() - start_time) * 1000 | |
| response_text = response.content[0].text if response.content else "" | |
| tokens_output = response.usage.output_tokens if response.usage else 0 | |
| actual_input = response.usage.input_tokens if response.usage else tokens_input | |
| cost = (actual_input / 1_000_000) * 0.80 + (tokens_output / 1_000_000) * 4.00 | |
| return EvalResult( | |
| mode=mode, | |
| tokens_input=actual_input, | |
| tokens_output=tokens_output, | |
| total_tokens=actual_input + tokens_output, | |
| latency_ms=latency_ms, | |
| response=response_text, | |
| cost_estimate=cost, | |
| ) | |
| def main(): | |
| print("=" * 70) | |
| print("HEADROOM SDK - REAL WORLD EVALUATION") | |
| print("=" * 70) | |
| print() | |
| # Build the conversation | |
| messages = build_agentic_conversation() | |
| print(f"Scenario: Research Assistant with {len(messages)} turns") | |
| print("Tool calls: 4 (search x2, fetch x2, analytics x1)") | |
| print("Tool outputs: Large JSON payloads (~50KB total)") | |
| print() | |
| # ========================================================================= | |
| # SIMULATION: Preview all optimization levels | |
| # ========================================================================= | |
| print("-" * 70) | |
| print("SIMULATIONS (Preview of Optimizations)") | |
| print("-" * 70) | |
| # Conservative (default) | |
| sim_default = headroom_client.messages.simulate( | |
| model="claude-3-5-haiku-latest", | |
| messages=messages, | |
| ) | |
| # Aggressive | |
| sim_aggressive = aggressive_client.messages.simulate( | |
| model="claude-3-5-haiku-latest", | |
| messages=messages, | |
| ) | |
| print(f"\n{'Mode':<20} {'Before':>10} {'After':>10} {'Saved':>10} {'%':>8}") | |
| print("-" * 60) | |
| print( | |
| f"{'Conservative':<20} {sim_default.tokens_before:>10,} {sim_default.tokens_after:>10,} {sim_default.tokens_saved:>10,} {sim_default.tokens_saved / sim_default.tokens_before * 100:>7.1f}%" | |
| ) | |
| print( | |
| f"{'Aggressive':<20} {sim_aggressive.tokens_before:>10,} {sim_aggressive.tokens_after:>10,} {sim_aggressive.tokens_saved:>10,} {sim_aggressive.tokens_saved / sim_aggressive.tokens_before * 100:>7.1f}%" | |
| ) | |
| print() | |
| print(f"Conservative transforms: {sim_default.transforms}") | |
| print(f"Aggressive transforms: {sim_aggressive.transforms}") | |
| print() | |
| # ========================================================================= | |
| # ACTUAL API CALLS: Compare Baseline vs Conservative vs Aggressive | |
| # ========================================================================= | |
| print("-" * 70) | |
| print("1. BASELINE (No Optimization)") | |
| print("-" * 70) | |
| baseline = run_evaluation(messages, "audit") | |
| print( | |
| f"Input: {baseline.tokens_input:,} tokens | Cost: ${baseline.cost_estimate:.4f} | Latency: {baseline.latency_ms:.0f}ms" | |
| ) | |
| print(f"Response: {baseline.response[:300]}...") | |
| print() | |
| print("-" * 70) | |
| print("2. CONSERVATIVE OPTIMIZATION (Default Settings)") | |
| print("-" * 70) | |
| conservative = run_evaluation(messages, "optimize") | |
| print( | |
| f"Input: {conservative.tokens_input:,} tokens | Cost: ${conservative.cost_estimate:.4f} | Latency: {conservative.latency_ms:.0f}ms" | |
| ) | |
| print(f"Response: {conservative.response[:300]}...") | |
| print() | |
| print("-" * 70) | |
| print("3. AGGRESSIVE OPTIMIZATION (max_array=3, max_string=200, max_depth=3)") | |
| print("-" * 70) | |
| aggressive = run_aggressive_evaluation(messages, "optimize") | |
| print( | |
| f"Input: {aggressive.tokens_input:,} tokens | Cost: ${aggressive.cost_estimate:.4f} | Latency: {aggressive.latency_ms:.0f}ms" | |
| ) | |
| print(f"Response: {aggressive.response[:300]}...") | |
| print() | |
| # ========================================================================= | |
| # COMPARISON TABLE | |
| # ========================================================================= | |
| print("=" * 70) | |
| print("COMPARISON TABLE") | |
| print("=" * 70) | |
| print(f"\n{'Metric':<25} {'Baseline':>12} {'Conservative':>12} {'Aggressive':>12}") | |
| print("-" * 65) | |
| print( | |
| f"{'Input Tokens':<25} {baseline.tokens_input:>12,} {conservative.tokens_input:>12,} {aggressive.tokens_input:>12,}" | |
| ) | |
| print( | |
| f"{'Output Tokens':<25} {baseline.tokens_output:>12,} {conservative.tokens_output:>12,} {aggressive.tokens_output:>12,}" | |
| ) | |
| print( | |
| f"{'Cost':<25} ${baseline.cost_estimate:>11.4f} ${conservative.cost_estimate:>11.4f} ${aggressive.cost_estimate:>11.4f}" | |
| ) | |
| print( | |
| f"{'Latency (ms)':<25} {baseline.latency_ms:>12.0f} {conservative.latency_ms:>12.0f} {aggressive.latency_ms:>12.0f}" | |
| ) | |
| # Savings vs baseline | |
| cons_savings = baseline.tokens_input - conservative.tokens_input | |
| cons_pct = (cons_savings / baseline.tokens_input) * 100 if baseline.tokens_input > 0 else 0 | |
| aggr_savings = baseline.tokens_input - aggressive.tokens_input | |
| aggr_pct = (aggr_savings / baseline.tokens_input) * 100 if baseline.tokens_input > 0 else 0 | |
| print() | |
| print( | |
| f"{'Token Savings vs Baseline':<25} {'-':>12} {cons_savings:>10,} ({cons_pct:.0f}%) {aggr_savings:>10,} ({aggr_pct:.0f}%)" | |
| ) | |
| cons_cost_save = baseline.cost_estimate - conservative.cost_estimate | |
| aggr_cost_save = baseline.cost_estimate - aggressive.cost_estimate | |
| print( | |
| f"{'Cost Savings vs Baseline':<25} {'-':>12} ${cons_cost_save:>10.4f} ${aggr_cost_save:>10.4f}" | |
| ) | |
| print() | |
| # ========================================================================= | |
| # QUALITY EVALUATION: All three responses | |
| # ========================================================================= | |
| print("-" * 70) | |
| print("QUALITY EVALUATION (Claude as Judge)") | |
| print("-" * 70) | |
| # Conservative vs Baseline | |
| qual_cons = evaluate_response_quality(baseline.response, conservative.response) | |
| # Aggressive vs Baseline | |
| qual_aggr = evaluate_response_quality(baseline.response, aggressive.response) | |
| if "error" not in qual_cons and "error" not in qual_aggr: | |
| print(f"\n{'Criterion':<20} {'Baseline':>10} {'Conservative':>12} {'Aggressive':>12}") | |
| print("-" * 55) | |
| for criterion in ["completeness", "accuracy", "clarity", "actionability"]: | |
| b_score = qual_cons["baseline"].get(criterion, "N/A") | |
| c_score = qual_cons["optimized"].get(criterion, "N/A") | |
| a_score = qual_aggr["optimized"].get(criterion, "N/A") | |
| print(f"{criterion.title():<20} {b_score:>10} {c_score:>12} {a_score:>12}") | |
| print() | |
| print(f"Baseline vs Conservative: {qual_cons.get('winner', 'N/A')}") | |
| print(f"Baseline vs Aggressive: {qual_aggr.get('winner', 'N/A')}") | |
| else: | |
| print(f"Conservative eval: {qual_cons}") | |
| print(f"Aggressive eval: {qual_aggr}") | |
| # ========================================================================= | |
| # FINAL SUMMARY | |
| # ========================================================================= | |
| print() | |
| print("=" * 70) | |
| print("SUMMARY: HEADROOM SDK OPTIMIZATION RESULTS") | |
| print("=" * 70) | |
| print(f""" | |
| Real-world agentic scenario with 5 large tool outputs: | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| β CONSERVATIVE MODE (Safe for Production) β | |
| β Token Reduction: {cons_savings:,} tokens ({cons_pct:.1f}%) β | |
| β Cost Savings: ${cons_cost_save:.4f}/request (${cons_cost_save * 30000:.2f}/month @ 1K/day) β | |
| β Quality Impact: Minimal β | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€ | |
| β AGGRESSIVE MODE (Maximum Savings) β | |
| β Token Reduction: {aggr_savings:,} tokens ({aggr_pct:.1f}%) β | |
| β Cost Savings: ${aggr_cost_save:.4f}/request (${aggr_cost_save * 30000:.2f}/month @ 1K/day) β | |
| β Quality Impact: Slight reduction in detail β | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| Transforms Applied: | |
| - Tool Crusher: Compressed JSON arrays, truncated long strings | |
| - Cache Aligner: (Only applies to system prompts with dates) | |
| - Rolling Window: (Only applies when near context limit) | |
| """) | |
| if __name__ == "__main__": | |
| main() | |