#!/usr/bin/env python3 """ Real-world OpenAI Agentic Evaluation for Headroom SDK. Scenario: DevOps/SRE Agent investigating a production incident - This is NOT hand-crafted for Headroom optimizations - Tool outputs are realistic sizes based on actual tool responses - The conversation flow mirrors real incident response Tools used: 1. query_metrics - Prometheus/Datadog style metrics 2. search_logs - ELK/Splunk style log search 3. get_service_status - Health check endpoints 4. query_deployments - CI/CD deployment history 5. get_runbook - Documentation lookup """ import json import os import tempfile import time from dataclasses import dataclass from datetime import datetime, timedelta from dotenv import load_dotenv from openai import OpenAI from headroom import HeadroomClient, OpenAIProvider, ToolCrusherConfig from headroom.config import HeadroomConfig from headroom.transforms import TransformPipeline load_dotenv(".env.local") # Initialize base_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY")) provider = OpenAIProvider() db_path = os.path.join(tempfile.gettempdir(), "headroom_openai_eval.db") client = HeadroomClient( original_client=base_client, provider=provider, store_url=f"sqlite:///{db_path}", default_mode="audit", ) # Aggressive config aggressive_config = HeadroomConfig() aggressive_config.tool_crusher = ToolCrusherConfig( min_tokens_to_crush=100, max_array_items=5, max_string_length=300, max_depth=4, ) db_path_agg = os.path.join(tempfile.gettempdir(), "headroom_openai_eval_agg.db") aggressive_client = HeadroomClient( original_client=base_client, provider=provider, store_url=f"sqlite:///{db_path_agg}", default_mode="audit", ) aggressive_client._config = aggressive_config aggressive_client._pipeline = TransformPipeline(aggressive_config, provider=provider) # ============================================================================= # REALISTIC TOOL OUTPUTS - Based on actual production systems # ============================================================================= def generate_metrics_response() -> str: """ Realistic Prometheus/Datadog metrics query response. This is what you actually get from a metrics API. """ base_time = datetime.now() - timedelta(hours=1) # CPU metrics - 60 data points (1 per minute for 1 hour) cpu_data = [] for i in range(60): ts = base_time + timedelta(minutes=i) # Simulate spike around minute 45 value = 45 + (i * 0.5) if i < 45 else 85 + (i - 45) * 2 cpu_data.append( { "timestamp": ts.isoformat(), "value": min(value, 98), "labels": {"instance": "prod-api-1", "job": "api-server"}, } ) # Memory metrics memory_data = [] for i in range(60): ts = base_time + timedelta(minutes=i) value = 62 + (i * 0.3) memory_data.append( { "timestamp": ts.isoformat(), "value": min(value, 89), "labels": {"instance": "prod-api-1", "job": "api-server"}, } ) # Request latency (p99) latency_data = [] for i in range(60): ts = base_time + timedelta(minutes=i) value = 120 if i < 45 else 450 + (i - 45) * 50 latency_data.append( { "timestamp": ts.isoformat(), "value": min(value, 2500), "labels": {"instance": "prod-api-1", "endpoint": "/api/v1/users"}, } ) # Error rate error_data = [] for i in range(60): ts = base_time + timedelta(minutes=i) value = 0.1 if i < 45 else 2.5 + (i - 45) * 0.5 error_data.append( { "timestamp": ts.isoformat(), "value": min(value, 15), "labels": {"instance": "prod-api-1", "status_code": "5xx"}, } ) return json.dumps( { "status": "success", "data": { "resultType": "matrix", "result": [ {"metric": {"__name__": "cpu_usage_percent"}, "values": cpu_data}, {"metric": {"__name__": "memory_usage_percent"}, "values": memory_data}, { "metric": {"__name__": "http_request_duration_p99_ms"}, "values": latency_data, }, {"metric": {"__name__": "http_errors_rate_percent"}, "values": error_data}, ], }, "query_time_ms": 127, } ) def generate_logs_response() -> str: """ Realistic ELK/Splunk log search response. This is what production log searches actually return. """ base_time = datetime.now() - timedelta(minutes=30) logs = [] log_templates = [ ("ERROR", "Connection pool exhausted, waiting for available connection", "api-server"), ( "WARN", "Slow query detected: SELECT * FROM users WHERE status = 'active' took 2.3s", "api-server", ), ("ERROR", "Database connection timeout after 30000ms", "api-server"), ("INFO", "Retry attempt 1/3 for database connection", "api-server"), ("ERROR", "Max retries exceeded for database operation", "api-server"), ("WARN", "Circuit breaker OPEN for database-primary", "api-server"), ("ERROR", "Failed to process request: upstream connect error", "api-server"), ("INFO", "Health check failed for database-primary", "health-checker"), ("ERROR", "PostgreSQL: too many connections for role 'api_user'", "database"), ("WARN", "Connection refused to database-primary:5432", "api-server"), ("ERROR", "Request timeout: /api/v1/users after 30s", "nginx"), ("INFO", "Scaling up api-server replicas from 3 to 5", "autoscaler"), ("ERROR", "OOM killed process api-server (pid 12345)", "kernel"), ("WARN", "Memory pressure detected, initiating garbage collection", "jvm"), ("ERROR", "Unhandled exception in request handler", "api-server"), ] for i in range(40): # 40 log entries ts = base_time + timedelta(seconds=i * 45) level, msg, source = log_templates[i % len(log_templates)] logs.append( { "@timestamp": ts.isoformat(), "level": level, "message": msg, "service": source, "trace_id": f"trace-{1000 + i:04d}-abcd-{i:04d}", "span_id": f"span-{i:04d}", "host": f"prod-{source}-{i % 3 + 1}", "environment": "production", "version": "2.4.1", "kubernetes": { "namespace": "production", "pod": f"{source}-{i % 5 + 1}-abc123", "container": source, "node": f"node-{i % 3 + 1}.prod.internal", }, "request": { "method": "GET" if i % 2 == 0 else "POST", "path": "/api/v1/users" if i % 3 == 0 else "/api/v1/orders", "status_code": 500 if level == "ERROR" else 200, "duration_ms": 150 + (i * 100) if level != "ERROR" else 30000, }, } ) return json.dumps( { "took": 234, "timed_out": False, "hits": { "total": {"value": len(logs), "relation": "eq"}, "max_score": 1.0, "hits": logs, }, } ) def generate_service_status() -> str: """ Realistic health check / service status response. """ return json.dumps( { "services": [ { "name": "api-server", "status": "degraded", "instances": [ { "id": "api-1", "status": "unhealthy", "cpu": 94, "memory": 87, "connections": 500, }, { "id": "api-2", "status": "healthy", "cpu": 45, "memory": 62, "connections": 150, }, { "id": "api-3", "status": "unhealthy", "cpu": 91, "memory": 85, "connections": 480, }, ], "last_check": datetime.now().isoformat(), "error_rate": 12.5, "p99_latency_ms": 2100, }, { "name": "database-primary", "status": "critical", "instances": [ { "id": "db-primary", "status": "unhealthy", "connections": 500, "max_connections": 500, "replication_lag_ms": 0, "disk_usage_percent": 78, } ], "last_check": datetime.now().isoformat(), "active_queries": 487, "blocked_queries": 52, }, { "name": "database-replica", "status": "healthy", "instances": [ { "id": "db-replica-1", "status": "healthy", "connections": 120, "max_connections": 500, "replication_lag_ms": 150, "disk_usage_percent": 76, }, { "id": "db-replica-2", "status": "healthy", "connections": 115, "max_connections": 500, "replication_lag_ms": 180, "disk_usage_percent": 77, }, ], "last_check": datetime.now().isoformat(), }, { "name": "redis-cache", "status": "healthy", "instances": [ { "id": "redis-1", "status": "healthy", "memory_used_mb": 2048, "memory_max_mb": 4096, "connected_clients": 45, "hit_rate": 0.94, } ], "last_check": datetime.now().isoformat(), }, { "name": "nginx-ingress", "status": "healthy", "instances": [ { "id": "nginx-1", "status": "healthy", "active_connections": 1250, "requests_per_sec": 450, }, { "id": "nginx-2", "status": "healthy", "active_connections": 1180, "requests_per_sec": 420, }, ], "last_check": datetime.now().isoformat(), }, ], "overall_status": "critical", "timestamp": datetime.now().isoformat(), } ) def generate_deployments_response() -> str: """ Realistic CI/CD deployment history. """ base_time = datetime.now() deployments = [] for i in range(15): ts = base_time - timedelta(hours=i * 4) deployments.append( { "id": f"deploy-{1000 - i}", "service": "api-server" if i % 3 != 2 else "database-migration", "version": f"2.4.{15 - i}", "status": "success" if i != 1 else "success", # Recent deploy "timestamp": ts.isoformat(), "commit": f"abc{i:04d}def", "author": f"dev{i % 5 + 1}@company.com", "message": [ "feat: Add new user endpoint", "fix: Connection pool sizing", "chore: Update dependencies", "feat: Implement caching layer", "fix: Memory leak in request handler", ][i % 5], "changes": { "files_changed": 5 + i, "insertions": 100 + i * 20, "deletions": 30 + i * 5, }, "rollback_available": True, "canary_status": "completed" if i > 0 else "in_progress", } ) return json.dumps( {"deployments": deployments, "total_count": len(deployments), "page": 1, "per_page": 20} ) def generate_runbook_response() -> str: """ Realistic runbook/documentation lookup. """ return json.dumps( { "runbook": { "title": "Database Connection Pool Exhaustion", "id": "RUN-DB-001", "severity": "P1", "last_updated": "2024-11-15", "owner": "platform-team", "symptoms": [ "High error rate on API endpoints", "Connection timeout errors in logs", "Database showing max connections reached", "Increased latency across all services", ], "diagnosis_steps": [ "1. Check current connection count: SELECT count(*) FROM pg_stat_activity", "2. Identify connection holders: SELECT * FROM pg_stat_activity WHERE state != 'idle'", "3. Check for long-running queries: SELECT * FROM pg_stat_activity WHERE state = 'active' AND query_start < now() - interval '1 minute'", "4. Verify connection pool settings in application config", "5. Check for connection leaks in recent deployments", ], "remediation_steps": [ "1. IMMEDIATE: Kill idle connections older than 10 minutes", "2. IMMEDIATE: Scale up API server replicas to distribute load", "3. SHORT-TERM: Increase max_connections on database (requires restart)", "4. SHORT-TERM: Review and optimize connection pool settings", "5. LONG-TERM: Implement connection pooler (PgBouncer)", ], "commands": { "kill_idle_connections": "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state = 'idle' AND query_start < now() - interval '10 minutes'", "check_pool_settings": "kubectl get configmap api-server-config -o yaml | grep -A5 'database'", "scale_replicas": "kubectl scale deployment api-server --replicas=5", }, "related_incidents": ["INC-2024-089", "INC-2024-056", "INC-2024-023"], "escalation_path": ["on-call-sre", "database-team", "platform-lead"], } } ) # ============================================================================= # BUILD REALISTIC INCIDENT RESPONSE CONVERSATION # ============================================================================= def build_incident_conversation() -> list[dict]: """ Build a realistic incident response agentic conversation. This mirrors how SRE teams actually investigate incidents. """ messages = [ # System prompt (realistic for an SRE agent) { "role": "system", "content": """You are an SRE incident response assistant. You help investigate and resolve production incidents. Current Date: 2024-12-15T14:30:00Z On-call: @platform-team Incident Channel: #inc-2024-142 You have access to the following tools: - query_metrics: Query Prometheus metrics - search_logs: Search ELK logs - get_service_status: Get current service health - query_deployments: Get recent deployment history - get_runbook: Lookup runbook documentation Always be concise and focus on actionable insights.""", }, # User reports incident { "role": "user", "content": "We're seeing elevated error rates on the API. Users reporting timeouts. Can you investigate?", }, # Agent starts investigation - queries metrics { "role": "assistant", "content": None, "tool_calls": [ { "id": "call_metrics_1", "type": "function", "function": { "name": "query_metrics", "arguments": json.dumps( { "query": "api_error_rate, cpu_usage, memory_usage, latency_p99", "time_range": "1h", } ), }, } ], }, # Metrics response {"role": "tool", "tool_call_id": "call_metrics_1", "content": generate_metrics_response()}, # Agent analyzes and queries logs { "role": "assistant", "content": "I see a clear pattern: around 45 minutes ago, CPU spiked to 95%, latency jumped from 120ms to 2500ms, and error rate went from 0.1% to 15%. Let me check the logs for that timeframe.", "tool_calls": [ { "id": "call_logs_1", "type": "function", "function": { "name": "search_logs", "arguments": json.dumps( { "query": "level:ERROR OR level:WARN", "time_range": "30m", "services": ["api-server", "database"], } ), }, } ], }, # Logs response {"role": "tool", "tool_call_id": "call_logs_1", "content": generate_logs_response()}, # Agent identifies root cause, checks service status { "role": "assistant", "content": "The logs show database connection issues: 'Connection pool exhausted', 'too many connections', and 'connection timeout'. This is a database connection saturation issue. Let me check current service status.", "tool_calls": [ { "id": "call_status_1", "type": "function", "function": {"name": "get_service_status", "arguments": json.dumps({})}, } ], }, # Service status response {"role": "tool", "tool_call_id": "call_status_1", "content": generate_service_status()}, # Agent checks for recent deployments { "role": "assistant", "content": "Confirmed: database-primary is CRITICAL with 500/500 connections (100% saturated), 52 blocked queries. API servers are degraded. Let me check if a recent deployment might have caused this.", "tool_calls": [ { "id": "call_deploys_1", "type": "function", "function": { "name": "query_deployments", "arguments": json.dumps({"service": "api-server", "limit": 15}), }, } ], }, # Deployments response { "role": "tool", "tool_call_id": "call_deploys_1", "content": generate_deployments_response(), }, # Agent gets runbook { "role": "assistant", "content": "I see deploy-999 (version 2.4.14) was deployed 4 hours ago with 'fix: Connection pool sizing' - this might have changed pool settings. There's also a canary deployment in progress. Let me get the runbook for this issue.", "tool_calls": [ { "id": "call_runbook_1", "type": "function", "function": { "name": "get_runbook", "arguments": json.dumps({"search": "database connection pool exhaustion"}), }, } ], }, # Runbook response {"role": "tool", "tool_call_id": "call_runbook_1", "content": generate_runbook_response()}, # Agent provides final analysis { "role": "assistant", "content": """**Incident Summary: Database Connection Pool Exhaustion** **Root Cause:** The database primary is at 100% connection capacity (500/500). This started ~45 minutes ago, likely related to deploy-999 which modified connection pool settings. **Impact:** - API error rate: 15% (up from 0.1%) - P99 latency: 2500ms (up from 120ms) - 2 of 3 API instances unhealthy **Immediate Actions Recommended:** 1. Kill idle database connections (per runbook RUN-DB-001) 2. Scale API servers to 5 replicas to distribute load 3. Consider rolling back deploy-999 if connection pool change is confirmed Shall I proceed with the remediation steps?""", }, # User asks for action { "role": "user", "content": "Yes, give me the exact commands to run and summarize the incident for the post-mortem.", }, ] return messages # ============================================================================= # EVALUATION # ============================================================================= @dataclass class EvalResult: mode: str tokens_input: int tokens_output: int latency_ms: float response: str cost_estimate: float def evaluate_response_quality( baseline_response: str, optimized_response: str, optimization_level: str ) -> dict: """ Use GPT-4o as judge to evaluate if the optimized response maintains quality. """ judge_prompt = f"""You are evaluating two AI assistant responses to an SRE incident investigation request. The user asked: "Give me the exact commands to run and summarize the incident for the post-mortem." BASELINE RESPONSE (no optimization): {baseline_response} OPTIMIZED RESPONSE ({optimization_level} optimization): {optimized_response} Evaluate the optimized response compared to baseline on these criteria: 1. CORRECTNESS: Are the SQL/kubectl commands identical and correct? (1-5) 2. COMPLETENESS: Does it include all necessary remediation steps? (1-5) 3. INCIDENT SUMMARY: Is the post-mortem summary accurate and complete? (1-5) 4. ACTIONABILITY: Is the response equally actionable for an SRE? (1-5) For each criterion, score 1-5 where: - 5 = Equivalent or better than baseline - 4 = Minor differences, still fully usable - 3 = Some information missing but acceptable - 2 = Noticeable quality degradation - 1 = Critical information missing Respond in this exact JSON format: {{ "correctness": {{"score": N, "reason": "..."}}, "completeness": {{"score": N, "reason": "..."}}, "incident_summary": {{"score": N, "reason": "..."}}, "actionability": {{"score": N, "reason": "..."}}, "overall_score": N, "verdict": "PASS" or "FAIL", "summary": "One sentence summary" }} PASS means overall_score >= 4.0, FAIL means < 4.0.""" response = base_client.chat.completions.create( model="gpt-4o", # Using stronger model for judging messages=[{"role": "user", "content": judge_prompt}], max_tokens=1000, response_format={"type": "json_object"}, ) import json as json_module try: return json_module.loads(response.choices[0].message.content) except Exception: return { "error": "Failed to parse judge response", "raw": response.choices[0].message.content, } def run_eval(messages: list[dict], mode: str, use_aggressive: bool = False) -> EvalResult: """Run evaluation.""" c = aggressive_client if use_aggressive else client start_time = time.time() response = c.chat.completions.create( model="gpt-4o-mini", messages=messages, max_tokens=1000, headroom_mode=mode, ) latency_ms = (time.time() - start_time) * 1000 response_text = response.choices[0].message.content or "" tokens_in = response.usage.prompt_tokens if response.usage else 0 tokens_out = response.usage.completion_tokens if response.usage else 0 # GPT-4o-mini pricing: $0.15/1M input, $0.60/1M output cost = (tokens_in / 1_000_000) * 0.15 + (tokens_out / 1_000_000) * 0.60 return EvalResult( mode=mode, tokens_input=tokens_in, tokens_output=tokens_out, latency_ms=latency_ms, response=response_text, cost_estimate=cost, ) def main(): print("=" * 70) print("REAL-WORLD OPENAI AGENTIC EVALUATION") print("Scenario: SRE Incident Response Investigation") print("=" * 70) print() messages = build_incident_conversation() # Count tool outputs tool_messages = [m for m in messages if m.get("role") == "tool"] print(f"Conversation: {len(messages)} messages") print(f"Tool calls: {len(tool_messages)} (metrics, logs, status, deployments, runbook)") print() # Simulations print("-" * 70) print("SIMULATIONS") print("-" * 70) sim_default = client.chat.completions.simulate(model="gpt-4o-mini", messages=messages) sim_aggressive = aggressive_client.chat.completions.simulate( model="gpt-4o-mini", messages=messages ) print(f"\n{'Mode':<15} {'Before':>10} {'After':>10} {'Saved':>10} {'%':>8}") print("-" * 55) print( f"{'Default':<15} {sim_default.tokens_before:>10,} {sim_default.tokens_after:>10,} {sim_default.tokens_saved:>10,} {sim_default.tokens_saved / sim_default.tokens_before * 100:>7.1f}%" ) print( f"{'Aggressive':<15} {sim_aggressive.tokens_before:>10,} {sim_aggressive.tokens_after:>10,} {sim_aggressive.tokens_saved:>10,} {sim_aggressive.tokens_saved / sim_aggressive.tokens_before * 100:>7.1f}%" ) print(f"\nTransforms: {sim_default.transforms}") print() # Actual API calls print("-" * 70) print("ACTUAL API CALLS") print("-" * 70) print("\n1. BASELINE (No optimization)...") baseline = run_eval(messages, "audit") print(f" Tokens: {baseline.tokens_input:,} in / {baseline.tokens_output:,} out") print(f" Cost: ${baseline.cost_estimate:.6f} | Latency: {baseline.latency_ms:.0f}ms") print("\n2. DEFAULT OPTIMIZATION...") default_opt = run_eval(messages, "optimize") print(f" Tokens: {default_opt.tokens_input:,} in / {default_opt.tokens_output:,} out") print(f" Cost: ${default_opt.cost_estimate:.6f} | Latency: {default_opt.latency_ms:.0f}ms") print("\n3. AGGRESSIVE OPTIMIZATION...") aggressive_opt = run_eval(messages, "optimize", use_aggressive=True) print(f" Tokens: {aggressive_opt.tokens_input:,} in / {aggressive_opt.tokens_output:,} out") print( f" Cost: ${aggressive_opt.cost_estimate:.6f} | Latency: {aggressive_opt.latency_ms:.0f}ms" ) # Results table print() print("=" * 70) print("RESULTS COMPARISON") print("=" * 70) def_savings = baseline.tokens_input - default_opt.tokens_input def_pct = (def_savings / baseline.tokens_input * 100) if baseline.tokens_input else 0 agg_savings = baseline.tokens_input - aggressive_opt.tokens_input agg_pct = (agg_savings / baseline.tokens_input * 100) if baseline.tokens_input else 0 print(f"\n{'Metric':<20} {'Baseline':>12} {'Default Opt':>12} {'Aggressive':>12}") print("-" * 60) print( f"{'Input Tokens':<20} {baseline.tokens_input:>12,} {default_opt.tokens_input:>12,} {aggressive_opt.tokens_input:>12,}" ) print( f"{'Output Tokens':<20} {baseline.tokens_output:>12,} {default_opt.tokens_output:>12,} {aggressive_opt.tokens_output:>12,}" ) print( f"{'Cost':<20} ${baseline.cost_estimate:>11.6f} ${default_opt.cost_estimate:>11.6f} ${aggressive_opt.cost_estimate:>11.6f}" ) print( f"{'Latency (ms)':<20} {baseline.latency_ms:>12.0f} {default_opt.latency_ms:>12.0f} {aggressive_opt.latency_ms:>12.0f}" ) print() print( f"{'Token Savings':<20} {'-':>12} {def_savings:>10,} ({def_pct:.0f}%) {agg_savings:>10,} ({agg_pct:.0f}%)" ) # Show responses print() print("-" * 70) print("RESPONSE COMPARISON") print("-" * 70) print("\n[BASELINE RESPONSE]") print(baseline.response[:800]) print("..." if len(baseline.response) > 800 else "") print("\n[AGGRESSIVE OPTIMIZATION RESPONSE]") print(aggressive_opt.response[:800]) print("..." if len(aggressive_opt.response) > 800 else "") # Quality Evaluation with LLM Judge print() print("-" * 70) print("QUALITY EVALUATION (GPT-4o as Judge)") print("-" * 70) print("\nEvaluating DEFAULT optimization vs Baseline...") default_eval = evaluate_response_quality(baseline.response, default_opt.response, "default") print("\nEvaluating AGGRESSIVE optimization vs Baseline...") aggressive_eval = evaluate_response_quality( baseline.response, aggressive_opt.response, "aggressive" ) print(f"\n{'Criterion':<20} {'Default':>12} {'Aggressive':>12}") print("-" * 46) if "error" not in default_eval and "error" not in aggressive_eval: for criterion in ["correctness", "completeness", "incident_summary", "actionability"]: d_score = default_eval.get(criterion, {}).get("score", "N/A") a_score = aggressive_eval.get(criterion, {}).get("score", "N/A") print(f"{criterion.replace('_', ' ').title():<20} {d_score:>12}/5 {a_score:>12}/5") print("-" * 46) d_overall = default_eval.get("overall_score", "N/A") a_overall = aggressive_eval.get("overall_score", "N/A") print(f"{'OVERALL SCORE':<20} {d_overall:>12}/5 {a_overall:>12}/5") d_verdict = default_eval.get("verdict", "N/A") a_verdict = aggressive_eval.get("verdict", "N/A") print(f"{'VERDICT':<20} {d_verdict:>12} {a_verdict:>12}") print("\n[Default Optimization Judge Summary]") print(f" {default_eval.get('summary', 'N/A')}") print("\n[Aggressive Optimization Judge Summary]") print(f" {aggressive_eval.get('summary', 'N/A')}") # Detailed reasoning print("\n[Detailed Evaluation - Aggressive]") for criterion in ["correctness", "completeness", "incident_summary", "actionability"]: reason = aggressive_eval.get(criterion, {}).get("reason", "N/A") print(f" {criterion.title()}: {reason}") else: print("Error in evaluation:") print(f" Default: {default_eval}") print(f" Aggressive: {aggressive_eval}") # Summary cost_save_monthly = (baseline.cost_estimate - aggressive_opt.cost_estimate) * 1000 * 30 print() print("=" * 70) print("SUMMARY") print("=" * 70) # Get verdicts for summary d_verdict = default_eval.get("verdict", "N/A") if "error" not in default_eval else "ERROR" a_verdict = aggressive_eval.get("verdict", "N/A") if "error" not in aggressive_eval else "ERROR" d_score = default_eval.get("overall_score", 0) if "error" not in default_eval else 0 a_score = aggressive_eval.get("overall_score", 0) if "error" not in aggressive_eval else 0 print(f""" Real-world SRE incident investigation with 5 tool calls: Tool Outputs: - Metrics query: ~240 data points (60 per metric x 4 metrics) - Log search: 40 log entries with full metadata - Service status: 5 services with instance details - Deployments: 15 deployment records - Runbook: Structured documentation Token Savings: - Default optimization: {def_savings:,} tokens saved ({def_pct:.1f}%) - Aggressive optimization: {agg_savings:,} tokens saved ({agg_pct:.1f}%) Quality Evaluation (GPT-4o Judge): - Default: {d_verdict} (Score: {d_score}/5) - Aggressive: {a_verdict} (Score: {a_score}/5) Cost Impact @ 1K requests/day: - Monthly savings: ${cost_save_monthly:.2f} CONCLUSION: {"✓ Headroom achieves " + f"{agg_pct:.0f}% token reduction with PASSING quality scores." if a_verdict == "PASS" else "⚠ Aggressive optimization may degrade response quality - use conservative settings."} {" The compressed context maintains semantic equivalence for model reasoning." if a_verdict == "PASS" else ""} """) if __name__ == "__main__": main()