Spaces:

minhtudragon
/

headroom

Build error

App Files Files Community

headroom / examples /real_world_openai_eval.py

chopratejas

Fix all ruff lint and format errors for CI

e4a41fa 6 months ago

Raw

History Blame

33.6 kB

	#!/usr/bin/env python3
	"""
	Real-world OpenAI Agentic Evaluation for Headroom SDK.

	Scenario: DevOps/SRE Agent investigating a production incident
	- This is NOT hand-crafted for Headroom optimizations
	- Tool outputs are realistic sizes based on actual tool responses
	- The conversation flow mirrors real incident response

	Tools used:
	1. query_metrics - Prometheus/Datadog style metrics
	2. search_logs - ELK/Splunk style log search
	3. get_service_status - Health check endpoints
	4. query_deployments - CI/CD deployment history
	5. get_runbook - Documentation lookup
	"""

	import json
	import os
	import tempfile
	import time
	from dataclasses import dataclass
	from datetime import datetime, timedelta

	from dotenv import load_dotenv
	from openai import OpenAI

	from headroom import HeadroomClient, OpenAIProvider, ToolCrusherConfig
	from headroom.config import HeadroomConfig
	from headroom.transforms import TransformPipeline

	load_dotenv(".env.local")

	# Initialize
	base_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
	provider = OpenAIProvider()

	db_path = os.path.join(tempfile.gettempdir(), "headroom_openai_eval.db")
	client = HeadroomClient(
	original_client=base_client,
	provider=provider,
	store_url=f"sqlite:///{db_path}",
	default_mode="audit",
	)

	# Aggressive config
	aggressive_config = HeadroomConfig()
	aggressive_config.tool_crusher = ToolCrusherConfig(
	min_tokens_to_crush=100,
	max_array_items=5,
	max_string_length=300,
	max_depth=4,
	)

	db_path_agg = os.path.join(tempfile.gettempdir(), "headroom_openai_eval_agg.db")
	aggressive_client = HeadroomClient(
	original_client=base_client,
	provider=provider,
	store_url=f"sqlite:///{db_path_agg}",
	default_mode="audit",
	)
	aggressive_client._config = aggressive_config
	aggressive_client._pipeline = TransformPipeline(aggressive_config, provider=provider)


	# =============================================================================
	# REALISTIC TOOL OUTPUTS - Based on actual production systems
	# =============================================================================


	def generate_metrics_response() -> str:
	"""
	Realistic Prometheus/Datadog metrics query response.
	This is what you actually get from a metrics API.
	"""
	base_time = datetime.now() - timedelta(hours=1)

	# CPU metrics - 60 data points (1 per minute for 1 hour)
	cpu_data = []
	for i in range(60):
	ts = base_time + timedelta(minutes=i)
	# Simulate spike around minute 45
	value = 45 + (i * 0.5) if i < 45 else 85 + (i - 45) * 2
	cpu_data.append(
	{
	"timestamp": ts.isoformat(),
	"value": min(value, 98),
	"labels": {"instance": "prod-api-1", "job": "api-server"},
	}
	)

	# Memory metrics
	memory_data = []
	for i in range(60):
	ts = base_time + timedelta(minutes=i)
	value = 62 + (i * 0.3)
	memory_data.append(
	{
	"timestamp": ts.isoformat(),
	"value": min(value, 89),
	"labels": {"instance": "prod-api-1", "job": "api-server"},
	}
	)

	# Request latency (p99)
	latency_data = []
	for i in range(60):
	ts = base_time + timedelta(minutes=i)
	value = 120 if i < 45 else 450 + (i - 45) * 50
	latency_data.append(
	{
	"timestamp": ts.isoformat(),
	"value": min(value, 2500),
	"labels": {"instance": "prod-api-1", "endpoint": "/api/v1/users"},
	}
	)

	# Error rate
	error_data = []
	for i in range(60):
	ts = base_time + timedelta(minutes=i)
	value = 0.1 if i < 45 else 2.5 + (i - 45) * 0.5
	error_data.append(
	{
	"timestamp": ts.isoformat(),
	"value": min(value, 15),
	"labels": {"instance": "prod-api-1", "status_code": "5xx"},
	}
	)

	return json.dumps(
	{
	"status": "success",
	"data": {
	"resultType": "matrix",
	"result": [
	{"metric": {"__name__": "cpu_usage_percent"}, "values": cpu_data},
	{"metric": {"__name__": "memory_usage_percent"}, "values": memory_data},
	{
	"metric": {"__name__": "http_request_duration_p99_ms"},
	"values": latency_data,
	},
	{"metric": {"__name__": "http_errors_rate_percent"}, "values": error_data},
	],
	},
	"query_time_ms": 127,
	}
	)


	def generate_logs_response() -> str:
	"""
	Realistic ELK/Splunk log search response.
	This is what production log searches actually return.
	"""
	base_time = datetime.now() - timedelta(minutes=30)

	logs = []
	log_templates = [
	("ERROR", "Connection pool exhausted, waiting for available connection", "api-server"),
	(
	"WARN",
	"Slow query detected: SELECT * FROM users WHERE status = 'active' took 2.3s",
	"api-server",
	),
	("ERROR", "Database connection timeout after 30000ms", "api-server"),
	("INFO", "Retry attempt 1/3 for database connection", "api-server"),
	("ERROR", "Max retries exceeded for database operation", "api-server"),
	("WARN", "Circuit breaker OPEN for database-primary", "api-server"),
	("ERROR", "Failed to process request: upstream connect error", "api-server"),
	("INFO", "Health check failed for database-primary", "health-checker"),
	("ERROR", "PostgreSQL: too many connections for role 'api_user'", "database"),
	("WARN", "Connection refused to database-primary:5432", "api-server"),
	("ERROR", "Request timeout: /api/v1/users after 30s", "nginx"),
	("INFO", "Scaling up api-server replicas from 3 to 5", "autoscaler"),
	("ERROR", "OOM killed process api-server (pid 12345)", "kernel"),
	("WARN", "Memory pressure detected, initiating garbage collection", "jvm"),
	("ERROR", "Unhandled exception in request handler", "api-server"),
	]

	for i in range(40): # 40 log entries
	ts = base_time + timedelta(seconds=i * 45)
	level, msg, source = log_templates[i % len(log_templates)]

	logs.append(
	{
	"@timestamp": ts.isoformat(),
	"level": level,
	"message": msg,
	"service": source,
	"trace_id": f"trace-{1000 + i:04d}-abcd-{i:04d}",
	"span_id": f"span-{i:04d}",
	"host": f"prod-{source}-{i % 3 + 1}",
	"environment": "production",
	"version": "2.4.1",
	"kubernetes": {
	"namespace": "production",
	"pod": f"{source}-{i % 5 + 1}-abc123",
	"container": source,
	"node": f"node-{i % 3 + 1}.prod.internal",
	},
	"request": {
	"method": "GET" if i % 2 == 0 else "POST",
	"path": "/api/v1/users" if i % 3 == 0 else "/api/v1/orders",
	"status_code": 500 if level == "ERROR" else 200,
	"duration_ms": 150 + (i * 100) if level != "ERROR" else 30000,
	},
	}
	)

	return json.dumps(
	{
	"took": 234,
	"timed_out": False,
	"hits": {
	"total": {"value": len(logs), "relation": "eq"},
	"max_score": 1.0,
	"hits": logs,
	},
	}
	)


	def generate_service_status() -> str:
	"""
	Realistic health check / service status response.
	"""
	return json.dumps(
	{
	"services": [
	{
	"name": "api-server",
	"status": "degraded",
	"instances": [
	{
	"id": "api-1",
	"status": "unhealthy",
	"cpu": 94,
	"memory": 87,
	"connections": 500,
	},
	{
	"id": "api-2",
	"status": "healthy",
	"cpu": 45,
	"memory": 62,
	"connections": 150,
	},
	{
	"id": "api-3",
	"status": "unhealthy",
	"cpu": 91,
	"memory": 85,
	"connections": 480,
	},
	],
	"last_check": datetime.now().isoformat(),
	"error_rate": 12.5,
	"p99_latency_ms": 2100,
	},
	{
	"name": "database-primary",
	"status": "critical",
	"instances": [
	{
	"id": "db-primary",
	"status": "unhealthy",
	"connections": 500,
	"max_connections": 500,
	"replication_lag_ms": 0,
	"disk_usage_percent": 78,
	}
	],
	"last_check": datetime.now().isoformat(),
	"active_queries": 487,
	"blocked_queries": 52,
	},
	{
	"name": "database-replica",
	"status": "healthy",
	"instances": [
	{
	"id": "db-replica-1",
	"status": "healthy",
	"connections": 120,
	"max_connections": 500,
	"replication_lag_ms": 150,
	"disk_usage_percent": 76,
	},
	{
	"id": "db-replica-2",
	"status": "healthy",
	"connections": 115,
	"max_connections": 500,
	"replication_lag_ms": 180,
	"disk_usage_percent": 77,
	},
	],
	"last_check": datetime.now().isoformat(),
	},
	{
	"name": "redis-cache",
	"status": "healthy",
	"instances": [
	{
	"id": "redis-1",
	"status": "healthy",
	"memory_used_mb": 2048,
	"memory_max_mb": 4096,
	"connected_clients": 45,
	"hit_rate": 0.94,
	}
	],
	"last_check": datetime.now().isoformat(),
	},
	{
	"name": "nginx-ingress",
	"status": "healthy",
	"instances": [
	{
	"id": "nginx-1",
	"status": "healthy",
	"active_connections": 1250,
	"requests_per_sec": 450,
	},
	{
	"id": "nginx-2",
	"status": "healthy",
	"active_connections": 1180,
	"requests_per_sec": 420,
	},
	],
	"last_check": datetime.now().isoformat(),
	},
	],
	"overall_status": "critical",
	"timestamp": datetime.now().isoformat(),
	}
	)


	def generate_deployments_response() -> str:
	"""
	Realistic CI/CD deployment history.
	"""
	base_time = datetime.now()

	deployments = []
	for i in range(15):
	ts = base_time - timedelta(hours=i * 4)
	deployments.append(
	{
	"id": f"deploy-{1000 - i}",
	"service": "api-server" if i % 3 != 2 else "database-migration",
	"version": f"2.4.{15 - i}",
	"status": "success" if i != 1 else "success", # Recent deploy
	"timestamp": ts.isoformat(),
	"commit": f"abc{i:04d}def",
	"author": f"dev{i % 5 + 1}@company.com",
	"message": [
	"feat: Add new user endpoint",
	"fix: Connection pool sizing",
	"chore: Update dependencies",
	"feat: Implement caching layer",
	"fix: Memory leak in request handler",
	][i % 5],
	"changes": {
	"files_changed": 5 + i,
	"insertions": 100 + i * 20,
	"deletions": 30 + i * 5,
	},
	"rollback_available": True,
	"canary_status": "completed" if i > 0 else "in_progress",
	}
	)

	return json.dumps(
	{"deployments": deployments, "total_count": len(deployments), "page": 1, "per_page": 20}
	)


	def generate_runbook_response() -> str:
	"""
	Realistic runbook/documentation lookup.
	"""
	return json.dumps(
	{
	"runbook": {
	"title": "Database Connection Pool Exhaustion",
	"id": "RUN-DB-001",
	"severity": "P1",
	"last_updated": "2024-11-15",
	"owner": "platform-team",
	"symptoms": [
	"High error rate on API endpoints",
	"Connection timeout errors in logs",
	"Database showing max connections reached",
	"Increased latency across all services",
	],
	"diagnosis_steps": [
	"1. Check current connection count: SELECT count(*) FROM pg_stat_activity",
	"2. Identify connection holders: SELECT * FROM pg_stat_activity WHERE state != 'idle'",
	"3. Check for long-running queries: SELECT * FROM pg_stat_activity WHERE state = 'active' AND query_start < now() - interval '1 minute'",
	"4. Verify connection pool settings in application config",
	"5. Check for connection leaks in recent deployments",
	],
	"remediation_steps": [
	"1. IMMEDIATE: Kill idle connections older than 10 minutes",
	"2. IMMEDIATE: Scale up API server replicas to distribute load",
	"3. SHORT-TERM: Increase max_connections on database (requires restart)",
	"4. SHORT-TERM: Review and optimize connection pool settings",
	"5. LONG-TERM: Implement connection pooler (PgBouncer)",
	],
	"commands": {
	"kill_idle_connections": "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state = 'idle' AND query_start < now() - interval '10 minutes'",
	"check_pool_settings": "kubectl get configmap api-server-config -o yaml \| grep -A5 'database'",
	"scale_replicas": "kubectl scale deployment api-server --replicas=5",
	},
	"related_incidents": ["INC-2024-089", "INC-2024-056", "INC-2024-023"],
	"escalation_path": ["on-call-sre", "database-team", "platform-lead"],
	}
	}
	)


	# =============================================================================
	# BUILD REALISTIC INCIDENT RESPONSE CONVERSATION
	# =============================================================================


	def build_incident_conversation() -> list[dict]:
	"""
	Build a realistic incident response agentic conversation.
	This mirrors how SRE teams actually investigate incidents.
	"""

	messages = [
	# System prompt (realistic for an SRE agent)
	{
	"role": "system",
	"content": """You are an SRE incident response assistant. You help investigate and resolve production incidents.

	Current Date: 2024-12-15T14:30:00Z
	On-call: @platform-team
	Incident Channel: #inc-2024-142

	You have access to the following tools:
	- query_metrics: Query Prometheus metrics
	- search_logs: Search ELK logs
	- get_service_status: Get current service health
	- query_deployments: Get recent deployment history
	- get_runbook: Lookup runbook documentation

	Always be concise and focus on actionable insights.""",
	},
	# User reports incident
	{
	"role": "user",
	"content": "We're seeing elevated error rates on the API. Users reporting timeouts. Can you investigate?",
	},
	# Agent starts investigation - queries metrics
	{
	"role": "assistant",
	"content": None,
	"tool_calls": [
	{
	"id": "call_metrics_1",
	"type": "function",
	"function": {
	"name": "query_metrics",
	"arguments": json.dumps(
	{
	"query": "api_error_rate, cpu_usage, memory_usage, latency_p99",
	"time_range": "1h",
	}
	),
	},
	}
	],
	},
	# Metrics response
	{"role": "tool", "tool_call_id": "call_metrics_1", "content": generate_metrics_response()},
	# Agent analyzes and queries logs
	{
	"role": "assistant",
	"content": "I see a clear pattern: around 45 minutes ago, CPU spiked to 95%, latency jumped from 120ms to 2500ms, and error rate went from 0.1% to 15%. Let me check the logs for that timeframe.",
	"tool_calls": [
	{
	"id": "call_logs_1",
	"type": "function",
	"function": {
	"name": "search_logs",
	"arguments": json.dumps(
	{
	"query": "level:ERROR OR level:WARN",
	"time_range": "30m",
	"services": ["api-server", "database"],
	}
	),
	},
	}
	],
	},
	# Logs response
	{"role": "tool", "tool_call_id": "call_logs_1", "content": generate_logs_response()},
	# Agent identifies root cause, checks service status
	{
	"role": "assistant",
	"content": "The logs show database connection issues: 'Connection pool exhausted', 'too many connections', and 'connection timeout'. This is a database connection saturation issue. Let me check current service status.",
	"tool_calls": [
	{
	"id": "call_status_1",
	"type": "function",
	"function": {"name": "get_service_status", "arguments": json.dumps({})},
	}
	],
	},
	# Service status response
	{"role": "tool", "tool_call_id": "call_status_1", "content": generate_service_status()},
	# Agent checks for recent deployments
	{
	"role": "assistant",
	"content": "Confirmed: database-primary is CRITICAL with 500/500 connections (100% saturated), 52 blocked queries. API servers are degraded. Let me check if a recent deployment might have caused this.",
	"tool_calls": [
	{
	"id": "call_deploys_1",
	"type": "function",
	"function": {
	"name": "query_deployments",
	"arguments": json.dumps({"service": "api-server", "limit": 15}),
	},
	}
	],
	},
	# Deployments response
	{
	"role": "tool",
	"tool_call_id": "call_deploys_1",
	"content": generate_deployments_response(),
	},
	# Agent gets runbook
	{
	"role": "assistant",
	"content": "I see deploy-999 (version 2.4.14) was deployed 4 hours ago with 'fix: Connection pool sizing' - this might have changed pool settings. There's also a canary deployment in progress. Let me get the runbook for this issue.",
	"tool_calls": [
	{
	"id": "call_runbook_1",
	"type": "function",
	"function": {
	"name": "get_runbook",
	"arguments": json.dumps({"search": "database connection pool exhaustion"}),
	},
	}
	],
	},
	# Runbook response
	{"role": "tool", "tool_call_id": "call_runbook_1", "content": generate_runbook_response()},
	# Agent provides final analysis
	{
	"role": "assistant",
	"content": """Incident Summary: Database Connection Pool Exhaustion

	Root Cause: The database primary is at 100% connection capacity (500/500). This started ~45 minutes ago, likely related to deploy-999 which modified connection pool settings.

	Impact:
	- API error rate: 15% (up from 0.1%)
	- P99 latency: 2500ms (up from 120ms)
	- 2 of 3 API instances unhealthy

	Immediate Actions Recommended:
	1. Kill idle database connections (per runbook RUN-DB-001)
	2. Scale API servers to 5 replicas to distribute load
	3. Consider rolling back deploy-999 if connection pool change is confirmed

	Shall I proceed with the remediation steps?""",
	},
	# User asks for action
	{
	"role": "user",
	"content": "Yes, give me the exact commands to run and summarize the incident for the post-mortem.",
	},
	]

	return messages


	# =============================================================================
	# EVALUATION
	# =============================================================================


	@dataclass
	class EvalResult:
	mode: str
	tokens_input: int
	tokens_output: int
	latency_ms: float
	response: str
	cost_estimate: float


	def evaluate_response_quality(
	baseline_response: str, optimized_response: str, optimization_level: str
	) -> dict:
	"""
	Use GPT-4o as judge to evaluate if the optimized response maintains quality.
	"""
	judge_prompt = f"""You are evaluating two AI assistant responses to an SRE incident investigation request.
	The user asked: "Give me the exact commands to run and summarize the incident for the post-mortem."

	BASELINE RESPONSE (no optimization):
	{baseline_response}

	OPTIMIZED RESPONSE ({optimization_level} optimization):
	{optimized_response}

	Evaluate the optimized response compared to baseline on these criteria:
	1. CORRECTNESS: Are the SQL/kubectl commands identical and correct? (1-5)
	2. COMPLETENESS: Does it include all necessary remediation steps? (1-5)
	3. INCIDENT SUMMARY: Is the post-mortem summary accurate and complete? (1-5)
	4. ACTIONABILITY: Is the response equally actionable for an SRE? (1-5)

	For each criterion, score 1-5 where:
	- 5 = Equivalent or better than baseline
	- 4 = Minor differences, still fully usable
	- 3 = Some information missing but acceptable
	- 2 = Noticeable quality degradation
	- 1 = Critical information missing

	Respond in this exact JSON format:
	{{
	"correctness": {{"score": N, "reason": "..."}},
	"completeness": {{"score": N, "reason": "..."}},
	"incident_summary": {{"score": N, "reason": "..."}},
	"actionability": {{"score": N, "reason": "..."}},
	"overall_score": N,
	"verdict": "PASS" or "FAIL",
	"summary": "One sentence summary"
	}}

	PASS means overall_score >= 4.0, FAIL means < 4.0."""

	response = base_client.chat.completions.create(
	model="gpt-4o", # Using stronger model for judging
	messages=[{"role": "user", "content": judge_prompt}],
	max_tokens=1000,
	response_format={"type": "json_object"},
	)

	import json as json_module

	try:
	return json_module.loads(response.choices[0].message.content)
	except Exception:
	return {
	"error": "Failed to parse judge response",
	"raw": response.choices[0].message.content,
	}


	def run_eval(messages: list[dict], mode: str, use_aggressive: bool = False) -> EvalResult:
	"""Run evaluation."""
	c = aggressive_client if use_aggressive else client

	start_time = time.time()
	response = c.chat.completions.create(
	model="gpt-4o-mini",
	messages=messages,
	max_tokens=1000,
	headroom_mode=mode,
	)
	latency_ms = (time.time() - start_time) * 1000

	response_text = response.choices[0].message.content or ""
	tokens_in = response.usage.prompt_tokens if response.usage else 0
	tokens_out = response.usage.completion_tokens if response.usage else 0

	# GPT-4o-mini pricing: $0.15/1M input, $0.60/1M output
	cost = (tokens_in / 1_000_000) * 0.15 + (tokens_out / 1_000_000) * 0.60

	return EvalResult(
	mode=mode,
	tokens_input=tokens_in,
	tokens_output=tokens_out,
	latency_ms=latency_ms,
	response=response_text,
	cost_estimate=cost,
	)


	def main():
	print("=" * 70)
	print("REAL-WORLD OPENAI AGENTIC EVALUATION")
	print("Scenario: SRE Incident Response Investigation")
	print("=" * 70)
	print()

	messages = build_incident_conversation()

	# Count tool outputs
	tool_messages = [m for m in messages if m.get("role") == "tool"]
	print(f"Conversation: {len(messages)} messages")
	print(f"Tool calls: {len(tool_messages)} (metrics, logs, status, deployments, runbook)")
	print()

	# Simulations
	print("-" * 70)
	print("SIMULATIONS")
	print("-" * 70)

	sim_default = client.chat.completions.simulate(model="gpt-4o-mini", messages=messages)
	sim_aggressive = aggressive_client.chat.completions.simulate(
	model="gpt-4o-mini", messages=messages
	)

	print(f"\n{'Mode':<15} {'Before':>10} {'After':>10} {'Saved':>10} {'%':>8}")
	print("-" * 55)
	print(
	f"{'Default':<15} {sim_default.tokens_before:>10,} {sim_default.tokens_after:>10,} {sim_default.tokens_saved:>10,} {sim_default.tokens_saved / sim_default.tokens_before * 100:>7.1f}%"
	)
	print(
	f"{'Aggressive':<15} {sim_aggressive.tokens_before:>10,} {sim_aggressive.tokens_after:>10,} {sim_aggressive.tokens_saved:>10,} {sim_aggressive.tokens_saved / sim_aggressive.tokens_before * 100:>7.1f}%"
	)
	print(f"\nTransforms: {sim_default.transforms}")
	print()

	# Actual API calls
	print("-" * 70)
	print("ACTUAL API CALLS")
	print("-" * 70)

	print("\n1. BASELINE (No optimization)...")
	baseline = run_eval(messages, "audit")
	print(f" Tokens: {baseline.tokens_input:,} in / {baseline.tokens_output:,} out")
	print(f" Cost: ${baseline.cost_estimate:.6f} \| Latency: {baseline.latency_ms:.0f}ms")

	print("\n2. DEFAULT OPTIMIZATION...")
	default_opt = run_eval(messages, "optimize")
	print(f" Tokens: {default_opt.tokens_input:,} in / {default_opt.tokens_output:,} out")
	print(f" Cost: ${default_opt.cost_estimate:.6f} \| Latency: {default_opt.latency_ms:.0f}ms")

	print("\n3. AGGRESSIVE OPTIMIZATION...")
	aggressive_opt = run_eval(messages, "optimize", use_aggressive=True)
	print(f" Tokens: {aggressive_opt.tokens_input:,} in / {aggressive_opt.tokens_output:,} out")
	print(
	f" Cost: ${aggressive_opt.cost_estimate:.6f} \| Latency: {aggressive_opt.latency_ms:.0f}ms"
	)

	# Results table
	print()
	print("=" * 70)
	print("RESULTS COMPARISON")
	print("=" * 70)

	def_savings = baseline.tokens_input - default_opt.tokens_input
	def_pct = (def_savings / baseline.tokens_input * 100) if baseline.tokens_input else 0
	agg_savings = baseline.tokens_input - aggressive_opt.tokens_input
	agg_pct = (agg_savings / baseline.tokens_input * 100) if baseline.tokens_input else 0

	print(f"\n{'Metric':<20} {'Baseline':>12} {'Default Opt':>12} {'Aggressive':>12}")
	print("-" * 60)
	print(
	f"{'Input Tokens':<20} {baseline.tokens_input:>12,} {default_opt.tokens_input:>12,} {aggressive_opt.tokens_input:>12,}"
	)
	print(
	f"{'Output Tokens':<20} {baseline.tokens_output:>12,} {default_opt.tokens_output:>12,} {aggressive_opt.tokens_output:>12,}"
	)
	print(
	f"{'Cost':<20} ${baseline.cost_estimate:>11.6f} ${default_opt.cost_estimate:>11.6f} ${aggressive_opt.cost_estimate:>11.6f}"
	)
	print(
	f"{'Latency (ms)':<20} {baseline.latency_ms:>12.0f} {default_opt.latency_ms:>12.0f} {aggressive_opt.latency_ms:>12.0f}"
	)
	print()
	print(
	f"{'Token Savings':<20} {'-':>12} {def_savings:>10,} ({def_pct:.0f}%) {agg_savings:>10,} ({agg_pct:.0f}%)"
	)

	# Show responses
	print()
	print("-" * 70)
	print("RESPONSE COMPARISON")
	print("-" * 70)

	print("\n[BASELINE RESPONSE]")
	print(baseline.response[:800])
	print("..." if len(baseline.response) > 800 else "")

	print("\n[AGGRESSIVE OPTIMIZATION RESPONSE]")
	print(aggressive_opt.response[:800])
	print("..." if len(aggressive_opt.response) > 800 else "")

	# Quality Evaluation with LLM Judge
	print()
	print("-" * 70)
	print("QUALITY EVALUATION (GPT-4o as Judge)")
	print("-" * 70)

	print("\nEvaluating DEFAULT optimization vs Baseline...")
	default_eval = evaluate_response_quality(baseline.response, default_opt.response, "default")

	print("\nEvaluating AGGRESSIVE optimization vs Baseline...")
	aggressive_eval = evaluate_response_quality(
	baseline.response, aggressive_opt.response, "aggressive"
	)

	print(f"\n{'Criterion':<20} {'Default':>12} {'Aggressive':>12}")
	print("-" * 46)

	if "error" not in default_eval and "error" not in aggressive_eval:
	for criterion in ["correctness", "completeness", "incident_summary", "actionability"]:
	d_score = default_eval.get(criterion, {}).get("score", "N/A")
	a_score = aggressive_eval.get(criterion, {}).get("score", "N/A")
	print(f"{criterion.replace('_', ' ').title():<20} {d_score:>12}/5 {a_score:>12}/5")

	print("-" * 46)
	d_overall = default_eval.get("overall_score", "N/A")
	a_overall = aggressive_eval.get("overall_score", "N/A")
	print(f"{'OVERALL SCORE':<20} {d_overall:>12}/5 {a_overall:>12}/5")

	d_verdict = default_eval.get("verdict", "N/A")
	a_verdict = aggressive_eval.get("verdict", "N/A")
	print(f"{'VERDICT':<20} {d_verdict:>12} {a_verdict:>12}")

	print("\n[Default Optimization Judge Summary]")
	print(f" {default_eval.get('summary', 'N/A')}")

	print("\n[Aggressive Optimization Judge Summary]")
	print(f" {aggressive_eval.get('summary', 'N/A')}")

	# Detailed reasoning
	print("\n[Detailed Evaluation - Aggressive]")
	for criterion in ["correctness", "completeness", "incident_summary", "actionability"]:
	reason = aggressive_eval.get(criterion, {}).get("reason", "N/A")
	print(f" {criterion.title()}: {reason}")
	else:
	print("Error in evaluation:")
	print(f" Default: {default_eval}")
	print(f" Aggressive: {aggressive_eval}")

	# Summary
	cost_save_monthly = (baseline.cost_estimate - aggressive_opt.cost_estimate) * 1000 * 30

	print()
	print("=" * 70)
	print("SUMMARY")
	print("=" * 70)
	# Get verdicts for summary
	d_verdict = default_eval.get("verdict", "N/A") if "error" not in default_eval else "ERROR"
	a_verdict = aggressive_eval.get("verdict", "N/A") if "error" not in aggressive_eval else "ERROR"
	d_score = default_eval.get("overall_score", 0) if "error" not in default_eval else 0
	a_score = aggressive_eval.get("overall_score", 0) if "error" not in aggressive_eval else 0

	print(f"""
	Real-world SRE incident investigation with 5 tool calls:

	Tool Outputs:
	- Metrics query: ~240 data points (60 per metric x 4 metrics)
	- Log search: 40 log entries with full metadata
	- Service status: 5 services with instance details
	- Deployments: 15 deployment records
	- Runbook: Structured documentation

	Token Savings:
	- Default optimization: {def_savings:,} tokens saved ({def_pct:.1f}%)
	- Aggressive optimization: {agg_savings:,} tokens saved ({agg_pct:.1f}%)

	Quality Evaluation (GPT-4o Judge):
	- Default: {d_verdict} (Score: {d_score}/5)
	- Aggressive: {a_verdict} (Score: {a_score}/5)

	Cost Impact @ 1K requests/day:
	- Monthly savings: ${cost_save_monthly:.2f}

	CONCLUSION:
	{"✓ Headroom achieves " + f"{agg_pct:.0f}% token reduction with PASSING quality scores." if a_verdict == "PASS" else "⚠ Aggressive optimization may degrade response quality - use conservative settings."}
	{" The compressed context maintains semantic equivalence for model reasoning." if a_verdict == "PASS" else ""}
	""")


	if __name__ == "__main__":
	main()