#!/usr/bin/env python3
"""
Real-world OpenAI Agentic Evaluation for Headroom SDK.

Scenario: DevOps/SRE Agent investigating a production incident
- This is NOT hand-crafted for Headroom optimizations
- Tool outputs are realistic sizes based on actual tool responses
- The conversation flow mirrors real incident response

Tools used:
1. query_metrics - Prometheus/Datadog style metrics
2. search_logs - ELK/Splunk style log search
3. get_service_status - Health check endpoints
4. query_deployments - CI/CD deployment history
5. get_runbook - Documentation lookup
"""

import json
import os
import tempfile
import time
from dataclasses import dataclass
from datetime import datetime, timedelta

from dotenv import load_dotenv
from openai import OpenAI

from headroom import HeadroomClient, OpenAIProvider, ToolCrusherConfig
from headroom.config import HeadroomConfig
from headroom.transforms import TransformPipeline

load_dotenv(".env.local")

# Initialize
base_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
provider = OpenAIProvider()

db_path = os.path.join(tempfile.gettempdir(), "headroom_openai_eval.db")
client = HeadroomClient(
    original_client=base_client,
    provider=provider,
    store_url=f"sqlite:///{db_path}",
    default_mode="audit",
)

# Aggressive config
aggressive_config = HeadroomConfig()
aggressive_config.tool_crusher = ToolCrusherConfig(
    min_tokens_to_crush=100,
    max_array_items=5,
    max_string_length=300,
    max_depth=4,
)

db_path_agg = os.path.join(tempfile.gettempdir(), "headroom_openai_eval_agg.db")
aggressive_client = HeadroomClient(
    original_client=base_client,
    provider=provider,
    store_url=f"sqlite:///{db_path_agg}",
    default_mode="audit",
)
aggressive_client._config = aggressive_config
aggressive_client._pipeline = TransformPipeline(aggressive_config, provider=provider)


# =============================================================================
# REALISTIC TOOL OUTPUTS - Based on actual production systems
# =============================================================================


def generate_metrics_response() -> str:
    """
    Realistic Prometheus/Datadog metrics query response.
    This is what you actually get from a metrics API.
    """
    base_time = datetime.now() - timedelta(hours=1)

    # CPU metrics - 60 data points (1 per minute for 1 hour)
    cpu_data = []
    for i in range(60):
        ts = base_time + timedelta(minutes=i)
        # Simulate spike around minute 45
        value = 45 + (i * 0.5) if i < 45 else 85 + (i - 45) * 2
        cpu_data.append(
            {
                "timestamp": ts.isoformat(),
                "value": min(value, 98),
                "labels": {"instance": "prod-api-1", "job": "api-server"},
            }
        )

    # Memory metrics
    memory_data = []
    for i in range(60):
        ts = base_time + timedelta(minutes=i)
        value = 62 + (i * 0.3)
        memory_data.append(
            {
                "timestamp": ts.isoformat(),
                "value": min(value, 89),
                "labels": {"instance": "prod-api-1", "job": "api-server"},
            }
        )

    # Request latency (p99)
    latency_data = []
    for i in range(60):
        ts = base_time + timedelta(minutes=i)
        value = 120 if i < 45 else 450 + (i - 45) * 50
        latency_data.append(
            {
                "timestamp": ts.isoformat(),
                "value": min(value, 2500),
                "labels": {"instance": "prod-api-1", "endpoint": "/api/v1/users"},
            }
        )

    # Error rate
    error_data = []
    for i in range(60):
        ts = base_time + timedelta(minutes=i)
        value = 0.1 if i < 45 else 2.5 + (i - 45) * 0.5
        error_data.append(
            {
                "timestamp": ts.isoformat(),
                "value": min(value, 15),
                "labels": {"instance": "prod-api-1", "status_code": "5xx"},
            }
        )

    return json.dumps(
        {
            "status": "success",
            "data": {
                "resultType": "matrix",
                "result": [
                    {"metric": {"__name__": "cpu_usage_percent"}, "values": cpu_data},
                    {"metric": {"__name__": "memory_usage_percent"}, "values": memory_data},
                    {
                        "metric": {"__name__": "http_request_duration_p99_ms"},
                        "values": latency_data,
                    },
                    {"metric": {"__name__": "http_errors_rate_percent"}, "values": error_data},
                ],
            },
            "query_time_ms": 127,
        }
    )


def generate_logs_response() -> str:
    """
    Realistic ELK/Splunk log search response.
    This is what production log searches actually return.
    """
    base_time = datetime.now() - timedelta(minutes=30)

    logs = []
    log_templates = [
        ("ERROR", "Connection pool exhausted, waiting for available connection", "api-server"),
        (
            "WARN",
            "Slow query detected: SELECT * FROM users WHERE status = 'active' took 2.3s",
            "api-server",
        ),
        ("ERROR", "Database connection timeout after 30000ms", "api-server"),
        ("INFO", "Retry attempt 1/3 for database connection", "api-server"),
        ("ERROR", "Max retries exceeded for database operation", "api-server"),
        ("WARN", "Circuit breaker OPEN for database-primary", "api-server"),
        ("ERROR", "Failed to process request: upstream connect error", "api-server"),
        ("INFO", "Health check failed for database-primary", "health-checker"),
        ("ERROR", "PostgreSQL: too many connections for role 'api_user'", "database"),
        ("WARN", "Connection refused to database-primary:5432", "api-server"),
        ("ERROR", "Request timeout: /api/v1/users after 30s", "nginx"),
        ("INFO", "Scaling up api-server replicas from 3 to 5", "autoscaler"),
        ("ERROR", "OOM killed process api-server (pid 12345)", "kernel"),
        ("WARN", "Memory pressure detected, initiating garbage collection", "jvm"),
        ("ERROR", "Unhandled exception in request handler", "api-server"),
    ]

    for i in range(40):  # 40 log entries
        ts = base_time + timedelta(seconds=i * 45)
        level, msg, source = log_templates[i % len(log_templates)]

        logs.append(
            {
                "@timestamp": ts.isoformat(),
                "level": level,
                "message": msg,
                "service": source,
                "trace_id": f"trace-{1000 + i:04d}-abcd-{i:04d}",
                "span_id": f"span-{i:04d}",
                "host": f"prod-{source}-{i % 3 + 1}",
                "environment": "production",
                "version": "2.4.1",
                "kubernetes": {
                    "namespace": "production",
                    "pod": f"{source}-{i % 5 + 1}-abc123",
                    "container": source,
                    "node": f"node-{i % 3 + 1}.prod.internal",
                },
                "request": {
                    "method": "GET" if i % 2 == 0 else "POST",
                    "path": "/api/v1/users" if i % 3 == 0 else "/api/v1/orders",
                    "status_code": 500 if level == "ERROR" else 200,
                    "duration_ms": 150 + (i * 100) if level != "ERROR" else 30000,
                },
            }
        )

    return json.dumps(
        {
            "took": 234,
            "timed_out": False,
            "hits": {
                "total": {"value": len(logs), "relation": "eq"},
                "max_score": 1.0,
                "hits": logs,
            },
        }
    )


def generate_service_status() -> str:
    """
    Realistic health check / service status response.
    """
    return json.dumps(
        {
            "services": [
                {
                    "name": "api-server",
                    "status": "degraded",
                    "instances": [
                        {
                            "id": "api-1",
                            "status": "unhealthy",
                            "cpu": 94,
                            "memory": 87,
                            "connections": 500,
                        },
                        {
                            "id": "api-2",
                            "status": "healthy",
                            "cpu": 45,
                            "memory": 62,
                            "connections": 150,
                        },
                        {
                            "id": "api-3",
                            "status": "unhealthy",
                            "cpu": 91,
                            "memory": 85,
                            "connections": 480,
                        },
                    ],
                    "last_check": datetime.now().isoformat(),
                    "error_rate": 12.5,
                    "p99_latency_ms": 2100,
                },
                {
                    "name": "database-primary",
                    "status": "critical",
                    "instances": [
                        {
                            "id": "db-primary",
                            "status": "unhealthy",
                            "connections": 500,
                            "max_connections": 500,
                            "replication_lag_ms": 0,
                            "disk_usage_percent": 78,
                        }
                    ],
                    "last_check": datetime.now().isoformat(),
                    "active_queries": 487,
                    "blocked_queries": 52,
                },
                {
                    "name": "database-replica",
                    "status": "healthy",
                    "instances": [
                        {
                            "id": "db-replica-1",
                            "status": "healthy",
                            "connections": 120,
                            "max_connections": 500,
                            "replication_lag_ms": 150,
                            "disk_usage_percent": 76,
                        },
                        {
                            "id": "db-replica-2",
                            "status": "healthy",
                            "connections": 115,
                            "max_connections": 500,
                            "replication_lag_ms": 180,
                            "disk_usage_percent": 77,
                        },
                    ],
                    "last_check": datetime.now().isoformat(),
                },
                {
                    "name": "redis-cache",
                    "status": "healthy",
                    "instances": [
                        {
                            "id": "redis-1",
                            "status": "healthy",
                            "memory_used_mb": 2048,
                            "memory_max_mb": 4096,
                            "connected_clients": 45,
                            "hit_rate": 0.94,
                        }
                    ],
                    "last_check": datetime.now().isoformat(),
                },
                {
                    "name": "nginx-ingress",
                    "status": "healthy",
                    "instances": [
                        {
                            "id": "nginx-1",
                            "status": "healthy",
                            "active_connections": 1250,
                            "requests_per_sec": 450,
                        },
                        {
                            "id": "nginx-2",
                            "status": "healthy",
                            "active_connections": 1180,
                            "requests_per_sec": 420,
                        },
                    ],
                    "last_check": datetime.now().isoformat(),
                },
            ],
            "overall_status": "critical",
            "timestamp": datetime.now().isoformat(),
        }
    )


def generate_deployments_response() -> str:
    """
    Realistic CI/CD deployment history.
    """
    base_time = datetime.now()

    deployments = []
    for i in range(15):
        ts = base_time - timedelta(hours=i * 4)
        deployments.append(
            {
                "id": f"deploy-{1000 - i}",
                "service": "api-server" if i % 3 != 2 else "database-migration",
                "version": f"2.4.{15 - i}",
                "status": "success" if i != 1 else "success",  # Recent deploy
                "timestamp": ts.isoformat(),
                "commit": f"abc{i:04d}def",
                "author": f"dev{i % 5 + 1}@company.com",
                "message": [
                    "feat: Add new user endpoint",
                    "fix: Connection pool sizing",
                    "chore: Update dependencies",
                    "feat: Implement caching layer",
                    "fix: Memory leak in request handler",
                ][i % 5],
                "changes": {
                    "files_changed": 5 + i,
                    "insertions": 100 + i * 20,
                    "deletions": 30 + i * 5,
                },
                "rollback_available": True,
                "canary_status": "completed" if i > 0 else "in_progress",
            }
        )

    return json.dumps(
        {"deployments": deployments, "total_count": len(deployments), "page": 1, "per_page": 20}
    )


def generate_runbook_response() -> str:
    """
    Realistic runbook/documentation lookup.
    """
    return json.dumps(
        {
            "runbook": {
                "title": "Database Connection Pool Exhaustion",
                "id": "RUN-DB-001",
                "severity": "P1",
                "last_updated": "2024-11-15",
                "owner": "platform-team",
                "symptoms": [
                    "High error rate on API endpoints",
                    "Connection timeout errors in logs",
                    "Database showing max connections reached",
                    "Increased latency across all services",
                ],
                "diagnosis_steps": [
                    "1. Check current connection count: SELECT count(*) FROM pg_stat_activity",
                    "2. Identify connection holders: SELECT * FROM pg_stat_activity WHERE state != 'idle'",
                    "3. Check for long-running queries: SELECT * FROM pg_stat_activity WHERE state = 'active' AND query_start < now() - interval '1 minute'",
                    "4. Verify connection pool settings in application config",
                    "5. Check for connection leaks in recent deployments",
                ],
                "remediation_steps": [
                    "1. IMMEDIATE: Kill idle connections older than 10 minutes",
                    "2. IMMEDIATE: Scale up API server replicas to distribute load",
                    "3. SHORT-TERM: Increase max_connections on database (requires restart)",
                    "4. SHORT-TERM: Review and optimize connection pool settings",
                    "5. LONG-TERM: Implement connection pooler (PgBouncer)",
                ],
                "commands": {
                    "kill_idle_connections": "SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE state = 'idle' AND query_start < now() - interval '10 minutes'",
                    "check_pool_settings": "kubectl get configmap api-server-config -o yaml | grep -A5 'database'",
                    "scale_replicas": "kubectl scale deployment api-server --replicas=5",
                },
                "related_incidents": ["INC-2024-089", "INC-2024-056", "INC-2024-023"],
                "escalation_path": ["on-call-sre", "database-team", "platform-lead"],
            }
        }
    )


# =============================================================================
# BUILD REALISTIC INCIDENT RESPONSE CONVERSATION
# =============================================================================


def build_incident_conversation() -> list[dict]:
    """
    Build a realistic incident response agentic conversation.
    This mirrors how SRE teams actually investigate incidents.
    """

    messages = [
        # System prompt (realistic for an SRE agent)
        {
            "role": "system",
            "content": """You are an SRE incident response assistant. You help investigate and resolve production incidents.

Current Date: 2024-12-15T14:30:00Z
On-call: @platform-team
Incident Channel: #inc-2024-142

You have access to the following tools:
- query_metrics: Query Prometheus metrics
- search_logs: Search ELK logs
- get_service_status: Get current service health
- query_deployments: Get recent deployment history
- get_runbook: Lookup runbook documentation

Always be concise and focus on actionable insights.""",
        },
        # User reports incident
        {
            "role": "user",
            "content": "We're seeing elevated error rates on the API. Users reporting timeouts. Can you investigate?",
        },
        # Agent starts investigation - queries metrics
        {
            "role": "assistant",
            "content": None,
            "tool_calls": [
                {
                    "id": "call_metrics_1",
                    "type": "function",
                    "function": {
                        "name": "query_metrics",
                        "arguments": json.dumps(
                            {
                                "query": "api_error_rate, cpu_usage, memory_usage, latency_p99",
                                "time_range": "1h",
                            }
                        ),
                    },
                }
            ],
        },
        # Metrics response
        {"role": "tool", "tool_call_id": "call_metrics_1", "content": generate_metrics_response()},
        # Agent analyzes and queries logs
        {
            "role": "assistant",
            "content": "I see a clear pattern: around 45 minutes ago, CPU spiked to 95%, latency jumped from 120ms to 2500ms, and error rate went from 0.1% to 15%. Let me check the logs for that timeframe.",
            "tool_calls": [
                {
                    "id": "call_logs_1",
                    "type": "function",
                    "function": {
                        "name": "search_logs",
                        "arguments": json.dumps(
                            {
                                "query": "level:ERROR OR level:WARN",
                                "time_range": "30m",
                                "services": ["api-server", "database"],
                            }
                        ),
                    },
                }
            ],
        },
        # Logs response
        {"role": "tool", "tool_call_id": "call_logs_1", "content": generate_logs_response()},
        # Agent identifies root cause, checks service status
        {
            "role": "assistant",
            "content": "The logs show database connection issues: 'Connection pool exhausted', 'too many connections', and 'connection timeout'. This is a database connection saturation issue. Let me check current service status.",
            "tool_calls": [
                {
                    "id": "call_status_1",
                    "type": "function",
                    "function": {"name": "get_service_status", "arguments": json.dumps({})},
                }
            ],
        },
        # Service status response
        {"role": "tool", "tool_call_id": "call_status_1", "content": generate_service_status()},
        # Agent checks for recent deployments
        {
            "role": "assistant",
            "content": "Confirmed: database-primary is CRITICAL with 500/500 connections (100% saturated), 52 blocked queries. API servers are degraded. Let me check if a recent deployment might have caused this.",
            "tool_calls": [
                {
                    "id": "call_deploys_1",
                    "type": "function",
                    "function": {
                        "name": "query_deployments",
                        "arguments": json.dumps({"service": "api-server", "limit": 15}),
                    },
                }
            ],
        },
        # Deployments response
        {
            "role": "tool",
            "tool_call_id": "call_deploys_1",
            "content": generate_deployments_response(),
        },
        # Agent gets runbook
        {
            "role": "assistant",
            "content": "I see deploy-999 (version 2.4.14) was deployed 4 hours ago with 'fix: Connection pool sizing' - this might have changed pool settings. There's also a canary deployment in progress. Let me get the runbook for this issue.",
            "tool_calls": [
                {
                    "id": "call_runbook_1",
                    "type": "function",
                    "function": {
                        "name": "get_runbook",
                        "arguments": json.dumps({"search": "database connection pool exhaustion"}),
                    },
                }
            ],
        },
        # Runbook response
        {"role": "tool", "tool_call_id": "call_runbook_1", "content": generate_runbook_response()},
        # Agent provides final analysis
        {
            "role": "assistant",
            "content": """**Incident Summary: Database Connection Pool Exhaustion**

**Root Cause:** The database primary is at 100% connection capacity (500/500). This started ~45 minutes ago, likely related to deploy-999 which modified connection pool settings.

**Impact:**
- API error rate: 15% (up from 0.1%)
- P99 latency: 2500ms (up from 120ms)
- 2 of 3 API instances unhealthy

**Immediate Actions Recommended:**
1. Kill idle database connections (per runbook RUN-DB-001)
2. Scale API servers to 5 replicas to distribute load
3. Consider rolling back deploy-999 if connection pool change is confirmed

Shall I proceed with the remediation steps?""",
        },
        # User asks for action
        {
            "role": "user",
            "content": "Yes, give me the exact commands to run and summarize the incident for the post-mortem.",
        },
    ]

    return messages


# =============================================================================
# EVALUATION
# =============================================================================


@dataclass
class EvalResult:
    mode: str
    tokens_input: int
    tokens_output: int
    latency_ms: float
    response: str
    cost_estimate: float


def evaluate_response_quality(
    baseline_response: str, optimized_response: str, optimization_level: str
) -> dict:
    """
    Use GPT-4o as judge to evaluate if the optimized response maintains quality.
    """
    judge_prompt = f"""You are evaluating two AI assistant responses to an SRE incident investigation request.
The user asked: "Give me the exact commands to run and summarize the incident for the post-mortem."

BASELINE RESPONSE (no optimization):
{baseline_response}

OPTIMIZED RESPONSE ({optimization_level} optimization):
{optimized_response}

Evaluate the optimized response compared to baseline on these criteria:
1. CORRECTNESS: Are the SQL/kubectl commands identical and correct? (1-5)
2. COMPLETENESS: Does it include all necessary remediation steps? (1-5)
3. INCIDENT SUMMARY: Is the post-mortem summary accurate and complete? (1-5)
4. ACTIONABILITY: Is the response equally actionable for an SRE? (1-5)

For each criterion, score 1-5 where:
- 5 = Equivalent or better than baseline
- 4 = Minor differences, still fully usable
- 3 = Some information missing but acceptable
- 2 = Noticeable quality degradation
- 1 = Critical information missing

Respond in this exact JSON format:
{{
    "correctness": {{"score": N, "reason": "..."}},
    "completeness": {{"score": N, "reason": "..."}},
    "incident_summary": {{"score": N, "reason": "..."}},
    "actionability": {{"score": N, "reason": "..."}},
    "overall_score": N,
    "verdict": "PASS" or "FAIL",
    "summary": "One sentence summary"
}}

PASS means overall_score >= 4.0, FAIL means < 4.0."""

    response = base_client.chat.completions.create(
        model="gpt-4o",  # Using stronger model for judging
        messages=[{"role": "user", "content": judge_prompt}],
        max_tokens=1000,
        response_format={"type": "json_object"},
    )

    import json as json_module

    try:
        return json_module.loads(response.choices[0].message.content)
    except Exception:
        return {
            "error": "Failed to parse judge response",
            "raw": response.choices[0].message.content,
        }


def run_eval(messages: list[dict], mode: str, use_aggressive: bool = False) -> EvalResult:
    """Run evaluation."""
    c = aggressive_client if use_aggressive else client

    start_time = time.time()
    response = c.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages,
        max_tokens=1000,
        headroom_mode=mode,
    )
    latency_ms = (time.time() - start_time) * 1000

    response_text = response.choices[0].message.content or ""
    tokens_in = response.usage.prompt_tokens if response.usage else 0
    tokens_out = response.usage.completion_tokens if response.usage else 0

    # GPT-4o-mini pricing: $0.15/1M input, $0.60/1M output
    cost = (tokens_in / 1_000_000) * 0.15 + (tokens_out / 1_000_000) * 0.60

    return EvalResult(
        mode=mode,
        tokens_input=tokens_in,
        tokens_output=tokens_out,
        latency_ms=latency_ms,
        response=response_text,
        cost_estimate=cost,
    )


def main():
    print("=" * 70)
    print("REAL-WORLD OPENAI AGENTIC EVALUATION")
    print("Scenario: SRE Incident Response Investigation")
    print("=" * 70)
    print()

    messages = build_incident_conversation()

    # Count tool outputs
    tool_messages = [m for m in messages if m.get("role") == "tool"]
    print(f"Conversation: {len(messages)} messages")
    print(f"Tool calls: {len(tool_messages)} (metrics, logs, status, deployments, runbook)")
    print()

    # Simulations
    print("-" * 70)
    print("SIMULATIONS")
    print("-" * 70)

    sim_default = client.chat.completions.simulate(model="gpt-4o-mini", messages=messages)
    sim_aggressive = aggressive_client.chat.completions.simulate(
        model="gpt-4o-mini", messages=messages
    )

    print(f"\n{'Mode':<15} {'Before':>10} {'After':>10} {'Saved':>10} {'%':>8}")
    print("-" * 55)
    print(
        f"{'Default':<15} {sim_default.tokens_before:>10,} {sim_default.tokens_after:>10,} {sim_default.tokens_saved:>10,} {sim_default.tokens_saved / sim_default.tokens_before * 100:>7.1f}%"
    )
    print(
        f"{'Aggressive':<15} {sim_aggressive.tokens_before:>10,} {sim_aggressive.tokens_after:>10,} {sim_aggressive.tokens_saved:>10,} {sim_aggressive.tokens_saved / sim_aggressive.tokens_before * 100:>7.1f}%"
    )
    print(f"\nTransforms: {sim_default.transforms}")
    print()

    # Actual API calls
    print("-" * 70)
    print("ACTUAL API CALLS")
    print("-" * 70)

    print("\n1. BASELINE (No optimization)...")
    baseline = run_eval(messages, "audit")
    print(f"   Tokens: {baseline.tokens_input:,} in / {baseline.tokens_output:,} out")
    print(f"   Cost: ${baseline.cost_estimate:.6f} | Latency: {baseline.latency_ms:.0f}ms")

    print("\n2. DEFAULT OPTIMIZATION...")
    default_opt = run_eval(messages, "optimize")
    print(f"   Tokens: {default_opt.tokens_input:,} in / {default_opt.tokens_output:,} out")
    print(f"   Cost: ${default_opt.cost_estimate:.6f} | Latency: {default_opt.latency_ms:.0f}ms")

    print("\n3. AGGRESSIVE OPTIMIZATION...")
    aggressive_opt = run_eval(messages, "optimize", use_aggressive=True)
    print(f"   Tokens: {aggressive_opt.tokens_input:,} in / {aggressive_opt.tokens_output:,} out")
    print(
        f"   Cost: ${aggressive_opt.cost_estimate:.6f} | Latency: {aggressive_opt.latency_ms:.0f}ms"
    )

    # Results table
    print()
    print("=" * 70)
    print("RESULTS COMPARISON")
    print("=" * 70)

    def_savings = baseline.tokens_input - default_opt.tokens_input
    def_pct = (def_savings / baseline.tokens_input * 100) if baseline.tokens_input else 0
    agg_savings = baseline.tokens_input - aggressive_opt.tokens_input
    agg_pct = (agg_savings / baseline.tokens_input * 100) if baseline.tokens_input else 0

    print(f"\n{'Metric':<20} {'Baseline':>12} {'Default Opt':>12} {'Aggressive':>12}")
    print("-" * 60)
    print(
        f"{'Input Tokens':<20} {baseline.tokens_input:>12,} {default_opt.tokens_input:>12,} {aggressive_opt.tokens_input:>12,}"
    )
    print(
        f"{'Output Tokens':<20} {baseline.tokens_output:>12,} {default_opt.tokens_output:>12,} {aggressive_opt.tokens_output:>12,}"
    )
    print(
        f"{'Cost':<20} ${baseline.cost_estimate:>11.6f} ${default_opt.cost_estimate:>11.6f} ${aggressive_opt.cost_estimate:>11.6f}"
    )
    print(
        f"{'Latency (ms)':<20} {baseline.latency_ms:>12.0f} {default_opt.latency_ms:>12.0f} {aggressive_opt.latency_ms:>12.0f}"
    )
    print()
    print(
        f"{'Token Savings':<20} {'-':>12} {def_savings:>10,} ({def_pct:.0f}%) {agg_savings:>10,} ({agg_pct:.0f}%)"
    )

    # Show responses
    print()
    print("-" * 70)
    print("RESPONSE COMPARISON")
    print("-" * 70)

    print("\n[BASELINE RESPONSE]")
    print(baseline.response[:800])
    print("..." if len(baseline.response) > 800 else "")

    print("\n[AGGRESSIVE OPTIMIZATION RESPONSE]")
    print(aggressive_opt.response[:800])
    print("..." if len(aggressive_opt.response) > 800 else "")

    # Quality Evaluation with LLM Judge
    print()
    print("-" * 70)
    print("QUALITY EVALUATION (GPT-4o as Judge)")
    print("-" * 70)

    print("\nEvaluating DEFAULT optimization vs Baseline...")
    default_eval = evaluate_response_quality(baseline.response, default_opt.response, "default")

    print("\nEvaluating AGGRESSIVE optimization vs Baseline...")
    aggressive_eval = evaluate_response_quality(
        baseline.response, aggressive_opt.response, "aggressive"
    )

    print(f"\n{'Criterion':<20} {'Default':>12} {'Aggressive':>12}")
    print("-" * 46)

    if "error" not in default_eval and "error" not in aggressive_eval:
        for criterion in ["correctness", "completeness", "incident_summary", "actionability"]:
            d_score = default_eval.get(criterion, {}).get("score", "N/A")
            a_score = aggressive_eval.get(criterion, {}).get("score", "N/A")
            print(f"{criterion.replace('_', ' ').title():<20} {d_score:>12}/5 {a_score:>12}/5")

        print("-" * 46)
        d_overall = default_eval.get("overall_score", "N/A")
        a_overall = aggressive_eval.get("overall_score", "N/A")
        print(f"{'OVERALL SCORE':<20} {d_overall:>12}/5 {a_overall:>12}/5")

        d_verdict = default_eval.get("verdict", "N/A")
        a_verdict = aggressive_eval.get("verdict", "N/A")
        print(f"{'VERDICT':<20} {d_verdict:>12} {a_verdict:>12}")

        print("\n[Default Optimization Judge Summary]")
        print(f"  {default_eval.get('summary', 'N/A')}")

        print("\n[Aggressive Optimization Judge Summary]")
        print(f"  {aggressive_eval.get('summary', 'N/A')}")

        # Detailed reasoning
        print("\n[Detailed Evaluation - Aggressive]")
        for criterion in ["correctness", "completeness", "incident_summary", "actionability"]:
            reason = aggressive_eval.get(criterion, {}).get("reason", "N/A")
            print(f"  {criterion.title()}: {reason}")
    else:
        print("Error in evaluation:")
        print(f"  Default: {default_eval}")
        print(f"  Aggressive: {aggressive_eval}")

    # Summary
    cost_save_monthly = (baseline.cost_estimate - aggressive_opt.cost_estimate) * 1000 * 30

    print()
    print("=" * 70)
    print("SUMMARY")
    print("=" * 70)
    # Get verdicts for summary
    d_verdict = default_eval.get("verdict", "N/A") if "error" not in default_eval else "ERROR"
    a_verdict = aggressive_eval.get("verdict", "N/A") if "error" not in aggressive_eval else "ERROR"
    d_score = default_eval.get("overall_score", 0) if "error" not in default_eval else 0
    a_score = aggressive_eval.get("overall_score", 0) if "error" not in aggressive_eval else 0

    print(f"""
Real-world SRE incident investigation with 5 tool calls:

Tool Outputs:
  - Metrics query:     ~240 data points (60 per metric x 4 metrics)
  - Log search:        40 log entries with full metadata
  - Service status:    5 services with instance details
  - Deployments:       15 deployment records
  - Runbook:           Structured documentation

Token Savings:
  - Default optimization:    {def_savings:,} tokens saved ({def_pct:.1f}%)
  - Aggressive optimization: {agg_savings:,} tokens saved ({agg_pct:.1f}%)

Quality Evaluation (GPT-4o Judge):
  - Default:    {d_verdict} (Score: {d_score}/5)
  - Aggressive: {a_verdict} (Score: {a_score}/5)

Cost Impact @ 1K requests/day:
  - Monthly savings: ${cost_save_monthly:.2f}

CONCLUSION:
  {"✓ Headroom achieves " + f"{agg_pct:.0f}% token reduction with PASSING quality scores." if a_verdict == "PASS" else "⚠ Aggressive optimization may degrade response quality - use conservative settings."}
  {"  The compressed context maintains semantic equivalence for model reasoning." if a_verdict == "PASS" else ""}
""")


if __name__ == "__main__":
    main()