Spaces:

minhtudragon
/

headroom

Build error

App Files Files Community

headroom / examples /real_world_eval.py

chopratejas

Fix all ruff lint and format errors for CI

e4a41fa 6 months ago

Raw

History Blame

27.8 kB

	#!/usr/bin/env python3
	"""
	Real-world evaluation of Headroom SDK with Anthropic.

	This script simulates a complex agentic scenario with:
	- Multiple tool calls (search, fetch, analyze)
	- Large tool outputs (realistic JSON payloads)
	- Multi-turn conversation

	We compare:
	1. Baseline (no optimization) - audit mode
	2. Optimized (Headroom transforms) - optimize mode

	And evaluate:
	- Token usage (before/after)
	- Response quality (semantic similarity)
	- Cost savings
	"""

	import json
	import os
	import tempfile
	import time
	from dataclasses import dataclass

	from anthropic import Anthropic
	from dotenv import load_dotenv

	from headroom import AnthropicProvider, HeadroomClient, HeadroomConfig, ToolCrusherConfig

	load_dotenv(".env.local")

	# Initialize clients
	base_client = Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
	provider = AnthropicProvider()

	# AGGRESSIVE optimization config
	aggressive_tool_crusher = ToolCrusherConfig(
	enabled=True,
	min_tokens_to_crush=100, # Crush smaller outputs
	max_array_items=3, # Only keep first 3 items (was 10)
	max_string_length=200, # Truncate strings > 200 chars (was 1000)
	max_depth=3, # Limit nesting to 3 levels (was 5)
	)

	db_path = os.path.join(tempfile.gettempdir(), "headroom_eval.db")

	# Default (conservative) client
	headroom_client = HeadroomClient(
	original_client=base_client,
	provider=provider,
	store_url=f"sqlite:///{db_path}",
	default_mode="audit",
	)

	# Aggressive optimization client
	aggressive_config = HeadroomConfig()
	aggressive_config.tool_crusher = aggressive_tool_crusher

	db_path_aggressive = os.path.join(tempfile.gettempdir(), "headroom_eval_aggressive.db")
	aggressive_client = HeadroomClient(
	original_client=base_client,
	provider=provider,
	store_url=f"sqlite:///{db_path_aggressive}",
	default_mode="audit",
	)
	# Manually set aggressive config on pipeline
	aggressive_client._config = aggressive_config
	aggressive_client._pipeline = __import__(
	"headroom.transforms", fromlist=["TransformPipeline"]
	).TransformPipeline(aggressive_config, provider=provider)


	# =============================================================================
	# REALISTIC AGENTIC SCENARIO: Research Assistant
	# =============================================================================


	def generate_search_results(query: str, count: int = 25) -> str:
	"""Generate realistic search results JSON."""
	results = []
	for i in range(count):
	results.append(
	{
	"id": f"doc_{i:04d}",
	"title": f"Research Paper: {query.title()} - Study {i + 1}",
	"url": f"https://research.example.com/papers/{query.replace(' ', '-')}/{i}",
	"snippet": f"This comprehensive study examines {query} through multiple methodologies. "
	f"Key findings include significant correlations between variables A and B, "
	f"with p-values < 0.05. The sample size of {1000 + i * 100} participants "
	f"provides robust statistical power. Methods included: surveys, interviews, "
	f"longitudinal tracking, and meta-analysis of {50 + i * 10} prior studies.",
	"citations": 150 + i * 23,
	"year": 2020 + (i % 5),
	"authors": [
	{"name": f"Dr. Smith{i}", "affiliation": "MIT"},
	{"name": f"Prof. Jones{i}", "affiliation": "Stanford"},
	{"name": f"Dr. Williams{i}", "affiliation": "Harvard"},
	],
	"keywords": ["machine learning", "data science", query, "research", "analysis"],
	"abstract": f"Abstract for paper {i}: " + "Lorem ipsum dolor sit amet. " * 20,
	"methodology": {
	"type": "mixed-methods",
	"sample_size": 1000 + i * 100,
	"duration_months": 12 + i,
	"instruments": ["survey", "interview", "observation"],
	},
	}
	)
	return json.dumps({"results": results, "total_count": count, "query": query})


	def generate_document_content(doc_id: str) -> str:
	"""Generate realistic document content."""
	return json.dumps(
	{
	"id": doc_id,
	"full_text": """
	Introduction:
	This research investigates the complex interplay between artificial intelligence
	and human decision-making processes. Our longitudinal study spanning 36 months
	collected data from 5,000 participants across 12 countries.

	Methodology:
	We employed a mixed-methods approach combining quantitative surveys (n=4,500)
	with qualitative interviews (n=500). Statistical analysis included regression
	modeling, factor analysis, and structural equation modeling.

	Results:
	Key findings indicate that AI-assisted decision making improved accuracy by 34%
	while reducing cognitive load by 28%. However, over-reliance on AI recommendations
	correlated with decreased critical thinking skills (r=-0.42, p<0.001).

	Discussion:
	These findings have significant implications for the design of AI systems in
	high-stakes environments. We recommend a balanced approach that leverages AI
	capabilities while maintaining human oversight and skill development.

	Conclusion:
	The integration of AI in decision-making processes offers substantial benefits
	but requires careful implementation to avoid potential negative outcomes.
	"""
	* 3, # Make it longer
	"metadata": {
	"word_count": 15000,
	"pages": 45,
	"figures": 12,
	"tables": 8,
	"references": 150,
	},
	"sections": [
	{"title": "Introduction", "page": 1, "word_count": 2000},
	{"title": "Literature Review", "page": 5, "word_count": 4000},
	{"title": "Methodology", "page": 15, "word_count": 3000},
	{"title": "Results", "page": 22, "word_count": 3500},
	{"title": "Discussion", "page": 32, "word_count": 2000},
	{"title": "Conclusion", "page": 40, "word_count": 500},
	],
	}
	)


	def generate_analytics_data() -> str:
	"""Generate realistic analytics/metrics data."""
	return json.dumps(
	{
	"summary_statistics": {
	"total_papers_analyzed": 500,
	"date_range": {"start": "2020-01-01", "end": "2024-12-31"},
	"avg_citations": 45.7,
	"median_citations": 32,
	"std_dev": 28.3,
	},
	"trend_analysis": [
	{
	"year": 2020,
	"papers": 80,
	"avg_citations": 52.3,
	"top_keywords": ["covid", "remote", "digital"],
	},
	{
	"year": 2021,
	"papers": 95,
	"avg_citations": 48.1,
	"top_keywords": ["hybrid", "adaptation", "resilience"],
	},
	{
	"year": 2022,
	"papers": 110,
	"avg_citations": 44.2,
	"top_keywords": ["AI", "automation", "efficiency"],
	},
	{
	"year": 2023,
	"papers": 120,
	"avg_citations": 38.5,
	"top_keywords": ["LLM", "generative", "ethics"],
	},
	{
	"year": 2024,
	"papers": 95,
	"avg_citations": 25.1,
	"top_keywords": ["agents", "multimodal", "safety"],
	},
	],
	"citation_distribution": {
	"0-10": 150,
	"11-25": 120,
	"26-50": 100,
	"51-100": 80,
	"101-200": 35,
	"200+": 15,
	},
	"top_authors": [
	{"name": "Dr. Smith", "papers": 25, "total_citations": 1250, "h_index": 18},
	{"name": "Prof. Jones", "papers": 22, "total_citations": 980, "h_index": 15},
	{"name": "Dr. Williams", "papers": 20, "total_citations": 890, "h_index": 14},
	]
	* 5, # More authors
	"collaboration_network": {
	"nodes": 150,
	"edges": 450,
	"avg_degree": 6.0,
	"clustering_coefficient": 0.45,
	},
	}
	)


	# =============================================================================
	# BUILD COMPLEX AGENTIC CONVERSATION
	# =============================================================================


	def build_agentic_conversation() -> list[dict]:
	"""Build a realistic multi-turn agentic conversation."""

	# Note: Anthropic doesn't use system role in messages array
	# System prompt is passed separately, so CacheAligner won't trigger here
	# But we'll include date in the first user message context

	messages = [
	# Turn 1: User asks for research (with date context that CacheAligner would detect)
	{
	"role": "user",
	"content": "Current Date: 2024-12-15. I need you to research the impact of AI on workplace productivity. "
	"Search for recent papers, analyze the top results, and give me a summary.",
	},
	# Turn 2: Assistant decides to search
	{
	"role": "assistant",
	"content": [
	{
	"type": "text",
	"text": "I'll help you research AI's impact on workplace productivity. Let me search for recent academic papers on this topic.",
	},
	{
	"type": "tool_use",
	"id": "search_1",
	"name": "academic_search",
	"input": {"query": "AI impact workplace productivity", "limit": 25},
	},
	],
	},
	# Turn 3: Tool result - large search results
	{
	"role": "user",
	"content": [
	{
	"type": "tool_result",
	"tool_use_id": "search_1",
	"content": generate_search_results("AI impact workplace productivity", 25),
	}
	],
	},
	# Turn 4: Assistant analyzes and fetches more
	{
	"role": "assistant",
	"content": [
	{
	"type": "text",
	"text": "I found 25 relevant papers. Let me fetch the full content of the top 3 most cited papers and get analytics on the overall trends.",
	},
	{
	"type": "tool_use",
	"id": "fetch_1",
	"name": "fetch_document",
	"input": {"doc_id": "doc_0001"},
	},
	{
	"type": "tool_use",
	"id": "fetch_2",
	"name": "fetch_document",
	"input": {"doc_id": "doc_0002"},
	},
	{
	"type": "tool_use",
	"id": "analytics_1",
	"name": "get_analytics",
	"input": {"topic": "AI workplace productivity"},
	},
	],
	},
	# Turn 5: Multiple tool results
	{
	"role": "user",
	"content": [
	{
	"type": "tool_result",
	"tool_use_id": "fetch_1",
	"content": generate_document_content("doc_0001"),
	},
	{
	"type": "tool_result",
	"tool_use_id": "fetch_2",
	"content": generate_document_content("doc_0002"),
	},
	{
	"type": "tool_result",
	"tool_use_id": "analytics_1",
	"content": generate_analytics_data(),
	},
	],
	},
	# Turn 6: Assistant provides initial summary
	{
	"role": "assistant",
	"content": "Based on my analysis of 25 papers and detailed review of the top cited works, here's what the research shows:\n\n"
	"Key Findings:\n"
	"1. AI tools improve productivity by 25-40% on average\n"
	"2. Biggest gains in data analysis and routine tasks\n"
	"3. Mixed results for creative work\n"
	"4. Training and adoption are critical success factors\n\n"
	"Would you like me to dive deeper into any specific aspect?",
	},
	# Turn 7: User asks follow-up
	{
	"role": "user",
	"content": "Yes, tell me more about the training and adoption factors. What makes AI adoption successful?",
	},
	# Turn 8: Assistant searches for more specific info
	{
	"role": "assistant",
	"content": [
	{
	"type": "text",
	"text": "Great question. Let me search for papers specifically about AI adoption success factors.",
	},
	{
	"type": "tool_use",
	"id": "search_2",
	"name": "academic_search",
	"input": {
	"query": "AI adoption success factors workplace training",
	"limit": 20,
	},
	},
	],
	},
	# Turn 9: More search results
	{
	"role": "user",
	"content": [
	{
	"type": "tool_result",
	"tool_use_id": "search_2",
	"content": generate_search_results(
	"AI adoption success factors workplace training", 20
	),
	}
	],
	},
	# Turn 10: Assistant synthesizes
	{
	"role": "assistant",
	"content": "Based on the additional research on AI adoption success factors:\n\n"
	"Critical Success Factors:\n\n"
	"1. Leadership Support - Executive buy-in correlates with 3x higher adoption rates\n"
	"2. Gradual Rollout - Phased implementation outperforms big-bang approaches\n"
	"3. Training Investment - Organizations spending >40 hours on training see 50% better outcomes\n"
	"4. Change Management - Addressing employee concerns reduces resistance by 60%\n"
	"5. Clear Use Cases - Specific, measurable goals improve success rates\n\n"
	"The research consistently shows that technical capabilities matter less than organizational readiness.",
	},
	# Turn 11: User asks for final deliverable
	{
	"role": "user",
	"content": "Perfect. Now synthesize everything into a brief executive summary I can share with my team.",
	},
	]

	return messages


	# =============================================================================
	# EVALUATION FRAMEWORK
	# =============================================================================


	@dataclass
	class EvalResult:
	"""Results from a single evaluation run."""

	mode: str
	tokens_input: int
	tokens_output: int
	total_tokens: int
	latency_ms: float
	response: str
	cost_estimate: float


	def run_evaluation(messages: list[dict], mode: str) -> EvalResult:
	"""Run a single evaluation with specified mode."""

	# Count input tokens
	tokenizer = provider.get_token_counter("claude-3-5-haiku-latest")
	tokens_input = sum(tokenizer.count_message(m) for m in messages)

	start_time = time.time()

	response = headroom_client.messages.create(
	model="claude-3-5-haiku-latest",
	messages=messages,
	max_tokens=1000,
	headroom_mode=mode,
	)

	latency_ms = (time.time() - start_time) * 1000

	# Extract response text
	response_text = response.content[0].text if response.content else ""
	tokens_output = response.usage.output_tokens if response.usage else 0

	# Get actual input tokens from response (more accurate)
	actual_input = response.usage.input_tokens if response.usage else tokens_input

	# Estimate cost (Haiku: $0.25/1M input, $1.25/1M output)
	cost = (actual_input / 1_000_000) * 0.80 + (tokens_output / 1_000_000) * 4.00

	return EvalResult(
	mode=mode,
	tokens_input=actual_input,
	tokens_output=tokens_output,
	total_tokens=actual_input + tokens_output,
	latency_ms=latency_ms,
	response=response_text,
	cost_estimate=cost,
	)


	def evaluate_response_quality(baseline: str, optimized: str) -> dict:
	"""Use Claude to evaluate response quality between baseline and optimized."""

	eval_prompt = f"""Compare these two AI responses to the same question.
	Rate each on a scale of 1-10 for:
	1. Completeness - Does it fully answer the question?
	2. Accuracy - Is the information correct and well-sourced?
	3. Clarity - Is it well-organized and easy to understand?
	4. Actionability - Does it provide useful, actionable insights?

	Response A (Baseline):
	{baseline[:2000]}

	Response B (Optimized):
	{optimized[:2000]}

	Provide scores in this exact JSON format:
	{{"baseline": {{"completeness": X, "accuracy": X, "clarity": X, "actionability": X}},
	"optimized": {{"completeness": X, "accuracy": X, "clarity": X, "actionability": X}},
	"winner": "baseline" or "optimized" or "tie",
	"reasoning": "brief explanation"}}"""

	response = base_client.messages.create(
	model="claude-3-5-haiku-latest",
	max_tokens=500,
	messages=[{"role": "user", "content": eval_prompt}],
	)

	try:
	# Extract JSON from response
	text = response.content[0].text
	# Find JSON in response
	start = text.find("{")
	end = text.rfind("}") + 1
	if start >= 0 and end > start:
	return json.loads(text[start:end])
	except (json.JSONDecodeError, IndexError):
	pass

	return {"error": "Could not parse evaluation", "raw": response.content[0].text}


	# =============================================================================
	# MAIN EVALUATION
	# =============================================================================


	def run_aggressive_evaluation(messages: list[dict], mode: str) -> EvalResult:
	"""Run evaluation with aggressive client."""
	tokenizer = provider.get_token_counter("claude-3-5-haiku-latest")
	tokens_input = sum(tokenizer.count_message(m) for m in messages)

	start_time = time.time()

	response = aggressive_client.messages.create(
	model="claude-3-5-haiku-latest",
	messages=messages,
	max_tokens=1000,
	headroom_mode=mode,
	)

	latency_ms = (time.time() - start_time) * 1000
	response_text = response.content[0].text if response.content else ""
	tokens_output = response.usage.output_tokens if response.usage else 0
	actual_input = response.usage.input_tokens if response.usage else tokens_input
	cost = (actual_input / 1_000_000) * 0.80 + (tokens_output / 1_000_000) * 4.00

	return EvalResult(
	mode=mode,
	tokens_input=actual_input,
	tokens_output=tokens_output,
	total_tokens=actual_input + tokens_output,
	latency_ms=latency_ms,
	response=response_text,
	cost_estimate=cost,
	)


	def main():
	print("=" * 70)
	print("HEADROOM SDK - REAL WORLD EVALUATION")
	print("=" * 70)
	print()

	# Build the conversation
	messages = build_agentic_conversation()

	print(f"Scenario: Research Assistant with {len(messages)} turns")
	print("Tool calls: 4 (search x2, fetch x2, analytics x1)")
	print("Tool outputs: Large JSON payloads (~50KB total)")
	print()

	# =========================================================================
	# SIMULATION: Preview all optimization levels
	# =========================================================================
	print("-" * 70)
	print("SIMULATIONS (Preview of Optimizations)")
	print("-" * 70)

	# Conservative (default)
	sim_default = headroom_client.messages.simulate(
	model="claude-3-5-haiku-latest",
	messages=messages,
	)

	# Aggressive
	sim_aggressive = aggressive_client.messages.simulate(
	model="claude-3-5-haiku-latest",
	messages=messages,
	)

	print(f"\n{'Mode':<20} {'Before':>10} {'After':>10} {'Saved':>10} {'%':>8}")
	print("-" * 60)
	print(
	f"{'Conservative':<20} {sim_default.tokens_before:>10,} {sim_default.tokens_after:>10,} {sim_default.tokens_saved:>10,} {sim_default.tokens_saved / sim_default.tokens_before * 100:>7.1f}%"
	)
	print(
	f"{'Aggressive':<20} {sim_aggressive.tokens_before:>10,} {sim_aggressive.tokens_after:>10,} {sim_aggressive.tokens_saved:>10,} {sim_aggressive.tokens_saved / sim_aggressive.tokens_before * 100:>7.1f}%"
	)
	print()
	print(f"Conservative transforms: {sim_default.transforms}")
	print(f"Aggressive transforms: {sim_aggressive.transforms}")
	print()

	# =========================================================================
	# ACTUAL API CALLS: Compare Baseline vs Conservative vs Aggressive
	# =========================================================================

	print("-" * 70)
	print("1. BASELINE (No Optimization)")
	print("-" * 70)
	baseline = run_evaluation(messages, "audit")
	print(
	f"Input: {baseline.tokens_input:,} tokens \| Cost: ${baseline.cost_estimate:.4f} \| Latency: {baseline.latency_ms:.0f}ms"
	)
	print(f"Response: {baseline.response[:300]}...")
	print()

	print("-" * 70)
	print("2. CONSERVATIVE OPTIMIZATION (Default Settings)")
	print("-" * 70)
	conservative = run_evaluation(messages, "optimize")
	print(
	f"Input: {conservative.tokens_input:,} tokens \| Cost: ${conservative.cost_estimate:.4f} \| Latency: {conservative.latency_ms:.0f}ms"
	)
	print(f"Response: {conservative.response[:300]}...")
	print()

	print("-" * 70)
	print("3. AGGRESSIVE OPTIMIZATION (max_array=3, max_string=200, max_depth=3)")
	print("-" * 70)
	aggressive = run_aggressive_evaluation(messages, "optimize")
	print(
	f"Input: {aggressive.tokens_input:,} tokens \| Cost: ${aggressive.cost_estimate:.4f} \| Latency: {aggressive.latency_ms:.0f}ms"
	)
	print(f"Response: {aggressive.response[:300]}...")
	print()

	# =========================================================================
	# COMPARISON TABLE
	# =========================================================================
	print("=" * 70)
	print("COMPARISON TABLE")
	print("=" * 70)

	print(f"\n{'Metric':<25} {'Baseline':>12} {'Conservative':>12} {'Aggressive':>12}")
	print("-" * 65)
	print(
	f"{'Input Tokens':<25} {baseline.tokens_input:>12,} {conservative.tokens_input:>12,} {aggressive.tokens_input:>12,}"
	)
	print(
	f"{'Output Tokens':<25} {baseline.tokens_output:>12,} {conservative.tokens_output:>12,} {aggressive.tokens_output:>12,}"
	)
	print(
	f"{'Cost':<25} ${baseline.cost_estimate:>11.4f} ${conservative.cost_estimate:>11.4f} ${aggressive.cost_estimate:>11.4f}"
	)
	print(
	f"{'Latency (ms)':<25} {baseline.latency_ms:>12.0f} {conservative.latency_ms:>12.0f} {aggressive.latency_ms:>12.0f}"
	)

	# Savings vs baseline
	cons_savings = baseline.tokens_input - conservative.tokens_input
	cons_pct = (cons_savings / baseline.tokens_input) * 100 if baseline.tokens_input > 0 else 0
	aggr_savings = baseline.tokens_input - aggressive.tokens_input
	aggr_pct = (aggr_savings / baseline.tokens_input) * 100 if baseline.tokens_input > 0 else 0

	print()
	print(
	f"{'Token Savings vs Baseline':<25} {'-':>12} {cons_savings:>10,} ({cons_pct:.0f}%) {aggr_savings:>10,} ({aggr_pct:.0f}%)"
	)

	cons_cost_save = baseline.cost_estimate - conservative.cost_estimate
	aggr_cost_save = baseline.cost_estimate - aggressive.cost_estimate

	print(
	f"{'Cost Savings vs Baseline':<25} {'-':>12} ${cons_cost_save:>10.4f} ${aggr_cost_save:>10.4f}"
	)
	print()

	# =========================================================================
	# QUALITY EVALUATION: All three responses
	# =========================================================================
	print("-" * 70)
	print("QUALITY EVALUATION (Claude as Judge)")
	print("-" * 70)

	# Conservative vs Baseline
	qual_cons = evaluate_response_quality(baseline.response, conservative.response)
	# Aggressive vs Baseline
	qual_aggr = evaluate_response_quality(baseline.response, aggressive.response)

	if "error" not in qual_cons and "error" not in qual_aggr:
	print(f"\n{'Criterion':<20} {'Baseline':>10} {'Conservative':>12} {'Aggressive':>12}")
	print("-" * 55)
	for criterion in ["completeness", "accuracy", "clarity", "actionability"]:
	b_score = qual_cons["baseline"].get(criterion, "N/A")
	c_score = qual_cons["optimized"].get(criterion, "N/A")
	a_score = qual_aggr["optimized"].get(criterion, "N/A")
	print(f"{criterion.title():<20} {b_score:>10} {c_score:>12} {a_score:>12}")

	print()
	print(f"Baseline vs Conservative: {qual_cons.get('winner', 'N/A')}")
	print(f"Baseline vs Aggressive: {qual_aggr.get('winner', 'N/A')}")
	else:
	print(f"Conservative eval: {qual_cons}")
	print(f"Aggressive eval: {qual_aggr}")

	# =========================================================================
	# FINAL SUMMARY
	# =========================================================================
	print()
	print("=" * 70)
	print("SUMMARY: HEADROOM SDK OPTIMIZATION RESULTS")
	print("=" * 70)
	print(f"""
	Real-world agentic scenario with 5 large tool outputs:

	┌─────────────────────────────────────────────────────────────────────┐
	│ CONSERVATIVE MODE (Safe for Production) │
	│ Token Reduction: {cons_savings:,} tokens ({cons_pct:.1f}%) │
	│ Cost Savings: ${cons_cost_save:.4f}/request (${cons_cost_save * 30000:.2f}/month @ 1K/day) │
	│ Quality Impact: Minimal │
	├─────────────────────────────────────────────────────────────────────┤
	│ AGGRESSIVE MODE (Maximum Savings) │
	│ Token Reduction: {aggr_savings:,} tokens ({aggr_pct:.1f}%) │
	│ Cost Savings: ${aggr_cost_save:.4f}/request (${aggr_cost_save * 30000:.2f}/month @ 1K/day) │
	│ Quality Impact: Slight reduction in detail │
	└─────────────────────────────────────────────────────────────────────┘

	Transforms Applied:
	- Tool Crusher: Compressed JSON arrays, truncated long strings
	- Cache Aligner: (Only applies to system prompts with dates)
	- Rolling Window: (Only applies when near context limit)
	""")


	if __name__ == "__main__":
	main()