Spaces:

minhtudragon
/

headroom

Build error

App Files Files Community

headroom / examples /langchain_before_after.py

chopratejas

Fix all ruff lint and format errors for CI

e4a41fa 6 months ago

Raw

History Blame

20 kB

	#!/usr/bin/env python3
	"""
	LangChain + Headroom Integration: Before vs After Examples

	This script demonstrates the real-world impact of Headroom optimization
	on LangChain applications. Run with:

	python examples/langchain_before_after.py

	Requirements:
	pip install headroom[langchain] langchain-openai

	Note: Set OPENAI_API_KEY environment variable for live API tests.
	For dry-run mode (no API calls), the script shows simulated results.
	"""

	from __future__ import annotations

	import json
	import os
	import tempfile
	import time
	from dataclasses import dataclass

	# Check dependencies
	try:
	from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage

	LANGCHAIN_AVAILABLE = True
	except ImportError:
	LANGCHAIN_AVAILABLE = False
	print("LangChain not installed. Install with: pip install langchain-core")

	try:
	from langchain_openai import ChatOpenAI # noqa: F401

	OPENAI_AVAILABLE = True
	except ImportError:
	OPENAI_AVAILABLE = False
	print("langchain-openai not installed. Install with: pip install langchain-openai")

	# Import Headroom
	try:
	from headroom import ( # noqa: F401
	HeadroomClient,
	HeadroomConfig,
	HeadroomMode,
	OpenAIProvider,
	)

	HEADROOM_AVAILABLE = True
	except ImportError:
	HEADROOM_AVAILABLE = False
	print("Headroom not installed. Install with: pip install headroom")


	@dataclass
	class ComparisonResult:
	"""Result of before/after comparison."""

	scenario: str
	tokens_before: int
	tokens_after: int
	tokens_saved: int
	savings_percent: float
	latency_before_ms: float \| None
	latency_after_ms: float \| None
	cost_before_usd: float
	cost_after_usd: float
	cost_saved_usd: float


	def estimate_cost(tokens: int, model: str = "gpt-4o") -> float:
	"""Estimate cost in USD. GPT-4o: $2.50/1M input tokens."""
	rates = {
	"gpt-4o": 2.50 / 1_000_000,
	"gpt-4o-mini": 0.15 / 1_000_000,
	"claude-3-5-sonnet": 3.00 / 1_000_000,
	}
	return tokens * rates.get(model, 2.50 / 1_000_000)


	def print_comparison(result: ComparisonResult) -> None:
	"""Print formatted comparison results."""
	print(f"\n{'=' * 60}")
	print(f"Scenario: {result.scenario}")
	print(f"{'=' * 60}")
	print("\n[Token Comparison]")
	print(f" Before: {result.tokens_before:,} tokens")
	print(f" After: {result.tokens_after:,} tokens")
	print(f" Saved: {result.tokens_saved:,} tokens ({result.savings_percent:.1f}%)")

	print("\n[Cost Impact] (GPT-4o pricing)")
	print(f" Before: ${result.cost_before_usd:.4f}")
	print(f" After: ${result.cost_after_usd:.4f}")
	print(f" Saved: ${result.cost_saved_usd:.4f}")

	if result.latency_before_ms and result.latency_after_ms:
	print("\n[Latency]")
	print(f" Before: {result.latency_before_ms:.0f}ms")
	print(f" After: {result.latency_after_ms:.0f}ms")


	def langchain_to_openai_messages(messages: list) -> list[dict]:
	"""Convert LangChain messages to OpenAI format."""
	openai_messages = []
	for msg in messages:
	if isinstance(msg, SystemMessage):
	openai_messages.append({"role": "system", "content": msg.content})
	elif isinstance(msg, HumanMessage):
	openai_messages.append({"role": "user", "content": msg.content})
	elif isinstance(msg, AIMessage):
	msg_dict = {"role": "assistant", "content": msg.content}
	if hasattr(msg, "tool_calls") and msg.tool_calls:
	msg_dict["tool_calls"] = [
	{
	"id": tc.get("id", f"call_{i}"),
	"type": "function",
	"function": {
	"name": tc.get("name", ""),
	"arguments": json.dumps(tc.get("args", {})),
	},
	}
	for i, tc in enumerate(msg.tool_calls)
	]
	openai_messages.append(msg_dict)
	elif isinstance(msg, ToolMessage):
	openai_messages.append(
	{
	"role": "tool",
	"tool_call_id": msg.tool_call_id,
	"content": msg.content,
	}
	)
	return openai_messages


	# ============================================================================
	# SCENARIO 1: Agentic Workflow with Large Tool Outputs
	# ============================================================================


	def scenario_agentic_workflow() -> ComparisonResult:
	"""
	Scenario: AI agent that searches a database and processes results.

	Common pattern: Tool returns 100+ records, but only 5-10 are relevant.
	Without optimization, ALL records are sent to the LLM.
	"""
	print("\n" + "=" * 60)
	print("SCENARIO 1: Agentic Workflow with Large Tool Outputs")
	print("=" * 60)

	# Simulate a database search tool that returns many results
	search_results = [
	{
	"id": f"user-{i:04d}",
	"name": f"User {i}",
	"email": f"user{i}@example.com",
	"department": ["Engineering", "Sales", "Marketing", "Support"][i % 4],
	"status": "active" if i % 10 != 0 else "inactive",
	"created_at": f"2024-{(i % 12) + 1:02d}-{(i % 28) + 1:02d}T10:00:00Z",
	"last_login": f"2024-12-{(i % 28) + 1:02d}T{i % 24:02d}:00:00Z",
	"metadata": {
	"preferences": {"theme": "dark", "notifications": True},
	"tags": ["premium", "verified"] if i % 5 == 0 else [],
	},
	}
	for i in range(100)
	]

	# The conversation in LangChain format
	lc_messages = [
	SystemMessage(
	content="""You are a helpful database assistant.
	When searching for users, analyze the results and provide a summary.
	Focus on active users in the Engineering department."""
	),
	HumanMessage(content="Find users in the Engineering department"),
	AIMessage(
	content="I'll search the database for Engineering users.",
	tool_calls=[
	{"id": "call_1", "name": "search_users", "args": {"department": "Engineering"}}
	],
	),
	ToolMessage(
	content=json.dumps(search_results), # 100 records!
	tool_call_id="call_1",
	),
	]

	# Convert to OpenAI format for Headroom
	messages = langchain_to_openai_messages(lc_messages)

	# Create Headroom client for simulation
	from openai import OpenAI

	db_path = os.path.join(tempfile.gettempdir(), "headroom_langchain_example.db")
	base_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "sk-fake-key"))
	provider = OpenAIProvider()

	client = HeadroomClient(
	original_client=base_client,
	provider=provider,
	store_url=f"sqlite:///{db_path}",
	default_mode="optimize",
	)

	# Simulate optimization
	plan = client.chat.completions.simulate(
	model="gpt-4o",
	messages=messages,
	)

	tokens_before = plan.tokens_before
	tokens_after = plan.tokens_after
	tokens_saved = plan.tokens_saved
	savings_percent = (tokens_saved / tokens_before * 100) if tokens_before > 0 else 0

	print("\n[Before Optimization]")
	print(" - System prompt + conversation")
	print(f" - Tool output: 100 user records ({len(json.dumps(search_results))} chars)")

	print("\n[After Optimization]")
	print(" - SmartCrusher kept: first 3, last 2, + relevance matches")
	print(" - Estimated ~15 items preserved (Engineering dept matches)")
	print(f" - Transforms: {plan.transforms}")

	client.close()

	return ComparisonResult(
	scenario="Agentic Workflow with Large Tool Outputs",
	tokens_before=tokens_before,
	tokens_after=tokens_after,
	tokens_saved=tokens_saved,
	savings_percent=savings_percent,
	latency_before_ms=None,
	latency_after_ms=None,
	cost_before_usd=estimate_cost(tokens_before),
	cost_after_usd=estimate_cost(tokens_after),
	cost_saved_usd=estimate_cost(tokens_saved),
	)


	# ============================================================================
	# SCENARIO 2: Long Conversation with Context Window Pressure
	# ============================================================================


	def scenario_long_conversation() -> ComparisonResult:
	"""
	Scenario: Multi-turn conversation approaching context window limit.

	Common pattern: Chatbot accumulates history, needs to drop old turns.
	Without optimization, either hits context limit or loses coherence.
	"""
	print("\n" + "=" * 60)
	print("SCENARIO 2: Long Conversation with Context Window Pressure")
	print("=" * 60)

	# Simulate 50-turn conversation in LangChain format
	lc_messages = [
	SystemMessage(
	content="""You are a customer support agent for TechCorp.
	You have access to customer data and can help with:
	- Account issues
	- Billing questions
	- Technical support
	- Product information

	Current date: 2024-12-15
	Agent ID: support-agent-42
	"""
	),
	]

	# Add 50 turns of conversation
	topics = [
	"I can't log into my account",
	"What's my current subscription?",
	"Can you explain the premium features?",
	"I was charged twice this month",
	"How do I reset my password?",
	]

	for i in range(50):
	topic = topics[i % len(topics)]
	lc_messages.append(HumanMessage(content=f"Turn {i}: {topic}"))
	lc_messages.append(
	AIMessage(
	content=f"Response to turn {i}: Thank you for reaching out about '{topic}'. "
	f"I can help you with that. Here's what I found... " * 3
	)
	)

	# Convert to OpenAI format
	messages = langchain_to_openai_messages(lc_messages)

	# Create Headroom client for simulation
	from openai import OpenAI

	db_path = os.path.join(tempfile.gettempdir(), "headroom_langchain_example.db")
	base_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "sk-fake-key"))
	provider = OpenAIProvider()

	client = HeadroomClient(
	original_client=base_client,
	provider=provider,
	store_url=f"sqlite:///{db_path}",
	default_mode="optimize",
	)

	# Simulate optimization
	plan = client.chat.completions.simulate(
	model="gpt-4o",
	messages=messages,
	)

	tokens_before = plan.tokens_before
	tokens_after = plan.tokens_after
	tokens_saved = plan.tokens_saved
	savings_percent = (tokens_saved / tokens_before * 100) if tokens_before > 0 else 0

	print("\n[Before Optimization]")
	print(" - 50-turn conversation")
	print(f" - ~{tokens_before:,} tokens total")

	print("\n[After Optimization]")
	print(" - RollingWindow kept system + last N turns")
	print(" - CacheAligner moved date to dynamic tail")
	print(f" - Transforms: {plan.transforms}")

	client.close()

	return ComparisonResult(
	scenario="Long Conversation (50 turns)",
	tokens_before=tokens_before,
	tokens_after=tokens_after,
	tokens_saved=tokens_saved,
	savings_percent=savings_percent,
	latency_before_ms=None,
	latency_after_ms=None,
	cost_before_usd=estimate_cost(tokens_before),
	cost_after_usd=estimate_cost(tokens_after),
	cost_saved_usd=estimate_cost(tokens_saved),
	)


	# ============================================================================
	# SCENARIO 3: RAG with Retrieved Documents
	# ============================================================================


	def scenario_rag_pipeline() -> ComparisonResult:
	"""
	Scenario: RAG pipeline that retrieves multiple documents.

	Common pattern: Retriever returns 10 chunks, many are redundant.
	Without optimization, all chunks consume tokens.
	"""
	print("\n" + "=" * 60)
	print("SCENARIO 3: RAG Pipeline with Retrieved Documents")
	print("=" * 60)

	# Simulate retrieved document chunks
	chunks = []
	for i in range(10):
	chunk = {
	"content": f"Document {i} content: " + "This is relevant information. " * 50,
	"source": f"doc_{i}.pdf",
	"page": i + 1,
	"relevance_score": 0.9 - (i * 0.05),
	"metadata": {
	"author": f"Author {i}",
	"date": "2024-01-15",
	"category": "Technical",
	},
	}
	chunks.append(chunk)

	context = "\n\n".join(
	[f"[Source: {c['source']}, Page {c['page']}]\n{c['content']}" for c in chunks]
	)

	# LangChain format
	lc_messages = [
	SystemMessage(content="You are a helpful assistant. Answer based on the provided context."),
	HumanMessage(
	content=f"""Based on the following retrieved documents:

	{context}

	Question: What are the key technical requirements?"""
	),
	]

	# Convert to OpenAI format
	messages = langchain_to_openai_messages(lc_messages)

	# Create Headroom client for simulation
	from openai import OpenAI

	db_path = os.path.join(tempfile.gettempdir(), "headroom_langchain_example.db")
	base_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", "sk-fake-key"))
	provider = OpenAIProvider()

	client = HeadroomClient(
	original_client=base_client,
	provider=provider,
	store_url=f"sqlite:///{db_path}",
	default_mode="optimize",
	)

	# Simulate optimization
	plan = client.chat.completions.simulate(
	model="gpt-4o",
	messages=messages,
	)

	tokens_before = plan.tokens_before
	tokens_after = plan.tokens_after
	tokens_saved = plan.tokens_saved
	savings_percent = (tokens_saved / tokens_before * 100) if tokens_before > 0 else 0

	print("\n[Before Optimization]")
	print(" - 10 retrieved document chunks")
	print(f" - ~{tokens_before:,} tokens total")

	print("\n[After Optimization]")
	print(" - CacheAligner normalized whitespace")
	print(f" - Transforms: {plan.transforms}")

	client.close()

	return ComparisonResult(
	scenario="RAG Pipeline (10 chunks)",
	tokens_before=tokens_before,
	tokens_after=tokens_after,
	tokens_saved=tokens_saved,
	savings_percent=savings_percent,
	latency_before_ms=None,
	latency_after_ms=None,
	cost_before_usd=estimate_cost(tokens_before),
	cost_after_usd=estimate_cost(tokens_after),
	cost_saved_usd=estimate_cost(tokens_saved),
	)


	# ============================================================================
	# SCENARIO 4: Real API Comparison (if API key available)
	# ============================================================================


	def scenario_live_api() -> ComparisonResult \| None:
	"""
	Scenario: Live API comparison with actual timing.

	Only runs if OPENAI_API_KEY is set.
	"""
	if not os.environ.get("OPENAI_API_KEY"):
	print("\n[!] Skipping live API test (OPENAI_API_KEY not set)")
	return None

	if not OPENAI_AVAILABLE:
	print("\n[!] Skipping live API test (langchain-openai not installed)")
	return None

	print("\n" + "=" * 60)
	print("SCENARIO 4: Live API Comparison")
	print("=" * 60)

	from openai import OpenAI

	# Create base OpenAI client
	base_client = OpenAI()

	# Create Headroom-wrapped client
	db_path = os.path.join(tempfile.gettempdir(), "headroom_langchain_live.db")
	provider = OpenAIProvider()

	headroom_client = HeadroomClient(
	original_client=base_client,
	provider=provider,
	store_url=f"sqlite:///{db_path}",
	default_mode="optimize",
	)

	# Test messages in OpenAI format
	messages = [
	{"role": "system", "content": "You are helpful. Be concise."},
	{"role": "user", "content": "What is 2+2?"},
	]

	# Time the base client
	start = time.time()
	base_response = base_client.chat.completions.create(
	model="gpt-4o-mini",
	messages=messages,
	max_tokens=50,
	)
	latency_before = (time.time() - start) * 1000

	# Time the Headroom-wrapped client
	start = time.time()
	optimized_response = headroom_client.chat.completions.create(
	model="gpt-4o-mini",
	messages=messages,
	headroom_mode="optimize",
	max_tokens=50,
	)
	latency_after = (time.time() - start) * 1000

	print(f"\n[Base Model Response] {base_response.choices[0].message.content[:50]}...")
	print(f"[Optimized Response] {optimized_response.choices[0].message.content[:50]}...")
	print(f"\n[Latency] {latency_before:.0f}ms -> {latency_after:.0f}ms")

	# Get metrics
	headroom_client.get_summary()

	headroom_client.close()

	# For this simple case, savings are minimal
	tokens_before = base_response.usage.prompt_tokens if base_response.usage else 20
	tokens_after = optimized_response.usage.prompt_tokens if optimized_response.usage else 20
	tokens_saved = max(0, tokens_before - tokens_after)

	return ComparisonResult(
	scenario="Live API (Simple Query)",
	tokens_before=tokens_before,
	tokens_after=tokens_after,
	tokens_saved=tokens_saved,
	savings_percent=(tokens_saved / tokens_before * 100) if tokens_before > 0 else 0,
	latency_before_ms=latency_before,
	latency_after_ms=latency_after,
	cost_before_usd=estimate_cost(tokens_before, "gpt-4o-mini"),
	cost_after_usd=estimate_cost(tokens_after, "gpt-4o-mini"),
	cost_saved_usd=estimate_cost(tokens_saved, "gpt-4o-mini"),
	)


	# ============================================================================
	# MAIN: Run All Scenarios
	# ============================================================================


	def main():
	"""Run all comparison scenarios."""
	print("\n" + "=" * 70)
	print(" HEADROOM + LANGCHAIN: Before vs After Comparison")
	print("=" * 70)

	if not LANGCHAIN_AVAILABLE:
	print("\n[X] Cannot run examples: LangChain not installed")
	print(" Install with: pip install langchain-core")
	return

	if not HEADROOM_AVAILABLE:
	print("\n[X] Cannot run examples: Headroom not installed")
	return

	results = []

	# Run each scenario
	try:
	results.append(scenario_agentic_workflow())
	except Exception as e:
	print(f"\n[X] Scenario 1 failed: {e}")

	try:
	results.append(scenario_long_conversation())
	except Exception as e:
	print(f"\n[X] Scenario 2 failed: {e}")

	try:
	results.append(scenario_rag_pipeline())
	except Exception as e:
	print(f"\n[X] Scenario 3 failed: {e}")

	try:
	live_result = scenario_live_api()
	if live_result:
	results.append(live_result)
	except Exception as e:
	print(f"\n[X] Live API scenario failed: {e}")

	# Print all results
	print("\n\n" + "=" * 70)
	print(" SUMMARY: All Scenarios")
	print("=" * 70)

	for result in results:
	print_comparison(result)

	# Calculate totals
	if results:
	total_saved = sum(r.tokens_saved for r in results)
	total_cost_saved = sum(r.cost_saved_usd for r in results)
	avg_savings = sum(r.savings_percent for r in results) / len(results)

	print("\n" + "=" * 70)
	print(" TOTAL IMPACT")
	print("=" * 70)
	print(f"\n[Results] Across {len(results)} scenarios:")
	print(f" Total tokens saved: {total_saved:,}")
	print(f" Average savings: {avg_savings:.1f}%")
	print(f" Total cost saved: ${total_cost_saved:.4f}")
	print("\n[Projection] At scale (1M requests/month):")
	print(f" Estimated monthly savings: ${total_cost_saved * 1_000_000 / len(results):,.2f}")


	if __name__ == "__main__":
	main()