"""Hard eval: Cases where the LLM has NO reason to check compressed data. The previous eval asked "are there failures?" — that's too easy, the LLM will proactively check regardless of summary. This eval tests the SUBTLE case: the user asks a DIFFERENT question, but the answer is in the compressed data. The summary is the only hint. Requires: ANTHROPIC_API_KEY in environment or .env file. """ from __future__ import annotations import json import os from pathlib import Path import pytest env_path = Path(__file__).parent.parent / ".env" if env_path.exists(): for line in env_path.read_text().splitlines(): line = line.strip() if line and not line.startswith("#") and "=" in line: key, _, value = line.partition("=") os.environ.setdefault(key.strip(), value.strip()) ANTHROPIC_KEY = os.environ.get("ANTHROPIC_API_KEY", "") pytestmark = pytest.mark.skipif( not ANTHROPIC_KEY, reason="ANTHROPIC_API_KEY not set", ) HEADROOM_RETRIEVE_TOOL = { "name": "headroom_retrieve", "description": "Retrieve uncompressed content. Pass a query to search within it.", "input_schema": { "type": "object", "properties": { "hash": {"type": "string"}, "query": {"type": "string"}, }, "required": ["hash"], }, } def _call_claude(messages, tools, max_tokens=300): import httpx resp = httpx.post( "https://api.anthropic.com/v1/messages", headers={ "X-Api-Key": ANTHROPIC_KEY, "anthropic-version": "2023-06-01", "Content-Type": "application/json", }, json={ "model": "claude-sonnet-4-5-20250929", "max_tokens": max_tokens, "messages": messages, "tools": tools, }, timeout=30, ) return resp.json() def _get_tool_calls(resp): return [ {"name": b["name"], "input": b.get("input", {})} for b in resp.get("content", []) if b.get("type") == "tool_use" ] def _get_text(resp): return " ".join(b.get("text", "") for b in resp.get("content", []) if b.get("type") == "text") class TestHardCases: """Cases where the LLM wouldn't naturally check compressed data.""" def test_config_lookup_with_summary(self): """User asks about a config value that's in compressed data. The visible items are all about 'production' env. The compressed items include 'staging' configs. Summary mentions this. LLM should retrieve. """ visible = [ {"env": "production", "key": "DATABASE_URL", "value": "postgres://prod-db:5432/app"}, {"env": "production", "key": "REDIS_URL", "value": "redis://prod-cache:6379"}, {"env": "production", "key": "API_RATE_LIMIT", "value": "1000"}, ] # Hidden in compressed: staging configs all_items = ( visible + [ { "env": "staging", "key": "DATABASE_URL", "value": "postgres://staging-db:5432/app", }, {"env": "staging", "key": "REDIS_URL", "value": "redis://staging-cache:6379"}, {"env": "staging", "key": "DEBUG_MODE", "value": "true"}, {"env": "staging", "key": "LOG_LEVEL", "value": "debug"}, ] * 10 + [ { "env": "development", "key": "DATABASE_URL", "value": "postgres://localhost:5432/dev", }, ] * 5 ) from headroom.transforms.compression_summary import summarize_dropped_items summary = summarize_dropped_items(all_items, visible) compressed_output = json.dumps(visible, indent=2) compressed_output += ( f"\n[{len(all_items) - len(visible)} items compressed to {len(visible)}." f" Omitted: {summary}." f' Retrieve specific items: headroom_retrieve(hash="config_hash", query="search")]' ) messages = [ { "role": "user", "content": ( f"Here are the application configs:\n\n{compressed_output}\n\n" "What is the staging database URL?" ), } ] resp = _call_claude(messages, [HEADROOM_RETRIEVE_TOOL]) tool_calls = _get_tool_calls(resp) text = _get_text(resp) print(f"\n Summary: {summary}") print(f" Stop reason: {resp.get('stop_reason')}") print(f" Tool calls: {tool_calls}") if text: print(f" Text: {text[:200]}") # WITH summary mentioning "staging" → should retrieve if resp.get("stop_reason") == "tool_use": assert tool_calls[0]["name"] == "headroom_retrieve" query = tool_calls[0]["input"].get("query", "").lower() assert "staging" in query or "database" in query print(" RESULT: Retrieved staging config ✓") else: # If LLM didn't retrieve, it should at least mention the data is compressed assert "compressed" in text.lower() or "staging" in text.lower() print(" RESULT: Mentioned compressed data but didn't retrieve") def test_config_lookup_without_summary(self): """Same question, but NO summary. LLM only sees production configs.""" visible = [ {"env": "production", "key": "DATABASE_URL", "value": "postgres://prod-db:5432/app"}, {"env": "production", "key": "REDIS_URL", "value": "redis://prod-cache:6379"}, {"env": "production", "key": "API_RATE_LIMIT", "value": "1000"}, ] compressed_output = json.dumps(visible, indent=2) compressed_output += "\n[45 items compressed to 3. Retrieve more: hash=config_hash]" messages = [ { "role": "user", "content": ( f"Here are the application configs:\n\n{compressed_output}\n\n" "What is the staging database URL?" ), } ] resp = _call_claude(messages, [HEADROOM_RETRIEVE_TOOL]) tool_calls = _get_tool_calls(resp) text = _get_text(resp) print(f"\n Stop reason: {resp.get('stop_reason')}") print(f" Tool calls: {tool_calls}") if text: print(f" Text: {text[:200]}") if resp.get("stop_reason") == "tool_use": print(" RESULT: LLM proactively retrieved (smart)") else: print(" RESULT: LLM did NOT retrieve staging config") def test_specific_user_in_large_list_with_summary(self): """Find a specific user in a compressed user list. Summary mentions user roles. User asks about admins. """ visible = [ {"id": i, "name": f"user_{i}", "role": "member", "email": f"user{i}@co.com"} for i in range(5) ] all_items = ( visible + [ {"id": i, "name": f"user_{i}", "role": "member", "email": f"user{i}@co.com"} for i in range(5, 95) ] + [ {"id": 96, "name": "admin_sarah", "role": "admin", "email": "sarah@co.com"}, {"id": 97, "name": "admin_mike", "role": "admin", "email": "mike@co.com"}, {"id": 98, "name": "superadmin_jane", "role": "superadmin", "email": "jane@co.com"}, ] ) from headroom.transforms.compression_summary import summarize_dropped_items summary = summarize_dropped_items(all_items, visible) compressed_output = json.dumps(visible, indent=2) compressed_output += ( f"\n[{len(all_items) - len(visible)} items compressed to {len(visible)}." f" Omitted: {summary}." f' Retrieve: headroom_retrieve(hash="users_hash", query="search")]' ) messages = [ { "role": "user", "content": ( f"Here's our user list:\n\n{compressed_output}\n\n" "Who are the admin users? I need to contact them." ), } ] resp = _call_claude(messages, [HEADROOM_RETRIEVE_TOOL]) tool_calls = _get_tool_calls(resp) text = _get_text(resp) print(f"\n Summary: {summary}") print(f" Stop reason: {resp.get('stop_reason')}") print(f" Tool calls: {tool_calls}") if text: print(f" Text: {text[:200]}") if resp.get("stop_reason") == "tool_use": query = tool_calls[0]["input"].get("query", "").lower() assert "admin" in query print(f" RESULT: Retrieved admin users (query='{query}') ✓") else: print(" RESULT: Did not retrieve admin users")