Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| """Benchmark: LLM Extraction vs Embedding-Only Memory. | |
| Demonstrates the massive latency difference between: | |
| 1. OLD: LLM-based extraction (2-3 seconds) | |
| 2. NEW: Embedding-only storage (sub-100ms) | |
| Usage: | |
| export OPENAI_API_KEY="sk-..." | |
| python examples/memory_latency_benchmark.py | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import sys | |
| import tempfile | |
| import time | |
| from pathlib import Path | |
| from statistics import mean | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from openai import OpenAI | |
| from headroom.memory.extractor import MemoryExtractor | |
| from headroom.memory.fast_store import ( | |
| FastMemoryStore, | |
| create_local_embed_fn, | |
| create_openai_batch_embed_fn, | |
| create_openai_embed_fn, | |
| ) | |
| # Test messages with memory-worthy content | |
| TEST_MESSAGES = [ | |
| ("I prefer Python over JavaScript for backend development", "Great choice!"), | |
| ("I work at a fintech startup handling payment processing", "Interesting domain!"), | |
| ("Always use PostgreSQL for relational data, never MongoDB", "Solid preference!"), | |
| ("I'm migrating from monolith to microservices architecture", "Good luck!"), | |
| ("My email is test@example.com, contact me there only", "Noted!"), | |
| ] | |
| def benchmark_llm_extraction(client: OpenAI, num_runs: int = 5) -> list[float]: | |
| """Benchmark the OLD LLM-based extraction approach.""" | |
| print("\n" + "=" * 60) | |
| print("BENCHMARK: LLM-Based Extraction (OLD)") | |
| print("=" * 60) | |
| extractor = MemoryExtractor(client) | |
| latencies = [] | |
| for i, (query, response) in enumerate(TEST_MESSAGES[:num_runs]): | |
| start = time.perf_counter() | |
| memories = extractor.extract(query, response) | |
| elapsed = time.perf_counter() - start | |
| latencies.append(elapsed * 1000) # Convert to ms | |
| print(f" Run {i + 1}: {elapsed * 1000:.0f}ms - extracted {len(memories)} memories") | |
| return latencies | |
| def benchmark_embedding_store(client: OpenAI, num_runs: int = 5) -> list[float]: | |
| """Benchmark embedding-only approach with INDIVIDUAL API calls.""" | |
| print("\n" + "=" * 60) | |
| print("BENCHMARK: Embedding-Only, Individual Calls") | |
| print("=" * 60) | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| db_path = Path(tmpdir) / "fast_memory.db" | |
| embed_fn = create_openai_embed_fn(client) | |
| store = FastMemoryStore(db_path, embed_fn=embed_fn) | |
| latencies = [] | |
| for i, (query, response) in enumerate(TEST_MESSAGES[:num_runs]): | |
| start = time.perf_counter() | |
| # Store both messages (2 separate API calls) | |
| store.add("test_user", query, role="user") | |
| store.add("test_user", response, role="assistant") | |
| elapsed = time.perf_counter() - start | |
| latencies.append(elapsed * 1000) # Convert to ms | |
| print(f" Run {i + 1}: {elapsed * 1000:.0f}ms - stored 2 chunks (2 API calls)") | |
| return latencies | |
| def benchmark_batched_embedding(client: OpenAI, num_runs: int = 5) -> list[float]: | |
| """Benchmark embedding-only approach with BATCHED API calls.""" | |
| print("\n" + "=" * 60) | |
| print("BENCHMARK: Embedding-Only, BATCHED Calls") | |
| print("=" * 60) | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| db_path = Path(tmpdir) / "fast_memory.db" | |
| embed_fn = create_openai_embed_fn(client) | |
| batch_embed_fn = create_openai_batch_embed_fn(client) | |
| store = FastMemoryStore(db_path, embed_fn=embed_fn) | |
| latencies = [] | |
| for i, (query, response) in enumerate(TEST_MESSAGES[:num_runs]): | |
| start = time.perf_counter() | |
| # Store both messages in ONE API call | |
| store.add_turn_batched("test_user", query, response, batch_embed_fn) | |
| elapsed = time.perf_counter() - start | |
| latencies.append(elapsed * 1000) # Convert to ms | |
| print(f" Run {i + 1}: {elapsed * 1000:.0f}ms - stored 2 chunks (1 API call)") | |
| return latencies | |
| def benchmark_local_embedding(num_runs: int = 5) -> list[float]: | |
| """Benchmark embedding-only approach with LOCAL model (FASTEST).""" | |
| print("\n" + "=" * 60) | |
| print("BENCHMARK: LOCAL Embeddings (FASTEST - No API!)") | |
| print("=" * 60) | |
| # Load model once (this is slow, but only happens once) | |
| print(" Loading local model (one-time cost)...") | |
| start_load = time.perf_counter() | |
| embed_fn = create_local_embed_fn("all-MiniLM-L6-v2") | |
| load_time = time.perf_counter() - start_load | |
| print(f" Model loaded in {load_time:.1f}s") | |
| # Warmup runs to trigger JIT compilation | |
| print(" Warming up (JIT compilation)...") | |
| for _ in range(3): | |
| embed_fn("warmup text for compilation") | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| db_path = Path(tmpdir) / "fast_memory.db" | |
| store = FastMemoryStore(db_path, embed_fn=embed_fn, embedding_dim=384) | |
| latencies = [] | |
| for i, (query, response) in enumerate(TEST_MESSAGES[:num_runs]): | |
| start = time.perf_counter() | |
| # Store both messages | |
| store.add("test_user", query, role="user") | |
| store.add("test_user", response, role="assistant") | |
| elapsed = time.perf_counter() - start | |
| latencies.append(elapsed * 1000) # Convert to ms | |
| print(f" Run {i + 1}: {elapsed * 1000:.1f}ms - stored 2 chunks (LOCAL)") | |
| return latencies | |
| def benchmark_search_comparison(client: OpenAI) -> None: | |
| """Compare search latency: FTS5 vs Vector Similarity.""" | |
| print("\n" + "=" * 60) | |
| print("BENCHMARK: Search Latency") | |
| print("=" * 60) | |
| with tempfile.TemporaryDirectory() as tmpdir: | |
| db_path = Path(tmpdir) / "fast_memory.db" | |
| embed_fn = create_openai_embed_fn(client) | |
| store = FastMemoryStore(db_path, embed_fn=embed_fn) | |
| # Populate with test data | |
| print(" Populating store with 20 memories...") | |
| for query, response in TEST_MESSAGES * 4: | |
| store.add("test_user", query, role="user") | |
| store.add("test_user", response, role="assistant") | |
| # Benchmark searches | |
| search_queries = [ | |
| "What programming language?", | |
| "database recommendations", | |
| "architecture patterns", | |
| "contact information", | |
| ] | |
| print("\n Search latencies:") | |
| for query in search_queries: | |
| start = time.perf_counter() | |
| results = store.search("test_user", query, top_k=3) | |
| elapsed = time.perf_counter() - start | |
| top_match = results[0][0].text[:40] if results else "None" | |
| print(f" '{query}' -> {elapsed * 1000:.0f}ms ({len(results)} results)") | |
| print(f" Top match: '{top_match}...'") | |
| def main(): | |
| """Run all benchmarks.""" | |
| api_key = os.environ.get("OPENAI_API_KEY") | |
| if not api_key: | |
| print("ERROR: OPENAI_API_KEY environment variable not set") | |
| sys.exit(1) | |
| client = OpenAI(api_key=api_key) | |
| print("=" * 60) | |
| print("MEMORY LATENCY BENCHMARK") | |
| print("Comparing LLM Extraction vs Embedding-Only") | |
| print("=" * 60) | |
| # Run benchmarks | |
| llm_latencies = benchmark_llm_extraction(client, num_runs=3) | |
| embed_latencies = benchmark_embedding_store(client, num_runs=3) | |
| batched_latencies = benchmark_batched_embedding(client, num_runs=5) | |
| local_latencies = benchmark_local_embedding(num_runs=5) | |
| benchmark_search_comparison(client) | |
| # Summary | |
| print("\n" + "=" * 60) | |
| print("SUMMARY") | |
| print("=" * 60) | |
| llm_avg = mean(llm_latencies) | |
| embed_avg = mean(embed_latencies) | |
| batched_avg = mean(batched_latencies) | |
| local_avg = mean(local_latencies) | |
| print(f"\n{'Approach':<35} {'Avg Latency':<15} {'Speedup':<10}") | |
| print("-" * 60) | |
| print(f"{'LLM Extraction (OLD)':<35} {llm_avg:>10.0f}ms {'1.0x':>10}") | |
| print(f"{'Embedding (2 API calls)':<35} {embed_avg:>10.0f}ms {llm_avg / embed_avg:>9.1f}x") | |
| print( | |
| f"{'Embedding BATCHED (1 API call)':<35} {batched_avg:>10.0f}ms {llm_avg / batched_avg:>9.1f}x" | |
| ) | |
| print(f"{'LOCAL Embeddings (no API!)':<35} {local_avg:>10.1f}ms {llm_avg / local_avg:>9.0f}x") | |
| print(f"\n{'=' * 60}") | |
| print(f"BEST SPEEDUP: {llm_avg / local_avg:.0f}x FASTER with local embeddings!") | |
| print(f"{'=' * 60}") | |
| if local_avg < 100: | |
| print("\n✓ SUB-100ms ACHIEVED with local embeddings!") | |
| if local_avg < 50: | |
| print("✓ SUB-50ms ACHIEVED!") | |
| if local_avg < 20: | |
| print("✓ SUB-20ms ACHIEVED - GOAL MET!") | |
| if __name__ == "__main__": | |
| main() | |