# app/core/model_registry.py - Centralized Model Capability Registry """ Enterprise Model Capability Registry. Decouples hardcoded model IDs from their specific capabilities (Strict Mode, Tools, etc.). Facilitates dynamic scaling across multiple providers (Groq, OpenAI, Gemini). """ from typing import Dict, Any, List, Optional from enum import Enum class Capability(str, Enum): STRICT_MODE = "strict_mode" # Supports constrained decoding (strict: true) JSON_SCHEMA = "json_schema" # Supports structured outputs (strict: false) JSON_OBJECT = "json_object" # Only supports response_format: json_object TOOLS = "tools" # Supports native tool calling CACHE = "prompt_cache" # Supports prompt caching REASONING = "reasoning" # Supports explicit reasoning (format/effort) PARALLEL_TOOLS = "parallel_tools" # Supports executing multiple tools in one turn BUILT_IN_TOOLS = "built_in_tools" # Supports Groq server-side tools (web_search, etc.) REMOTE_MCP = "remote_mcp" # Supports connecting to external MCP servers VISIT_WEBSITE = "visit_website" # Supports Groq native URL analysis BROWSER_AUTOMATION = "browser_automation" # Supports interactive browser actions WOLFRAM_ALPHA = "wolfram_alpha" # Supports Wolfram computational engine CODE_INTERPRETER = "code_interpreter" # Supports Groq server-side Python (code_interpreter) class ModelRegistry: """Manages model inventory across all providers.""" # MASTER INVENTORY MODELS = { # --- GROQ INFRASTRUCTURE --- "allam-2-7b": { "provider": "groq", "capabilities": [Capability.JSON_OBJECT, Capability.TOOLS, Capability.PARALLEL_TOOLS], "role": "FAST_CHAT", "description": "Arabic-optimized high-speed chat", "rpm": 30, "rpd": 7000, "tpm": 6000, "tpd": 500000, "context_window": 131072 }, "openai/gpt-oss-20b": { "provider": "groq", "capabilities": [ Capability.STRICT_MODE, Capability.JSON_SCHEMA, Capability.CACHE, Capability.REASONING, Capability.TOOLS, Capability.BUILT_IN_TOOLS, Capability.VISIT_WEBSITE, Capability.CODE_INTERPRETER ], "role": "STRUCTURED_OUTPUT", "description": "High-precision forensic extraction (Supports Built-In Tools)", "rpm": 30, "rpd": 1000, "tpm": 8000, "tpd": 200000, "context_window": 131072 }, "llama-3.3-70b-versatile": { "provider": "groq", "capabilities": [ Capability.TOOLS, Capability.PARALLEL_TOOLS, Capability.REMOTE_MCP, Capability.JSON_OBJECT, Capability.JSON_SCHEMA ], "role": "SMART_REASONING", "description": "Versatile reasoning and official JSON Schema support", "rpm": 30, "rpd": 1000, "tpm": 12000, "tpd": 100000, "context_window": 131072 }, "openai/gpt-oss-120b": { "provider": "groq", "capabilities": [ Capability.STRICT_MODE, Capability.JSON_SCHEMA, Capability.CACHE, Capability.REASONING, Capability.TOOLS, Capability.BUILT_IN_TOOLS, Capability.VISIT_WEBSITE, Capability.CODE_INTERPRETER ], "role": "SMART_REASONING", "description": "Massive scale reasoning (Supports Built-In Tools)", "rpm": 30, "rpd": 1000, "tpm": 8000, "tpd": 200000, "context_window": 131072 }, "llama-3.1-8b-instant": { "provider": "groq", "capabilities": [Capability.JSON_OBJECT, Capability.TOOLS, Capability.PARALLEL_TOOLS], "role": "FAST_CHAT", "description": "Ultra-fast chat with high daily request limit (14.4K)", "rpm": 30, "rpd": 14400, "tpm": 6000, "tpd": 500000, "context_window": 131072 }, "qwen/qwen3-32b": { "provider": "groq", "capabilities": [Capability.JSON_SCHEMA, Capability.REASONING, Capability.TOOLS, Capability.PARALLEL_TOOLS], "role": "SMART_REASONING", "description": "Advanced reasoning (Qwen 3) Thinking Mode", "rpm": 60, "rpd": 1000, "tpm": 6000, "tpd": 500000, "context_window": 131072 }, "openai/gpt-oss-safeguard-20b": { "provider": "groq", "capabilities": [Capability.JSON_SCHEMA, Capability.CACHE, Capability.REASONING, Capability.TOOLS], "role": "SAFETY_GUARD", "description": "Policy-following safety analysis (Best-effort Structured Output)", "rpm": 30, "rpd": 1000, "tpm": 8000, "tpd": 200000, "context_window": 131072 }, "meta-llama/llama-guard-4-12b": { "provider": "groq", "capabilities": [Capability.TOOLS, Capability.PARALLEL_TOOLS], "role": "SAFETY_GUARD", "description": "Security guardrail (14.4K RPD Workhorse)", "rpm": 30, "rpd": 14400, "tpm": 15000, "tpd": 500000, "context_window": 131072 }, # Ultra-light safety model for final safety fallback "meta-llama/llama-prompt-guard-2-86m": { "provider": "groq", "capabilities": [], # Minimal - prompt safety only "role": "SAFETY_GUARD", "description": "Ultra-light Prompt Guard (Preview, 86M params)", "rpm": 30, "rpd": 30000, "tpm": 15000, "tpd": 1000000, "context_window": 4096 }, "moonshotai/kimi-k2-instruct-0905": { "provider": "groq", "capabilities": [Capability.JSON_OBJECT, Capability.CACHE, Capability.REASONING, Capability.PARALLEL_TOOLS, Capability.TOOLS], "role": "NATURAL_CHAT", "description": "Agentic reasoning (Thinking Mode - 262k Window)", "rpm": 60, "rpd": 1000, "tpm": 10000, "tpd": 300000, "context_window": 262144 }, "meta-llama/llama-4-maverick-17b-128e-instruct": { "provider": "groq", "capabilities": [Capability.JSON_SCHEMA, Capability.PARALLEL_TOOLS, Capability.TOOLS, Capability.JSON_OBJECT], "role": "STRUCTURED_OUTPUT", "description": "Llama 4 Maverick (500k TPD Generalist)", "rpm": 30, "rpd": 1000, "tpm": 6000, "tpd": 500000, "context_window": 512000 }, "meta-llama/llama-4-scout-17b-16e-instruct": { "provider": "groq", "capabilities": [Capability.JSON_SCHEMA, Capability.PARALLEL_TOOLS, Capability.TOOLS], "role": "FAST_CHAT", "description": "Llama 4 Scout (30K TPM - 10M Context)", "rpm": 30, "rpd": 1000, "tpm": 30000, "tpd": 500000, "context_window": 10000000 }, "groq/compound": { "provider": "groq", "capabilities": [ Capability.BUILT_IN_TOOLS, Capability.VISIT_WEBSITE, Capability.BROWSER_AUTOMATION, Capability.CODE_INTERPRETER, Capability.WOLFRAM_ALPHA, Capability.JSON_OBJECT ], "role": "FORENSIC_SEARCH", "description": "Groq Compound (Multi-Tool Server-side)", "rpm": 30, "rpd": 250, "tpm": 70000, "context_window": 131072 }, "groq/compound-mini": { "provider": "groq", "capabilities": [ Capability.BUILT_IN_TOOLS, Capability.VISIT_WEBSITE, Capability.CODE_INTERPRETER, Capability.JSON_OBJECT ], "role": "FAST_CHAT", "description": "Groq Compound Mini (Single-Tool, 3x Lower Latency)", "rpm": 30, "rpd": 250, "tpm": 70000, "context_window": 131072 }, # --- OPENAI INFRASTRUCTURE --- "gpt-4o": { "provider": "openai", "capabilities": [Capability.STRICT_MODE, Capability.JSON_SCHEMA, Capability.TOOLS], "role": "SMART_REASONING" }, "gpt-4o-mini": { "provider": "openai", "capabilities": [Capability.STRICT_MODE, Capability.JSON_SCHEMA, Capability.TOOLS], "role": "FAST_CHAT" }, # --- ANTHROPIC INFRASTRUCTURE --- "claude-3-5-sonnet-latest": { "provider": "anthropic", "capabilities": [Capability.TOOLS, Capability.JSON_OBJECT], "role": "SMART_REASONING" } } @classmethod def get_capabilities(cls, model_id: str) -> List[Capability]: """Get capabilities for a specific model.""" return cls.MODELS.get(model_id, {}).get("capabilities", []) @classmethod def supports(cls, model_id: str, capability: Capability) -> bool: """Check if a model supports a specific capability.""" return capability in cls.get_capabilities(model_id) @classmethod def get_models_by_provider(cls, provider: str) -> Dict[str, Any]: """Filter models by provider name.""" return {k: v for k, v in cls.MODELS.items() if v.get("provider") == provider} @classmethod def get_preferred_model(cls, provider: str, role_name: str) -> str: """Dynamically pick the best model for a role from a provider.""" # Clean role name (remove _MODEL suffix if present to match registry keys) clean_role = role_name.replace("_MODEL", "") # 1. Specialized Routing for GROQ (Free Tier Throughput Optimization) if provider == "groq": if clean_role == "FAST_CHAT": # Scout has 30K TPM, massive context, and 500k TPD. return "meta-llama/llama-4-scout-17b-16e-instruct" elif clean_role == "STRUCTURED_OUTPUT": # Prefer GPT-OSS 20B for precision/strict mode normally. return "openai/gpt-oss-20b" elif clean_role == "SAFETY_GUARD": # Use Llama Guard 4 as primary because it has 14.4K RPD limit. return "meta-llama/llama-guard-4-12b" elif clean_role == "SMART_REASONING": # Llama 3.3 70B is the standard for smart reasoning without 400 errors return "llama-3.3-70b-versatile" elif clean_role == "NATURAL_CHAT": # Kimi is best for natural chat and personas due to context/cache return "moonshotai/kimi-k2-instruct-0905" elif clean_role == "FORENSIC_SEARCH": return "groq/compound" # 2. General Role-Based Lookup role_models = [k for k, v in cls.MODELS.items() if v.get("provider") == provider and v.get("role") == clean_role] if role_models: return role_models[0] # 3. Last Resort Fallbacks (using chains) chain = cls.get_fallback_chain(provider, clean_role) return chain[0] if chain else "llama-3.3-70b-versatile" @classmethod def get_fallback_chain(cls, provider: str, role_name: str) -> List[str]: """Returns a prioritized list of models for a role to handle failover (Production Ready).""" clean_role = role_name.replace("_MODEL", "") # Define chains based on performance vs. availability (Strict Mode & Reasoning Tiers) # Groq-Specific Chains if provider == "groq": # ════════════════════════════════════════════════════════════════ # CAPABILITY-AWARE FALLBACK CHAINS (Production Grade) # Rule: Same category → Same capability → Higher benchmark → Local # ════════════════════════════════════════════════════════════════ chains = { # SMART REASONING: Native thinking → Strong reasoning → Generalist "SMART_REASONING": [ "llama-3.3-70b-versatile", # Tier 1: Generalist (12K TPM), Reliable JSON Schema "qwen/qwen3-32b", # Tier 2: Native Reasoning, High RPM (60) "openai/gpt-oss-120b", # Tier 3: Strongest Reasoning, CACHE ✅ ], # FAST CHAT / PERSONA: Cache-first for stable personas "FAST_CHAT": [ "meta-llama/llama-4-scout-17b-16e-instruct", # Tier 1: 30K TPM speedster, 10M Context ✅ "llama-3.1-8b-instant", # Tier 2: Ultra-fast, High RPD (14.4K) "moonshotai/kimi-k2-instruct-0905", # Tier 3: CACHE ✅, Best persona stability "llama-3.3-70b-versatile", # Tier 4: Heavy fallback ], # STRUCTURED OUTPUT: Strict JSON → Strict JSON → JSON Object "STRUCTURED_OUTPUT": [ "qwen/qwen3-32b", # Tier 1: Native Reasoning, Reliable JSON Schema "llama-3.3-70b-versatile", # Tier 2: Generalist (May has 400s with some schemas) "openai/gpt-oss-20b", # Tier 3: STRICT JSON ✅, CACHE ✅ "meta-llama/llama-4-maverick-17b-128e-instruct", # Tier 4: High TPD (500k) ], # SAFETY GUARD: Policy-driven → Lightweight guards "SAFETY_GUARD": [ "meta-llama/llama-guard-4-12b", # Tier 1: Production, High RPD (14.4K) "openai/gpt-oss-safeguard-20b", # Tier 2: CACHE ✅, Policy-safe "meta-llama/llama-prompt-guard-2-86m", # Tier 3: Ultra-light (Preview) ], # FORENSIC SEARCH: Compound tools "FORENSIC_SEARCH": [ "groq/compound", # Tier 1: Full tooling "groq/compound-mini" # Tier 2: Lightweight ] } return chains.get(clean_role, ["llama-3.3-70b-versatile"]) # Generic / Other Provider Chains return ["gpt-4o", "gpt-4o-mini"] model_registry = ModelRegistry()