"""Game generation module using NVIDIA Nemotron 3 Nano 4B via llama.cpp.""" import gc import json import uuid from typing import Optional from pathlib import Path # Model initialization cache (exported for reuse by story.py recap) _model_cache = {} NEMOTRON_MODEL_CACHE = _model_cache # Model configuration NEMOTRON_MODEL_ID = "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF" NEMOTRON_GGUF_FILE = "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf" # ── Lazy model path resolution ────────────────────────────────────────── # The model is a 2.84 GB GGUF file. We do NOT download at import time # because that would block HF Space startup and cause OCI kill (exit 128). # Instead the download happens lazily inside the @spaces.GPU-decorated # function, where we have ample duration. hf_hub_download caches to # ~/.cache/huggingface/hub/ so subsequent calls are instant. _model_path: Optional[str] = None _model_download_attempted = False def _resolve_model_path() -> Optional[str]: """Download the GGUF model to a local cache and return its path. Uses ``hf_hub_download`` which caches to ``~/.cache/huggingface/hub/`` so subsequent calls are instant (no re-download). Call this lazily inside ``@spaces.GPU`` so the 2.84 GB download does *not* block container startup. Returns: Absolute path to the GGUF file, or ``None`` on failure. """ global _model_path, _model_download_attempted if _model_download_attempted: return _model_path _model_download_attempted = True try: # Enable hf_transfer for faster downloads (silent if unavailable) import os os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") from huggingface_hub import hf_hub_download print(f"[model] Downloading {NEMOTRON_MODEL_ID}/{NEMOTRON_GGUF_FILE} …") _model_path = hf_hub_download( repo_id=NEMOTRON_MODEL_ID, filename=NEMOTRON_GGUF_FILE, ) print(f"[model] Downloaded → {_model_path}") except Exception as e: print(f"[model] Download failed: {type(e).__name__}: {e}") return _model_path # NOTE: _resolve_model_path() is NOT called here — see docstring above. # ── GPU detection for llama.cpp ────────────────────────────────────────── def _get_n_gpu_layers() -> int: """Auto-detect GPU availability for llama.cpp inference. Returns: -1 if CUDA/GPU available (use all layers on GPU), 0 for CPU-only """ try: import torch if torch.cuda.is_available(): return -1 # All layers on GPU except ImportError: pass return 0 # CPU only def unload_nemotron() -> None: """Deload the Nemotron llama.cpp model to free GPU memory. After game generation is complete, the 2.84 GB GGUF model no longer needs to sit in VRAM. Calling this frees ~3 GB so that other models (FLUX poster, Cohere ASR) can load on the same GPU. Safe to call outside ``@spaces.GPU`` context — skips CUDA calls if GPU is not available. """ global _model_path, _model_download_attempted cleared = 0 for key in list(_model_cache.keys()): obj = _model_cache.pop(key, None) del obj cleared += 1 if cleared: print(f"[nemotron] Cleared {cleared} cached model(s) from memory") # Force a garbage collection gc.collect() # Only attempt CUDA cleanup if CUDA was actually used — # torch.cuda.is_available() can crash outside @spaces.GPU on ZeroGPU. try: import torch if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() free, total = torch.cuda.mem_get_info() print(f"[nemotron] GPU memory freed — {free / 1e9:.1f} GB / {total / 1e9:.1f} GB available") except Exception: pass # CUDA not available or outside GPU context — skip silently # ── Prompt building ────────────────────────────────────────────────────── def build_generation_prompt(config: dict, retrieved_examples: list[dict]) -> str: """Build the game generation prompt with context and examples. The prompt adapts its example formatting to the requested game type: - For scavenger_hunt: shows task patterns with points, proof types, hints - For hide_and_seek: shows hiding zones, concealment ratings, seeker strategies - For tag: shows task patterns (simpler structure) Args: config: Game configuration from user retrieved_examples: Retrieved similar games for grounding Returns: Formatted prompt string """ game_type = config.get('game_type', 'scavenger_hunt') # ── Format retrieved examples (game-type-aware) ────────────────── examples_str = "" if retrieved_examples: examples_str = "\n## Retrieved Similar Examples:\n" for i, ex in enumerate(retrieved_examples[:3], 1): examples_str += f"\n### Example {i}: {ex.get('id')}\n" examples_str += f"- Type: {ex.get('game_type')}\n" examples_str += f"- City: {ex.get('city')} · Area: {ex.get('area')}\n" examples_str += f"- Difficulty: {ex.get('difficulty')} · Age: {ex.get('age_group')} · Duration: {ex.get('duration_minutes')} min\n" examples_str += f"- Theme: {ex.get('theme', 'general')} · Mobility: {ex.get('mobility', 'standard')}\n" examples_str += f"- Landscape Tags: {', '.join(ex.get('landscape_tags', []))}\n" # Rules rules = ex.get('rules_summary', []) if rules: examples_str += f"- Rules: {', '.join(rules[:2])}\n" if game_type == 'hide_and_seek' and ex.get('hiding_zones_summary'): # Hide & seek: format hiding zones + play area + seeker strategy examples_str += "- Hiding Zones:\n" for z in ex['hiding_zones_summary'][:2]: examples_str += f" • {z.get('zone_id')}: {z.get('description', '')[:80]} " examples_str += f"[concealment: {z.get('concealment_rating')}]\n" pa = ex.get('play_area_summary', {}) if pa.get('boundary_description'): examples_str += f"- Play Area: {pa['boundary_description'][:100]}...\n" examples_str += f"- Boundary Size: {pa.get('boundary_size_tier', 'medium')}\n" if ex.get('seeker_strategy'): examples_str += f"- Seeker Strategy: {ex['seeker_strategy'][:120]}...\n" elif game_type == 'tag' and ex.get('arena_summary'): # Tag: format arena, safe zones, movement features ar = ex.get('arena_summary', {}) if ar.get('boundary_description'): examples_str += f"- Arena: {ar['boundary_description'][:100]}...\n" examples_str += f"- Arena Size: {ar.get('arena_size_tier', 'medium')}\n" examples_str += f"- Variant: {ex.get('tag_variant', 'classic')} · " examples_str += f"'It' Players: {ex.get('it_count', 1)} · " examples_str += f"Rounds: {ex.get('round_count', 1)}\n" sz = ex.get('safe_zones_summary', []) if sz: examples_str += "- Safe Zones:\n" for z in sz[:2]: examples_str += f" • {z.get('zone_id')}: {z.get('description', '')[:80]}\n" cp = ex.get('chokepoints', []) if cp: examples_str += f"- Chokepoints: {'; '.join(cp[:2])}\n" oz = ex.get('open_zones', []) if oz: examples_str += f"- Open Zones: {'; '.join(oz[:2])}\n" if ex.get('tag_mechanic'): examples_str += f"- Tag Mechanic: {ex['tag_mechanic'][:100]}...\n" else: # Scavenger hunt / tag: format task patterns task_patterns = ex.get('task_patterns', []) if task_patterns: examples_str += "- Tasks:\n" for task in task_patterns[:2]: pts = task.get('points', '?') tl = task.get('time_limit', '?') tt = task.get('task_type', '') diff = task.get('difficulty', '') tags = task.get('landscape_tags_used', []) examples_str += f" • {task.get('task_id')}: {task.get('title', '')} " examples_str += f"({pts} pts, {tl} min, {diff})" if tags: examples_str += f" [{', '.join(tags)}]" examples_str += "\n" if ex.get('dataset_source') in ('scavenger_hunt',): examples_str += f"- Notes: {ex.get('notes', '')[:80]}\n" # ── Live city context via Wikipedia ───────────────────────────────── city = config.get('city', 'Paris') city_context_str = "" try: from app.services.city_context import build_city_section city_context_str = build_city_section(city) except Exception as e: print(f"[prompt] Wikipedia city context unavailable: {e}") # ── Load prompt template ────────────────────────────────────────── template_path = Path("app/prompts/game_generation.txt") if template_path.exists(): with open(template_path, 'r', encoding='utf-8') as f: template = f.read() else: template = "Generate a location-based game in strict JSON format.\n{output_schema}" # ── Load output schema ─────────────────────────────────────────── schema_path = Path("app/schemas/game_schema.json") schema_str = "" if schema_path.exists(): with open(schema_path, 'r', encoding='utf-8') as f: schema_obj = json.load(f) schema_str = json.dumps(schema_obj, indent=2) # ── Build prompt ───────────────────────────────────────────────── prompt = template.format( city=city, area=config.get('area', 'downtown'), game_type=game_type, duration_minutes=config.get('duration_minutes', 45), num_players=config.get('num_players', 4), difficulty=config.get('difficulty', 'medium'), age_group=config.get('age_group', 'adults'), location_type=config.get('location_type', 'mixed'), retrieved_examples=examples_str, city_context=city_context_str, output_schema=schema_str, ) return prompt # ── JSON extraction from model output ──────────────────────────────────── def extract_json(text: str) -> Optional[str]: """Extract JSON object from generated text. Finds the first complete JSON object by tracking brace depth. Args: text: Generated text that may contain JSON Returns: JSON string or None if not found """ start_idx = text.find('{') if start_idx == -1: return None depth = 0 for i in range(start_idx, len(text)): if text[i] == '{': depth += 1 elif text[i] == '}': depth -= 1 if depth == 0: raw = text[start_idx:i+1] # Normalize double braces from prompt echoing ({{ -> {) if raw.startswith('{{') and raw.endswith('}}'): raw = raw[1:-1] return raw return None # ── Model-based generation with llama.cpp ─────────────────────────────── def generate_game_with_model( prompt: str, model_path: Optional[str] = None, model_name: str = "nemotron", ) -> Optional[str]: """Generate game JSON using NVIDIA Nemotron 3 Nano 4B via llama.cpp. Uses llama-cpp-python for optimal performance with GGUF quantization. Important — HF Spaces Zero GPU pattern: * The 2.84 GB GGUF file is lazily downloaded inside ``@spaces.GPU`` (if not already cached on disk from a previous run). ``hf_hub_download`` uses the local Hugging Face cache so subsequent calls are instant. * ``Llama(model_path=...)`` initialisation happens here — inside the GPU context where CUDA is available. Args: prompt: Generation prompt model_path: Path to a local GGUF file (optional — auto-downloaded if omitted). model_name: Model identifier (unused, kept for API compat). Returns: Generated game JSON string or None if model unavailable """ try: from llama_cpp import Llama cache_key = f"llama_cpp_{model_path or 'module_default'}" if cache_key in _model_cache: llm = _model_cache[cache_key] else: resolved = model_path or _resolve_model_path() if not resolved: print("[nemotron] No model path available — fall back to mock") return None n_gpu_layers = _get_n_gpu_layers() gpu_info = "GPU" if n_gpu_layers < 0 else "CPU" print(f"[nemotron] Initialising llama.cpp from: {resolved} ({gpu_info})") llm = Llama( model_path=resolved, verbose=False, n_gpu_layers=n_gpu_layers, n_ctx=8192, ) _model_cache[cache_key] = llm # Use create_chat_completion — this model uses a Nemotron chat template messages = [ {"role": "system", "content": "You output only valid JSON. No other text."}, {"role": "user", "content": prompt}, ] result = llm.create_chat_completion( messages=messages, max_tokens=8192, temperature=0.3, top_p=0.9, stop=["```"], ) generated_text = result["choices"][0]["message"]["content"] generated_text = generated_text.strip() print(f"[nemotron] Generated {len(generated_text)} chars") json_str = extract_json(generated_text) if not json_str: print(f"[nemotron] JSON extraction failed on output (len={len(generated_text)})") print(f"[nemotron] Preview: {generated_text[:300]}...") return json_str except ImportError: print("[nemotron] llama-cpp-python not available. Install with: pip install llama-cpp-python") return None except Exception as e: print(f"[nemotron] llama.cpp generation failed: {type(e).__name__}: {e}") return None # ── Mock generation (fallback) ─────────────────────────────────────────── def generate_game_mock(config: dict, retrieved_examples: list[dict]) -> dict: """Generate a realistic mock game for testing without a model. Uses retrieved examples and config to create a valid game structure that passes schema validation. Args: config: Game configuration retrieved_examples: Retrieved similar games for grounding Returns: Generated game JSON matching the game schema """ game_id = f"mock-{uuid.uuid4().hex[:8]}" num_tasks = max(2, config.get('duration_minutes', 45) // 15) tasks = [] proof_types = ["photo", "observation", "text"] locations = ["main square", "city center", "park area", "landmark district", "historic district"] for i in range(min(num_tasks, 5)): task_id = f"t{i+1}" points = 15 + (i * 5) time_limit = 8 + (i * 2) proof_type = proof_types[i % len(proof_types)] location = locations[i % len(locations)] task = { "task_id": task_id, "title": f"Task {i+1}: Explore the {location}", "description": f"Find and document something interesting in the {location}", "location_hint": f"Navigate to the {location} and look for distinctive features", "points": points, "time_limit_minutes": time_limit, "proof_type": proof_type, "hint": f"Look for signs or landmarks in the {location}", "safety_note": "Stay on public paths and avoid restricted areas", } tasks.append(task) game = { "game_id": game_id, "game_type": config.get('game_type', 'scavenger_hunt'), "title": f"{config.get('game_type', 'scavenger hunt').title()} in {config.get('area', 'the city')}", "theme": f"{config.get('difficulty', 'medium').lower()} adventure", "setup": { "city": config.get('city', 'Paris'), "area": config.get('area', 'downtown'), "meeting_point": f"Main entrance of {config.get('area', 'downtown')}", "duration_minutes": config.get('duration_minutes', 45), "num_players": config.get('num_players', 4), }, "rules": [ f"Complete as many tasks as possible within {config.get('duration_minutes', 45)} minutes", "Take photos or notes as proof of completion", "Stay within the designated area at all times", "No entering private buildings or restricted areas", f"This game is suitable for {config.get('age_group', 'all ages')}", ], "tasks": tasks, "global_hints": [ "Explore systematically from the meeting point outward", "Ask locals for directions if needed", "Time management is key - don't spend too long on any single task", ], "score_rules": [ "Each task completed: full points", "Early completion: +1 bonus point per minute under limit", "Hints used: -5 points per hint", "Late arrival at meeting point: -10 points per minute", ], "tie_breaker": "Winner is the player with the most points when time expires. Ties broken by earliest completion time.", "safety": { "allowed_zone": config.get('area', 'downtown'), "forbidden_behaviors": [ "Entering buildings without permission", "Crossing busy streets recklessly", "Approaching strangers", "Leaving the designated area", ], "adult_supervision": config.get('age_group') in ['kids', 'teens'], "stop_conditions": [ "If a player feels unsafe, the game stops immediately", "If weather becomes severe, relocate to shelter", "If anyone is injured, call emergency services", ], }, "story_seed": { "tone": "playful", "motifs": ["exploration", "discovery", "teamwork"], "recap_style": "episode_recap", }, } return game # ── Main game generation (wrapper) ─────────────────────────────────────── def generate_game(config: dict, retrieved_examples: list[dict]) -> dict: """Generate a game from user config and retrieved examples. Uses NVIDIA Nemotron 3 Nano 4B via llama.cpp for optimal performance. Falls back to mock generation if model unavailable. Args: config: Game configuration (game_type, city, duration, etc.) retrieved_examples: List of similar example games for grounding Returns: Generated game JSON matching the game schema """ prompt = build_generation_prompt(config, retrieved_examples) json_str = generate_game_with_model(prompt, model_name="nemotron") if json_str: try: game = json.loads(json_str) if all(field in game for field in ["game_id", "title", "setup", "tasks", "safety"]): print(f"[gen] Generated game via Nemotron: {game.get('game_id')}") return game except json.JSONDecodeError: print("[gen] Failed to parse generated JSON, using mock") print("[gen] Using mock generation (model unavailable or generation failed)") return generate_game_mock(config, retrieved_examples)