cq-test / app /services /generator.py
BhargavMN
fix: record selected game_type on the generated game
e44426f
Raw
History Blame Contribute Delete
20.9 kB
"""Game generation module using NVIDIA Nemotron 3 Nano 4B via llama.cpp."""
import gc
import json
import uuid
from typing import Optional
from pathlib import Path
# Model initialization cache (exported for reuse by story.py recap)
_model_cache = {}
NEMOTRON_MODEL_CACHE = _model_cache
# Model configuration
NEMOTRON_MODEL_ID = "nvidia/NVIDIA-Nemotron-3-Nano-4B-GGUF"
NEMOTRON_GGUF_FILE = "NVIDIA-Nemotron3-Nano-4B-Q4_K_M.gguf"
# ── Lazy model path resolution ──────────────────────────────────────────
# The model is a 2.84 GB GGUF file. We do NOT download at import time
# because that would block HF Space startup and cause OCI kill (exit 128).
# Instead the download happens lazily inside the @spaces.GPU-decorated
# function, where we have ample duration. hf_hub_download caches to
# ~/.cache/huggingface/hub/ so subsequent calls are instant.
_model_path: Optional[str] = None
_model_download_attempted = False
def _resolve_model_path() -> Optional[str]:
"""Download the GGUF model to a local cache and return its path.
Uses ``hf_hub_download`` which caches to ``~/.cache/huggingface/hub/``
so subsequent calls are instant (no re-download).
Call this lazily inside ``@spaces.GPU`` so the 2.84 GB download does
*not* block container startup.
Returns:
Absolute path to the GGUF file, or ``None`` on failure.
"""
global _model_path, _model_download_attempted
if _model_download_attempted:
return _model_path
_model_download_attempted = True
try:
# Enable hf_transfer for faster downloads (silent if unavailable)
import os
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
from huggingface_hub import hf_hub_download
print(f"[model] Downloading {NEMOTRON_MODEL_ID}/{NEMOTRON_GGUF_FILE} …")
_model_path = hf_hub_download(
repo_id=NEMOTRON_MODEL_ID,
filename=NEMOTRON_GGUF_FILE,
)
print(f"[model] Downloaded β†’ {_model_path}")
except Exception as e:
print(f"[model] Download failed: {type(e).__name__}: {e}")
return _model_path
# NOTE: _resolve_model_path() is NOT called here β€” see docstring above.
# ── GPU detection for llama.cpp ──────────────────────────────────────────
def _get_n_gpu_layers() -> int:
"""Auto-detect GPU availability for llama.cpp inference.
Returns:
-1 if CUDA/GPU available (use all layers on GPU), 0 for CPU-only
"""
try:
import torch
if torch.cuda.is_available():
return -1 # All layers on GPU
except ImportError:
pass
return 0 # CPU only
def unload_nemotron() -> None:
"""Deload the Nemotron llama.cpp model to free GPU memory.
After game generation is complete, the 2.84 GB GGUF model no longer
needs to sit in VRAM. Calling this frees ~3 GB so that other models
(FLUX poster, Cohere ASR) can load on the same GPU.
Safe to call outside ``@spaces.GPU`` context β€” skips CUDA calls
if GPU is not available.
"""
global _model_path, _model_download_attempted
cleared = 0
for key in list(_model_cache.keys()):
obj = _model_cache.pop(key, None)
del obj
cleared += 1
if cleared:
print(f"[nemotron] Cleared {cleared} cached model(s) from memory")
# Force a garbage collection
gc.collect()
# Only attempt CUDA cleanup if CUDA was actually used β€”
# torch.cuda.is_available() can crash outside @spaces.GPU on ZeroGPU.
try:
import torch
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()
free, total = torch.cuda.mem_get_info()
print(f"[nemotron] GPU memory freed β€” {free / 1e9:.1f} GB / {total / 1e9:.1f} GB available")
except Exception:
pass # CUDA not available or outside GPU context β€” skip silently
# ── Prompt building ──────────────────────────────────────────────────────
def build_generation_prompt(config: dict, retrieved_examples: list[dict]) -> str:
"""Build the game generation prompt with context and examples.
The prompt adapts its example formatting to the requested game type:
- For scavenger_hunt: shows task patterns with points, proof types, hints
- For hide_and_seek: shows hiding zones, concealment ratings, seeker strategies
- For tag: shows task patterns (simpler structure)
Args:
config: Game configuration from user
retrieved_examples: Retrieved similar games for grounding
Returns:
Formatted prompt string
"""
game_type = config.get('game_type', 'scavenger_hunt')
# ── Format retrieved examples (game-type-aware) ──────────────────
examples_str = ""
if retrieved_examples:
examples_str = "\n## Retrieved Similar Examples:\n"
for i, ex in enumerate(retrieved_examples[:3], 1):
examples_str += f"\n### Example {i}: {ex.get('id')}\n"
examples_str += f"- Type: {ex.get('game_type')}\n"
examples_str += f"- City: {ex.get('city')} Β· Area: {ex.get('area')}\n"
examples_str += f"- Difficulty: {ex.get('difficulty')} Β· Age: {ex.get('age_group')} Β· Duration: {ex.get('duration_minutes')} min\n"
examples_str += f"- Theme: {ex.get('theme', 'general')} Β· Mobility: {ex.get('mobility', 'standard')}\n"
examples_str += f"- Landscape Tags: {', '.join(ex.get('landscape_tags', []))}\n"
# Rules
rules = ex.get('rules_summary', [])
if rules:
examples_str += f"- Rules: {', '.join(rules[:2])}\n"
if game_type == 'hide_and_seek' and ex.get('hiding_zones_summary'):
# Hide & seek: format hiding zones + play area + seeker strategy
examples_str += "- Hiding Zones:\n"
for z in ex['hiding_zones_summary'][:2]:
examples_str += f" β€’ {z.get('zone_id')}: {z.get('description', '')[:80]} "
examples_str += f"[concealment: {z.get('concealment_rating')}]\n"
pa = ex.get('play_area_summary', {})
if pa.get('boundary_description'):
examples_str += f"- Play Area: {pa['boundary_description'][:100]}...\n"
examples_str += f"- Boundary Size: {pa.get('boundary_size_tier', 'medium')}\n"
if ex.get('seeker_strategy'):
examples_str += f"- Seeker Strategy: {ex['seeker_strategy'][:120]}...\n"
elif game_type == 'tag' and ex.get('arena_summary'):
# Tag: format arena, safe zones, movement features
ar = ex.get('arena_summary', {})
if ar.get('boundary_description'):
examples_str += f"- Arena: {ar['boundary_description'][:100]}...\n"
examples_str += f"- Arena Size: {ar.get('arena_size_tier', 'medium')}\n"
examples_str += f"- Variant: {ex.get('tag_variant', 'classic')} Β· "
examples_str += f"'It' Players: {ex.get('it_count', 1)} Β· "
examples_str += f"Rounds: {ex.get('round_count', 1)}\n"
sz = ex.get('safe_zones_summary', [])
if sz:
examples_str += "- Safe Zones:\n"
for z in sz[:2]:
examples_str += f" β€’ {z.get('zone_id')}: {z.get('description', '')[:80]}\n"
cp = ex.get('chokepoints', [])
if cp:
examples_str += f"- Chokepoints: {'; '.join(cp[:2])}\n"
oz = ex.get('open_zones', [])
if oz:
examples_str += f"- Open Zones: {'; '.join(oz[:2])}\n"
if ex.get('tag_mechanic'):
examples_str += f"- Tag Mechanic: {ex['tag_mechanic'][:100]}...\n"
else:
# Scavenger hunt / tag: format task patterns
task_patterns = ex.get('task_patterns', [])
if task_patterns:
examples_str += "- Tasks:\n"
for task in task_patterns[:2]:
pts = task.get('points', '?')
tl = task.get('time_limit', '?')
tt = task.get('task_type', '')
diff = task.get('difficulty', '')
tags = task.get('landscape_tags_used', [])
examples_str += f" β€’ {task.get('task_id')}: {task.get('title', '')} "
examples_str += f"({pts} pts, {tl} min, {diff})"
if tags:
examples_str += f" [{', '.join(tags)}]"
examples_str += "\n"
if ex.get('dataset_source') in ('scavenger_hunt',):
examples_str += f"- Notes: {ex.get('notes', '')[:80]}\n"
# ── Live city context via Wikipedia ─────────────────────────────────
city = config.get('city', 'Paris')
city_context_str = ""
try:
from app.services.city_context import build_city_section
city_context_str = build_city_section(city)
except Exception as e:
print(f"[prompt] Wikipedia city context unavailable: {e}")
# ── Load prompt template ──────────────────────────────────────────
template_path = Path("app/prompts/game_generation.txt")
if template_path.exists():
with open(template_path, 'r', encoding='utf-8') as f:
template = f.read()
else:
template = "Generate a location-based game in strict JSON format.\n{output_schema}"
# ── Load output schema ───────────────────────────────────────────
schema_path = Path("app/schemas/game_schema.json")
schema_str = ""
if schema_path.exists():
with open(schema_path, 'r', encoding='utf-8') as f:
schema_obj = json.load(f)
schema_str = json.dumps(schema_obj, indent=2)
# ── Build prompt ─────────────────────────────────────────────────
prompt = template.format(
city=city,
area=config.get('area', 'downtown'),
game_type=game_type,
duration_minutes=config.get('duration_minutes', 45),
num_players=config.get('num_players', 4),
difficulty=config.get('difficulty', 'medium'),
age_group=config.get('age_group', 'adults'),
location_type=config.get('location_type', 'mixed'),
retrieved_examples=examples_str,
city_context=city_context_str,
output_schema=schema_str,
)
return prompt
# ── JSON extraction from model output ────────────────────────────────────
def extract_json(text: str) -> Optional[str]:
"""Extract JSON object from generated text.
Finds the first complete JSON object by tracking brace depth.
Args:
text: Generated text that may contain JSON
Returns:
JSON string or None if not found
"""
start_idx = text.find('{')
if start_idx == -1:
return None
depth = 0
for i in range(start_idx, len(text)):
if text[i] == '{':
depth += 1
elif text[i] == '}':
depth -= 1
if depth == 0:
raw = text[start_idx:i+1]
# Normalize double braces from prompt echoing ({{ -> {)
if raw.startswith('{{') and raw.endswith('}}'):
raw = raw[1:-1]
return raw
return None
# ── Model-based generation with llama.cpp ───────────────────────────────
def generate_game_with_model(
prompt: str,
model_path: Optional[str] = None,
model_name: str = "nemotron",
) -> Optional[str]:
"""Generate game JSON using NVIDIA Nemotron 3 Nano 4B via llama.cpp.
Uses llama-cpp-python for optimal performance with GGUF quantization.
Important β€” HF Spaces Zero GPU pattern:
* The 2.84 GB GGUF file is lazily downloaded inside ``@spaces.GPU``
(if not already cached on disk from a previous run). ``hf_hub_download``
uses the local Hugging Face cache so subsequent calls are instant.
* ``Llama(model_path=...)`` initialisation happens here β€” inside the GPU
context where CUDA is available.
Args:
prompt: Generation prompt
model_path: Path to a local GGUF file (optional β€” auto-downloaded
if omitted).
model_name: Model identifier (unused, kept for API compat).
Returns:
Generated game JSON string or None if model unavailable
"""
try:
from llama_cpp import Llama
cache_key = f"llama_cpp_{model_path or 'module_default'}"
if cache_key in _model_cache:
llm = _model_cache[cache_key]
else:
resolved = model_path or _resolve_model_path()
if not resolved:
print("[nemotron] No model path available β€” fall back to mock")
return None
n_gpu_layers = _get_n_gpu_layers()
gpu_info = "GPU" if n_gpu_layers < 0 else "CPU"
print(f"[nemotron] Initialising llama.cpp from: {resolved} ({gpu_info})")
llm = Llama(
model_path=resolved,
verbose=False,
n_gpu_layers=n_gpu_layers,
n_ctx=8192,
)
_model_cache[cache_key] = llm
# Use create_chat_completion β€” this model uses a Nemotron chat template
messages = [
{"role": "system", "content": "You output only valid JSON. No other text."},
{"role": "user", "content": prompt},
]
result = llm.create_chat_completion(
messages=messages,
max_tokens=8192,
temperature=0.3,
top_p=0.9,
stop=["```"],
)
generated_text = result["choices"][0]["message"]["content"]
generated_text = generated_text.strip()
print(f"[nemotron] Generated {len(generated_text)} chars")
json_str = extract_json(generated_text)
if not json_str:
print(f"[nemotron] JSON extraction failed on output (len={len(generated_text)})")
print(f"[nemotron] Preview: {generated_text[:300]}...")
return json_str
except ImportError:
print("[nemotron] llama-cpp-python not available. Install with: pip install llama-cpp-python")
return None
except Exception as e:
print(f"[nemotron] llama.cpp generation failed: {type(e).__name__}: {e}")
return None
# ── Mock generation (fallback) ───────────────────────────────────────────
def generate_game_mock(config: dict, retrieved_examples: list[dict]) -> dict:
"""Generate a realistic mock game for testing without a model.
Uses retrieved examples and config to create a valid game structure
that passes schema validation.
Args:
config: Game configuration
retrieved_examples: Retrieved similar games for grounding
Returns:
Generated game JSON matching the game schema
"""
game_id = f"mock-{uuid.uuid4().hex[:8]}"
num_tasks = max(2, config.get('duration_minutes', 45) // 15)
tasks = []
proof_types = ["photo", "observation", "text"]
locations = ["main square", "city center", "park area", "landmark district", "historic district"]
for i in range(min(num_tasks, 5)):
task_id = f"t{i+1}"
points = 15 + (i * 5)
time_limit = 8 + (i * 2)
proof_type = proof_types[i % len(proof_types)]
location = locations[i % len(locations)]
task = {
"task_id": task_id,
"title": f"Task {i+1}: Explore the {location}",
"description": f"Find and document something interesting in the {location}",
"location_hint": f"Navigate to the {location} and look for distinctive features",
"points": points,
"time_limit_minutes": time_limit,
"proof_type": proof_type,
"hint": f"Look for signs or landmarks in the {location}",
"safety_note": "Stay on public paths and avoid restricted areas",
}
tasks.append(task)
game = {
"game_id": game_id,
"game_type": config.get('game_type', 'scavenger_hunt'),
"title": f"{config.get('game_type', 'scavenger hunt').title()} in {config.get('area', 'the city')}",
"theme": f"{config.get('difficulty', 'medium').lower()} adventure",
"setup": {
"city": config.get('city', 'Paris'),
"area": config.get('area', 'downtown'),
"meeting_point": f"Main entrance of {config.get('area', 'downtown')}",
"duration_minutes": config.get('duration_minutes', 45),
"num_players": config.get('num_players', 4),
},
"rules": [
f"Complete as many tasks as possible within {config.get('duration_minutes', 45)} minutes",
"Take photos or notes as proof of completion",
"Stay within the designated area at all times",
"No entering private buildings or restricted areas",
f"This game is suitable for {config.get('age_group', 'all ages')}",
],
"tasks": tasks,
"global_hints": [
"Explore systematically from the meeting point outward",
"Ask locals for directions if needed",
"Time management is key - don't spend too long on any single task",
],
"score_rules": [
"Each task completed: full points",
"Early completion: +1 bonus point per minute under limit",
"Hints used: -5 points per hint",
"Late arrival at meeting point: -10 points per minute",
],
"tie_breaker": "Winner is the player with the most points when time expires. Ties broken by earliest completion time.",
"safety": {
"allowed_zone": config.get('area', 'downtown'),
"forbidden_behaviors": [
"Entering buildings without permission",
"Crossing busy streets recklessly",
"Approaching strangers",
"Leaving the designated area",
],
"adult_supervision": config.get('age_group') in ['kids', 'teens'],
"stop_conditions": [
"If a player feels unsafe, the game stops immediately",
"If weather becomes severe, relocate to shelter",
"If anyone is injured, call emergency services",
],
},
"story_seed": {
"tone": "playful",
"motifs": ["exploration", "discovery", "teamwork"],
"recap_style": "episode_recap",
},
}
return game
# ── Main game generation (wrapper) ───────────────────────────────────────
def generate_game(config: dict, retrieved_examples: list[dict]) -> dict:
"""Generate a game from user config and retrieved examples.
Uses NVIDIA Nemotron 3 Nano 4B via llama.cpp for optimal performance.
Falls back to mock generation if model unavailable.
Args:
config: Game configuration (game_type, city, duration, etc.)
retrieved_examples: List of similar example games for grounding
Returns:
Generated game JSON matching the game schema
"""
prompt = build_generation_prompt(config, retrieved_examples)
json_str = generate_game_with_model(prompt, model_name="nemotron")
if json_str:
try:
game = json.loads(json_str)
if all(field in game for field in ["game_id", "title", "setup", "tasks", "safety"]):
print(f"[gen] Generated game via Nemotron: {game.get('game_id')}")
return game
except json.JSONDecodeError:
print("[gen] Failed to parse generated JSON, using mock")
print("[gen] Using mock generation (model unavailable or generation failed)")
return generate_game_mock(config, retrieved_examples)