"""End-to-end test for the full CityQuest AI pipeline. Run with ``CITYQUEST_FAST_TEST=1`` to skip GGUF model downloads and use mock generation for LLM-dependent steps. Usage: python test_end_to_end.py CITYQUEST_FAST_TEST=1 python test_end_to_end.py """ import os import sys import json import uuid from pathlib import Path # ── Fast-test guard ────────────────────────────────────────────────────────── if os.environ.get("CITYQUEST_FAST_TEST"): print("[test] FAST TEST - skipping GGUF model download and LLM inference") os.environ["CITYQUEST_SKIP_MODEL"] = "1" # ── Test tracking ──────────────────────────────────────────────────────────── passed = 0 failed = 0 errors: list[str] = [] def check(label: str, condition: bool, detail: str = ""): global passed, failed if condition: passed += 1 print(f" ✓ PASS: {label}") else: failed += 1 msg = f"✗ FAIL: {label} — {detail}" if detail else f"✗ FAIL: {label}" errors.append(msg) print(f" {msg}") def main(): global passed, failed, errors print("\n" + "=" * 80) print("CITYQUEST AI — END-TO-END PIPELINE TEST") print("=" * 80) # ── Imports ────────────────────────────────────────────────────────── print("\n" + "-" * 80) print("MODULE IMPORTS") print("-" * 80) try: from app.services.retrieval import load_games_dataset, normalize_game_record, retrieve_examples from app.services.generator import generate_game, generate_game_mock, build_generation_prompt from app.services.validator import validate_game, repair_game from app.services.schema_validator import validate_game_schema, create_minimal_game_template from app.services.tracing import log_event, load_events, log_generation_trace from app.services.journal import create_journal_entry, save_journal_entry, summarize_journal, load_journal_entries, detect_mood, assess_story_value from app.services.scoring import compute_scores from app.services.story import build_story_packet, generate_story, _template_short_recap, _template_long_summary from app.services.minicpm import generate_recap as minicpm_recap, generate_summary as minicpm_summary from app.services.image_gen import generate_poster, generate_poster_sync check("All service modules imported", True) except Exception as e: check("All service modules imported", False, str(e)) # Don't continue if core imports fail _summary() sys.exit(1) # ── T1: Dataset loading ────────────────────────────────────────────── print("\n" + "-" * 80) print("T1: DATASET LOADING & NORMALIZATION") print("-" * 80) try: raw = load_games_dataset("app/data/games_dataset.json") check("Dataset loaded", len(raw) > 0, f"Got {len(raw)} records") check("Dataset has 70 records", len(raw) == 70, f"Got {len(raw)}") records = [normalize_game_record(r) for r in raw] check("Records normalized", len(records) == len(raw)) # Verify structure of first record r0 = records[0] check("Normalized record has id", bool(r0.get("id"))) check("Normalized record has game_type", bool(r0.get("game_type"))) check("Normalized record has tasks list", isinstance(r0.get("tasks"), list)) check("Normalized record has rules list", isinstance(r0.get("rules"), list)) # Check game type distribution game_types = set(r["game_type"] for r in records) check("All game types present", game_types == {"scavenger_hunt", "hide_and_seek", "tag"}, f"Got {game_types}") except Exception as e: check("Dataset loading", False, str(e)) # ── T2: Retrieval ──────────────────────────────────────────────────── print("\n" + "-" * 80) print("T2: RETRIEVAL GROUNDING") print("-" * 80) try: config_adults = { "game_type": "scavenger_hunt", "city": "Paris", "area": "Le Marais", "location_type": "mixed", "duration_minutes": 60, "num_players": 4, "difficulty": "medium", "age_group": "adults", "energy_level": "medium", } retrieved = retrieve_examples(config_adults, records, k=3) check("Retrieval returns k=3 results", len(retrieved) == 3) # Verify structure check("Retrieved has id", all(r.get("id") for r in retrieved)) check("Retrieved has retrieval_score", all("retrieval_score" in r for r in retrieved)) check("Retrieved has rules_summary", all("rules_summary" in r for r in retrieved)) check("Retrieved has task_patterns", all("task_patterns" in r for r in retrieved)) # Test different config types config_kids = {"game_type": "hide_and_seek", "city": "Paris", "area": "Park", "location_type": "park", "duration_minutes": 45, "num_players": 5, "difficulty": "easy", "age_group": "kids", "energy_level": "high"} r_kids = retrieve_examples(config_kids, records, k=5) check("Kids config returns k=5", len(r_kids) == 5) config_teens = {"game_type": "tag", "city": "Paris", "area": "Square", "location_type": "mixed", "duration_minutes": 30, "num_players": 8, "difficulty": "hard", "age_group": "teens", "energy_level": "high"} r_teens = retrieve_examples(config_teens, records, k=3) check("Teens config returns k=3", len(r_teens) == 3) except Exception as e: check("Retrieval", False, str(e)) # ── T3: Game Generation (mock) ──────────────────────────────────────── print("\n" + "-" * 80) print("T3: GAME GENERATION (MOCK PATH)") print("-" * 80) try: game = generate_game_mock(config_adults, retrieved) check("Game generated", game is not None) check("Game has game_id", bool(game.get("game_id"))) check("Game has title", bool(game.get("title"))) check("Game has theme", bool(game.get("theme"))) check("Game has setup with city", game.get("setup", {}).get("city") == "Paris") check("Game has tasks list", len(game.get("tasks", [])) > 0) check("Game has rules list", len(game.get("rules", [])) > 0) check("Game has safety section", bool(game.get("safety"))) check("Game has story_seed", bool(game.get("story_seed"))) # Schema validation is_valid_schema, schema_errors = validate_game_schema(game) check("Game passes JSON schema", is_valid_schema, f"Errors: {schema_errors[:3]}") # Build generation prompt prompt = build_generation_prompt(config_adults, retrieved) check("Prompt built", len(prompt) > 100, f"Got {len(prompt)} chars") except Exception as e: check("Game generation", False, str(e)) # ── T4: Validation ─────────────────────────────────────────────────── print("\n" + "-" * 80) print("T4: VALIDATION CHECKS") print("-" * 80) try: is_valid, failures = validate_game(game, config_adults) check("Generated game passes validation", is_valid, f"Failures: {failures[:3]}") # Test that invalid games are caught bad_game = { "game_id": "", "title": "", "theme": "", "setup": {}, "rules": [], "tasks": [], "global_hints": [], "score_rules": [], "tie_breaker": "", "safety": {}, "story_seed": {}, } is_bad_valid, bad_failures = validate_game(bad_game, config_adults) check("Empty game fails validation", not is_bad_valid, f"Got {len(bad_failures)} issues") check("Empty game has issues", len(bad_failures) > 5) except Exception as e: check("Validation", False, str(e)) # ── T5: Repair ─────────────────────────────────────────────────────── print("\n" + "-" * 80) print("T5: REPAIR PASS") print("-" * 80) try: bad_game = { "game_id": "", "title": "", "theme": "", "setup": {}, "rules": [], "tasks": [], "global_hints": [], "score_rules": [], "tie_breaker": "", "safety": {}, "story_seed": {}, } _, failures = validate_game(bad_game, config_adults) repaired = repair_game(bad_game, failures, config_adults) check("Repair returns dict", isinstance(repaired, dict)) check("Repaired has game_id", bool(repaired.get("game_id")), f"Got '{repaired.get('game_id')}'") check("Repaired has title", bool(repaired.get("title"))) check("Repaired has tasks", len(repaired.get("tasks", [])) > 0) check("Repaired has rules", len(repaired.get("rules", [])) > 0) check("Repaired has safety zone", bool(repaired.get("safety", {}).get("allowed_zone"))) is_repaired_valid, repaired_failures = validate_game(repaired, config_adults) check("Repaired game passes validation", is_repaired_valid, f"Remaining: {repaired_failures[:3]}") except Exception as e: check("Repair", False, str(e)) # ── T6: Event Logging ──────────────────────────────────────────────── print("\n" + "-" * 80) print("T6: EVENT LOGGING") print("-" * 80) try: session_id = f"test-{uuid.uuid4().hex[:8]}" ev1 = log_event(session_id, "task_revealed", {"task_id": "t1", "title": "Find the mural"}) check("Event logged", ev1 is not None) check("Event has event_id", bool(ev1.get("event_id"))) check("Event has timestamp", bool(ev1.get("timestamp"))) check("Event has payload", "task_id" in ev1.get("payload", {})) log_event(session_id, "task_completed", {"task_id": "t1", "summary": "Done"}, team_id="team-a") log_event(session_id, "task_completed", {"task_id": "t2", "summary": "Done"}, team_id="team-a") log_event(session_id, "hint_used", {"task_id": "t2", "summary": "Used hint"}, team_id="team-a") log_event(session_id, "task_skipped", {"task_id": "t3", "summary": "Skipped"}, team_id="team-a") log_event(session_id, "photo_uploaded", {"photo_id": "p1", "summary": "Selfie"}, team_id="team-a") log_event(session_id, "journal_recorded", {"journal_id": "j1", "mood": "excited"}, team_id="team-a") events = load_events(session_id) check("All events loaded", len(events) >= 6, f"Got {len(events)} events") event_types = [e["event_type"] for e in events] check("Has task_completed", "task_completed" in event_types) check("Has hint_used", "hint_used" in event_types) check("Has task_skipped", "task_skipped" in event_types) check("Has photo_uploaded", "photo_uploaded" in event_types) except Exception as e: check("Event logging", False, str(e)) # ── T7: Journal ────────────────────────────────────────────────────── print("\n" + "-" * 80) print("T7: JOURNAL PIPELINE") print("-" * 80) try: transcript = "We found the mural near the canal, it was incredible! The whole team cheered. This was definitely the highlight of our game - a close call that turned into our best moment." entry = create_journal_entry( transcript=transcript, session_id=session_id, team_id="team-a", task_id="t1", location_note="Canal area", ) check("Journal entry created", entry is not None) check("Journal has journal_id", bool(entry.get("journal_id"))) check("Journal has session_id", entry.get("session_id") == session_id) # Mood detection mood = detect_mood(transcript) check("Mood detected", bool(mood), f"Got '{mood}'") check("Mood is expected", mood in ("excited", "funny", "confused", "tense", "lucky", "chaotic"), f"Got '{mood}'") # Story value story_value = assess_story_value(transcript) check("Story value assessed", bool(story_value), f"Got '{story_value}'") # Summarization summary = summarize_journal(transcript, task_id="t1") check("Summary has moment_summary", bool(summary.get("moment_summary"))) check("Summary has tags", len(summary.get("tags", [])) > 0) check("Summary has story_value", bool(summary.get("story_value"))) entry.update(summary) save_journal_entry(entry) journals = load_journal_entries(session_id) check("Journal persisted", len(journals) >= 1, f"Got {len(journals)} entries") except Exception as e: check("Journal pipeline", False, str(e)) # ── T8: Scoring ────────────────────────────────────────────────────── print("\n" + "-" * 80) print("T8: DETERMINISTIC SCORING") print("-" * 80) try: events = load_events(session_id) scores = compute_scores(events, game) check("Scores returned", scores is not None) check("Has team_scores", len(scores.get("team_scores", [])) > 0) check("Has winner", bool(scores.get("winner"))) check("Has scoring_explanation", len(scores.get("scoring_explanation", [])) > 0) ts = scores["team_scores"][0] check("Team score has points", ts.get("points", 0) >= 0) check("Team score has completed_tasks", ts.get("completed_tasks", 0) >= 0) check("Team score has hints_used", ts.get("hints_used", 0) >= 0) check("Team score has total_tasks", ts.get("total_tasks", 0) > 0) # Verify hint penalty check("Hint penalty applied", ts["hints_used"] > 0) # Scoring breakdown is present has_breakdown = any(s.get("scoring_breakdown") for s in scores["team_scores"]) check("Has scoring breakdown", has_breakdown) except Exception as e: check("Scoring", False, str(e)) # ── T9: Story Recap (template) ────────────────────────────────────── print("\n" + "-" * 80) print("T9: STORY RECAP (TEMPLATE FALLBACK)") print("-" * 80) try: events = load_events(session_id) packet = build_story_packet( game=game, events=events, scores=scores, journal_entries=load_journal_entries(session_id), ) check("Story packet built", packet is not None) check("Packet has game_info", bool(packet.get("game_info"))) check("Packet has winner", bool(packet.get("winner"))) check("Packet has final_scores", len(packet.get("final_scores", [])) > 0) check("Packet has task_outcomes", len(packet.get("task_outcomes", [])) > 0) # Template recap short = _template_short_recap(packet) check("Template short recap generated", len(short) > 50, f"Got {len(short)} chars") long = _template_long_summary(packet) check("Template long summary generated", len(long) > 100, f"Got {len(long)} chars") # Full generate_story with fallback story = generate_story(packet, session_id=session_id) check("generate_story() returns dict", isinstance(story, dict)) check("Has short_recap", bool(story.get("short_recap"))) check("Has long_summary", bool(story.get("long_summary"))) check("Has poster_prompt", bool(story.get("poster_prompt"))) check("Has story_packet", bool(story.get("story_packet"))) except Exception as e: check("Story recap", False, str(e)) # ── T10: MiniCPM module ───────────────────────────────────────────── print("\n" + "-" * 80) print("T10: MINICPM MODULE (lazy-load only — no download in fast test)") print("-" * 80) try: from app.services.minicpm import MINICPM_MODEL_ID, MINICPM_GGUF_FILE, generate_recap, generate_summary check("MiniCPM constants loaded", bool(MINICPM_MODEL_ID) and bool(MINICPM_GGUF_FILE)) check("MiniCPM model ID is correct", "MiniCPM" in MINICPM_MODEL_ID) check("MiniCPM GGUF file ends with .gguf", MINICPM_GGUF_FILE.endswith(".gguf")) # In fast test, skip actual model loading if os.environ.get("CITYQUEST_FAST_TEST"): check("MiniCPM module imports (fast test — model skipped)", True) else: # Try recapping — will likely fail without model, that's OK packet = build_story_packet( game=game, events=events, scores=scores, journal_entries=load_journal_entries(session_id), ) result = generate_recap(packet) # May be None if no model, but must not raise check("MiniCPM generate_recap() runs without exception", True) except Exception as e: check("MiniCPM module", False, str(e)) # ── T11: Image Gen module ─────────────────────────────────────────── print("\n" + "-" * 80) print("T11: IMAGE GEN MODULE") print("-" * 80) try: from app.services.image_gen import generate_poster, generate_poster_sync, FLUX_MODEL_ID check("Image gen constants loaded", bool(FLUX_MODEL_ID)) # Test with skip-model env var — must return None gracefully result = generate_poster("A test poster prompt") check("generate_poster() with skip returns None (no model loaded)", result is None) path_or_none, status = generate_poster_sync("test-session", "A test poster prompt") check("generate_poster_sync() returns tuple", isinstance(path_or_none, type(None)) and isinstance(status, str)) check("Status message is informative", "unavailable" in status or "Failed" in status) except Exception as e: check("Image gen module", False, str(e)) # ── Summary ────────────────────────────────────────────────────────── _summary() def _summary(): total = passed + failed print(f"\nRESULTS: {passed}/{total} tests passed", end="") if failed > 0: print(f"{passed}/{total} tests passed - ALL CLEAR") else: print(" — ALL PASSED! 🎉") print("=" * 80) if errors: print("\nFailed tests:") for e in errors: print(f" {e}") # Exit with error code if any test failed if failed > 0: sys.exit(1) if __name__ == "__main__": main()