""" Extra investigation powers, kept out of detective.py so the core audit path stays clean. Each function degrades gracefully (returns None / a friendly message) when its backend isn't configured, so the app never crashes if an endpoint is down. - THE INSPECTOR'S VOICE -> Kokoro-82M TTS (TTS_ENDPOINT_URL) - THE RECONSTRUCTION -> FLUX.2 Klein 4B (FLUX_ENDPOINT_URL) - INTERROGATE THE INSPECTOR -> the same Qwen2.5-VL endpoint, re-prompted """ from __future__ import annotations import base64 import io import os import tempfile import requests from PIL import Image from detective import CaseFile, Evidence, _call_modal from prompts import ( BRIEF_SYSTEM_PROMPT, INTERROGATION_SYSTEM_PROMPT, PROSECUTOR_SYSTEM_PROMPT, ) _SEV_RANK = {"capital": 3, "high": 2, "medium": 1, "low": 0} # --------------------------------------------------------------------------- # THE INSPECTOR'S VOICE (Kokoro-82M) # --------------------------------------------------------------------------- def voice_script(case: CaseFile) -> str: """A short, punchy noir script for narration (kept tight for snappy TTS).""" cleared = str(case.grade).upper() in {"A", "B"} n = len(case.evidence) if cleared: body = (f"The scene came back clean. A few infractions, nothing that holds up " f"in court.") else: body = (f"I count {n} crime{'s' if n != 1 else ''} against the user.") closing = (case.closing_statement or "").strip() return ( f"Case number {case.case_number}. {case.case_title}. " f"{body} My verdict: {case.verdict}. The grade: {case.grade}. {closing}" ).strip() def synthesize_voice(text: str) -> str | None: """POST the text to the Kokoro backend; write the WAV to a temp file and return its path (gr.Audio takes a filepath). None if the voice backend isn't set/up.""" url = os.environ.get("TTS_ENDPOINT_URL", "").rstrip("/") if not url: return None try: resp = requests.post(f"{url}/speak", json={"text": text[:600]}, timeout=180) resp.raise_for_status() wav = base64.b64decode(resp.json()["audio_b64"]) if not wav: return None f = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) f.write(wav) f.close() return f.name except Exception: return None # --------------------------------------------------------------------------- # THE RECONSTRUCTION (FLUX.2 Klein) # --------------------------------------------------------------------------- def _worst_evidence(case: CaseFile) -> Evidence | None: if not case.evidence: return None def rank(ev: Evidence): x1, y1, x2, y2 = ev.bbox area = max(1, (x2 - x1)) * max(1, (y2 - y1)) return (_SEV_RANK.get(ev.severity, 1), area) return max(case.evidence, key=rank) def _crop_region(image: Image.Image, bbox, pad_frac: float = 0.35, min_side: int = 280): """Crop the bbox with padding; widen tiny boxes so FLUX has pixels to work with.""" W, H = image.size x1, y1, x2, y2 = bbox bw, bh = x2 - x1, y2 - y1 px, py = bw * pad_frac, bh * pad_frac # ensure a sane minimum crop around the centre cx, cy = (x1 + x2) / 2, (y1 + y2) / 2 half_w = max(bw / 2 + px, min_side / 2) half_h = max(bh / 2 + py, min_side / 2) nx1 = int(max(0, cx - half_w)); ny1 = int(max(0, cy - half_h)) nx2 = int(min(W, cx + half_w)); ny2 = int(min(H, cy + half_h)) return image.convert("RGB").crop((nx1, ny1, nx2, ny2)) def _visual_brief(crop_b64: str, ev: Evidence) -> str | None: """The Inspector turns his remedy into ONE concrete visual instruction — abstract fixes make FLUX just re-render the same pixels cleaner.""" try: result = _call_modal( crop_b64, system_prompt=BRIEF_SYSTEM_PROMPT.format( crime=ev.crime, fix=ev.fix or "make the flaw obviously fixed", ), user_prompt="Write the redesign brief.", max_tokens=90, temperature=0.4, ) text = (result.get("text") or "").strip().strip('"').strip() return text if 12 < len(text) < 400 else None except Exception: return None def _too_similar(a: Image.Image, b: Image.Image, threshold: float = 8.0) -> bool: """True when the 'redesign' is just the same pixels re-rendered cleaner.""" try: pa = a.convert("L").resize((48, 48)) pb = b.convert("L").resize((48, 48)) da, db = pa.getdata(), pb.getdata() diff = sum(abs(x - y) for x, y in zip(da, db)) / len(da) return diff < threshold except Exception: return False def _call_flux(url: str, image_b64: str, instruction: str) -> Image.Image | None: try: resp = requests.post( f"{url}/reconstruct", json={"image_b64": image_b64, "instruction": instruction, "steps": 8}, timeout=240, ) resp.raise_for_status() out_b64 = resp.json()["image_b64"] return Image.open(io.BytesIO(base64.b64decode(out_b64))).convert("RGB") except Exception: return None def reconstruct_evidence(image: Image.Image, ev: Evidence): """Crop ONE circled element and rebuild it FIXED: 1. the Inspector writes a concrete design brief for the fix, 2. FLUX redesigns the crop from that brief, 3. anti-clone net: if the result is just a cleaner copy, retry harder, and as a last resort sketch the ideal element from text alone. Returns (before_crop, after_image) or None.""" url = os.environ.get("FLUX_ENDPOINT_URL", "").rstrip("/") if not url: return None crop = _crop_region(image, ev.bbox) buf = io.BytesIO() crop.save(buf, format="PNG") crop_b64 = base64.b64encode(buf.getvalue()).decode("ascii") brief = _visual_brief(crop_b64, ev) or ( ev.fix or "Improve clarity, contrast and visual hierarchy of this element." ) after = _call_flux(url, crop_b64, brief) if after is not None and _too_similar(crop, after): harder = (f"RADICALLY redesign this element — the result must look clearly " f"different from the original: {brief}") retry = _call_flux(url, crop_b64, harder) if retry is not None and not _too_similar(crop, retry): after = retry else: # the Inspector's concept sketch, from the brief alone concept = _call_flux(url, "", brief) if concept is not None: after = concept if after is None: return None return crop, after def reconstruct_all(image: Image.Image, case: CaseFile, max_items: int = 4): """Rebuild every charged element (most serious first, capped). Returns a list of (evidence, before_crop, after_image) — skips any exhibit FLUX fails on.""" def rank(ev: Evidence): x1, y1, x2, y2 = ev.bbox return (_SEV_RANK.get(ev.severity, 1), max(1, (x2 - x1)) * max(1, (y2 - y1))) out = [] for ev in sorted(case.evidence, key=rank, reverse=True)[:max_items]: res = reconstruct_evidence(image, ev) if res: out.append((ev, res[0], res[1])) return out def reconstruct_worst(image: Image.Image, case: CaseFile): """Back-compat: rebuild only the most serious exhibit. Returns (before_crop, after_image, evidence) or None.""" ev = _worst_evidence(case) if ev is None: return None res = reconstruct_evidence(image, ev) if res is None: return None return res[0], res[1], ev # --------------------------------------------------------------------------- # INTERROGATE THE INSPECTOR (re-prompt the same VLM with the case in context) # --------------------------------------------------------------------------- def _case_context(case: CaseFile) -> str: lines = [ f"CASE Nº {case.case_number} — {case.case_title}", f"Scene: {case.scene_summary}", f"Verdict: {case.verdict} (grade {case.grade}).", "Charges on file:", ] for ev in case.evidence: lines.append(f" #{ev.id} [{ev.severity}] {ev.crime} — {ev.testimony}" + (f" Remedy: {ev.fix}" if ev.fix else "")) lines.append(f"Closing: {case.closing_statement}") return "\n".join(lines) def interrogate(image: Image.Image, case: CaseFile, question: str) -> str: """Answer a follow-up question in the Inspector's voice, grounded in the screenshot + the filed case. Reuses the Qwen2.5-VL endpoint.""" q = (question or "").strip() if not q: return "…You'll have to actually ask me something, friend." from detective import _image_to_b64 system = INTERROGATION_SYSTEM_PROMPT.format(case=_case_context(case)) try: result = _call_modal( _image_to_b64(image), system_prompt=system, user_prompt=q, max_tokens=320, temperature=0.6, ) text = (result.get("text") or "").strip() return text or "…The line crackled. Ask me again." except Exception: return "…The wire to the precinct went dead. Try that question again in a moment." # --------------------------------------------------------------------------- # THE PROSECUTION (a SEPARATE small model — NVIDIA Nemotron) # --------------------------------------------------------------------------- def prosecute(case: CaseFile) -> str | None: """THE PROSECUTION's opening statement, argued by a separate NVIDIA Nemotron model over the evidence the vision agent filed. Text-only (no image). Returns None if the prosecutor backend isn't configured/up or there are no charges.""" url = os.environ.get("NEMOTRON_ENDPOINT_URL", "").rstrip("/") if not url or case is None or not case.evidence: return None try: resp = requests.post( f"{url}/prosecute", json={ "system_prompt": PROSECUTOR_SYSTEM_PROMPT.format(case=_case_context(case)), "user_prompt": "Deliver the opening statement for the prosecution.", "max_tokens": 360, "temperature": 0.7, }, timeout=180, ) resp.raise_for_status() text = (resp.json().get("text") or "").strip() # End on a complete sentence — a statement cut mid-thought reads as broken. if text: cut = max(text.rfind("."), text.rfind("!"), text.rfind("?")) if cut > 80: text = text[: cut + 1] return text or None except Exception: return None