"""The eyes: MiniCPM-V reads a slide image into a structured 'slide reading'. Computed once per slide and cached — interjections reuse the reading and never re-run the (heavy) vision pass. """ from __future__ import annotations import base64 import mimetypes from functools import lru_cache from openai import OpenAI from .config import CONFIG _READING_PROMPT = ( "You are reading a single lecture slide shown as an image. " "Respond in English only. " "Produce a compact, structured reading another model will use to explain the slide aloud. " "Use exactly these plain-text sections, omitting any that don't apply:\n" "TITLE: \n" "BULLETS: \n" "EQUATIONS: \n" "DIAGRAM: \n" "CONCEPTS: \n" "Be faithful to what's actually on the slide. Do not invent content. " "Do not use XML or JSON — plain text only." ) _RETRY_PROMPT = ( "Read this lecture slide image and respond in English only. " "Use plain text with these sections (omit any that don't apply):\n" "TITLE: ...\nBULLETS: ...\nEQUATIONS: ...\nDIAGRAM: ...\nCONCEPTS: ...\n" "No XML, no JSON, no repetition." ) @lru_cache(maxsize=1) def _client() -> OpenAI: return OpenAI(base_url=CONFIG.vision.openai_base_url, api_key=CONFIG.vision.api_key) def _data_uri(image_path: str) -> str: mime = mimetypes.guess_type(image_path)[0] or "image/png" with open(image_path, "rb") as f: b64 = base64.b64encode(f.read()).decode("ascii") return f"data:{mime};base64,{b64}" def _mock_reading(text: str, question: str | None) -> str: head = next((ln.strip() for ln in text.splitlines() if ln.strip()), "Untitled slide") body = " ".join(text.split())[:400] if question: return f"[mock vision] Looking closely for: {question}\nVisible text: {body or '(none)'}" return ( f"TITLE: {head}\n" f"BULLETS:\n{text or '(no extractable text)'}\n" "CONCEPTS: (mock reading — set VISION_BASE_URL for a real MiniCPM-V pass)" ) def _is_degenerate(text: str) -> bool: """Detect infinite-loop or language-drift outputs.""" if not text: return True lines = [ln for ln in text.splitlines() if ln.strip()] if not lines: return True # Flag if >40% of non-empty lines are duplicates (repetition loop) if len(set(lines)) / len(lines) < 0.6: return True # Flag if majority of characters are non-ASCII (language drift) non_ascii = sum(1 for c in text if ord(c) > 127) if non_ascii / max(len(text), 1) > 0.3: return True return False def _call_vision(instruction: str, image_path: str) -> str: resp = _client().chat.completions.create( model=CONFIG.vision.model, messages=[ { "role": "user", "content": [ {"type": "text", "text": instruction}, {"type": "image_url", "image_url": {"url": _data_uri(image_path)}}, ], } ], temperature=0.1, max_tokens=512, ) return (resp.choices[0].message.content or "").strip() def read_slide( image_path: str, text_layer: str = "", question: str | None = None, prior_reading: str | None = None, ) -> str: """Return a structured reading of the slide image. ``question`` switches to a targeted 'look closer' read for a specific ask. ``prior_reading`` is the reading of the previous slide; passed when slides are part of an animation sequence so the model has context on what changed. Falls back to a mock reading derived from the PDF text layer if no endpoint is configured. """ if not CONFIG.vision.is_live: return _mock_reading(text_layer, question) if question: instruction = f"Look closely at this slide and answer in English: {question}" else: instruction = _READING_PROMPT if prior_reading: instruction = ( f"The previous slide reading was:\n{prior_reading}\n\n" + instruction ) result = _call_vision(instruction, image_path) if _is_degenerate(result): result = _call_vision(_RETRY_PROMPT, image_path) if _is_degenerate(result): return _mock_reading(text_layer, question) return result