Spaces:

build-small-hackathon
/

kirana-detective

Sleeping

App Files Files Community

naazimsnh02 commited on 8 days ago

Commit

8dc382f

1 Parent(s): c9462a9

Model loading fix for vison model

Browse files

Files changed (4) hide show

agents/invoice_extractor.py +53 -10
app.py +23 -12
requirements.txt +2 -1
samples/modal_inference_test.ipynb +180 -0

agents/invoice_extractor.py CHANGED Viewed

@@ -92,7 +92,7 @@ def _dict_to_invoice(data: dict) -> InvoiceJSON:
 def _call_llm_with_image(llm, image_bytes: bytes, prompt: str) -> str:
-    """Call MiniCPM-V via native transformers generate() with an image."""
     import io
     import torch
     from PIL import Image as PILImage
@@ -100,15 +100,58 @@ def _call_llm_with_image(llm, image_bytes: bytes, prompt: str) -> str:
     model, processor = llm
     image = PILImage.open(io.BytesIO(image_bytes)).convert("RGB")
-    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    inputs = processor(text=[text], images=[image], return_tensors="pt").to(model.device)
-    with torch.no_grad():
-        generated_ids = model.generate(**inputs, max_new_tokens=2048, do_sample=False)
-    input_len = inputs.input_ids.shape[1]
-    return processor.decode(generated_ids[0][input_len:], skip_special_tokens=True)
 def _extract_from_image(llm, image_bytes: bytes) -> Tuple[InvoiceJSON, list[str]]:

 def _call_llm_with_image(llm, image_bytes: bytes, prompt: str) -> str:
+    """Call MiniCPM-V 4.6 through the current processor + generate API."""
     import io
     import torch
     from PIL import Image as PILImage
     model, processor = llm
     image = PILImage.open(io.BytesIO(image_bytes)).convert("RGB")
+    messages = [{
+        "role": "user",
+        "content": [
+            {"type": "image", "image": image},
+            {"type": "text", "text": prompt},
+        ],
+    }]
+    downsample_mode = "16x"
+    try:
+        inputs = processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt",
+            downsample_mode=downsample_mode,
+            max_slice_nums=36,
+        )
+    except TypeError:
+        inputs = processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        downsample_mode = None
+    inputs = inputs.to(model.device)
+    generate_kwargs = {
+        **inputs,
+        "max_new_tokens": 2048,
+        "do_sample": False,
+    }
+    if downsample_mode is not None:
+        generate_kwargs["downsample_mode"] = downsample_mode
+    with torch.inference_mode():
+        try:
+            generated_ids = model.generate(**generate_kwargs)
+        except TypeError:
+            generate_kwargs.pop("downsample_mode", None)
+            generated_ids = model.generate(**generate_kwargs)
+    prompt_len = inputs["input_ids"].shape[-1]
+    generated_ids = generated_ids[:, prompt_len:]
+    decoded = processor.batch_decode(
+        generated_ids,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )
+    return decoded[0].strip()
 def _extract_from_image(llm, image_bytes: bytes) -> Tuple[InvoiceJSON, list[str]]:

app.py CHANGED Viewed

@@ -63,7 +63,7 @@ def load_models() -> None:
         logger.info("Downloading vision model (MiniCPM-V 4.6 merged)…")
         import os
         import torch
-        from transformers import AutoModel
         from huggingface_hub import snapshot_download
         from safetensors.torch import load_file as safetensors_load
@@ -72,16 +72,25 @@ def load_models() -> None:
         _dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
         _device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Load base model code (has .chat()) + fine-tuned weights from merged repo.
-        # save_pretrained() after PEFT merge doesn't copy custom modeling files,
-        # so the merged repo lacks .chat(); loading base code fixes this.
         logger.info("Loading base model code from %s …", _BASE_REPO)
-        _vision_model = AutoModel.from_pretrained(
-            _BASE_REPO,
-            trust_remote_code=True,
-            torch_dtype=_dtype,
-            device_map=_device,
-        )
         logger.info("Loading fine-tuned weights from %s …", _MERGED_REPO)
         merged_local = snapshot_download(_MERGED_REPO, token=_HF_TOKEN or None)
@@ -99,8 +108,10 @@ def load_models() -> None:
         logger.info("Fine-tuned weights loaded (%d keys, %d missing)", len(state_dict), len(missing))
         _vision_model.eval()
-        from transformers import AutoProcessor
-        _vision_processor = AutoProcessor.from_pretrained(_MERGED_REPO, trust_remote_code=True)
         vision_llm = (_vision_model, _vision_processor)
         logger.info("Vision LLM ready (device=%s dtype=%s)", _device, _dtype)

         logger.info("Downloading vision model (MiniCPM-V 4.6 merged)…")
         import os
         import torch
+        from transformers import AutoProcessor
         from huggingface_hub import snapshot_download
         from safetensors.torch import load_file as safetensors_load
         _dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
         _device = "cuda" if torch.cuda.is_available() else "cpu"
+        try:
+            from transformers import AutoModelForImageTextToText as _VisionModel
+        except ImportError:
+            from transformers import AutoModelForMultimodalLM as _VisionModel
+        # Load base model code plus fine-tuned weights from the merged repo.
+        # MiniCPM-V 4.6 uses AutoProcessor + model.generate() for inference.
         logger.info("Loading base model code from %s …", _BASE_REPO)
+        _model_kwargs = {
+            "trust_remote_code": True,
+            "torch_dtype": _dtype,
+        }
+        if _HF_TOKEN:
+            _model_kwargs["token"] = _HF_TOKEN
+        if torch.cuda.is_available():
+            _model_kwargs["device_map"] = "auto"
+        _vision_model = _VisionModel.from_pretrained(_BASE_REPO, **_model_kwargs)
+        if not torch.cuda.is_available():
+            _vision_model.to(_device)
         logger.info("Loading fine-tuned weights from %s …", _MERGED_REPO)
         merged_local = snapshot_download(_MERGED_REPO, token=_HF_TOKEN or None)
         logger.info("Fine-tuned weights loaded (%d keys, %d missing)", len(state_dict), len(missing))
         _vision_model.eval()
+        _processor_kwargs = {"trust_remote_code": True}
+        if _HF_TOKEN:
+            _processor_kwargs["token"] = _HF_TOKEN
+        _vision_processor = AutoProcessor.from_pretrained(_BASE_REPO, **_processor_kwargs)
         vision_llm = (_vision_model, _vision_processor)
         logger.info("Vision LLM ready (device=%s dtype=%s)", _device, _dtype)

requirements.txt CHANGED Viewed

@@ -7,7 +7,8 @@ PyMuPDF==1.25.5
 numpy==2.2.5
 opencv-python-headless==4.11.0.86
 huggingface_hub>=0.33.5
-transformers>=4.46.0
 accelerate>=0.26.0
 safetensors>=0.4.3
 datasets==3.6.0

 numpy==2.2.5
 opencv-python-headless==4.11.0.86
 huggingface_hub>=0.33.5
+transformers>=5.7.0
+torchvision>=0.19.0
 accelerate>=0.26.0
 safetensors>=0.4.3
 datasets==3.6.0

samples/modal_inference_test.ipynb ADDED Viewed

	@@ -0,0 +1,180 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Kirana Detective — Modal Inference Test\n",
+    "\n",
+    "Run this notebook cell-by-cell in a Modal GPU environment to verify:\n",
+    "1. All dependencies install correctly (including torchvision)\n",
+    "2. Vision model (MiniCPM-V 4.6 + fine-tuned weights) loads correctly\n",
+    "3. Processor loads correctly\n",
+    "4. End-to-end inference on a test invoice image works\n",
+    "5. Text LLM (MiniCPM5-1B gguf) loads and runs\n",
+    "6. YOLO ONNX model loads and runs\n",
+    "\n",
+    "**Before running:** Set your HF_TOKEN secret in Modal or paste it below."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 1 — Modal app definition\n",
+    "\n",
+    "This defines the Modal image with all required dependencies and the GPU function."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install modal if not already installed\n",
+    "import subprocess, sys\n",
+    "subprocess.run([sys.executable, \"-m\", \"pip\", \"install\", \"modal\", \"-q\"], check=True)\n",
+    "print(\"modal installed\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "import modal\n\napp = modal.App(\"kirana-inference-test\")\n\nimage = (\n    modal.Image.debian_slim(python_version=\"3.12\")\n    .pip_install(\n        \"torch==2.4.0\",\n        \"torchvision==0.19.0\",\n        \"torchaudio==2.4.0\",\n        extra_options=\"--index-url https://download.pytorch.org/whl/cu121\",\n    )\n    .pip_install(\n        \"transformers>=5.7.0\",\n        \"accelerate>=0.26.0\",\n        \"safetensors>=0.4.3\",\n        \"huggingface_hub>=0.33.5\",\n        \"Pillow>=11.0.0\",\n        \"numpy>=1.26.0\",\n        \"llama-cpp-python==0.3.28\",\n        \"onnxruntime==1.21.0\",\n        \"requests\",\n        extra_options=\"--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu\",\n    )\n)\n\nprint(\"Modal app defined\")"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 2 — Define the test functions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "HF_TOKEN = \"\"  # paste your token here OR leave empty if set as Modal secret\n\n_BASE_REPO   = \"openbmb/MiniCPM-V-4.6\"\n_MERGED_REPO = \"build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged\"\n_TEXT_REPO   = \"build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer\"\n_TEXT_FILE   = \"MiniCPM5-1B.Q4_K_M.gguf\"\n_YOLO_REPO   = \"build-small-hackathon/yolo26n-indian-fmcg-detection\"\n\n\ndef _make_test_invoice_image():\n    \"\"\"Create a minimal synthetic invoice image with visible text.\"\"\"\n    from PIL import Image as PILImage, ImageDraw\n\n    img = PILImage.new(\"RGB\", (640, 900), color=(255, 255, 255))\n    draw = ImageDraw.Draw(img)\n    lines = [\n        \"TAX INVOICE\",\n        \"Supplier: Hindustan Unilever Ltd\",\n        \"Invoice No: INV-2024-00123\",\n        \"Date: 2024-05-15\",\n        \"\",\n        \"Item                  Qty   Price   Total\",\n        \"Surf Excel 1kg         2    180.00  360.00\",\n        \"Lifebuoy Soap 100g     5     22.00  110.00\",\n        \"Dove Shampoo 200ml     3     95.00  285.00\",\n        \"\",\n        \"Grand Total:                        755.00\",\n    ]\n    y = 40\n    for line in lines:\n        draw.text((40, y), line, fill=(0, 0, 0))\n        y += 36\n    return img\n\n\n@app.function(\n    image=image,\n    gpu=\"T4\",\n    timeout=600,\n    secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_vision_model(hf_token: str = \"\") -> dict:\n    \"\"\"\n    Load MiniCPM-V 4.6 base architecture + fine-tuned weights + processor,\n    then run inference using processor.apply_chat_template() and generate().\n    \"\"\"\n    import time, os\n    import torch\n    from transformers import AutoProcessor\n    from huggingface_hub import snapshot_download\n    from safetensors.torch import load_file as safetensors_load\n\n    try:\n        from transformers import AutoModelForImageTextToText as VisionModel\n        model_loader = \"AutoModelForImageTextToText\"\n    except ImportError:\n        from transformers import AutoModelForMultimodalLM as VisionModel\n        model_loader = \"AutoModelForMultimodalLM\"\n\n    token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n    results = {\"model_loader\": model_loader}\n\n    results[\"torch_version\"]  = torch.__version__\n    results[\"cuda_available\"] = torch.cuda.is_available()\n    results[\"cuda_device\"]    = torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"CPU\"\n\n    try:\n        import torchvision\n        results[\"torchvision_version\"] = torchvision.__version__\n    except ImportError as e:\n        results[\"torchvision_error\"] = str(e)\n\n    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32\n    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\n    t0 = time.time()\n    try:\n        model_kwargs = {\"trust_remote_code\": True, \"torch_dtype\": dtype, \"token\": token}\n        if torch.cuda.is_available():\n            model_kwargs[\"device_map\"] = \"auto\"\n        model = VisionModel.from_pretrained(_BASE_REPO, **model_kwargs)\n        if not torch.cuda.is_available():\n            model.to(device)\n        results[\"base_model_class\"]  = type(model).__name__\n        results[\"base_model_load_s\"] = round(time.time() - t0, 1)\n        results[\"base_model_ok\"]     = True\n        results[\"has_chat\"]          = hasattr(model, \"chat\")\n        results[\"has_generate\"]      = hasattr(model, \"generate\")\n    except Exception as e:\n        results[\"base_model_ok\"]    = False\n        results[\"base_model_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        merged_local = snapshot_download(_MERGED_REPO, token=token)\n        shard_files = sorted(f for f in os.listdir(merged_local) if f.endswith(\".safetensors\"))\n        if not shard_files:\n            raise RuntimeError(f\"No .safetensors found in {_MERGED_REPO}\")\n        state_dict = {}\n        for sf in shard_files:\n            state_dict.update(safetensors_load(os.path.join(merged_local, sf), device=\"cpu\"))\n        missing, unexpected = model.load_state_dict(state_dict, strict=False)\n        results[\"finetuned_weights_load_s\"]  = round(time.time() - t0, 1)\n        results[\"finetuned_weights_ok\"]      = True\n        results[\"finetuned_keys\"]            = len(state_dict)\n        results[\"finetuned_missing_keys\"]    = len(missing)\n        results[\"finetuned_unexpected_keys\"] = len(unexpected)\n        model.eval()\n    except Exception as e:\n        results[\"finetuned_weights_ok\"]    = False\n        results[\"finetuned_weights_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        processor = AutoProcessor.from_pretrained(_BASE_REPO, trust_remote_code=True, token=token)\n        results[\"processor_load_s\"] = round(time.time() - t0, 1)\n        results[\"processor_ok\"] = True\n        results[\"processor_source\"] = _BASE_REPO\n        results[\"processor_class\"] = type(processor).__name__\n    except Exception as e:\n        results[\"processor_ok\"] = False\n        results[\"processor_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        test_img = _make_test_invoice_image()\n        prompt = (\n            \"You are an OCR agent. Extract data from this invoice image and return ONLY valid JSON \"\n            'matching: {\"invoice_number\": string|null, \"supplier\": string|null, \"date\": string|null, '\n            '\"items\": [{\"product_raw\": string, \"quantity\": number, \"unit_price\": number, \"line_total\": number}], '\n            '\"grand_total\": number}. Return ONLY the JSON, no prose.'\n        )\n        messages = [{\n            \"role\": \"user\",\n            \"content\": [\n                {\"type\": \"image\", \"image\": test_img},\n                {\"type\": \"text\", \"text\": prompt},\n            ],\n        }]\n        downsample_mode = \"16x\"\n        try:\n            inputs = processor.apply_chat_template(\n                messages,\n                tokenize=True,\n                add_generation_prompt=True,\n                return_dict=True,\n                return_tensors=\"pt\",\n                downsample_mode=downsample_mode,\n                max_slice_nums=36,\n            )\n        except TypeError:\n            inputs = processor.apply_chat_template(\n                messages,\n                tokenize=True,\n                add_generation_prompt=True,\n                return_dict=True,\n                return_tensors=\"pt\",\n            )\n            downsample_mode = None\n        inputs = inputs.to(model.device)\n        generate_kwargs = {**inputs, \"max_new_tokens\": 512, \"do_sample\": False}\n        if downsample_mode is not None:\n            generate_kwargs[\"downsample_mode\"] = downsample_mode\n        with torch.inference_mode():\n            try:\n                output_ids = model.generate(**generate_kwargs)\n            except TypeError:\n                generate_kwargs.pop(\"downsample_mode\", None)\n                output_ids = model.generate(**generate_kwargs)\n        output_ids = output_ids[:, inputs[\"input_ids\"].shape[-1]:]\n        response = processor.batch_decode(\n            output_ids,\n            skip_special_tokens=True,\n            clean_up_tokenization_spaces=False,\n        )[0].strip()\n        results[\"inference_ok\"]     = True\n        results[\"inference_s\"]      = round(time.time() - t0, 1)\n        results[\"inference_output\"] = str(response)[:800]\n    except Exception as e:\n        results[\"inference_ok\"]    = False\n        results[\"inference_error\"] = str(e)\n\n    return results\n\n\n@app.function(\n    image=image,\n    timeout=300,\n    secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_text_model(hf_token: str = \"\") -> dict:\n    \"\"\"Download and load MiniCPM5-1B GGUF, run a normalization test prompt.\"\"\"\n    import time, os\n    from huggingface_hub import hf_hub_download\n    from llama_cpp import Llama\n\n    token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n    results = {}\n\n    t0 = time.time()\n    try:\n        path = hf_hub_download(repo_id=_TEXT_REPO, filename=_TEXT_FILE, token=token)\n        results[\"text_model_download_s\"] = round(time.time() - t0, 1)\n    except Exception as e:\n        results[\"text_model_download_ok\"]    = False\n        results[\"text_model_download_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        llm = Llama(model_path=path, n_ctx=8192, n_threads=4, verbose=False)\n        results[\"text_model_load_s\"]  = round(time.time() - t0, 1)\n        results[\"text_model_load_ok\"] = True\n    except Exception as e:\n        results[\"text_model_load_ok\"]    = False\n        results[\"text_model_load_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        resp = llm(\n            \"Normalize product name to standard FMCG name. Input: 'Lays Classic Salted 26g'. \"\n            \"Output JSON: {\\\"normalized\\\": \\\"...\\\", \\\"brand\\\": \\\"...\\\", \\\"category\\\": \\\"...\\\"}\",\n            max_tokens=64,\n            stop=[\"\\n\\n\"],\n        )\n        results[\"text_inference_ok\"]     = True\n        results[\"text_inference_s\"]      = round(time.time() - t0, 1)\n        results[\"text_inference_output\"] = resp[\"choices\"][0][\"text\"][:300]\n    except Exception as e:\n        results[\"text_inference_ok\"]    = False\n        results[\"text_inference_error\"] = str(e)\n\n    return results\n\n\n@app.function(\n    image=image,\n    timeout=120,\n    secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_yolo_model(hf_token: str = \"\") -> dict:\n    \"\"\"Download YOLO ONNX and run inference on a synthetic 640x640 image.\"\"\"\n    import time, os, json\n    import numpy as np\n    from huggingface_hub import hf_hub_download\n    import onnxruntime as ort\n\n    token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n    results = {}\n\n    t0 = time.time()\n    try:\n        onnx_path  = hf_hub_download(repo_id=_YOLO_REPO, filename=\"yolo26n_fmcg.onnx\", token=token)\n        class_path = hf_hub_download(repo_id=_YOLO_REPO, filename=\"class_names.json\", token=token)\n        with open(class_path) as f:\n            class_names = json.load(f)\n        results[\"yolo_download_s\"]     = round(time.time() - t0, 1)\n        results[\"yolo_num_classes\"]    = len(class_names)\n        results[\"yolo_sample_classes\"] = class_names[:5]\n    except Exception as e:\n        results[\"yolo_download_ok\"]    = False\n        results[\"yolo_download_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        session   = ort.InferenceSession(onnx_path, providers=[\"CPUExecutionProvider\"])\n        inp_name  = session.get_inputs()[0].name\n        inp_shape = session.get_inputs()[0].shape\n        results[\"yolo_load_s\"]      = round(time.time() - t0, 1)\n        results[\"yolo_input_name\"]  = inp_name\n        results[\"yolo_input_shape\"] = str(inp_shape)\n    except Exception as e:\n        results[\"yolo_load_ok\"]    = False\n        results[\"yolo_load_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        dummy   = np.random.rand(1, 3, 640, 640).astype(np.float32)\n        outputs = session.run(None, {inp_name: dummy})\n        results[\"yolo_inference_ok\"]  = True\n        results[\"yolo_inference_s\"]   = round(time.time() - t0, 1)\n        results[\"yolo_output_shapes\"] = [str(o.shape) for o in outputs]\n    except Exception as e:\n        results[\"yolo_inference_ok\"]    = False\n        results[\"yolo_inference_error\"] = str(e)\n\n    return results\n\n\nprint(\"Functions defined\")"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 3 — Run vision model test (GPU T4, ~3-5 min)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "with app.run():\n",
+    "    vision_result = test_vision_model.remote(hf_token=HF_TOKEN)\n",
+    "\n",
+    "print(\"=== VISION MODEL TEST RESULTS ===\")\n",
+    "print(json.dumps(vision_result, indent=2))\n",
+    "\n",
+    "# Summary\n",
+    "print(\"\\n=== SUMMARY ===\")\n",
+    "print(f\"  torch:          {vision_result.get('torch_version')}\")\n",
+    "print(f\"  torchvision:    {vision_result.get('torchvision_version', '❌ MISSING')}\")\n",
+    "print(f\"  CUDA device:    {vision_result.get('cuda_device')}\")\n",
+    "print(f\"  base model:     {'✅' if vision_result.get('base_model_ok') else '❌'} ({vision_result.get('base_model_load_s', '?')}s)\")\n",
+    "print(f\"  fine-tuned wts: {'✅' if vision_result.get('finetuned_weights_ok') else '❌'} ({vision_result.get('finetuned_keys', '?')} keys, {vision_result.get('finetuned_missing_keys', '?')} missing)\")\n",
+    "print(f\"  processor:      {'✅' if vision_result.get('processor_ok') else '❌'} (from {vision_result.get('processor_source', 'N/A')})\")\n",
+    "print(f\"  inference:      {'✅' if vision_result.get('inference_ok') else '❌'} ({vision_result.get('inference_s', '?')}s)\")\n",
+    "if vision_result.get('inference_output'):\n",
+    "    print(f\"\\n  Output snippet: {vision_result['inference_output'][:200]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 4 — Run text model test (CPU, ~2-3 min)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with app.run():\n",
+    "    text_result = test_text_model.remote(hf_token=HF_TOKEN)\n",
+    "\n",
+    "print(\"=== TEXT MODEL TEST RESULTS ===\")\n",
+    "print(json.dumps(text_result, indent=2))\n",
+    "\n",
+    "print(\"\\n=== SUMMARY ===\")\n",
+    "print(f\"  download:  {'✅' if not text_result.get('text_model_download_error') else '❌'} ({text_result.get('text_model_download_s', '?')}s)\")\n",
+    "print(f\"  load:      {'✅' if text_result.get('text_model_load_ok') else '❌'} ({text_result.get('text_model_load_s', '?')}s)\")\n",
+    "print(f\"  inference: {'✅' if text_result.get('text_inference_ok') else '❌'} ({text_result.get('text_inference_s', '?')}s)\")\n",
+    "if text_result.get('text_inference_output'):\n",
+    "    print(f\"\\n  Output: {text_result['text_inference_output']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 5 — Run YOLO model test (CPU, ~1 min)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with app.run():\n",
+    "    yolo_result = test_yolo_model.remote(hf_token=HF_TOKEN)\n",
+    "\n",
+    "print(\"=== YOLO MODEL TEST RESULTS ===\")\n",
+    "print(json.dumps(yolo_result, indent=2))\n",
+    "\n",
+    "print(\"\\n=== SUMMARY ===\")\n",
+    "print(f\"  download:  {'✅' if not yolo_result.get('yolo_download_error') else '❌'} ({yolo_result.get('yolo_download_s', '?')}s, {yolo_result.get('yolo_num_classes', '?')} classes)\")\n",
+    "print(f\"  load:      {'✅' if not yolo_result.get('yolo_load_error') else '❌'} ({yolo_result.get('yolo_load_s', '?')}s)\")\n",
+    "print(f\"  inference: {'✅' if yolo_result.get('yolo_inference_ok') else '❌'} ({yolo_result.get('yolo_inference_s', '?')}s)\")\n",
+    "if yolo_result.get('yolo_output_shapes'):\n",
+    "    print(f\"  output shapes: {yolo_result['yolo_output_shapes']}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cell 6 — Diagnosis helper\n",
+    "\n",
+    "Run this after the tests above to get a clear fix list."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "print(\"=\"*60)\nprint(\"DIAGNOSIS REPORT\")\nprint(\"=\"*60)\n\nissues = []\nfixes  = []\n\nif \"torchvision_error\" in vision_result:\n    issues.append(\"torchvision not installed\")\n    fixes.append(\"Add torchvision to requirements.txt / Dockerfile\")\n\nif not vision_result.get(\"base_model_ok\"):\n    issues.append(f\"Vision base model failed: {vision_result.get('base_model_error', '?')}\")\n    fixes.append(\"Use transformers>=5.7.0 and AutoModelForImageTextToText/AutoModelForMultimodalLM\")\n\nif not vision_result.get(\"has_generate\"):\n    issues.append(\"Vision model does not expose generate()\")\n    fixes.append(\"Load MiniCPM-V 4.6 through the image-text/multimodal auto class, not generic AutoModel\")\n\nif vision_result.get(\"has_chat\"):\n    issues.append(\"Vision test is still seeing old chat() API\")\n    fixes.append(\"Use processor.apply_chat_template() + model.generate() in app.py and InvoiceExtractorAgent\")\n\nif not vision_result.get(\"processor_ok\"):\n    issues.append(\"Processor failed to load\")\n    fixes.append(\"Load AutoProcessor from openbmb/MiniCPM-V-4.6; the merged repo is weights-only\")\n\nif not vision_result.get(\"inference_ok\"):\n    issues.append(f\"Inference failed: {vision_result.get('inference_error', '?')}\")\n    fixes.append(\"Check the generate/apply_chat_template error above\")\n\nif not text_result.get(\"text_model_load_ok\"):\n    issues.append(f\"Text model failed: {text_result.get('text_model_load_error', '?')}\")\n    fixes.append(\"Check text model error\")\n\nif not yolo_result.get(\"yolo_inference_ok\"):\n    issues.append(f\"YOLO failed: {yolo_result.get('yolo_inference_error', '?')}\")\n    fixes.append(\"Check YOLO error\")\n\nif not issues:\n    print(\"All tests passed. Your HF Space should use the same load/inference path.\")\nelse:\n    print(\"ISSUES FOUND:\")\n    for i, issue in enumerate(issues, 1):\n        print(f\"  {i}. {issue}\")\n\n    print(\"\\nRECOMMENDED FIXES:\")\n    for i, fix in enumerate(fixes, 1):\n        print(f\"  {i}. {fix}\")\n\n    print(\"\\nKey finding: processor loaded from:\", vision_result.get(\"processor_source\"))\n    print(\"Model loader:\", vision_result.get(\"model_loader\"))"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}