{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Kirana Detective — Modal Inference Test\n", "\n", "Run this notebook cell-by-cell in a Modal GPU environment to verify:\n", "1. All dependencies install correctly (including torchvision)\n", "2. Vision model (MiniCPM-V 4.6 + fine-tuned weights) loads correctly\n", "3. Processor loads correctly\n", "4. End-to-end inference on a test invoice image works\n", "5. Text LLM (MiniCPM5-1B gguf) loads and runs\n", "6. YOLO ONNX model loads and runs\n", "\n", "**Before running:** Set your HF_TOKEN secret in Modal or paste it below." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cell 1 — Modal app definition\n", "\n", "This defines the Modal image with all required dependencies and the GPU function." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install modal if not already installed\n", "import subprocess, sys\n", "subprocess.run([sys.executable, \"-m\", \"pip\", \"install\", \"modal\", \"-q\"], check=True)\n", "print(\"modal installed\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "import modal\n\napp = modal.App(\"kirana-inference-test\")\n\nimage = (\n modal.Image.debian_slim(python_version=\"3.12\")\n .pip_install(\n \"torch==2.4.0\",\n \"torchvision==0.19.0\",\n \"torchaudio==2.4.0\",\n extra_options=\"--index-url https://download.pytorch.org/whl/cu121\",\n )\n .pip_install(\n \"transformers>=5.7.0\",\n \"accelerate>=0.26.0\",\n \"safetensors>=0.4.3\",\n \"huggingface_hub>=0.33.5\",\n \"Pillow>=11.0.0\",\n \"numpy>=1.26.0\",\n \"llama-cpp-python==0.3.28\",\n \"onnxruntime==1.21.0\",\n \"requests\",\n extra_options=\"--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu\",\n )\n)\n\nprint(\"Modal app defined\")" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cell 2 — Define the test functions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "HF_TOKEN = \"\" # paste your token here OR leave empty if set as Modal secret\n\n_BASE_REPO = \"openbmb/MiniCPM-V-4.6\"\n_MERGED_REPO = \"build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged\"\n_TEXT_REPO = \"build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer\"\n_TEXT_FILE = \"MiniCPM5-1B.Q4_K_M.gguf\"\n_YOLO_REPO = \"build-small-hackathon/yolo26n-indian-fmcg-detection\"\n\n\ndef _make_test_invoice_image():\n \"\"\"Create a minimal synthetic invoice image with visible text.\"\"\"\n from PIL import Image as PILImage, ImageDraw\n\n img = PILImage.new(\"RGB\", (640, 900), color=(255, 255, 255))\n draw = ImageDraw.Draw(img)\n lines = [\n \"TAX INVOICE\",\n \"Supplier: Hindustan Unilever Ltd\",\n \"Invoice No: INV-2024-00123\",\n \"Date: 2024-05-15\",\n \"\",\n \"Item Qty Price Total\",\n \"Surf Excel 1kg 2 180.00 360.00\",\n \"Lifebuoy Soap 100g 5 22.00 110.00\",\n \"Dove Shampoo 200ml 3 95.00 285.00\",\n \"\",\n \"Grand Total: 755.00\",\n ]\n y = 40\n for line in lines:\n draw.text((40, y), line, fill=(0, 0, 0))\n y += 36\n return img\n\n\n@app.function(\n image=image,\n gpu=\"T4\",\n timeout=600,\n secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_vision_model(hf_token: str = \"\") -> dict:\n \"\"\"\n Load MiniCPM-V 4.6 base architecture + fine-tuned weights + processor,\n then run inference using processor.apply_chat_template() and generate().\n \"\"\"\n import time, os\n import torch\n from transformers import AutoProcessor\n from huggingface_hub import snapshot_download\n from safetensors.torch import load_file as safetensors_load\n\n try:\n from transformers import AutoModelForImageTextToText as VisionModel\n model_loader = \"AutoModelForImageTextToText\"\n except ImportError:\n from transformers import AutoModelForMultimodalLM as VisionModel\n model_loader = \"AutoModelForMultimodalLM\"\n\n token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n results = {\"model_loader\": model_loader}\n\n results[\"torch_version\"] = torch.__version__\n results[\"cuda_available\"] = torch.cuda.is_available()\n results[\"cuda_device\"] = torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"CPU\"\n\n try:\n import torchvision\n results[\"torchvision_version\"] = torchvision.__version__\n except ImportError as e:\n results[\"torchvision_error\"] = str(e)\n\n dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32\n device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\n t0 = time.time()\n try:\n model_kwargs = {\"trust_remote_code\": True, \"torch_dtype\": dtype, \"token\": token}\n if torch.cuda.is_available():\n model_kwargs[\"device_map\"] = \"auto\"\n model = VisionModel.from_pretrained(_BASE_REPO, **model_kwargs)\n if not torch.cuda.is_available():\n model.to(device)\n results[\"base_model_class\"] = type(model).__name__\n results[\"base_model_load_s\"] = round(time.time() - t0, 1)\n results[\"base_model_ok\"] = True\n results[\"has_chat\"] = hasattr(model, \"chat\")\n results[\"has_generate\"] = hasattr(model, \"generate\")\n except Exception as e:\n results[\"base_model_ok\"] = False\n results[\"base_model_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n merged_local = snapshot_download(_MERGED_REPO, token=token)\n shard_files = sorted(f for f in os.listdir(merged_local) if f.endswith(\".safetensors\"))\n if not shard_files:\n raise RuntimeError(f\"No .safetensors found in {_MERGED_REPO}\")\n state_dict = {}\n for sf in shard_files:\n state_dict.update(safetensors_load(os.path.join(merged_local, sf), device=\"cpu\"))\n missing, unexpected = model.load_state_dict(state_dict, strict=False)\n results[\"finetuned_weights_load_s\"] = round(time.time() - t0, 1)\n results[\"finetuned_weights_ok\"] = True\n results[\"finetuned_keys\"] = len(state_dict)\n results[\"finetuned_missing_keys\"] = len(missing)\n results[\"finetuned_unexpected_keys\"] = len(unexpected)\n model.eval()\n except Exception as e:\n results[\"finetuned_weights_ok\"] = False\n results[\"finetuned_weights_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n processor = AutoProcessor.from_pretrained(_BASE_REPO, trust_remote_code=True, token=token)\n results[\"processor_load_s\"] = round(time.time() - t0, 1)\n results[\"processor_ok\"] = True\n results[\"processor_source\"] = _BASE_REPO\n results[\"processor_class\"] = type(processor).__name__\n except Exception as e:\n results[\"processor_ok\"] = False\n results[\"processor_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n test_img = _make_test_invoice_image()\n prompt = (\n \"You are an OCR agent. Extract data from this invoice image and return ONLY valid JSON \"\n 'matching: {\"invoice_number\": string|null, \"supplier\": string|null, \"date\": string|null, '\n '\"items\": [{\"product_raw\": string, \"quantity\": number, \"unit_price\": number, \"line_total\": number}], '\n '\"grand_total\": number}. Return ONLY the JSON, no prose.'\n )\n messages = [{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image\", \"image\": test_img},\n {\"type\": \"text\", \"text\": prompt},\n ],\n }]\n downsample_mode = \"16x\"\n try:\n inputs = processor.apply_chat_template(\n messages,\n tokenize=True,\n add_generation_prompt=True,\n return_dict=True,\n return_tensors=\"pt\",\n downsample_mode=downsample_mode,\n max_slice_nums=36,\n )\n except TypeError:\n inputs = processor.apply_chat_template(\n messages,\n tokenize=True,\n add_generation_prompt=True,\n return_dict=True,\n return_tensors=\"pt\",\n )\n downsample_mode = None\n inputs = inputs.to(model.device)\n generate_kwargs = {**inputs, \"max_new_tokens\": 512, \"do_sample\": False}\n if downsample_mode is not None:\n generate_kwargs[\"downsample_mode\"] = downsample_mode\n with torch.inference_mode():\n try:\n output_ids = model.generate(**generate_kwargs)\n except TypeError:\n generate_kwargs.pop(\"downsample_mode\", None)\n output_ids = model.generate(**generate_kwargs)\n output_ids = output_ids[:, inputs[\"input_ids\"].shape[-1]:]\n response = processor.batch_decode(\n output_ids,\n skip_special_tokens=True,\n clean_up_tokenization_spaces=False,\n )[0].strip()\n results[\"inference_ok\"] = True\n results[\"inference_s\"] = round(time.time() - t0, 1)\n results[\"inference_output\"] = str(response)[:800]\n except Exception as e:\n results[\"inference_ok\"] = False\n results[\"inference_error\"] = str(e)\n\n return results\n\n\n@app.function(\n image=image,\n timeout=300,\n secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_text_model(hf_token: str = \"\") -> dict:\n \"\"\"Download and load MiniCPM5-1B GGUF, run a normalization test prompt.\"\"\"\n import time, os\n from huggingface_hub import hf_hub_download\n from llama_cpp import Llama\n\n token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n results = {}\n\n t0 = time.time()\n try:\n path = hf_hub_download(repo_id=_TEXT_REPO, filename=_TEXT_FILE, token=token)\n results[\"text_model_download_s\"] = round(time.time() - t0, 1)\n except Exception as e:\n results[\"text_model_download_ok\"] = False\n results[\"text_model_download_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n llm = Llama(model_path=path, n_ctx=8192, n_threads=4, verbose=False)\n results[\"text_model_load_s\"] = round(time.time() - t0, 1)\n results[\"text_model_load_ok\"] = True\n except Exception as e:\n results[\"text_model_load_ok\"] = False\n results[\"text_model_load_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n resp = llm(\n \"Normalize product name to standard FMCG name. Input: 'Lays Classic Salted 26g'. \"\n \"Output JSON: {\\\"normalized\\\": \\\"...\\\", \\\"brand\\\": \\\"...\\\", \\\"category\\\": \\\"...\\\"}\",\n max_tokens=64,\n stop=[\"\\n\\n\"],\n )\n results[\"text_inference_ok\"] = True\n results[\"text_inference_s\"] = round(time.time() - t0, 1)\n results[\"text_inference_output\"] = resp[\"choices\"][0][\"text\"][:300]\n except Exception as e:\n results[\"text_inference_ok\"] = False\n results[\"text_inference_error\"] = str(e)\n\n return results\n\n\n@app.function(\n image=image,\n timeout=120,\n secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_yolo_model(hf_token: str = \"\") -> dict:\n \"\"\"Download YOLO ONNX and run inference on a synthetic 640x640 image.\"\"\"\n import time, os, json\n import numpy as np\n from huggingface_hub import hf_hub_download\n import onnxruntime as ort\n\n token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n results = {}\n\n t0 = time.time()\n try:\n onnx_path = hf_hub_download(repo_id=_YOLO_REPO, filename=\"yolo26n_fmcg.onnx\", token=token)\n class_path = hf_hub_download(repo_id=_YOLO_REPO, filename=\"class_names.json\", token=token)\n with open(class_path) as f:\n class_names = json.load(f)\n results[\"yolo_download_s\"] = round(time.time() - t0, 1)\n results[\"yolo_num_classes\"] = len(class_names)\n results[\"yolo_sample_classes\"] = class_names[:5]\n except Exception as e:\n results[\"yolo_download_ok\"] = False\n results[\"yolo_download_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n session = ort.InferenceSession(onnx_path, providers=[\"CPUExecutionProvider\"])\n inp_name = session.get_inputs()[0].name\n inp_shape = session.get_inputs()[0].shape\n results[\"yolo_load_s\"] = round(time.time() - t0, 1)\n results[\"yolo_input_name\"] = inp_name\n results[\"yolo_input_shape\"] = str(inp_shape)\n except Exception as e:\n results[\"yolo_load_ok\"] = False\n results[\"yolo_load_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n dummy = np.random.rand(1, 3, 640, 640).astype(np.float32)\n outputs = session.run(None, {inp_name: dummy})\n results[\"yolo_inference_ok\"] = True\n results[\"yolo_inference_s\"] = round(time.time() - t0, 1)\n results[\"yolo_output_shapes\"] = [str(o.shape) for o in outputs]\n except Exception as e:\n results[\"yolo_inference_ok\"] = False\n results[\"yolo_inference_error\"] = str(e)\n\n return results\n\n\nprint(\"Functions defined\")" }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cell 3 — Run vision model test (GPU T4, ~3-5 min)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "with app.run():\n", " vision_result = test_vision_model.remote(hf_token=HF_TOKEN)\n", "\n", "print(\"=== VISION MODEL TEST RESULTS ===\")\n", "print(json.dumps(vision_result, indent=2))\n", "\n", "# Summary\n", "print(\"\\n=== SUMMARY ===\")\n", "print(f\" torch: {vision_result.get('torch_version')}\")\n", "print(f\" torchvision: {vision_result.get('torchvision_version', '❌ MISSING')}\")\n", "print(f\" CUDA device: {vision_result.get('cuda_device')}\")\n", "print(f\" base model: {'✅' if vision_result.get('base_model_ok') else '❌'} ({vision_result.get('base_model_load_s', '?')}s)\")\n", "print(f\" fine-tuned wts: {'✅' if vision_result.get('finetuned_weights_ok') else '❌'} ({vision_result.get('finetuned_keys', '?')} keys, {vision_result.get('finetuned_missing_keys', '?')} missing)\")\n", "print(f\" processor: {'✅' if vision_result.get('processor_ok') else '❌'} (from {vision_result.get('processor_source', 'N/A')})\")\n", "print(f\" inference: {'✅' if vision_result.get('inference_ok') else '❌'} ({vision_result.get('inference_s', '?')}s)\")\n", "if vision_result.get('inference_output'):\n", " print(f\"\\n Output snippet: {vision_result['inference_output'][:200]}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cell 4 — Run text model test (CPU, ~2-3 min)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with app.run():\n", " text_result = test_text_model.remote(hf_token=HF_TOKEN)\n", "\n", "print(\"=== TEXT MODEL TEST RESULTS ===\")\n", "print(json.dumps(text_result, indent=2))\n", "\n", "print(\"\\n=== SUMMARY ===\")\n", "print(f\" download: {'✅' if not text_result.get('text_model_download_error') else '❌'} ({text_result.get('text_model_download_s', '?')}s)\")\n", "print(f\" load: {'✅' if text_result.get('text_model_load_ok') else '❌'} ({text_result.get('text_model_load_s', '?')}s)\")\n", "print(f\" inference: {'✅' if text_result.get('text_inference_ok') else '❌'} ({text_result.get('text_inference_s', '?')}s)\")\n", "if text_result.get('text_inference_output'):\n", " print(f\"\\n Output: {text_result['text_inference_output']}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cell 5 — Run YOLO model test (CPU, ~1 min)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with app.run():\n", " yolo_result = test_yolo_model.remote(hf_token=HF_TOKEN)\n", "\n", "print(\"=== YOLO MODEL TEST RESULTS ===\")\n", "print(json.dumps(yolo_result, indent=2))\n", "\n", "print(\"\\n=== SUMMARY ===\")\n", "print(f\" download: {'✅' if not yolo_result.get('yolo_download_error') else '❌'} ({yolo_result.get('yolo_download_s', '?')}s, {yolo_result.get('yolo_num_classes', '?')} classes)\")\n", "print(f\" load: {'✅' if not yolo_result.get('yolo_load_error') else '❌'} ({yolo_result.get('yolo_load_s', '?')}s)\")\n", "print(f\" inference: {'✅' if yolo_result.get('yolo_inference_ok') else '❌'} ({yolo_result.get('yolo_inference_s', '?')}s)\")\n", "if yolo_result.get('yolo_output_shapes'):\n", " print(f\" output shapes: {yolo_result['yolo_output_shapes']}\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cell 6 — Diagnosis helper\n", "\n", "Run this after the tests above to get a clear fix list." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": "print(\"=\"*60)\nprint(\"DIAGNOSIS REPORT\")\nprint(\"=\"*60)\n\nissues = []\nfixes = []\n\nif \"torchvision_error\" in vision_result:\n issues.append(\"torchvision not installed\")\n fixes.append(\"Add torchvision to requirements.txt / Dockerfile\")\n\nif not vision_result.get(\"base_model_ok\"):\n issues.append(f\"Vision base model failed: {vision_result.get('base_model_error', '?')}\")\n fixes.append(\"Use transformers>=5.7.0 and AutoModelForImageTextToText/AutoModelForMultimodalLM\")\n\nif not vision_result.get(\"has_generate\"):\n issues.append(\"Vision model does not expose generate()\")\n fixes.append(\"Load MiniCPM-V 4.6 through the image-text/multimodal auto class, not generic AutoModel\")\n\nif vision_result.get(\"has_chat\"):\n issues.append(\"Vision test is still seeing old chat() API\")\n fixes.append(\"Use processor.apply_chat_template() + model.generate() in app.py and InvoiceExtractorAgent\")\n\nif not vision_result.get(\"processor_ok\"):\n issues.append(\"Processor failed to load\")\n fixes.append(\"Load AutoProcessor from openbmb/MiniCPM-V-4.6; the merged repo is weights-only\")\n\nif not vision_result.get(\"inference_ok\"):\n issues.append(f\"Inference failed: {vision_result.get('inference_error', '?')}\")\n fixes.append(\"Check the generate/apply_chat_template error above\")\n\nif not text_result.get(\"text_model_load_ok\"):\n issues.append(f\"Text model failed: {text_result.get('text_model_load_error', '?')}\")\n fixes.append(\"Check text model error\")\n\nif not yolo_result.get(\"yolo_inference_ok\"):\n issues.append(f\"YOLO failed: {yolo_result.get('yolo_inference_error', '?')}\")\n fixes.append(\"Check YOLO error\")\n\nif not issues:\n print(\"All tests passed. Your HF Space should use the same load/inference path.\")\nelse:\n print(\"ISSUES FOUND:\")\n for i, issue in enumerate(issues, 1):\n print(f\" {i}. {issue}\")\n\n print(\"\\nRECOMMENDED FIXES:\")\n for i, fix in enumerate(fixes, 1):\n print(f\" {i}. {fix}\")\n\n print(\"\\nKey finding: processor loaded from:\", vision_result.get(\"processor_source\"))\n print(\"Model loader:\", vision_result.get(\"model_loader\"))" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.11.0" } }, "nbformat": 4, "nbformat_minor": 4 }