{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Kirana Detective — Modal Inference Test\n",
    "\n",
    "Run this notebook cell-by-cell in a Modal GPU environment to verify:\n",
    "1. All dependencies install correctly (including torchvision)\n",
    "2. Vision model (MiniCPM-V 4.6 + fine-tuned weights) loads correctly\n",
    "3. Processor loads correctly\n",
    "4. End-to-end inference on a test invoice image works\n",
    "5. Text LLM (MiniCPM5-1B gguf) loads and runs\n",
    "6. YOLO ONNX model loads and runs\n",
    "\n",
    "**Before running:** Set your HF_TOKEN secret in Modal or paste it below."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cell 1 — Modal app definition\n",
    "\n",
    "This defines the Modal image with all required dependencies and the GPU function."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install modal if not already installed\n",
    "import subprocess, sys\n",
    "subprocess.run([sys.executable, \"-m\", \"pip\", \"install\", \"modal\", \"-q\"], check=True)\n",
    "print(\"modal installed\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "import modal\n\napp = modal.App(\"kirana-inference-test\")\n\nimage = (\n    modal.Image.debian_slim(python_version=\"3.12\")\n    .pip_install(\n        \"torch==2.4.0\",\n        \"torchvision==0.19.0\",\n        \"torchaudio==2.4.0\",\n        extra_options=\"--index-url https://download.pytorch.org/whl/cu121\",\n    )\n    .pip_install(\n        \"transformers>=5.7.0\",\n        \"accelerate>=0.26.0\",\n        \"safetensors>=0.4.3\",\n        \"huggingface_hub>=0.33.5\",\n        \"Pillow>=11.0.0\",\n        \"numpy>=1.26.0\",\n        \"llama-cpp-python==0.3.28\",\n        \"onnxruntime==1.21.0\",\n        \"requests\",\n        extra_options=\"--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu\",\n    )\n)\n\nprint(\"Modal app defined\")"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cell 2 — Define the test functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "HF_TOKEN = \"\"  # paste your token here OR leave empty if set as Modal secret\n\n_BASE_REPO   = \"openbmb/MiniCPM-V-4.6\"\n_MERGED_REPO = \"build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged\"\n_TEXT_REPO   = \"build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer\"\n_TEXT_FILE   = \"MiniCPM5-1B.Q4_K_M.gguf\"\n_YOLO_REPO   = \"build-small-hackathon/yolo26n-indian-fmcg-detection\"\n\n\ndef _make_test_invoice_image():\n    \"\"\"Create a minimal synthetic invoice image with visible text.\"\"\"\n    from PIL import Image as PILImage, ImageDraw\n\n    img = PILImage.new(\"RGB\", (640, 900), color=(255, 255, 255))\n    draw = ImageDraw.Draw(img)\n    lines = [\n        \"TAX INVOICE\",\n        \"Supplier: Hindustan Unilever Ltd\",\n        \"Invoice No: INV-2024-00123\",\n        \"Date: 2024-05-15\",\n        \"\",\n        \"Item                  Qty   Price   Total\",\n        \"Surf Excel 1kg         2    180.00  360.00\",\n        \"Lifebuoy Soap 100g     5     22.00  110.00\",\n        \"Dove Shampoo 200ml     3     95.00  285.00\",\n        \"\",\n        \"Grand Total:                        755.00\",\n    ]\n    y = 40\n    for line in lines:\n        draw.text((40, y), line, fill=(0, 0, 0))\n        y += 36\n    return img\n\n\n@app.function(\n    image=image,\n    gpu=\"T4\",\n    timeout=600,\n    secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_vision_model(hf_token: str = \"\") -> dict:\n    \"\"\"\n    Load MiniCPM-V 4.6 base architecture + fine-tuned weights + processor,\n    then run inference using processor.apply_chat_template() and generate().\n    \"\"\"\n    import time, os\n    import torch\n    from transformers import AutoProcessor\n    from huggingface_hub import snapshot_download\n    from safetensors.torch import load_file as safetensors_load\n\n    try:\n        from transformers import AutoModelForImageTextToText as VisionModel\n        model_loader = \"AutoModelForImageTextToText\"\n    except ImportError:\n        from transformers import AutoModelForMultimodalLM as VisionModel\n        model_loader = \"AutoModelForMultimodalLM\"\n\n    token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n    results = {\"model_loader\": model_loader}\n\n    results[\"torch_version\"]  = torch.__version__\n    results[\"cuda_available\"] = torch.cuda.is_available()\n    results[\"cuda_device\"]    = torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"CPU\"\n\n    try:\n        import torchvision\n        results[\"torchvision_version\"] = torchvision.__version__\n    except ImportError as e:\n        results[\"torchvision_error\"] = str(e)\n\n    dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32\n    device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\n    t0 = time.time()\n    try:\n        model_kwargs = {\"trust_remote_code\": True, \"torch_dtype\": dtype, \"token\": token}\n        if torch.cuda.is_available():\n            model_kwargs[\"device_map\"] = \"auto\"\n        model = VisionModel.from_pretrained(_BASE_REPO, **model_kwargs)\n        if not torch.cuda.is_available():\n            model.to(device)\n        results[\"base_model_class\"]  = type(model).__name__\n        results[\"base_model_load_s\"] = round(time.time() - t0, 1)\n        results[\"base_model_ok\"]     = True\n        results[\"has_chat\"]          = hasattr(model, \"chat\")\n        results[\"has_generate\"]      = hasattr(model, \"generate\")\n    except Exception as e:\n        results[\"base_model_ok\"]    = False\n        results[\"base_model_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        merged_local = snapshot_download(_MERGED_REPO, token=token)\n        shard_files = sorted(f for f in os.listdir(merged_local) if f.endswith(\".safetensors\"))\n        if not shard_files:\n            raise RuntimeError(f\"No .safetensors found in {_MERGED_REPO}\")\n        state_dict = {}\n        for sf in shard_files:\n            state_dict.update(safetensors_load(os.path.join(merged_local, sf), device=\"cpu\"))\n        missing, unexpected = model.load_state_dict(state_dict, strict=False)\n        results[\"finetuned_weights_load_s\"]  = round(time.time() - t0, 1)\n        results[\"finetuned_weights_ok\"]      = True\n        results[\"finetuned_keys\"]            = len(state_dict)\n        results[\"finetuned_missing_keys\"]    = len(missing)\n        results[\"finetuned_unexpected_keys\"] = len(unexpected)\n        model.eval()\n    except Exception as e:\n        results[\"finetuned_weights_ok\"]    = False\n        results[\"finetuned_weights_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        processor = AutoProcessor.from_pretrained(_BASE_REPO, trust_remote_code=True, token=token)\n        results[\"processor_load_s\"] = round(time.time() - t0, 1)\n        results[\"processor_ok\"] = True\n        results[\"processor_source\"] = _BASE_REPO\n        results[\"processor_class\"] = type(processor).__name__\n    except Exception as e:\n        results[\"processor_ok\"] = False\n        results[\"processor_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        test_img = _make_test_invoice_image()\n        prompt = (\n            \"You are an OCR agent. Extract data from this invoice image and return ONLY valid JSON \"\n            'matching: {\"invoice_number\": string|null, \"supplier\": string|null, \"date\": string|null, '\n            '\"items\": [{\"product_raw\": string, \"quantity\": number, \"unit_price\": number, \"line_total\": number}], '\n            '\"grand_total\": number}. Return ONLY the JSON, no prose.'\n        )\n        messages = [{\n            \"role\": \"user\",\n            \"content\": [\n                {\"type\": \"image\", \"image\": test_img},\n                {\"type\": \"text\", \"text\": prompt},\n            ],\n        }]\n        downsample_mode = \"16x\"\n        try:\n            inputs = processor.apply_chat_template(\n                messages,\n                tokenize=True,\n                add_generation_prompt=True,\n                return_dict=True,\n                return_tensors=\"pt\",\n                downsample_mode=downsample_mode,\n                max_slice_nums=36,\n            )\n        except TypeError:\n            inputs = processor.apply_chat_template(\n                messages,\n                tokenize=True,\n                add_generation_prompt=True,\n                return_dict=True,\n                return_tensors=\"pt\",\n            )\n            downsample_mode = None\n        inputs = inputs.to(model.device)\n        generate_kwargs = {**inputs, \"max_new_tokens\": 512, \"do_sample\": False}\n        if downsample_mode is not None:\n            generate_kwargs[\"downsample_mode\"] = downsample_mode\n        with torch.inference_mode():\n            try:\n                output_ids = model.generate(**generate_kwargs)\n            except TypeError:\n                generate_kwargs.pop(\"downsample_mode\", None)\n                output_ids = model.generate(**generate_kwargs)\n        output_ids = output_ids[:, inputs[\"input_ids\"].shape[-1]:]\n        response = processor.batch_decode(\n            output_ids,\n            skip_special_tokens=True,\n            clean_up_tokenization_spaces=False,\n        )[0].strip()\n        results[\"inference_ok\"]     = True\n        results[\"inference_s\"]      = round(time.time() - t0, 1)\n        results[\"inference_output\"] = str(response)[:800]\n    except Exception as e:\n        results[\"inference_ok\"]    = False\n        results[\"inference_error\"] = str(e)\n\n    return results\n\n\n@app.function(\n    image=image,\n    timeout=300,\n    secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_text_model(hf_token: str = \"\") -> dict:\n    \"\"\"Download and load MiniCPM5-1B GGUF, run a normalization test prompt.\"\"\"\n    import time, os\n    from huggingface_hub import hf_hub_download\n    from llama_cpp import Llama\n\n    token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n    results = {}\n\n    t0 = time.time()\n    try:\n        path = hf_hub_download(repo_id=_TEXT_REPO, filename=_TEXT_FILE, token=token)\n        results[\"text_model_download_s\"] = round(time.time() - t0, 1)\n    except Exception as e:\n        results[\"text_model_download_ok\"]    = False\n        results[\"text_model_download_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        llm = Llama(model_path=path, n_ctx=8192, n_threads=4, verbose=False)\n        results[\"text_model_load_s\"]  = round(time.time() - t0, 1)\n        results[\"text_model_load_ok\"] = True\n    except Exception as e:\n        results[\"text_model_load_ok\"]    = False\n        results[\"text_model_load_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        resp = llm(\n            \"Normalize product name to standard FMCG name. Input: 'Lays Classic Salted 26g'. \"\n            \"Output JSON: {\\\"normalized\\\": \\\"...\\\", \\\"brand\\\": \\\"...\\\", \\\"category\\\": \\\"...\\\"}\",\n            max_tokens=64,\n            stop=[\"\\n\\n\"],\n        )\n        results[\"text_inference_ok\"]     = True\n        results[\"text_inference_s\"]      = round(time.time() - t0, 1)\n        results[\"text_inference_output\"] = resp[\"choices\"][0][\"text\"][:300]\n    except Exception as e:\n        results[\"text_inference_ok\"]    = False\n        results[\"text_inference_error\"] = str(e)\n\n    return results\n\n\n@app.function(\n    image=image,\n    timeout=120,\n    secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_yolo_model(hf_token: str = \"\") -> dict:\n    \"\"\"Download YOLO ONNX and run inference on a synthetic 640x640 image.\"\"\"\n    import time, os, json\n    import numpy as np\n    from huggingface_hub import hf_hub_download\n    import onnxruntime as ort\n\n    token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n    results = {}\n\n    t0 = time.time()\n    try:\n        onnx_path  = hf_hub_download(repo_id=_YOLO_REPO, filename=\"yolo26n_fmcg.onnx\", token=token)\n        class_path = hf_hub_download(repo_id=_YOLO_REPO, filename=\"class_names.json\", token=token)\n        with open(class_path) as f:\n            class_names = json.load(f)\n        results[\"yolo_download_s\"]     = round(time.time() - t0, 1)\n        results[\"yolo_num_classes\"]    = len(class_names)\n        results[\"yolo_sample_classes\"] = class_names[:5]\n    except Exception as e:\n        results[\"yolo_download_ok\"]    = False\n        results[\"yolo_download_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        session   = ort.InferenceSession(onnx_path, providers=[\"CPUExecutionProvider\"])\n        inp_name  = session.get_inputs()[0].name\n        inp_shape = session.get_inputs()[0].shape\n        results[\"yolo_load_s\"]      = round(time.time() - t0, 1)\n        results[\"yolo_input_name\"]  = inp_name\n        results[\"yolo_input_shape\"] = str(inp_shape)\n    except Exception as e:\n        results[\"yolo_load_ok\"]    = False\n        results[\"yolo_load_error\"] = str(e)\n        return results\n\n    t0 = time.time()\n    try:\n        dummy   = np.random.rand(1, 3, 640, 640).astype(np.float32)\n        outputs = session.run(None, {inp_name: dummy})\n        results[\"yolo_inference_ok\"]  = True\n        results[\"yolo_inference_s\"]   = round(time.time() - t0, 1)\n        results[\"yolo_output_shapes\"] = [str(o.shape) for o in outputs]\n    except Exception as e:\n        results[\"yolo_inference_ok\"]    = False\n        results[\"yolo_inference_error\"] = str(e)\n\n    return results\n\n\nprint(\"Functions defined\")"
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cell 3 — Run vision model test (GPU T4, ~3-5 min)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with app.run():\n",
    "    vision_result = test_vision_model.remote(hf_token=HF_TOKEN)\n",
    "\n",
    "print(\"=== VISION MODEL TEST RESULTS ===\")\n",
    "print(json.dumps(vision_result, indent=2))\n",
    "\n",
    "# Summary\n",
    "print(\"\\n=== SUMMARY ===\")\n",
    "print(f\"  torch:          {vision_result.get('torch_version')}\")\n",
    "print(f\"  torchvision:    {vision_result.get('torchvision_version', '❌ MISSING')}\")\n",
    "print(f\"  CUDA device:    {vision_result.get('cuda_device')}\")\n",
    "print(f\"  base model:     {'✅' if vision_result.get('base_model_ok') else '❌'} ({vision_result.get('base_model_load_s', '?')}s)\")\n",
    "print(f\"  fine-tuned wts: {'✅' if vision_result.get('finetuned_weights_ok') else '❌'} ({vision_result.get('finetuned_keys', '?')} keys, {vision_result.get('finetuned_missing_keys', '?')} missing)\")\n",
    "print(f\"  processor:      {'✅' if vision_result.get('processor_ok') else '❌'} (from {vision_result.get('processor_source', 'N/A')})\")\n",
    "print(f\"  inference:      {'✅' if vision_result.get('inference_ok') else '❌'} ({vision_result.get('inference_s', '?')}s)\")\n",
    "if vision_result.get('inference_output'):\n",
    "    print(f\"\\n  Output snippet: {vision_result['inference_output'][:200]}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cell 4 — Run text model test (CPU, ~2-3 min)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with app.run():\n",
    "    text_result = test_text_model.remote(hf_token=HF_TOKEN)\n",
    "\n",
    "print(\"=== TEXT MODEL TEST RESULTS ===\")\n",
    "print(json.dumps(text_result, indent=2))\n",
    "\n",
    "print(\"\\n=== SUMMARY ===\")\n",
    "print(f\"  download:  {'✅' if not text_result.get('text_model_download_error') else '❌'} ({text_result.get('text_model_download_s', '?')}s)\")\n",
    "print(f\"  load:      {'✅' if text_result.get('text_model_load_ok') else '❌'} ({text_result.get('text_model_load_s', '?')}s)\")\n",
    "print(f\"  inference: {'✅' if text_result.get('text_inference_ok') else '❌'} ({text_result.get('text_inference_s', '?')}s)\")\n",
    "if text_result.get('text_inference_output'):\n",
    "    print(f\"\\n  Output: {text_result['text_inference_output']}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cell 5 — Run YOLO model test (CPU, ~1 min)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with app.run():\n",
    "    yolo_result = test_yolo_model.remote(hf_token=HF_TOKEN)\n",
    "\n",
    "print(\"=== YOLO MODEL TEST RESULTS ===\")\n",
    "print(json.dumps(yolo_result, indent=2))\n",
    "\n",
    "print(\"\\n=== SUMMARY ===\")\n",
    "print(f\"  download:  {'✅' if not yolo_result.get('yolo_download_error') else '❌'} ({yolo_result.get('yolo_download_s', '?')}s, {yolo_result.get('yolo_num_classes', '?')} classes)\")\n",
    "print(f\"  load:      {'✅' if not yolo_result.get('yolo_load_error') else '❌'} ({yolo_result.get('yolo_load_s', '?')}s)\")\n",
    "print(f\"  inference: {'✅' if yolo_result.get('yolo_inference_ok') else '❌'} ({yolo_result.get('yolo_inference_s', '?')}s)\")\n",
    "if yolo_result.get('yolo_output_shapes'):\n",
    "    print(f\"  output shapes: {yolo_result['yolo_output_shapes']}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cell 6 — Diagnosis helper\n",
    "\n",
    "Run this after the tests above to get a clear fix list."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": "print(\"=\"*60)\nprint(\"DIAGNOSIS REPORT\")\nprint(\"=\"*60)\n\nissues = []\nfixes  = []\n\nif \"torchvision_error\" in vision_result:\n    issues.append(\"torchvision not installed\")\n    fixes.append(\"Add torchvision to requirements.txt / Dockerfile\")\n\nif not vision_result.get(\"base_model_ok\"):\n    issues.append(f\"Vision base model failed: {vision_result.get('base_model_error', '?')}\")\n    fixes.append(\"Use transformers>=5.7.0 and AutoModelForImageTextToText/AutoModelForMultimodalLM\")\n\nif not vision_result.get(\"has_generate\"):\n    issues.append(\"Vision model does not expose generate()\")\n    fixes.append(\"Load MiniCPM-V 4.6 through the image-text/multimodal auto class, not generic AutoModel\")\n\nif vision_result.get(\"has_chat\"):\n    issues.append(\"Vision test is still seeing old chat() API\")\n    fixes.append(\"Use processor.apply_chat_template() + model.generate() in app.py and InvoiceExtractorAgent\")\n\nif not vision_result.get(\"processor_ok\"):\n    issues.append(\"Processor failed to load\")\n    fixes.append(\"Load AutoProcessor from openbmb/MiniCPM-V-4.6; the merged repo is weights-only\")\n\nif not vision_result.get(\"inference_ok\"):\n    issues.append(f\"Inference failed: {vision_result.get('inference_error', '?')}\")\n    fixes.append(\"Check the generate/apply_chat_template error above\")\n\nif not text_result.get(\"text_model_load_ok\"):\n    issues.append(f\"Text model failed: {text_result.get('text_model_load_error', '?')}\")\n    fixes.append(\"Check text model error\")\n\nif not yolo_result.get(\"yolo_inference_ok\"):\n    issues.append(f\"YOLO failed: {yolo_result.get('yolo_inference_error', '?')}\")\n    fixes.append(\"Check YOLO error\")\n\nif not issues:\n    print(\"All tests passed. Your HF Space should use the same load/inference path.\")\nelse:\n    print(\"ISSUES FOUND:\")\n    for i, issue in enumerate(issues, 1):\n        print(f\"  {i}. {issue}\")\n\n    print(\"\\nRECOMMENDED FIXES:\")\n    for i, fix in enumerate(fixes, 1):\n        print(f\"  {i}. {fix}\")\n\n    print(\"\\nKey finding: processor loaded from:\", vision_result.get(\"processor_source\"))\n    print(\"Model loader:\", vision_result.get(\"model_loader\"))"
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}