Commit Β·
8dc382f
1
Parent(s): c9462a9
Model loading fix for vison model
Browse files- agents/invoice_extractor.py +53 -10
- app.py +23 -12
- requirements.txt +2 -1
- samples/modal_inference_test.ipynb +180 -0
agents/invoice_extractor.py
CHANGED
|
@@ -92,7 +92,7 @@ def _dict_to_invoice(data: dict) -> InvoiceJSON:
|
|
| 92 |
|
| 93 |
|
| 94 |
def _call_llm_with_image(llm, image_bytes: bytes, prompt: str) -> str:
|
| 95 |
-
"""Call MiniCPM-V
|
| 96 |
import io
|
| 97 |
import torch
|
| 98 |
from PIL import Image as PILImage
|
|
@@ -100,15 +100,58 @@ def _call_llm_with_image(llm, image_bytes: bytes, prompt: str) -> str:
|
|
| 100 |
model, processor = llm
|
| 101 |
image = PILImage.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 102 |
|
| 103 |
-
messages = [{
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
|
| 114 |
def _extract_from_image(llm, image_bytes: bytes) -> Tuple[InvoiceJSON, list[str]]:
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
def _call_llm_with_image(llm, image_bytes: bytes, prompt: str) -> str:
|
| 95 |
+
"""Call MiniCPM-V 4.6 through the current processor + generate API."""
|
| 96 |
import io
|
| 97 |
import torch
|
| 98 |
from PIL import Image as PILImage
|
|
|
|
| 100 |
model, processor = llm
|
| 101 |
image = PILImage.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 102 |
|
| 103 |
+
messages = [{
|
| 104 |
+
"role": "user",
|
| 105 |
+
"content": [
|
| 106 |
+
{"type": "image", "image": image},
|
| 107 |
+
{"type": "text", "text": prompt},
|
| 108 |
+
],
|
| 109 |
+
}]
|
| 110 |
+
downsample_mode = "16x"
|
| 111 |
+
try:
|
| 112 |
+
inputs = processor.apply_chat_template(
|
| 113 |
+
messages,
|
| 114 |
+
tokenize=True,
|
| 115 |
+
add_generation_prompt=True,
|
| 116 |
+
return_dict=True,
|
| 117 |
+
return_tensors="pt",
|
| 118 |
+
downsample_mode=downsample_mode,
|
| 119 |
+
max_slice_nums=36,
|
| 120 |
+
)
|
| 121 |
+
except TypeError:
|
| 122 |
+
inputs = processor.apply_chat_template(
|
| 123 |
+
messages,
|
| 124 |
+
tokenize=True,
|
| 125 |
+
add_generation_prompt=True,
|
| 126 |
+
return_dict=True,
|
| 127 |
+
return_tensors="pt",
|
| 128 |
+
)
|
| 129 |
+
downsample_mode = None
|
| 130 |
+
|
| 131 |
+
inputs = inputs.to(model.device)
|
| 132 |
+
generate_kwargs = {
|
| 133 |
+
**inputs,
|
| 134 |
+
"max_new_tokens": 2048,
|
| 135 |
+
"do_sample": False,
|
| 136 |
+
}
|
| 137 |
+
if downsample_mode is not None:
|
| 138 |
+
generate_kwargs["downsample_mode"] = downsample_mode
|
| 139 |
+
|
| 140 |
+
with torch.inference_mode():
|
| 141 |
+
try:
|
| 142 |
+
generated_ids = model.generate(**generate_kwargs)
|
| 143 |
+
except TypeError:
|
| 144 |
+
generate_kwargs.pop("downsample_mode", None)
|
| 145 |
+
generated_ids = model.generate(**generate_kwargs)
|
| 146 |
+
|
| 147 |
+
prompt_len = inputs["input_ids"].shape[-1]
|
| 148 |
+
generated_ids = generated_ids[:, prompt_len:]
|
| 149 |
+
decoded = processor.batch_decode(
|
| 150 |
+
generated_ids,
|
| 151 |
+
skip_special_tokens=True,
|
| 152 |
+
clean_up_tokenization_spaces=False,
|
| 153 |
+
)
|
| 154 |
+
return decoded[0].strip()
|
| 155 |
|
| 156 |
|
| 157 |
def _extract_from_image(llm, image_bytes: bytes) -> Tuple[InvoiceJSON, list[str]]:
|
app.py
CHANGED
|
@@ -63,7 +63,7 @@ def load_models() -> None:
|
|
| 63 |
logger.info("Downloading vision model (MiniCPM-V 4.6 merged)β¦")
|
| 64 |
import os
|
| 65 |
import torch
|
| 66 |
-
from transformers import
|
| 67 |
from huggingface_hub import snapshot_download
|
| 68 |
from safetensors.torch import load_file as safetensors_load
|
| 69 |
|
|
@@ -72,16 +72,25 @@ def load_models() -> None:
|
|
| 72 |
_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
| 73 |
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
logger.info("Loading base model code from %s β¦", _BASE_REPO)
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
logger.info("Loading fine-tuned weights from %s β¦", _MERGED_REPO)
|
| 87 |
merged_local = snapshot_download(_MERGED_REPO, token=_HF_TOKEN or None)
|
|
@@ -99,8 +108,10 @@ def load_models() -> None:
|
|
| 99 |
logger.info("Fine-tuned weights loaded (%d keys, %d missing)", len(state_dict), len(missing))
|
| 100 |
|
| 101 |
_vision_model.eval()
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
| 104 |
vision_llm = (_vision_model, _vision_processor)
|
| 105 |
logger.info("Vision LLM ready (device=%s dtype=%s)", _device, _dtype)
|
| 106 |
|
|
|
|
| 63 |
logger.info("Downloading vision model (MiniCPM-V 4.6 merged)β¦")
|
| 64 |
import os
|
| 65 |
import torch
|
| 66 |
+
from transformers import AutoProcessor
|
| 67 |
from huggingface_hub import snapshot_download
|
| 68 |
from safetensors.torch import load_file as safetensors_load
|
| 69 |
|
|
|
|
| 72 |
_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
| 73 |
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 74 |
|
| 75 |
+
try:
|
| 76 |
+
from transformers import AutoModelForImageTextToText as _VisionModel
|
| 77 |
+
except ImportError:
|
| 78 |
+
from transformers import AutoModelForMultimodalLM as _VisionModel
|
| 79 |
+
|
| 80 |
+
# Load base model code plus fine-tuned weights from the merged repo.
|
| 81 |
+
# MiniCPM-V 4.6 uses AutoProcessor + model.generate() for inference.
|
| 82 |
logger.info("Loading base model code from %s β¦", _BASE_REPO)
|
| 83 |
+
_model_kwargs = {
|
| 84 |
+
"trust_remote_code": True,
|
| 85 |
+
"torch_dtype": _dtype,
|
| 86 |
+
}
|
| 87 |
+
if _HF_TOKEN:
|
| 88 |
+
_model_kwargs["token"] = _HF_TOKEN
|
| 89 |
+
if torch.cuda.is_available():
|
| 90 |
+
_model_kwargs["device_map"] = "auto"
|
| 91 |
+
_vision_model = _VisionModel.from_pretrained(_BASE_REPO, **_model_kwargs)
|
| 92 |
+
if not torch.cuda.is_available():
|
| 93 |
+
_vision_model.to(_device)
|
| 94 |
|
| 95 |
logger.info("Loading fine-tuned weights from %s β¦", _MERGED_REPO)
|
| 96 |
merged_local = snapshot_download(_MERGED_REPO, token=_HF_TOKEN or None)
|
|
|
|
| 108 |
logger.info("Fine-tuned weights loaded (%d keys, %d missing)", len(state_dict), len(missing))
|
| 109 |
|
| 110 |
_vision_model.eval()
|
| 111 |
+
_processor_kwargs = {"trust_remote_code": True}
|
| 112 |
+
if _HF_TOKEN:
|
| 113 |
+
_processor_kwargs["token"] = _HF_TOKEN
|
| 114 |
+
_vision_processor = AutoProcessor.from_pretrained(_BASE_REPO, **_processor_kwargs)
|
| 115 |
vision_llm = (_vision_model, _vision_processor)
|
| 116 |
logger.info("Vision LLM ready (device=%s dtype=%s)", _device, _dtype)
|
| 117 |
|
requirements.txt
CHANGED
|
@@ -7,7 +7,8 @@ PyMuPDF==1.25.5
|
|
| 7 |
numpy==2.2.5
|
| 8 |
opencv-python-headless==4.11.0.86
|
| 9 |
huggingface_hub>=0.33.5
|
| 10 |
-
transformers>=
|
|
|
|
| 11 |
accelerate>=0.26.0
|
| 12 |
safetensors>=0.4.3
|
| 13 |
datasets==3.6.0
|
|
|
|
| 7 |
numpy==2.2.5
|
| 8 |
opencv-python-headless==4.11.0.86
|
| 9 |
huggingface_hub>=0.33.5
|
| 10 |
+
transformers>=5.7.0
|
| 11 |
+
torchvision>=0.19.0
|
| 12 |
accelerate>=0.26.0
|
| 13 |
safetensors>=0.4.3
|
| 14 |
datasets==3.6.0
|
samples/modal_inference_test.ipynb
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {},
|
| 6 |
+
"source": [
|
| 7 |
+
"# Kirana Detective β Modal Inference Test\n",
|
| 8 |
+
"\n",
|
| 9 |
+
"Run this notebook cell-by-cell in a Modal GPU environment to verify:\n",
|
| 10 |
+
"1. All dependencies install correctly (including torchvision)\n",
|
| 11 |
+
"2. Vision model (MiniCPM-V 4.6 + fine-tuned weights) loads correctly\n",
|
| 12 |
+
"3. Processor loads correctly\n",
|
| 13 |
+
"4. End-to-end inference on a test invoice image works\n",
|
| 14 |
+
"5. Text LLM (MiniCPM5-1B gguf) loads and runs\n",
|
| 15 |
+
"6. YOLO ONNX model loads and runs\n",
|
| 16 |
+
"\n",
|
| 17 |
+
"**Before running:** Set your HF_TOKEN secret in Modal or paste it below."
|
| 18 |
+
]
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"cell_type": "markdown",
|
| 22 |
+
"metadata": {},
|
| 23 |
+
"source": [
|
| 24 |
+
"## Cell 1 β Modal app definition\n",
|
| 25 |
+
"\n",
|
| 26 |
+
"This defines the Modal image with all required dependencies and the GPU function."
|
| 27 |
+
]
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"cell_type": "code",
|
| 31 |
+
"execution_count": null,
|
| 32 |
+
"metadata": {},
|
| 33 |
+
"outputs": [],
|
| 34 |
+
"source": [
|
| 35 |
+
"# Install modal if not already installed\n",
|
| 36 |
+
"import subprocess, sys\n",
|
| 37 |
+
"subprocess.run([sys.executable, \"-m\", \"pip\", \"install\", \"modal\", \"-q\"], check=True)\n",
|
| 38 |
+
"print(\"modal installed\")"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "code",
|
| 43 |
+
"execution_count": null,
|
| 44 |
+
"metadata": {},
|
| 45 |
+
"outputs": [],
|
| 46 |
+
"source": "import modal\n\napp = modal.App(\"kirana-inference-test\")\n\nimage = (\n modal.Image.debian_slim(python_version=\"3.12\")\n .pip_install(\n \"torch==2.4.0\",\n \"torchvision==0.19.0\",\n \"torchaudio==2.4.0\",\n extra_options=\"--index-url https://download.pytorch.org/whl/cu121\",\n )\n .pip_install(\n \"transformers>=5.7.0\",\n \"accelerate>=0.26.0\",\n \"safetensors>=0.4.3\",\n \"huggingface_hub>=0.33.5\",\n \"Pillow>=11.0.0\",\n \"numpy>=1.26.0\",\n \"llama-cpp-python==0.3.28\",\n \"onnxruntime==1.21.0\",\n \"requests\",\n extra_options=\"--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu\",\n )\n)\n\nprint(\"Modal app defined\")"
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"cell_type": "markdown",
|
| 50 |
+
"metadata": {},
|
| 51 |
+
"source": [
|
| 52 |
+
"## Cell 2 β Define the test functions"
|
| 53 |
+
]
|
| 54 |
+
},
|
| 55 |
+
{
|
| 56 |
+
"cell_type": "code",
|
| 57 |
+
"execution_count": null,
|
| 58 |
+
"metadata": {},
|
| 59 |
+
"outputs": [],
|
| 60 |
+
"source": "HF_TOKEN = \"\" # paste your token here OR leave empty if set as Modal secret\n\n_BASE_REPO = \"openbmb/MiniCPM-V-4.6\"\n_MERGED_REPO = \"build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged\"\n_TEXT_REPO = \"build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer\"\n_TEXT_FILE = \"MiniCPM5-1B.Q4_K_M.gguf\"\n_YOLO_REPO = \"build-small-hackathon/yolo26n-indian-fmcg-detection\"\n\n\ndef _make_test_invoice_image():\n \"\"\"Create a minimal synthetic invoice image with visible text.\"\"\"\n from PIL import Image as PILImage, ImageDraw\n\n img = PILImage.new(\"RGB\", (640, 900), color=(255, 255, 255))\n draw = ImageDraw.Draw(img)\n lines = [\n \"TAX INVOICE\",\n \"Supplier: Hindustan Unilever Ltd\",\n \"Invoice No: INV-2024-00123\",\n \"Date: 2024-05-15\",\n \"\",\n \"Item Qty Price Total\",\n \"Surf Excel 1kg 2 180.00 360.00\",\n \"Lifebuoy Soap 100g 5 22.00 110.00\",\n \"Dove Shampoo 200ml 3 95.00 285.00\",\n \"\",\n \"Grand Total: 755.00\",\n ]\n y = 40\n for line in lines:\n draw.text((40, y), line, fill=(0, 0, 0))\n y += 36\n return img\n\n\n@app.function(\n image=image,\n gpu=\"T4\",\n timeout=600,\n secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_vision_model(hf_token: str = \"\") -> dict:\n \"\"\"\n Load MiniCPM-V 4.6 base architecture + fine-tuned weights + processor,\n then run inference using processor.apply_chat_template() and generate().\n \"\"\"\n import time, os\n import torch\n from transformers import AutoProcessor\n from huggingface_hub import snapshot_download\n from safetensors.torch import load_file as safetensors_load\n\n try:\n from transformers import AutoModelForImageTextToText as VisionModel\n model_loader = \"AutoModelForImageTextToText\"\n except ImportError:\n from transformers import AutoModelForMultimodalLM as VisionModel\n model_loader = \"AutoModelForMultimodalLM\"\n\n token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n results = {\"model_loader\": model_loader}\n\n results[\"torch_version\"] = torch.__version__\n results[\"cuda_available\"] = torch.cuda.is_available()\n results[\"cuda_device\"] = torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"CPU\"\n\n try:\n import torchvision\n results[\"torchvision_version\"] = torchvision.__version__\n except ImportError as e:\n results[\"torchvision_error\"] = str(e)\n\n dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32\n device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\n t0 = time.time()\n try:\n model_kwargs = {\"trust_remote_code\": True, \"torch_dtype\": dtype, \"token\": token}\n if torch.cuda.is_available():\n model_kwargs[\"device_map\"] = \"auto\"\n model = VisionModel.from_pretrained(_BASE_REPO, **model_kwargs)\n if not torch.cuda.is_available():\n model.to(device)\n results[\"base_model_class\"] = type(model).__name__\n results[\"base_model_load_s\"] = round(time.time() - t0, 1)\n results[\"base_model_ok\"] = True\n results[\"has_chat\"] = hasattr(model, \"chat\")\n results[\"has_generate\"] = hasattr(model, \"generate\")\n except Exception as e:\n results[\"base_model_ok\"] = False\n results[\"base_model_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n merged_local = snapshot_download(_MERGED_REPO, token=token)\n shard_files = sorted(f for f in os.listdir(merged_local) if f.endswith(\".safetensors\"))\n if not shard_files:\n raise RuntimeError(f\"No .safetensors found in {_MERGED_REPO}\")\n state_dict = {}\n for sf in shard_files:\n state_dict.update(safetensors_load(os.path.join(merged_local, sf), device=\"cpu\"))\n missing, unexpected = model.load_state_dict(state_dict, strict=False)\n results[\"finetuned_weights_load_s\"] = round(time.time() - t0, 1)\n results[\"finetuned_weights_ok\"] = True\n results[\"finetuned_keys\"] = len(state_dict)\n results[\"finetuned_missing_keys\"] = len(missing)\n results[\"finetuned_unexpected_keys\"] = len(unexpected)\n model.eval()\n except Exception as e:\n results[\"finetuned_weights_ok\"] = False\n results[\"finetuned_weights_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n processor = AutoProcessor.from_pretrained(_BASE_REPO, trust_remote_code=True, token=token)\n results[\"processor_load_s\"] = round(time.time() - t0, 1)\n results[\"processor_ok\"] = True\n results[\"processor_source\"] = _BASE_REPO\n results[\"processor_class\"] = type(processor).__name__\n except Exception as e:\n results[\"processor_ok\"] = False\n results[\"processor_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n test_img = _make_test_invoice_image()\n prompt = (\n \"You are an OCR agent. Extract data from this invoice image and return ONLY valid JSON \"\n 'matching: {\"invoice_number\": string|null, \"supplier\": string|null, \"date\": string|null, '\n '\"items\": [{\"product_raw\": string, \"quantity\": number, \"unit_price\": number, \"line_total\": number}], '\n '\"grand_total\": number}. Return ONLY the JSON, no prose.'\n )\n messages = [{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image\", \"image\": test_img},\n {\"type\": \"text\", \"text\": prompt},\n ],\n }]\n downsample_mode = \"16x\"\n try:\n inputs = processor.apply_chat_template(\n messages,\n tokenize=True,\n add_generation_prompt=True,\n return_dict=True,\n return_tensors=\"pt\",\n downsample_mode=downsample_mode,\n max_slice_nums=36,\n )\n except TypeError:\n inputs = processor.apply_chat_template(\n messages,\n tokenize=True,\n add_generation_prompt=True,\n return_dict=True,\n return_tensors=\"pt\",\n )\n downsample_mode = None\n inputs = inputs.to(model.device)\n generate_kwargs = {**inputs, \"max_new_tokens\": 512, \"do_sample\": False}\n if downsample_mode is not None:\n generate_kwargs[\"downsample_mode\"] = downsample_mode\n with torch.inference_mode():\n try:\n output_ids = model.generate(**generate_kwargs)\n except TypeError:\n generate_kwargs.pop(\"downsample_mode\", None)\n output_ids = model.generate(**generate_kwargs)\n output_ids = output_ids[:, inputs[\"input_ids\"].shape[-1]:]\n response = processor.batch_decode(\n output_ids,\n skip_special_tokens=True,\n clean_up_tokenization_spaces=False,\n )[0].strip()\n results[\"inference_ok\"] = True\n results[\"inference_s\"] = round(time.time() - t0, 1)\n results[\"inference_output\"] = str(response)[:800]\n except Exception as e:\n results[\"inference_ok\"] = False\n results[\"inference_error\"] = str(e)\n\n return results\n\n\n@app.function(\n image=image,\n timeout=300,\n secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_text_model(hf_token: str = \"\") -> dict:\n \"\"\"Download and load MiniCPM5-1B GGUF, run a normalization test prompt.\"\"\"\n import time, os\n from huggingface_hub import hf_hub_download\n from llama_cpp import Llama\n\n token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n results = {}\n\n t0 = time.time()\n try:\n path = hf_hub_download(repo_id=_TEXT_REPO, filename=_TEXT_FILE, token=token)\n results[\"text_model_download_s\"] = round(time.time() - t0, 1)\n except Exception as e:\n results[\"text_model_download_ok\"] = False\n results[\"text_model_download_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n llm = Llama(model_path=path, n_ctx=8192, n_threads=4, verbose=False)\n results[\"text_model_load_s\"] = round(time.time() - t0, 1)\n results[\"text_model_load_ok\"] = True\n except Exception as e:\n results[\"text_model_load_ok\"] = False\n results[\"text_model_load_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n resp = llm(\n \"Normalize product name to standard FMCG name. Input: 'Lays Classic Salted 26g'. \"\n \"Output JSON: {\\\"normalized\\\": \\\"...\\\", \\\"brand\\\": \\\"...\\\", \\\"category\\\": \\\"...\\\"}\",\n max_tokens=64,\n stop=[\"\\n\\n\"],\n )\n results[\"text_inference_ok\"] = True\n results[\"text_inference_s\"] = round(time.time() - t0, 1)\n results[\"text_inference_output\"] = resp[\"choices\"][0][\"text\"][:300]\n except Exception as e:\n results[\"text_inference_ok\"] = False\n results[\"text_inference_error\"] = str(e)\n\n return results\n\n\n@app.function(\n image=image,\n timeout=120,\n secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_yolo_model(hf_token: str = \"\") -> dict:\n \"\"\"Download YOLO ONNX and run inference on a synthetic 640x640 image.\"\"\"\n import time, os, json\n import numpy as np\n from huggingface_hub import hf_hub_download\n import onnxruntime as ort\n\n token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n results = {}\n\n t0 = time.time()\n try:\n onnx_path = hf_hub_download(repo_id=_YOLO_REPO, filename=\"yolo26n_fmcg.onnx\", token=token)\n class_path = hf_hub_download(repo_id=_YOLO_REPO, filename=\"class_names.json\", token=token)\n with open(class_path) as f:\n class_names = json.load(f)\n results[\"yolo_download_s\"] = round(time.time() - t0, 1)\n results[\"yolo_num_classes\"] = len(class_names)\n results[\"yolo_sample_classes\"] = class_names[:5]\n except Exception as e:\n results[\"yolo_download_ok\"] = False\n results[\"yolo_download_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n session = ort.InferenceSession(onnx_path, providers=[\"CPUExecutionProvider\"])\n inp_name = session.get_inputs()[0].name\n inp_shape = session.get_inputs()[0].shape\n results[\"yolo_load_s\"] = round(time.time() - t0, 1)\n results[\"yolo_input_name\"] = inp_name\n results[\"yolo_input_shape\"] = str(inp_shape)\n except Exception as e:\n results[\"yolo_load_ok\"] = False\n results[\"yolo_load_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n dummy = np.random.rand(1, 3, 640, 640).astype(np.float32)\n outputs = session.run(None, {inp_name: dummy})\n results[\"yolo_inference_ok\"] = True\n results[\"yolo_inference_s\"] = round(time.time() - t0, 1)\n results[\"yolo_output_shapes\"] = [str(o.shape) for o in outputs]\n except Exception as e:\n results[\"yolo_inference_ok\"] = False\n results[\"yolo_inference_error\"] = str(e)\n\n return results\n\n\nprint(\"Functions defined\")"
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"cell_type": "markdown",
|
| 64 |
+
"metadata": {},
|
| 65 |
+
"source": [
|
| 66 |
+
"## Cell 3 β Run vision model test (GPU T4, ~3-5 min)"
|
| 67 |
+
]
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
"cell_type": "code",
|
| 71 |
+
"execution_count": null,
|
| 72 |
+
"metadata": {},
|
| 73 |
+
"outputs": [],
|
| 74 |
+
"source": [
|
| 75 |
+
"import json\n",
|
| 76 |
+
"\n",
|
| 77 |
+
"with app.run():\n",
|
| 78 |
+
" vision_result = test_vision_model.remote(hf_token=HF_TOKEN)\n",
|
| 79 |
+
"\n",
|
| 80 |
+
"print(\"=== VISION MODEL TEST RESULTS ===\")\n",
|
| 81 |
+
"print(json.dumps(vision_result, indent=2))\n",
|
| 82 |
+
"\n",
|
| 83 |
+
"# Summary\n",
|
| 84 |
+
"print(\"\\n=== SUMMARY ===\")\n",
|
| 85 |
+
"print(f\" torch: {vision_result.get('torch_version')}\")\n",
|
| 86 |
+
"print(f\" torchvision: {vision_result.get('torchvision_version', 'β MISSING')}\")\n",
|
| 87 |
+
"print(f\" CUDA device: {vision_result.get('cuda_device')}\")\n",
|
| 88 |
+
"print(f\" base model: {'β
' if vision_result.get('base_model_ok') else 'β'} ({vision_result.get('base_model_load_s', '?')}s)\")\n",
|
| 89 |
+
"print(f\" fine-tuned wts: {'β
' if vision_result.get('finetuned_weights_ok') else 'β'} ({vision_result.get('finetuned_keys', '?')} keys, {vision_result.get('finetuned_missing_keys', '?')} missing)\")\n",
|
| 90 |
+
"print(f\" processor: {'β
' if vision_result.get('processor_ok') else 'β'} (from {vision_result.get('processor_source', 'N/A')})\")\n",
|
| 91 |
+
"print(f\" inference: {'β
' if vision_result.get('inference_ok') else 'β'} ({vision_result.get('inference_s', '?')}s)\")\n",
|
| 92 |
+
"if vision_result.get('inference_output'):\n",
|
| 93 |
+
" print(f\"\\n Output snippet: {vision_result['inference_output'][:200]}\")"
|
| 94 |
+
]
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"cell_type": "markdown",
|
| 98 |
+
"metadata": {},
|
| 99 |
+
"source": [
|
| 100 |
+
"## Cell 4 β Run text model test (CPU, ~2-3 min)"
|
| 101 |
+
]
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"cell_type": "code",
|
| 105 |
+
"execution_count": null,
|
| 106 |
+
"metadata": {},
|
| 107 |
+
"outputs": [],
|
| 108 |
+
"source": [
|
| 109 |
+
"with app.run():\n",
|
| 110 |
+
" text_result = test_text_model.remote(hf_token=HF_TOKEN)\n",
|
| 111 |
+
"\n",
|
| 112 |
+
"print(\"=== TEXT MODEL TEST RESULTS ===\")\n",
|
| 113 |
+
"print(json.dumps(text_result, indent=2))\n",
|
| 114 |
+
"\n",
|
| 115 |
+
"print(\"\\n=== SUMMARY ===\")\n",
|
| 116 |
+
"print(f\" download: {'β
' if not text_result.get('text_model_download_error') else 'β'} ({text_result.get('text_model_download_s', '?')}s)\")\n",
|
| 117 |
+
"print(f\" load: {'β
' if text_result.get('text_model_load_ok') else 'β'} ({text_result.get('text_model_load_s', '?')}s)\")\n",
|
| 118 |
+
"print(f\" inference: {'β
' if text_result.get('text_inference_ok') else 'β'} ({text_result.get('text_inference_s', '?')}s)\")\n",
|
| 119 |
+
"if text_result.get('text_inference_output'):\n",
|
| 120 |
+
" print(f\"\\n Output: {text_result['text_inference_output']}\")"
|
| 121 |
+
]
|
| 122 |
+
},
|
| 123 |
+
{
|
| 124 |
+
"cell_type": "markdown",
|
| 125 |
+
"metadata": {},
|
| 126 |
+
"source": [
|
| 127 |
+
"## Cell 5 β Run YOLO model test (CPU, ~1 min)"
|
| 128 |
+
]
|
| 129 |
+
},
|
| 130 |
+
{
|
| 131 |
+
"cell_type": "code",
|
| 132 |
+
"execution_count": null,
|
| 133 |
+
"metadata": {},
|
| 134 |
+
"outputs": [],
|
| 135 |
+
"source": [
|
| 136 |
+
"with app.run():\n",
|
| 137 |
+
" yolo_result = test_yolo_model.remote(hf_token=HF_TOKEN)\n",
|
| 138 |
+
"\n",
|
| 139 |
+
"print(\"=== YOLO MODEL TEST RESULTS ===\")\n",
|
| 140 |
+
"print(json.dumps(yolo_result, indent=2))\n",
|
| 141 |
+
"\n",
|
| 142 |
+
"print(\"\\n=== SUMMARY ===\")\n",
|
| 143 |
+
"print(f\" download: {'β
' if not yolo_result.get('yolo_download_error') else 'β'} ({yolo_result.get('yolo_download_s', '?')}s, {yolo_result.get('yolo_num_classes', '?')} classes)\")\n",
|
| 144 |
+
"print(f\" load: {'β
' if not yolo_result.get('yolo_load_error') else 'β'} ({yolo_result.get('yolo_load_s', '?')}s)\")\n",
|
| 145 |
+
"print(f\" inference: {'β
' if yolo_result.get('yolo_inference_ok') else 'β'} ({yolo_result.get('yolo_inference_s', '?')}s)\")\n",
|
| 146 |
+
"if yolo_result.get('yolo_output_shapes'):\n",
|
| 147 |
+
" print(f\" output shapes: {yolo_result['yolo_output_shapes']}\")"
|
| 148 |
+
]
|
| 149 |
+
},
|
| 150 |
+
{
|
| 151 |
+
"cell_type": "markdown",
|
| 152 |
+
"metadata": {},
|
| 153 |
+
"source": [
|
| 154 |
+
"## Cell 6 β Diagnosis helper\n",
|
| 155 |
+
"\n",
|
| 156 |
+
"Run this after the tests above to get a clear fix list."
|
| 157 |
+
]
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"cell_type": "code",
|
| 161 |
+
"execution_count": null,
|
| 162 |
+
"metadata": {},
|
| 163 |
+
"outputs": [],
|
| 164 |
+
"source": "print(\"=\"*60)\nprint(\"DIAGNOSIS REPORT\")\nprint(\"=\"*60)\n\nissues = []\nfixes = []\n\nif \"torchvision_error\" in vision_result:\n issues.append(\"torchvision not installed\")\n fixes.append(\"Add torchvision to requirements.txt / Dockerfile\")\n\nif not vision_result.get(\"base_model_ok\"):\n issues.append(f\"Vision base model failed: {vision_result.get('base_model_error', '?')}\")\n fixes.append(\"Use transformers>=5.7.0 and AutoModelForImageTextToText/AutoModelForMultimodalLM\")\n\nif not vision_result.get(\"has_generate\"):\n issues.append(\"Vision model does not expose generate()\")\n fixes.append(\"Load MiniCPM-V 4.6 through the image-text/multimodal auto class, not generic AutoModel\")\n\nif vision_result.get(\"has_chat\"):\n issues.append(\"Vision test is still seeing old chat() API\")\n fixes.append(\"Use processor.apply_chat_template() + model.generate() in app.py and InvoiceExtractorAgent\")\n\nif not vision_result.get(\"processor_ok\"):\n issues.append(\"Processor failed to load\")\n fixes.append(\"Load AutoProcessor from openbmb/MiniCPM-V-4.6; the merged repo is weights-only\")\n\nif not vision_result.get(\"inference_ok\"):\n issues.append(f\"Inference failed: {vision_result.get('inference_error', '?')}\")\n fixes.append(\"Check the generate/apply_chat_template error above\")\n\nif not text_result.get(\"text_model_load_ok\"):\n issues.append(f\"Text model failed: {text_result.get('text_model_load_error', '?')}\")\n fixes.append(\"Check text model error\")\n\nif not yolo_result.get(\"yolo_inference_ok\"):\n issues.append(f\"YOLO failed: {yolo_result.get('yolo_inference_error', '?')}\")\n fixes.append(\"Check YOLO error\")\n\nif not issues:\n print(\"All tests passed. Your HF Space should use the same load/inference path.\")\nelse:\n print(\"ISSUES FOUND:\")\n for i, issue in enumerate(issues, 1):\n print(f\" {i}. {issue}\")\n\n print(\"\\nRECOMMENDED FIXES:\")\n for i, fix in enumerate(fixes, 1):\n print(f\" {i}. {fix}\")\n\n print(\"\\nKey finding: processor loaded from:\", vision_result.get(\"processor_source\"))\n print(\"Model loader:\", vision_result.get(\"model_loader\"))"
|
| 165 |
+
}
|
| 166 |
+
],
|
| 167 |
+
"metadata": {
|
| 168 |
+
"kernelspec": {
|
| 169 |
+
"display_name": "Python 3",
|
| 170 |
+
"language": "python",
|
| 171 |
+
"name": "python3"
|
| 172 |
+
},
|
| 173 |
+
"language_info": {
|
| 174 |
+
"name": "python",
|
| 175 |
+
"version": "3.11.0"
|
| 176 |
+
}
|
| 177 |
+
},
|
| 178 |
+
"nbformat": 4,
|
| 179 |
+
"nbformat_minor": 4
|
| 180 |
+
}
|