naazimsnh02 commited on
Commit
8dc382f
Β·
1 Parent(s): c9462a9

Model loading fix for vison model

Browse files
agents/invoice_extractor.py CHANGED
@@ -92,7 +92,7 @@ def _dict_to_invoice(data: dict) -> InvoiceJSON:
92
 
93
 
94
  def _call_llm_with_image(llm, image_bytes: bytes, prompt: str) -> str:
95
- """Call MiniCPM-V via native transformers generate() with an image."""
96
  import io
97
  import torch
98
  from PIL import Image as PILImage
@@ -100,15 +100,58 @@ def _call_llm_with_image(llm, image_bytes: bytes, prompt: str) -> str:
100
  model, processor = llm
101
  image = PILImage.open(io.BytesIO(image_bytes)).convert("RGB")
102
 
103
- messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
104
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
105
- inputs = processor(text=[text], images=[image], return_tensors="pt").to(model.device)
106
-
107
- with torch.no_grad():
108
- generated_ids = model.generate(**inputs, max_new_tokens=2048, do_sample=False)
109
-
110
- input_len = inputs.input_ids.shape[1]
111
- return processor.decode(generated_ids[0][input_len:], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
 
114
  def _extract_from_image(llm, image_bytes: bytes) -> Tuple[InvoiceJSON, list[str]]:
 
92
 
93
 
94
  def _call_llm_with_image(llm, image_bytes: bytes, prompt: str) -> str:
95
+ """Call MiniCPM-V 4.6 through the current processor + generate API."""
96
  import io
97
  import torch
98
  from PIL import Image as PILImage
 
100
  model, processor = llm
101
  image = PILImage.open(io.BytesIO(image_bytes)).convert("RGB")
102
 
103
+ messages = [{
104
+ "role": "user",
105
+ "content": [
106
+ {"type": "image", "image": image},
107
+ {"type": "text", "text": prompt},
108
+ ],
109
+ }]
110
+ downsample_mode = "16x"
111
+ try:
112
+ inputs = processor.apply_chat_template(
113
+ messages,
114
+ tokenize=True,
115
+ add_generation_prompt=True,
116
+ return_dict=True,
117
+ return_tensors="pt",
118
+ downsample_mode=downsample_mode,
119
+ max_slice_nums=36,
120
+ )
121
+ except TypeError:
122
+ inputs = processor.apply_chat_template(
123
+ messages,
124
+ tokenize=True,
125
+ add_generation_prompt=True,
126
+ return_dict=True,
127
+ return_tensors="pt",
128
+ )
129
+ downsample_mode = None
130
+
131
+ inputs = inputs.to(model.device)
132
+ generate_kwargs = {
133
+ **inputs,
134
+ "max_new_tokens": 2048,
135
+ "do_sample": False,
136
+ }
137
+ if downsample_mode is not None:
138
+ generate_kwargs["downsample_mode"] = downsample_mode
139
+
140
+ with torch.inference_mode():
141
+ try:
142
+ generated_ids = model.generate(**generate_kwargs)
143
+ except TypeError:
144
+ generate_kwargs.pop("downsample_mode", None)
145
+ generated_ids = model.generate(**generate_kwargs)
146
+
147
+ prompt_len = inputs["input_ids"].shape[-1]
148
+ generated_ids = generated_ids[:, prompt_len:]
149
+ decoded = processor.batch_decode(
150
+ generated_ids,
151
+ skip_special_tokens=True,
152
+ clean_up_tokenization_spaces=False,
153
+ )
154
+ return decoded[0].strip()
155
 
156
 
157
  def _extract_from_image(llm, image_bytes: bytes) -> Tuple[InvoiceJSON, list[str]]:
app.py CHANGED
@@ -63,7 +63,7 @@ def load_models() -> None:
63
  logger.info("Downloading vision model (MiniCPM-V 4.6 merged)…")
64
  import os
65
  import torch
66
- from transformers import AutoModel
67
  from huggingface_hub import snapshot_download
68
  from safetensors.torch import load_file as safetensors_load
69
 
@@ -72,16 +72,25 @@ def load_models() -> None:
72
  _dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
73
  _device = "cuda" if torch.cuda.is_available() else "cpu"
74
 
75
- # Load base model code (has .chat()) + fine-tuned weights from merged repo.
76
- # save_pretrained() after PEFT merge doesn't copy custom modeling files,
77
- # so the merged repo lacks .chat(); loading base code fixes this.
 
 
 
 
78
  logger.info("Loading base model code from %s …", _BASE_REPO)
79
- _vision_model = AutoModel.from_pretrained(
80
- _BASE_REPO,
81
- trust_remote_code=True,
82
- torch_dtype=_dtype,
83
- device_map=_device,
84
- )
 
 
 
 
 
85
 
86
  logger.info("Loading fine-tuned weights from %s …", _MERGED_REPO)
87
  merged_local = snapshot_download(_MERGED_REPO, token=_HF_TOKEN or None)
@@ -99,8 +108,10 @@ def load_models() -> None:
99
  logger.info("Fine-tuned weights loaded (%d keys, %d missing)", len(state_dict), len(missing))
100
 
101
  _vision_model.eval()
102
- from transformers import AutoProcessor
103
- _vision_processor = AutoProcessor.from_pretrained(_MERGED_REPO, trust_remote_code=True)
 
 
104
  vision_llm = (_vision_model, _vision_processor)
105
  logger.info("Vision LLM ready (device=%s dtype=%s)", _device, _dtype)
106
 
 
63
  logger.info("Downloading vision model (MiniCPM-V 4.6 merged)…")
64
  import os
65
  import torch
66
+ from transformers import AutoProcessor
67
  from huggingface_hub import snapshot_download
68
  from safetensors.torch import load_file as safetensors_load
69
 
 
72
  _dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
73
  _device = "cuda" if torch.cuda.is_available() else "cpu"
74
 
75
+ try:
76
+ from transformers import AutoModelForImageTextToText as _VisionModel
77
+ except ImportError:
78
+ from transformers import AutoModelForMultimodalLM as _VisionModel
79
+
80
+ # Load base model code plus fine-tuned weights from the merged repo.
81
+ # MiniCPM-V 4.6 uses AutoProcessor + model.generate() for inference.
82
  logger.info("Loading base model code from %s …", _BASE_REPO)
83
+ _model_kwargs = {
84
+ "trust_remote_code": True,
85
+ "torch_dtype": _dtype,
86
+ }
87
+ if _HF_TOKEN:
88
+ _model_kwargs["token"] = _HF_TOKEN
89
+ if torch.cuda.is_available():
90
+ _model_kwargs["device_map"] = "auto"
91
+ _vision_model = _VisionModel.from_pretrained(_BASE_REPO, **_model_kwargs)
92
+ if not torch.cuda.is_available():
93
+ _vision_model.to(_device)
94
 
95
  logger.info("Loading fine-tuned weights from %s …", _MERGED_REPO)
96
  merged_local = snapshot_download(_MERGED_REPO, token=_HF_TOKEN or None)
 
108
  logger.info("Fine-tuned weights loaded (%d keys, %d missing)", len(state_dict), len(missing))
109
 
110
  _vision_model.eval()
111
+ _processor_kwargs = {"trust_remote_code": True}
112
+ if _HF_TOKEN:
113
+ _processor_kwargs["token"] = _HF_TOKEN
114
+ _vision_processor = AutoProcessor.from_pretrained(_BASE_REPO, **_processor_kwargs)
115
  vision_llm = (_vision_model, _vision_processor)
116
  logger.info("Vision LLM ready (device=%s dtype=%s)", _device, _dtype)
117
 
requirements.txt CHANGED
@@ -7,7 +7,8 @@ PyMuPDF==1.25.5
7
  numpy==2.2.5
8
  opencv-python-headless==4.11.0.86
9
  huggingface_hub>=0.33.5
10
- transformers>=4.46.0
 
11
  accelerate>=0.26.0
12
  safetensors>=0.4.3
13
  datasets==3.6.0
 
7
  numpy==2.2.5
8
  opencv-python-headless==4.11.0.86
9
  huggingface_hub>=0.33.5
10
+ transformers>=5.7.0
11
+ torchvision>=0.19.0
12
  accelerate>=0.26.0
13
  safetensors>=0.4.3
14
  datasets==3.6.0
samples/modal_inference_test.ipynb ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Kirana Detective β€” Modal Inference Test\n",
8
+ "\n",
9
+ "Run this notebook cell-by-cell in a Modal GPU environment to verify:\n",
10
+ "1. All dependencies install correctly (including torchvision)\n",
11
+ "2. Vision model (MiniCPM-V 4.6 + fine-tuned weights) loads correctly\n",
12
+ "3. Processor loads correctly\n",
13
+ "4. End-to-end inference on a test invoice image works\n",
14
+ "5. Text LLM (MiniCPM5-1B gguf) loads and runs\n",
15
+ "6. YOLO ONNX model loads and runs\n",
16
+ "\n",
17
+ "**Before running:** Set your HF_TOKEN secret in Modal or paste it below."
18
+ ]
19
+ },
20
+ {
21
+ "cell_type": "markdown",
22
+ "metadata": {},
23
+ "source": [
24
+ "## Cell 1 β€” Modal app definition\n",
25
+ "\n",
26
+ "This defines the Modal image with all required dependencies and the GPU function."
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": null,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "# Install modal if not already installed\n",
36
+ "import subprocess, sys\n",
37
+ "subprocess.run([sys.executable, \"-m\", \"pip\", \"install\", \"modal\", \"-q\"], check=True)\n",
38
+ "print(\"modal installed\")"
39
+ ]
40
+ },
41
+ {
42
+ "cell_type": "code",
43
+ "execution_count": null,
44
+ "metadata": {},
45
+ "outputs": [],
46
+ "source": "import modal\n\napp = modal.App(\"kirana-inference-test\")\n\nimage = (\n modal.Image.debian_slim(python_version=\"3.12\")\n .pip_install(\n \"torch==2.4.0\",\n \"torchvision==0.19.0\",\n \"torchaudio==2.4.0\",\n extra_options=\"--index-url https://download.pytorch.org/whl/cu121\",\n )\n .pip_install(\n \"transformers>=5.7.0\",\n \"accelerate>=0.26.0\",\n \"safetensors>=0.4.3\",\n \"huggingface_hub>=0.33.5\",\n \"Pillow>=11.0.0\",\n \"numpy>=1.26.0\",\n \"llama-cpp-python==0.3.28\",\n \"onnxruntime==1.21.0\",\n \"requests\",\n extra_options=\"--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu\",\n )\n)\n\nprint(\"Modal app defined\")"
47
+ },
48
+ {
49
+ "cell_type": "markdown",
50
+ "metadata": {},
51
+ "source": [
52
+ "## Cell 2 β€” Define the test functions"
53
+ ]
54
+ },
55
+ {
56
+ "cell_type": "code",
57
+ "execution_count": null,
58
+ "metadata": {},
59
+ "outputs": [],
60
+ "source": "HF_TOKEN = \"\" # paste your token here OR leave empty if set as Modal secret\n\n_BASE_REPO = \"openbmb/MiniCPM-V-4.6\"\n_MERGED_REPO = \"build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged\"\n_TEXT_REPO = \"build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer\"\n_TEXT_FILE = \"MiniCPM5-1B.Q4_K_M.gguf\"\n_YOLO_REPO = \"build-small-hackathon/yolo26n-indian-fmcg-detection\"\n\n\ndef _make_test_invoice_image():\n \"\"\"Create a minimal synthetic invoice image with visible text.\"\"\"\n from PIL import Image as PILImage, ImageDraw\n\n img = PILImage.new(\"RGB\", (640, 900), color=(255, 255, 255))\n draw = ImageDraw.Draw(img)\n lines = [\n \"TAX INVOICE\",\n \"Supplier: Hindustan Unilever Ltd\",\n \"Invoice No: INV-2024-00123\",\n \"Date: 2024-05-15\",\n \"\",\n \"Item Qty Price Total\",\n \"Surf Excel 1kg 2 180.00 360.00\",\n \"Lifebuoy Soap 100g 5 22.00 110.00\",\n \"Dove Shampoo 200ml 3 95.00 285.00\",\n \"\",\n \"Grand Total: 755.00\",\n ]\n y = 40\n for line in lines:\n draw.text((40, y), line, fill=(0, 0, 0))\n y += 36\n return img\n\n\n@app.function(\n image=image,\n gpu=\"T4\",\n timeout=600,\n secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_vision_model(hf_token: str = \"\") -> dict:\n \"\"\"\n Load MiniCPM-V 4.6 base architecture + fine-tuned weights + processor,\n then run inference using processor.apply_chat_template() and generate().\n \"\"\"\n import time, os\n import torch\n from transformers import AutoProcessor\n from huggingface_hub import snapshot_download\n from safetensors.torch import load_file as safetensors_load\n\n try:\n from transformers import AutoModelForImageTextToText as VisionModel\n model_loader = \"AutoModelForImageTextToText\"\n except ImportError:\n from transformers import AutoModelForMultimodalLM as VisionModel\n model_loader = \"AutoModelForMultimodalLM\"\n\n token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n results = {\"model_loader\": model_loader}\n\n results[\"torch_version\"] = torch.__version__\n results[\"cuda_available\"] = torch.cuda.is_available()\n results[\"cuda_device\"] = torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"CPU\"\n\n try:\n import torchvision\n results[\"torchvision_version\"] = torchvision.__version__\n except ImportError as e:\n results[\"torchvision_error\"] = str(e)\n\n dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32\n device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n\n t0 = time.time()\n try:\n model_kwargs = {\"trust_remote_code\": True, \"torch_dtype\": dtype, \"token\": token}\n if torch.cuda.is_available():\n model_kwargs[\"device_map\"] = \"auto\"\n model = VisionModel.from_pretrained(_BASE_REPO, **model_kwargs)\n if not torch.cuda.is_available():\n model.to(device)\n results[\"base_model_class\"] = type(model).__name__\n results[\"base_model_load_s\"] = round(time.time() - t0, 1)\n results[\"base_model_ok\"] = True\n results[\"has_chat\"] = hasattr(model, \"chat\")\n results[\"has_generate\"] = hasattr(model, \"generate\")\n except Exception as e:\n results[\"base_model_ok\"] = False\n results[\"base_model_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n merged_local = snapshot_download(_MERGED_REPO, token=token)\n shard_files = sorted(f for f in os.listdir(merged_local) if f.endswith(\".safetensors\"))\n if not shard_files:\n raise RuntimeError(f\"No .safetensors found in {_MERGED_REPO}\")\n state_dict = {}\n for sf in shard_files:\n state_dict.update(safetensors_load(os.path.join(merged_local, sf), device=\"cpu\"))\n missing, unexpected = model.load_state_dict(state_dict, strict=False)\n results[\"finetuned_weights_load_s\"] = round(time.time() - t0, 1)\n results[\"finetuned_weights_ok\"] = True\n results[\"finetuned_keys\"] = len(state_dict)\n results[\"finetuned_missing_keys\"] = len(missing)\n results[\"finetuned_unexpected_keys\"] = len(unexpected)\n model.eval()\n except Exception as e:\n results[\"finetuned_weights_ok\"] = False\n results[\"finetuned_weights_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n processor = AutoProcessor.from_pretrained(_BASE_REPO, trust_remote_code=True, token=token)\n results[\"processor_load_s\"] = round(time.time() - t0, 1)\n results[\"processor_ok\"] = True\n results[\"processor_source\"] = _BASE_REPO\n results[\"processor_class\"] = type(processor).__name__\n except Exception as e:\n results[\"processor_ok\"] = False\n results[\"processor_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n test_img = _make_test_invoice_image()\n prompt = (\n \"You are an OCR agent. Extract data from this invoice image and return ONLY valid JSON \"\n 'matching: {\"invoice_number\": string|null, \"supplier\": string|null, \"date\": string|null, '\n '\"items\": [{\"product_raw\": string, \"quantity\": number, \"unit_price\": number, \"line_total\": number}], '\n '\"grand_total\": number}. Return ONLY the JSON, no prose.'\n )\n messages = [{\n \"role\": \"user\",\n \"content\": [\n {\"type\": \"image\", \"image\": test_img},\n {\"type\": \"text\", \"text\": prompt},\n ],\n }]\n downsample_mode = \"16x\"\n try:\n inputs = processor.apply_chat_template(\n messages,\n tokenize=True,\n add_generation_prompt=True,\n return_dict=True,\n return_tensors=\"pt\",\n downsample_mode=downsample_mode,\n max_slice_nums=36,\n )\n except TypeError:\n inputs = processor.apply_chat_template(\n messages,\n tokenize=True,\n add_generation_prompt=True,\n return_dict=True,\n return_tensors=\"pt\",\n )\n downsample_mode = None\n inputs = inputs.to(model.device)\n generate_kwargs = {**inputs, \"max_new_tokens\": 512, \"do_sample\": False}\n if downsample_mode is not None:\n generate_kwargs[\"downsample_mode\"] = downsample_mode\n with torch.inference_mode():\n try:\n output_ids = model.generate(**generate_kwargs)\n except TypeError:\n generate_kwargs.pop(\"downsample_mode\", None)\n output_ids = model.generate(**generate_kwargs)\n output_ids = output_ids[:, inputs[\"input_ids\"].shape[-1]:]\n response = processor.batch_decode(\n output_ids,\n skip_special_tokens=True,\n clean_up_tokenization_spaces=False,\n )[0].strip()\n results[\"inference_ok\"] = True\n results[\"inference_s\"] = round(time.time() - t0, 1)\n results[\"inference_output\"] = str(response)[:800]\n except Exception as e:\n results[\"inference_ok\"] = False\n results[\"inference_error\"] = str(e)\n\n return results\n\n\n@app.function(\n image=image,\n timeout=300,\n secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_text_model(hf_token: str = \"\") -> dict:\n \"\"\"Download and load MiniCPM5-1B GGUF, run a normalization test prompt.\"\"\"\n import time, os\n from huggingface_hub import hf_hub_download\n from llama_cpp import Llama\n\n token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n results = {}\n\n t0 = time.time()\n try:\n path = hf_hub_download(repo_id=_TEXT_REPO, filename=_TEXT_FILE, token=token)\n results[\"text_model_download_s\"] = round(time.time() - t0, 1)\n except Exception as e:\n results[\"text_model_download_ok\"] = False\n results[\"text_model_download_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n llm = Llama(model_path=path, n_ctx=8192, n_threads=4, verbose=False)\n results[\"text_model_load_s\"] = round(time.time() - t0, 1)\n results[\"text_model_load_ok\"] = True\n except Exception as e:\n results[\"text_model_load_ok\"] = False\n results[\"text_model_load_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n resp = llm(\n \"Normalize product name to standard FMCG name. Input: 'Lays Classic Salted 26g'. \"\n \"Output JSON: {\\\"normalized\\\": \\\"...\\\", \\\"brand\\\": \\\"...\\\", \\\"category\\\": \\\"...\\\"}\",\n max_tokens=64,\n stop=[\"\\n\\n\"],\n )\n results[\"text_inference_ok\"] = True\n results[\"text_inference_s\"] = round(time.time() - t0, 1)\n results[\"text_inference_output\"] = resp[\"choices\"][0][\"text\"][:300]\n except Exception as e:\n results[\"text_inference_ok\"] = False\n results[\"text_inference_error\"] = str(e)\n\n return results\n\n\n@app.function(\n image=image,\n timeout=120,\n secrets=[modal.Secret.from_name(\"huggingface-secret\")] if not HF_TOKEN else [],\n)\ndef test_yolo_model(hf_token: str = \"\") -> dict:\n \"\"\"Download YOLO ONNX and run inference on a synthetic 640x640 image.\"\"\"\n import time, os, json\n import numpy as np\n from huggingface_hub import hf_hub_download\n import onnxruntime as ort\n\n token = hf_token or os.environ.get(\"HF_TOKEN\", \"\") or None\n results = {}\n\n t0 = time.time()\n try:\n onnx_path = hf_hub_download(repo_id=_YOLO_REPO, filename=\"yolo26n_fmcg.onnx\", token=token)\n class_path = hf_hub_download(repo_id=_YOLO_REPO, filename=\"class_names.json\", token=token)\n with open(class_path) as f:\n class_names = json.load(f)\n results[\"yolo_download_s\"] = round(time.time() - t0, 1)\n results[\"yolo_num_classes\"] = len(class_names)\n results[\"yolo_sample_classes\"] = class_names[:5]\n except Exception as e:\n results[\"yolo_download_ok\"] = False\n results[\"yolo_download_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n session = ort.InferenceSession(onnx_path, providers=[\"CPUExecutionProvider\"])\n inp_name = session.get_inputs()[0].name\n inp_shape = session.get_inputs()[0].shape\n results[\"yolo_load_s\"] = round(time.time() - t0, 1)\n results[\"yolo_input_name\"] = inp_name\n results[\"yolo_input_shape\"] = str(inp_shape)\n except Exception as e:\n results[\"yolo_load_ok\"] = False\n results[\"yolo_load_error\"] = str(e)\n return results\n\n t0 = time.time()\n try:\n dummy = np.random.rand(1, 3, 640, 640).astype(np.float32)\n outputs = session.run(None, {inp_name: dummy})\n results[\"yolo_inference_ok\"] = True\n results[\"yolo_inference_s\"] = round(time.time() - t0, 1)\n results[\"yolo_output_shapes\"] = [str(o.shape) for o in outputs]\n except Exception as e:\n results[\"yolo_inference_ok\"] = False\n results[\"yolo_inference_error\"] = str(e)\n\n return results\n\n\nprint(\"Functions defined\")"
61
+ },
62
+ {
63
+ "cell_type": "markdown",
64
+ "metadata": {},
65
+ "source": [
66
+ "## Cell 3 β€” Run vision model test (GPU T4, ~3-5 min)"
67
+ ]
68
+ },
69
+ {
70
+ "cell_type": "code",
71
+ "execution_count": null,
72
+ "metadata": {},
73
+ "outputs": [],
74
+ "source": [
75
+ "import json\n",
76
+ "\n",
77
+ "with app.run():\n",
78
+ " vision_result = test_vision_model.remote(hf_token=HF_TOKEN)\n",
79
+ "\n",
80
+ "print(\"=== VISION MODEL TEST RESULTS ===\")\n",
81
+ "print(json.dumps(vision_result, indent=2))\n",
82
+ "\n",
83
+ "# Summary\n",
84
+ "print(\"\\n=== SUMMARY ===\")\n",
85
+ "print(f\" torch: {vision_result.get('torch_version')}\")\n",
86
+ "print(f\" torchvision: {vision_result.get('torchvision_version', '❌ MISSING')}\")\n",
87
+ "print(f\" CUDA device: {vision_result.get('cuda_device')}\")\n",
88
+ "print(f\" base model: {'βœ…' if vision_result.get('base_model_ok') else '❌'} ({vision_result.get('base_model_load_s', '?')}s)\")\n",
89
+ "print(f\" fine-tuned wts: {'βœ…' if vision_result.get('finetuned_weights_ok') else '❌'} ({vision_result.get('finetuned_keys', '?')} keys, {vision_result.get('finetuned_missing_keys', '?')} missing)\")\n",
90
+ "print(f\" processor: {'βœ…' if vision_result.get('processor_ok') else '❌'} (from {vision_result.get('processor_source', 'N/A')})\")\n",
91
+ "print(f\" inference: {'βœ…' if vision_result.get('inference_ok') else '❌'} ({vision_result.get('inference_s', '?')}s)\")\n",
92
+ "if vision_result.get('inference_output'):\n",
93
+ " print(f\"\\n Output snippet: {vision_result['inference_output'][:200]}\")"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "markdown",
98
+ "metadata": {},
99
+ "source": [
100
+ "## Cell 4 β€” Run text model test (CPU, ~2-3 min)"
101
+ ]
102
+ },
103
+ {
104
+ "cell_type": "code",
105
+ "execution_count": null,
106
+ "metadata": {},
107
+ "outputs": [],
108
+ "source": [
109
+ "with app.run():\n",
110
+ " text_result = test_text_model.remote(hf_token=HF_TOKEN)\n",
111
+ "\n",
112
+ "print(\"=== TEXT MODEL TEST RESULTS ===\")\n",
113
+ "print(json.dumps(text_result, indent=2))\n",
114
+ "\n",
115
+ "print(\"\\n=== SUMMARY ===\")\n",
116
+ "print(f\" download: {'βœ…' if not text_result.get('text_model_download_error') else '❌'} ({text_result.get('text_model_download_s', '?')}s)\")\n",
117
+ "print(f\" load: {'βœ…' if text_result.get('text_model_load_ok') else '❌'} ({text_result.get('text_model_load_s', '?')}s)\")\n",
118
+ "print(f\" inference: {'βœ…' if text_result.get('text_inference_ok') else '❌'} ({text_result.get('text_inference_s', '?')}s)\")\n",
119
+ "if text_result.get('text_inference_output'):\n",
120
+ " print(f\"\\n Output: {text_result['text_inference_output']}\")"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "markdown",
125
+ "metadata": {},
126
+ "source": [
127
+ "## Cell 5 β€” Run YOLO model test (CPU, ~1 min)"
128
+ ]
129
+ },
130
+ {
131
+ "cell_type": "code",
132
+ "execution_count": null,
133
+ "metadata": {},
134
+ "outputs": [],
135
+ "source": [
136
+ "with app.run():\n",
137
+ " yolo_result = test_yolo_model.remote(hf_token=HF_TOKEN)\n",
138
+ "\n",
139
+ "print(\"=== YOLO MODEL TEST RESULTS ===\")\n",
140
+ "print(json.dumps(yolo_result, indent=2))\n",
141
+ "\n",
142
+ "print(\"\\n=== SUMMARY ===\")\n",
143
+ "print(f\" download: {'βœ…' if not yolo_result.get('yolo_download_error') else '❌'} ({yolo_result.get('yolo_download_s', '?')}s, {yolo_result.get('yolo_num_classes', '?')} classes)\")\n",
144
+ "print(f\" load: {'βœ…' if not yolo_result.get('yolo_load_error') else '❌'} ({yolo_result.get('yolo_load_s', '?')}s)\")\n",
145
+ "print(f\" inference: {'βœ…' if yolo_result.get('yolo_inference_ok') else '❌'} ({yolo_result.get('yolo_inference_s', '?')}s)\")\n",
146
+ "if yolo_result.get('yolo_output_shapes'):\n",
147
+ " print(f\" output shapes: {yolo_result['yolo_output_shapes']}\")"
148
+ ]
149
+ },
150
+ {
151
+ "cell_type": "markdown",
152
+ "metadata": {},
153
+ "source": [
154
+ "## Cell 6 β€” Diagnosis helper\n",
155
+ "\n",
156
+ "Run this after the tests above to get a clear fix list."
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": null,
162
+ "metadata": {},
163
+ "outputs": [],
164
+ "source": "print(\"=\"*60)\nprint(\"DIAGNOSIS REPORT\")\nprint(\"=\"*60)\n\nissues = []\nfixes = []\n\nif \"torchvision_error\" in vision_result:\n issues.append(\"torchvision not installed\")\n fixes.append(\"Add torchvision to requirements.txt / Dockerfile\")\n\nif not vision_result.get(\"base_model_ok\"):\n issues.append(f\"Vision base model failed: {vision_result.get('base_model_error', '?')}\")\n fixes.append(\"Use transformers>=5.7.0 and AutoModelForImageTextToText/AutoModelForMultimodalLM\")\n\nif not vision_result.get(\"has_generate\"):\n issues.append(\"Vision model does not expose generate()\")\n fixes.append(\"Load MiniCPM-V 4.6 through the image-text/multimodal auto class, not generic AutoModel\")\n\nif vision_result.get(\"has_chat\"):\n issues.append(\"Vision test is still seeing old chat() API\")\n fixes.append(\"Use processor.apply_chat_template() + model.generate() in app.py and InvoiceExtractorAgent\")\n\nif not vision_result.get(\"processor_ok\"):\n issues.append(\"Processor failed to load\")\n fixes.append(\"Load AutoProcessor from openbmb/MiniCPM-V-4.6; the merged repo is weights-only\")\n\nif not vision_result.get(\"inference_ok\"):\n issues.append(f\"Inference failed: {vision_result.get('inference_error', '?')}\")\n fixes.append(\"Check the generate/apply_chat_template error above\")\n\nif not text_result.get(\"text_model_load_ok\"):\n issues.append(f\"Text model failed: {text_result.get('text_model_load_error', '?')}\")\n fixes.append(\"Check text model error\")\n\nif not yolo_result.get(\"yolo_inference_ok\"):\n issues.append(f\"YOLO failed: {yolo_result.get('yolo_inference_error', '?')}\")\n fixes.append(\"Check YOLO error\")\n\nif not issues:\n print(\"All tests passed. Your HF Space should use the same load/inference path.\")\nelse:\n print(\"ISSUES FOUND:\")\n for i, issue in enumerate(issues, 1):\n print(f\" {i}. {issue}\")\n\n print(\"\\nRECOMMENDED FIXES:\")\n for i, fix in enumerate(fixes, 1):\n print(f\" {i}. {fix}\")\n\n print(\"\\nKey finding: processor loaded from:\", vision_result.get(\"processor_source\"))\n print(\"Model loader:\", vision_result.get(\"model_loader\"))"
165
+ }
166
+ ],
167
+ "metadata": {
168
+ "kernelspec": {
169
+ "display_name": "Python 3",
170
+ "language": "python",
171
+ "name": "python3"
172
+ },
173
+ "language_info": {
174
+ "name": "python",
175
+ "version": "3.11.0"
176
+ }
177
+ },
178
+ "nbformat": 4,
179
+ "nbformat_minor": 4
180
+ }