| """ |
| Task 0.2 (Part 2) β Fine-tune MiniCPM-V 4.6 on synthetic Indian invoices. |
| |
| Two-step workflow: |
| Step 1 β Upload dataset to HF Hub (run once, after generate_invoices.py): |
| modal run finetune/train_minicpm_v.py::main_push |
| |
| Step 2 β Train: |
| modal run finetune/train_minicpm_v.py |
| |
| Publishes: |
| build-small-hackathon/kirana-invoice-train-data (HF dataset, reusable) |
| build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction (model adapter + GGUF) |
| |
| Training approach: |
| QLoRA via native PEFT + bitsandbytes on base openbmb/MiniCPM-V-4.6 |
| (unsloth is incompatible with MiniCPM-V-4.6 which requires transformers>=5.7.0) |
| System prompt: "Extract invoice JSON" |
| User turn: <image> + "Extract all invoice fields as JSON" |
| Assistant turn: <annotation JSON> |
| """ |
|
|
| import json |
| import os |
| from pathlib import Path |
|
|
| import modal |
|
|
| app = modal.App("kirana-minicpm-v-finetune") |
|
|
| IMAGE = ( |
| modal.Image.debian_slim(python_version="3.11") |
| .apt_install("libsm6", "libxext6", "git") |
| .pip_install( |
| "huggingface_hub>=0.30.0", |
| "datasets>=3.0.0", |
| "torch>=2.3.0", |
| "torchvision>=0.18.0", |
| "transformers[torch]>=5.7.0", |
| "peft>=0.14.0", |
| "bitsandbytes>=0.43.0", |
| "accelerate>=0.28.0", |
| "trl>=0.9.0", |
| "pillow>=10.0.0", |
| "sentencepiece>=0.2.0", |
| "timm>=0.9.0", |
| ) |
| ) |
|
|
| HF_SECRET = modal.Secret.from_name("hf-secret") |
|
|
| BASE_MODEL = "openbmb/MiniCPM-V-4.6" |
| HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction" |
| HF_DATASET_REPO = "build-small-hackathon/kirana-invoice-train-data" |
|
|
| ANNOTATIONS_PATH = "/data/synthetic_invoices/annotations.jsonl" |
| LORA_RANK = 16 |
| MAX_SEQ_LENGTH = 2048 |
| EPOCHS = 3 |
| BATCH_SIZE = 1 |
| GRAD_ACCUM = 16 |
| LEARNING_RATE = 1e-4 |
|
|
| SYSTEM_PROMPT = ( |
| "You are an invoice extraction assistant. " |
| "Given an invoice image, extract all fields as valid JSON. " |
| "Return ONLY the JSON object, no explanation." |
| ) |
|
|
| JSON_SCHEMA = """{ |
| "invoice_number": "string", |
| "supplier": "string", |
| "date": "YYYY-MM-DD", |
| "items": [ |
| { |
| "product_raw": "string", |
| "quantity": number, |
| "unit_price": number, |
| "gst_rate": number, |
| "line_total": number |
| } |
| ], |
| "grand_total": number, |
| "extraction_warnings": [] |
| }""" |
|
|
|
|
| |
|
|
| @app.function( |
| image=IMAGE, |
| timeout=3600, |
| secrets=[HF_SECRET], |
| volumes={"/data": modal.Volume.from_name("kirana-synth-data", create_if_missing=True)}, |
| memory=8192, |
| ) |
| def push_dataset(): |
| """Build a HF dataset from the Modal volume and push to Hub.""" |
| from PIL import Image as PILImage |
| from datasets import Dataset, Features, Value |
| from datasets import Image as HFImage |
|
|
| annotations = [] |
| with open(ANNOTATIONS_PATH) as f: |
| for line in f: |
| ann = json.loads(line.strip()) |
| img_path = os.path.join("/data", ann["image"]) |
| if os.path.exists(img_path): |
| annotations.append(ann) |
|
|
| print(f"Found {len(annotations)} images β loading...") |
| images, responses = [], [] |
| for ann in annotations: |
| try: |
| img = PILImage.open(os.path.join("/data", ann["image"])).convert("RGB") |
| images.append(img) |
| responses.append(json.dumps(ann["invoice"], ensure_ascii=False)) |
| except Exception as e: |
| print(f" Skipping {ann['image']}: {e}") |
|
|
| print(f"Building dataset from {len(images)} images...") |
| features = Features({"image": HFImage(), "response": Value("string")}) |
| ds = Dataset.from_dict({"image": images, "response": responses}, features=features) |
|
|
| split = ds.train_test_split(test_size=0.1, seed=42) |
| split.push_to_hub(HF_DATASET_REPO, token=os.environ["HF_TOKEN"]) |
| print( |
| f"Pushed: {len(split['train'])} train + {len(split['test'])} test" |
| f" β {HF_DATASET_REPO}" |
| ) |
|
|
|
|
| |
|
|
| @app.function( |
| image=IMAGE, |
| gpu="A10G", |
| timeout=14400, |
| secrets=[HF_SECRET], |
| volumes={ |
| "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=True), |
| }, |
| memory=32768, |
| ) |
| def train(): |
| import torch |
| from datasets import load_dataset |
| from huggingface_hub import HfApi |
| from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training |
| from torch.utils.data import Dataset as TorchDataset |
| from transformers import ( |
| AutoModelForMultimodalLM, |
| AutoTokenizer, |
| BitsAndBytesConfig, |
| Trainer, |
| TrainingArguments, |
| ) |
|
|
| |
| hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"]) |
| print(f"Dataset: {len(hf_ds['train'])} train, {len(hf_ds['test'])} eval") |
|
|
| |
| bnb_config = BitsAndBytesConfig( |
| load_in_4bit=True, |
| bnb_4bit_quant_type="nf4", |
| bnb_4bit_compute_dtype=torch.bfloat16, |
| bnb_4bit_use_double_quant=True, |
| ) |
|
|
| raw_model = AutoModelForMultimodalLM.from_pretrained( |
| BASE_MODEL, |
| quantization_config=bnb_config, |
| trust_remote_code=True, |
| dtype=torch.bfloat16, |
| device_map="auto", |
| ) |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) |
| print(f"Loaded model class: {raw_model.__class__.__name__}") |
|
|
| |
| |
| processor = None |
| try: |
| from transformers import AutoProcessor |
| processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True) |
| print(f"Loaded processor: {processor.__class__.__name__}") |
| except Exception as e: |
| print(f"AutoProcessor not available: {e}") |
|
|
| |
| image_token = None |
| vocab = tokenizer.get_vocab() |
| for candidate in ["(<image>./</image>)", "<image>", "<IMAGE>", "[IMAGE]"]: |
| if candidate in vocab: |
| image_token = candidate |
| break |
| |
| if image_token is None: |
| for token in list(tokenizer.special_tokens_map.values()) + list(tokenizer.added_tokens_encoder.keys()): |
| if isinstance(token, str) and "image" in token.lower(): |
| image_token = token |
| break |
| print(f"Image placeholder token: {image_token!r}") |
| print(f"Special tokens: {tokenizer.special_tokens_map}") |
|
|
| |
| instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}" |
|
|
| def preprocess_one(image, response, debug=False): |
| image = image.convert("RGB") |
|
|
| |
| |
| |
| |
| |
| msgs = [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": [ |
| {"type": "image"}, |
| {"type": "text", "text": instruction}, |
| ]}, |
| ] |
| text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) |
|
|
| |
| proc_out = processor( |
| text=text, |
| images=[image], |
| return_tensors="pt", |
| max_slice_nums=6, |
| ) |
|
|
| if debug: |
| print("=== PROCESSOR OUTPUT SHAPES (first sample) ===") |
| for k, v in proc_out.items(): |
| if isinstance(v, torch.Tensor): |
| print(f" {k}: shape={list(v.shape)}, dtype={v.dtype}") |
| elif isinstance(v, list): |
| item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:3]] |
| print(f" {k}: list[{len(v)}] = {item_info}") |
| else: |
| print(f" {k}: {type(v).__name__} = {v!r}") |
|
|
| prompt_ids = proc_out["input_ids"][0] |
|
|
| |
| |
| |
| vision_fields = {} |
| for k, v in proc_out.items(): |
| if k in ("input_ids", "attention_mask"): |
| continue |
| if isinstance(v, torch.Tensor): |
| if k == "pixel_values": |
| vision_fields[k] = v[0] |
| elif v.shape[0] == 1: |
| vision_fields[k] = v[0] |
| else: |
| vision_fields[k] = v |
| elif isinstance(v, list) and len(v) == 1: |
| vision_fields[k] = v[0] |
| else: |
| vision_fields[k] = v |
|
|
| if debug: |
| print("=== VISION FIELDS AFTER PROCESSING ===") |
| for k, v in vision_fields.items(): |
| if isinstance(v, torch.Tensor): |
| print(f" {k}: shape={list(v.shape)}") |
| else: |
| print(f" {k}: {type(v).__name__} = {v!r}") |
|
|
| response_ids = torch.tensor( |
| tokenizer.encode(response + tokenizer.eos_token, add_special_tokens=False), |
| dtype=torch.long, |
| ) |
|
|
| full_ids = torch.cat([prompt_ids, response_ids]) |
| labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids]) |
| full_ids = full_ids[:MAX_SEQ_LENGTH] |
| labels = labels[:MAX_SEQ_LENGTH] |
|
|
| return { |
| "input_ids": full_ids, |
| "attention_mask": torch.ones_like(full_ids), |
| "labels": labels, |
| **vision_fields, |
| } |
|
|
| print("Pre-processing training data...") |
| train_data, eval_data = [], [] |
| for i, s in enumerate(hf_ds["train"]): |
| try: |
| train_data.append(preprocess_one(s["image"], s["response"], debug=(i == 0))) |
| except Exception as e: |
| print(f" Skipping train[{i}]: {e}") |
| for i, s in enumerate(hf_ds["test"]): |
| try: |
| eval_data.append(preprocess_one(s["image"], s["response"])) |
| except Exception as e: |
| print(f" Skipping eval[{i}]: {e}") |
| print(f"Pre-processed {len(train_data)} train, {len(eval_data)} eval samples") |
|
|
| |
| model = prepare_model_for_kbit_training(raw_model, use_gradient_checkpointing=True) |
|
|
| |
| lora_config = LoraConfig( |
| r=LORA_RANK, |
| lora_alpha=LORA_RANK * 2, |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], |
| lora_dropout=0.05, |
| bias="none", |
| task_type=None, |
| ) |
| model = get_peft_model(model, lora_config) |
| model.print_trainable_parameters() |
|
|
| |
| class PreprocessedDataset(TorchDataset): |
| def __init__(self, data): |
| self.data = data |
|
|
| def __len__(self): |
| return len(self.data) |
|
|
| def __getitem__(self, idx): |
| return self.data[idx] |
|
|
| train_dataset = PreprocessedDataset(train_data) |
| eval_dataset = PreprocessedDataset(eval_data) |
|
|
| |
| pad_id = tokenizer.pad_token_id or 0 |
|
|
| import torch.nn.functional as F |
|
|
| def collate_fn(batch): |
| max_len = max(b["input_ids"].size(0) for b in batch) |
| result = {} |
| for b in batch: |
| pad = max_len - b["input_ids"].size(0) |
| result.setdefault("input_ids", []).append(F.pad(b["input_ids"], (0, pad), value=pad_id)) |
| result.setdefault("attention_mask", []).append(F.pad(b["attention_mask"], (0, pad), value=0)) |
| result.setdefault("labels", []).append(F.pad(b["labels"], (0, pad), value=-100)) |
| result = {k: torch.stack(v) for k, v in result.items()} |
|
|
| |
| |
| |
| extra_keys = [k for k in batch[0] if k not in result] |
| for k in extra_keys: |
| vals = [b[k] for b in batch] |
| if k == "target_sizes": |
| result[k] = torch.cat(vals, dim=0) |
| else: |
| try: |
| result[k] = torch.stack(vals) |
| except (RuntimeError, TypeError): |
| result[k] = vals |
| return result |
|
|
| |
| _debug_step_done = [False] |
|
|
| class DebugTrainer(Trainer): |
| def compute_loss(self, model, inputs, num_items_in_batch=None, **kwargs): |
| if not _debug_step_done[0]: |
| _debug_step_done[0] = True |
| print("=== MODEL INPUT SHAPES (first batch) ===") |
| for k, v in inputs.items(): |
| if isinstance(v, torch.Tensor): |
| print(f" {k}: shape={list(v.shape)}, dtype={v.dtype}") |
| elif isinstance(v, list): |
| item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:2]] |
| print(f" {k}: list[{len(v)}] = {item_info}") |
| else: |
| print(f" {k}: {type(v).__name__} = {v!r}") |
| return super().compute_loss(model, inputs, num_items_in_batch=num_items_in_batch, **kwargs) |
|
|
| |
| training_args = TrainingArguments( |
| output_dir="/output/minicpm-v-sft", |
| per_device_train_batch_size=BATCH_SIZE, |
| per_device_eval_batch_size=BATCH_SIZE, |
| gradient_accumulation_steps=GRAD_ACCUM, |
| warmup_steps=10, |
| num_train_epochs=EPOCHS, |
| learning_rate=LEARNING_RATE, |
| bf16=True, |
| fp16=False, |
| logging_steps=10, |
| eval_strategy="epoch", |
| save_strategy="epoch", |
| load_best_model_at_end=True, |
| report_to="none", |
| remove_unused_columns=False, |
| dataloader_num_workers=0, |
| ) |
|
|
| trainer = DebugTrainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| data_collator=collate_fn, |
| ) |
| trainer.train() |
| print("Training complete") |
|
|
| |
| model.save_pretrained("/output/minicpm-v-lora") |
| tokenizer.save_pretrained("/output/minicpm-v-lora") |
| print("LoRA adapter saved to /output/minicpm-v-lora") |
|
|
| |
| api = HfApi(token=os.environ["HF_TOKEN"]) |
| api.create_repo(HF_REPO, repo_type="model", exist_ok=True) |
|
|
| adapter_dir = Path("/output/minicpm-v-lora") |
| for f in adapter_dir.iterdir(): |
| if f.is_file(): |
| print(f"Uploading {f.name}...") |
| api.upload_file(path_or_fileobj=str(f), path_in_repo=f.name, repo_id=HF_REPO) |
|
|
| model_card = f"""--- |
| license: apache-2.0 |
| base_model: {BASE_MODEL} |
| datasets: |
| - {HF_DATASET_REPO} |
| tags: |
| - invoice-extraction |
| - indian-fmcg |
| - minicpm-v |
| - ocr |
| - qlora |
| - peft |
| --- |
| |
| # MiniCPM-V 4.6 β Indian Invoice Extraction (LoRA Adapter) |
| |
| Fine-tuned [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}) for structured extraction |
| from Indian distributor invoices (printed GST bills, Tally exports, WhatsApp screenshots, |
| handwritten bills). |
| |
| ## Usage |
| |
| ```python |
| from transformers import AutoModelForMultimodalLM, AutoTokenizer |
| from peft import PeftModel |
| |
| base = AutoModelForMultimodalLM.from_pretrained("{BASE_MODEL}", trust_remote_code=True) |
| model = PeftModel.from_pretrained(base, "{HF_REPO}") |
| tokenizer = AutoTokenizer.from_pretrained("{HF_REPO}", trust_remote_code=True) |
| ``` |
| |
| ## Training |
| |
| - Base model: {BASE_MODEL} |
| - Method: QLoRA (rank {LORA_RANK}) via PEFT + bitsandbytes on Modal A10G |
| - Dataset: [{HF_DATASET_REPO}](https://huggingface.co/datasets/{HF_DATASET_REPO}) β 500 synthetic Indian invoices |
| - Part of: Kirana Detective AI (HuggingFace Build Small Hackathon 2026) |
| """ |
| api.upload_file( |
| path_or_fileobj=model_card.encode(), |
| path_in_repo="README.md", |
| repo_id=HF_REPO, |
| ) |
| print(f"Published to {HF_REPO}") |
|
|
|
|
| |
| |
| |
|
|
| @app.function( |
| image=IMAGE, |
| gpu="A10G", |
| timeout=600, |
| secrets=[HF_SECRET], |
| memory=32768, |
| ) |
| def dryrun(): |
| import torch |
| from datasets import load_dataset |
| from transformers import AutoModelForMultimodalLM, AutoTokenizer, AutoProcessor |
|
|
| hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"]) |
| raw_model = AutoModelForMultimodalLM.from_pretrained(BASE_MODEL, trust_remote_code=True, device_map="auto") |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) |
| processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True) |
|
|
| sample = hf_ds["train"][0] |
| image = sample["image"].convert("RGB") |
| instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}" |
| msgs = [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}]}, |
| ] |
| text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) |
| proc_out = processor(text=text, images=[image], return_tensors="pt", max_slice_nums=6) |
|
|
| print("=== DRY-RUN: processor output ===") |
| image_pad_id = tokenizer.convert_tokens_to_ids("<|image_pad|>") |
| for k, v in proc_out.items(): |
| if isinstance(v, torch.Tensor): |
| count = (v == image_pad_id).sum().item() if k == "input_ids" else "" |
| pad_info = f" (<|image_pad|> count={count})" if count != "" else "" |
| print(f" {k}: shape={list(v.shape)}{pad_info}") |
|
|
| response_ids = torch.tensor( |
| tokenizer.encode(sample["response"] + tokenizer.eos_token, add_special_tokens=False), |
| dtype=torch.long, |
| ) |
| prompt_ids = proc_out["input_ids"][0] |
| full_ids = torch.cat([prompt_ids, response_ids])[:MAX_SEQ_LENGTH] |
| labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids])[:MAX_SEQ_LENGTH] |
|
|
| model_inputs = { |
| "input_ids": full_ids.unsqueeze(0), |
| "attention_mask": torch.ones_like(full_ids).unsqueeze(0), |
| "labels": labels.unsqueeze(0), |
| } |
| for k, v in proc_out.items(): |
| if k not in ("input_ids", "attention_mask"): |
| model_inputs[k] = v |
|
|
| device = next(raw_model.parameters()).device |
| model_inputs = { |
| k: v.to(device) if isinstance(v, torch.Tensor) else v |
| for k, v in model_inputs.items() |
| } |
| raw_model.eval() |
| with torch.no_grad(): |
| outputs = raw_model(**model_inputs) |
| if getattr(outputs, "loss", None) is None: |
| raise RuntimeError( |
| f"Dry-run forward did not return loss. Output keys: {list(outputs.keys())}" |
| ) |
| print(f"Dry-run forward loss: {outputs.loss.detach().float().item():.4f}") |
| print("Dry-run complete - shapes and labeled forward pass look correct") |
|
|
|
|
| |
|
|
| @app.local_entrypoint() |
| def main(): |
| train.remote() |
|
|
|
|
| @app.local_entrypoint() |
| def main_push(): |
| push_dataset.remote() |
|
|
|
|
| @app.local_entrypoint() |
| def main_dryrun(): |
| dryrun.remote() |
|
|