kirana-detective / finetune /train_minicpm_v.py
naazimsnh02's picture
All models training uploaded
9d75c8c
Raw
History Blame
21.9 kB
"""
Task 0.2 (Part 2) β€” Fine-tune MiniCPM-V 4.6 on synthetic Indian invoices.
Two-step workflow:
Step 1 β€” Upload dataset to HF Hub (run once, after generate_invoices.py):
modal run finetune/train_minicpm_v.py::main_push
Step 2 β€” Train:
modal run finetune/train_minicpm_v.py
Publishes:
build-small-hackathon/kirana-invoice-train-data (HF dataset, reusable)
build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction (model adapter + GGUF)
Training approach:
QLoRA via native PEFT + bitsandbytes on base openbmb/MiniCPM-V-4.6
(unsloth is incompatible with MiniCPM-V-4.6 which requires transformers>=5.7.0)
System prompt: "Extract invoice JSON"
User turn: <image> + "Extract all invoice fields as JSON"
Assistant turn: <annotation JSON>
"""
import json
import os
from pathlib import Path
import modal
app = modal.App("kirana-minicpm-v-finetune")
IMAGE = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("libsm6", "libxext6", "git")
.pip_install(
"huggingface_hub>=0.30.0",
"datasets>=3.0.0",
"torch>=2.3.0",
"torchvision>=0.18.0",
"transformers[torch]>=5.7.0",
"peft>=0.14.0",
"bitsandbytes>=0.43.0",
"accelerate>=0.28.0",
"trl>=0.9.0",
"pillow>=10.0.0",
"sentencepiece>=0.2.0",
"timm>=0.9.0",
)
)
HF_SECRET = modal.Secret.from_name("hf-secret")
BASE_MODEL = "openbmb/MiniCPM-V-4.6"
HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction"
HF_DATASET_REPO = "build-small-hackathon/kirana-invoice-train-data"
ANNOTATIONS_PATH = "/data/synthetic_invoices/annotations.jsonl"
LORA_RANK = 16
MAX_SEQ_LENGTH = 2048
EPOCHS = 3
BATCH_SIZE = 1
GRAD_ACCUM = 16
LEARNING_RATE = 1e-4
SYSTEM_PROMPT = (
"You are an invoice extraction assistant. "
"Given an invoice image, extract all fields as valid JSON. "
"Return ONLY the JSON object, no explanation."
)
JSON_SCHEMA = """{
"invoice_number": "string",
"supplier": "string",
"date": "YYYY-MM-DD",
"items": [
{
"product_raw": "string",
"quantity": number,
"unit_price": number,
"gst_rate": number,
"line_total": number
}
],
"grand_total": number,
"extraction_warnings": []
}"""
# ─── Step 1: upload dataset ────────────────────────────────────────────────────
@app.function(
image=IMAGE,
timeout=3600,
secrets=[HF_SECRET],
volumes={"/data": modal.Volume.from_name("kirana-synth-data", create_if_missing=True)},
memory=8192,
)
def push_dataset():
"""Build a HF dataset from the Modal volume and push to Hub."""
from PIL import Image as PILImage
from datasets import Dataset, Features, Value
from datasets import Image as HFImage
annotations = []
with open(ANNOTATIONS_PATH) as f:
for line in f:
ann = json.loads(line.strip())
img_path = os.path.join("/data", ann["image"])
if os.path.exists(img_path):
annotations.append(ann)
print(f"Found {len(annotations)} images β€” loading...")
images, responses = [], []
for ann in annotations:
try:
img = PILImage.open(os.path.join("/data", ann["image"])).convert("RGB")
images.append(img)
responses.append(json.dumps(ann["invoice"], ensure_ascii=False))
except Exception as e:
print(f" Skipping {ann['image']}: {e}")
print(f"Building dataset from {len(images)} images...")
features = Features({"image": HFImage(), "response": Value("string")})
ds = Dataset.from_dict({"image": images, "response": responses}, features=features)
split = ds.train_test_split(test_size=0.1, seed=42)
split.push_to_hub(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
print(
f"Pushed: {len(split['train'])} train + {len(split['test'])} test"
f" β†’ {HF_DATASET_REPO}"
)
# ─── Step 2: train ────────────────────────────────────────────────────────────
@app.function(
image=IMAGE,
gpu="A10G",
timeout=14400,
secrets=[HF_SECRET],
volumes={
"/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=True),
},
memory=32768,
)
def train():
import torch
from datasets import load_dataset
from huggingface_hub import HfApi
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch.utils.data import Dataset as TorchDataset
from transformers import (
AutoModelForMultimodalLM,
AutoTokenizer,
BitsAndBytesConfig,
Trainer,
TrainingArguments,
)
# ── Load dataset ──────────────────────────────────────────────────────────
hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
print(f"Dataset: {len(hf_ds['train'])} train, {len(hf_ds['test'])} eval")
# ── Load model with 4-bit QLoRA ───────────────────────────────────────────
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
raw_model = AutoModelForMultimodalLM.from_pretrained(
BASE_MODEL,
quantization_config=bnb_config,
trust_remote_code=True,
dtype=torch.bfloat16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
print(f"Loaded model class: {raw_model.__class__.__name__}")
# ── Discover image preprocessing API ─────────────────────────────────────
# Try AutoProcessor first (modern HuggingFace VLM interface)
processor = None
try:
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
print(f"Loaded processor: {processor.__class__.__name__}")
except Exception as e:
print(f"AutoProcessor not available: {e}")
# Find image placeholder token (MiniCPM-V uses (<image>./</image>) or <image>)
image_token = None
vocab = tokenizer.get_vocab()
for candidate in ["(<image>./</image>)", "<image>", "<IMAGE>", "[IMAGE]"]:
if candidate in vocab:
image_token = candidate
break
# Fallback: scan tokenizer's special/added tokens
if image_token is None:
for token in list(tokenizer.special_tokens_map.values()) + list(tokenizer.added_tokens_encoder.keys()):
if isinstance(token, str) and "image" in token.lower():
image_token = token
break
print(f"Image placeholder token: {image_token!r}")
print(f"Special tokens: {tokenizer.special_tokens_map}")
# ── Pre-process ALL data before PEFT wrapping ─────────────────────────────
instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}"
def preprocess_one(image, response, debug=False):
image = image.convert("RGB")
# MUST use processor.apply_chat_template with {"type": "image"} β€”
# NOT tokenizer.apply_chat_template with a "<image>" string.
# Only the processor knows to expand {"type":"image"} into the correct
# number of <|image_pad|> tokens; the tokenizer leaves a bare <image>
# placeholder and the model then finds tokens:0, features:N mismatch.
msgs = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": instruction},
]},
]
text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
# max_slice_nums=6: balance context vs OOM for portrait invoices
proc_out = processor(
text=text,
images=[image],
return_tensors="pt",
max_slice_nums=6,
)
if debug:
print("=== PROCESSOR OUTPUT SHAPES (first sample) ===")
for k, v in proc_out.items():
if isinstance(v, torch.Tensor):
print(f" {k}: shape={list(v.shape)}, dtype={v.dtype}")
elif isinstance(v, list):
item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:3]]
print(f" {k}: list[{len(v)}] = {item_info}")
else:
print(f" {k}: {type(v).__name__} = {v!r}")
prompt_ids = proc_out["input_ids"][0]
# pixel_values: processor returns (1, 3, 14, W) WITH batch dim.
# target_sizes: returned as (N_tiles, 2) with NO batch dim.
# Strip batch dim only where it exists (shape[0]==1).
vision_fields = {}
for k, v in proc_out.items():
if k in ("input_ids", "attention_mask"):
continue
if isinstance(v, torch.Tensor):
if k == "pixel_values":
vision_fields[k] = v[0] # (1,3,14,W) β†’ (3,14,W); collator stacks to (B,3,14,W)
elif v.shape[0] == 1:
vision_fields[k] = v[0] # strip batch-1 wrapper from metadata scalars
else:
vision_fields[k] = v # e.g. target_sizes (N_tiles,2) β€” no batch dim
elif isinstance(v, list) and len(v) == 1:
vision_fields[k] = v[0]
else:
vision_fields[k] = v
if debug:
print("=== VISION FIELDS AFTER PROCESSING ===")
for k, v in vision_fields.items():
if isinstance(v, torch.Tensor):
print(f" {k}: shape={list(v.shape)}")
else:
print(f" {k}: {type(v).__name__} = {v!r}")
response_ids = torch.tensor(
tokenizer.encode(response + tokenizer.eos_token, add_special_tokens=False),
dtype=torch.long,
)
full_ids = torch.cat([prompt_ids, response_ids])
labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids])
full_ids = full_ids[:MAX_SEQ_LENGTH]
labels = labels[:MAX_SEQ_LENGTH]
return {
"input_ids": full_ids,
"attention_mask": torch.ones_like(full_ids),
"labels": labels,
**vision_fields, # pixel_values, image_sizes, etc.
}
print("Pre-processing training data...")
train_data, eval_data = [], []
for i, s in enumerate(hf_ds["train"]):
try:
train_data.append(preprocess_one(s["image"], s["response"], debug=(i == 0)))
except Exception as e:
print(f" Skipping train[{i}]: {e}")
for i, s in enumerate(hf_ds["test"]):
try:
eval_data.append(preprocess_one(s["image"], s["response"]))
except Exception as e:
print(f" Skipping eval[{i}]: {e}")
print(f"Pre-processed {len(train_data)} train, {len(eval_data)} eval samples")
# ── PEFT wrapping ─────────────────────────────────────────────────────────
model = prepare_model_for_kbit_training(raw_model, use_gradient_checkpointing=True)
# task_type=None β†’ base PeftModel; avoids requiring prepare_inputs_for_generation
lora_config = LoraConfig(
r=LORA_RANK,
lora_alpha=LORA_RANK * 2,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_dropout=0.05,
bias="none",
task_type=None,
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# ── Dataset (just wraps pre-processed list) ───────────────────────────────
class PreprocessedDataset(TorchDataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
train_dataset = PreprocessedDataset(train_data)
eval_dataset = PreprocessedDataset(eval_data)
# ── Collator: pad to batch max length ─────────────────────────────────────
pad_id = tokenizer.pad_token_id or 0
import torch.nn.functional as F
def collate_fn(batch):
max_len = max(b["input_ids"].size(0) for b in batch)
result = {}
for b in batch:
pad = max_len - b["input_ids"].size(0)
result.setdefault("input_ids", []).append(F.pad(b["input_ids"], (0, pad), value=pad_id))
result.setdefault("attention_mask", []).append(F.pad(b["attention_mask"], (0, pad), value=0))
result.setdefault("labels", []).append(F.pad(b["labels"], (0, pad), value=-100))
result = {k: torch.stack(v) for k, v in result.items()}
# Pass through every vision field.
# pixel_values (3,14,W) per sample β†’ stack β†’ (B,3,14,W) [4D for conv2d]
# target_sizes (N_tiles,2) per sample β†’ cat β†’ (total_tiles,2) [no extra batch dim]
extra_keys = [k for k in batch[0] if k not in result]
for k in extra_keys:
vals = [b[k] for b in batch]
if k == "target_sizes":
result[k] = torch.cat(vals, dim=0) # (total_tiles, 2)
else:
try:
result[k] = torch.stack(vals)
except (RuntimeError, TypeError):
result[k] = vals
return result
# ── Debug Trainer: print input shapes on first batch ─────────────────────
_debug_step_done = [False]
class DebugTrainer(Trainer):
def compute_loss(self, model, inputs, num_items_in_batch=None, **kwargs):
if not _debug_step_done[0]:
_debug_step_done[0] = True
print("=== MODEL INPUT SHAPES (first batch) ===")
for k, v in inputs.items():
if isinstance(v, torch.Tensor):
print(f" {k}: shape={list(v.shape)}, dtype={v.dtype}")
elif isinstance(v, list):
item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:2]]
print(f" {k}: list[{len(v)}] = {item_info}")
else:
print(f" {k}: {type(v).__name__} = {v!r}")
return super().compute_loss(model, inputs, num_items_in_batch=num_items_in_batch, **kwargs)
# ── Training ──────────────────────────────────────────────────────────────
training_args = TrainingArguments(
output_dir="/output/minicpm-v-sft",
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
gradient_accumulation_steps=GRAD_ACCUM,
warmup_steps=10,
num_train_epochs=EPOCHS,
learning_rate=LEARNING_RATE,
bf16=True,
fp16=False,
logging_steps=10,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
report_to="none",
remove_unused_columns=False,
dataloader_num_workers=0, # data already pre-processed; no workers needed
)
trainer = DebugTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=collate_fn,
)
trainer.train()
print("Training complete")
# ── Save LoRA adapter ─────────────────────────────────────────────────────
model.save_pretrained("/output/minicpm-v-lora")
tokenizer.save_pretrained("/output/minicpm-v-lora")
print("LoRA adapter saved to /output/minicpm-v-lora")
# ── Publish adapter to HF Hub ─────────────────────────────────────────────
api = HfApi(token=os.environ["HF_TOKEN"])
api.create_repo(HF_REPO, repo_type="model", exist_ok=True)
adapter_dir = Path("/output/minicpm-v-lora")
for f in adapter_dir.iterdir():
if f.is_file():
print(f"Uploading {f.name}...")
api.upload_file(path_or_fileobj=str(f), path_in_repo=f.name, repo_id=HF_REPO)
model_card = f"""---
license: apache-2.0
base_model: {BASE_MODEL}
datasets:
- {HF_DATASET_REPO}
tags:
- invoice-extraction
- indian-fmcg
- minicpm-v
- ocr
- qlora
- peft
---
# MiniCPM-V 4.6 β€” Indian Invoice Extraction (LoRA Adapter)
Fine-tuned [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}) for structured extraction
from Indian distributor invoices (printed GST bills, Tally exports, WhatsApp screenshots,
handwritten bills).
## Usage
```python
from transformers import AutoModelForMultimodalLM, AutoTokenizer
from peft import PeftModel
base = AutoModelForMultimodalLM.from_pretrained("{BASE_MODEL}", trust_remote_code=True)
model = PeftModel.from_pretrained(base, "{HF_REPO}")
tokenizer = AutoTokenizer.from_pretrained("{HF_REPO}", trust_remote_code=True)
```
## Training
- Base model: {BASE_MODEL}
- Method: QLoRA (rank {LORA_RANK}) via PEFT + bitsandbytes on Modal A10G
- Dataset: [{HF_DATASET_REPO}](https://huggingface.co/datasets/{HF_DATASET_REPO}) β€” 500 synthetic Indian invoices
- Part of: Kirana Detective AI (HuggingFace Build Small Hackathon 2026)
"""
api.upload_file(
path_or_fileobj=model_card.encode(),
path_in_repo="README.md",
repo_id=HF_REPO,
)
print(f"Published to {HF_REPO}")
# ─── Dry-run: verify preprocessing + first batch shapes without training ──────
# Usage: modal run finetune/train_minicpm_v.py::main_dryrun
# Completes in ~2 min; confirms shapes are correct before a full training run.
@app.function(
image=IMAGE,
gpu="A10G",
timeout=600,
secrets=[HF_SECRET],
memory=32768,
)
def dryrun():
import torch
from datasets import load_dataset
from transformers import AutoModelForMultimodalLM, AutoTokenizer, AutoProcessor
hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
raw_model = AutoModelForMultimodalLM.from_pretrained(BASE_MODEL, trust_remote_code=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
sample = hf_ds["train"][0]
image = sample["image"].convert("RGB")
instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}"
msgs = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}]},
]
text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
proc_out = processor(text=text, images=[image], return_tensors="pt", max_slice_nums=6)
print("=== DRY-RUN: processor output ===")
image_pad_id = tokenizer.convert_tokens_to_ids("<|image_pad|>")
for k, v in proc_out.items():
if isinstance(v, torch.Tensor):
count = (v == image_pad_id).sum().item() if k == "input_ids" else ""
pad_info = f" (<|image_pad|> count={count})" if count != "" else ""
print(f" {k}: shape={list(v.shape)}{pad_info}")
response_ids = torch.tensor(
tokenizer.encode(sample["response"] + tokenizer.eos_token, add_special_tokens=False),
dtype=torch.long,
)
prompt_ids = proc_out["input_ids"][0]
full_ids = torch.cat([prompt_ids, response_ids])[:MAX_SEQ_LENGTH]
labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids])[:MAX_SEQ_LENGTH]
model_inputs = {
"input_ids": full_ids.unsqueeze(0),
"attention_mask": torch.ones_like(full_ids).unsqueeze(0),
"labels": labels.unsqueeze(0),
}
for k, v in proc_out.items():
if k not in ("input_ids", "attention_mask"):
model_inputs[k] = v
device = next(raw_model.parameters()).device
model_inputs = {
k: v.to(device) if isinstance(v, torch.Tensor) else v
for k, v in model_inputs.items()
}
raw_model.eval()
with torch.no_grad():
outputs = raw_model(**model_inputs)
if getattr(outputs, "loss", None) is None:
raise RuntimeError(
f"Dry-run forward did not return loss. Output keys: {list(outputs.keys())}"
)
print(f"Dry-run forward loss: {outputs.loss.detach().float().item():.4f}")
print("Dry-run complete - shapes and labeled forward pass look correct")
# ─── Local entrypoints ─────────────────────────────────────────────────────────
@app.local_entrypoint()
def main():
train.remote()
@app.local_entrypoint()
def main_push():
push_dataset.remote()
@app.local_entrypoint()
def main_dryrun():
dryrun.remote()