Spaces:

build-small-hackathon
/

kirana-detective

Sleeping

App Files Files Community

kirana-detective / finetune /train_minicpm_v.py

naazimsnh02

All models training uploaded

9d75c8c 8 days ago

Raw

History Blame

21.9 kB

	"""
	Task 0.2 (Part 2) — Fine-tune MiniCPM-V 4.6 on synthetic Indian invoices.

	Two-step workflow:
	Step 1 — Upload dataset to HF Hub (run once, after generate_invoices.py):
	modal run finetune/train_minicpm_v.py::main_push

	Step 2 — Train:
	modal run finetune/train_minicpm_v.py

	Publishes:
	build-small-hackathon/kirana-invoice-train-data (HF dataset, reusable)
	build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction (model adapter + GGUF)

	Training approach:
	QLoRA via native PEFT + bitsandbytes on base openbmb/MiniCPM-V-4.6
	(unsloth is incompatible with MiniCPM-V-4.6 which requires transformers>=5.7.0)
	System prompt: "Extract invoice JSON"
	User turn: <image> + "Extract all invoice fields as JSON"
	Assistant turn: <annotation JSON>
	"""

	import json
	import os
	from pathlib import Path

	import modal

	app = modal.App("kirana-minicpm-v-finetune")

	IMAGE = (
	modal.Image.debian_slim(python_version="3.11")
	.apt_install("libsm6", "libxext6", "git")
	.pip_install(
	"huggingface_hub>=0.30.0",
	"datasets>=3.0.0",
	"torch>=2.3.0",
	"torchvision>=0.18.0",
	"transformers[torch]>=5.7.0",
	"peft>=0.14.0",
	"bitsandbytes>=0.43.0",
	"accelerate>=0.28.0",
	"trl>=0.9.0",
	"pillow>=10.0.0",
	"sentencepiece>=0.2.0",
	"timm>=0.9.0",
	)
	)

	HF_SECRET = modal.Secret.from_name("hf-secret")

	BASE_MODEL = "openbmb/MiniCPM-V-4.6"
	HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction"
	HF_DATASET_REPO = "build-small-hackathon/kirana-invoice-train-data"

	ANNOTATIONS_PATH = "/data/synthetic_invoices/annotations.jsonl"
	LORA_RANK = 16
	MAX_SEQ_LENGTH = 2048
	EPOCHS = 3
	BATCH_SIZE = 1
	GRAD_ACCUM = 16
	LEARNING_RATE = 1e-4

	SYSTEM_PROMPT = (
	"You are an invoice extraction assistant. "
	"Given an invoice image, extract all fields as valid JSON. "
	"Return ONLY the JSON object, no explanation."
	)

	JSON_SCHEMA = """{
	"invoice_number": "string",
	"supplier": "string",
	"date": "YYYY-MM-DD",
	"items": [
	{
	"product_raw": "string",
	"quantity": number,
	"unit_price": number,
	"gst_rate": number,
	"line_total": number
	}
	],
	"grand_total": number,
	"extraction_warnings": []
	}"""


	# ─── Step 1: upload dataset ────────────────────────────────────────────────────

	@app.function(
	image=IMAGE,
	timeout=3600,
	secrets=[HF_SECRET],
	volumes={"/data": modal.Volume.from_name("kirana-synth-data", create_if_missing=True)},
	memory=8192,
	)
	def push_dataset():
	"""Build a HF dataset from the Modal volume and push to Hub."""
	from PIL import Image as PILImage
	from datasets import Dataset, Features, Value
	from datasets import Image as HFImage

	annotations = []
	with open(ANNOTATIONS_PATH) as f:
	for line in f:
	ann = json.loads(line.strip())
	img_path = os.path.join("/data", ann["image"])
	if os.path.exists(img_path):
	annotations.append(ann)

	print(f"Found {len(annotations)} images — loading...")
	images, responses = [], []
	for ann in annotations:
	try:
	img = PILImage.open(os.path.join("/data", ann["image"])).convert("RGB")
	images.append(img)
	responses.append(json.dumps(ann["invoice"], ensure_ascii=False))
	except Exception as e:
	print(f" Skipping {ann['image']}: {e}")

	print(f"Building dataset from {len(images)} images...")
	features = Features({"image": HFImage(), "response": Value("string")})
	ds = Dataset.from_dict({"image": images, "response": responses}, features=features)

	split = ds.train_test_split(test_size=0.1, seed=42)
	split.push_to_hub(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
	print(
	f"Pushed: {len(split['train'])} train + {len(split['test'])} test"
	f" → {HF_DATASET_REPO}"
	)


	# ─── Step 2: train ────────────────────────────────────────────────────────────

	@app.function(
	image=IMAGE,
	gpu="A10G",
	timeout=14400,
	secrets=[HF_SECRET],
	volumes={
	"/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=True),
	},
	memory=32768,
	)
	def train():
	import torch
	from datasets import load_dataset
	from huggingface_hub import HfApi
	from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
	from torch.utils.data import Dataset as TorchDataset
	from transformers import (
	AutoModelForMultimodalLM,
	AutoTokenizer,
	BitsAndBytesConfig,
	Trainer,
	TrainingArguments,
	)

	# ── Load dataset ──────────────────────────────────────────────────────────
	hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
	print(f"Dataset: {len(hf_ds['train'])} train, {len(hf_ds['test'])} eval")

	# ── Load model with 4-bit QLoRA ───────────────────────────────────────────
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	bnb_4bit_use_double_quant=True,
	)

	raw_model = AutoModelForMultimodalLM.from_pretrained(
	BASE_MODEL,
	quantization_config=bnb_config,
	trust_remote_code=True,
	dtype=torch.bfloat16,
	device_map="auto",
	)
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
	print(f"Loaded model class: {raw_model.__class__.__name__}")

	# ── Discover image preprocessing API ─────────────────────────────────────
	# Try AutoProcessor first (modern HuggingFace VLM interface)
	processor = None
	try:
	from transformers import AutoProcessor
	processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
	print(f"Loaded processor: {processor.__class__.__name__}")
	except Exception as e:
	print(f"AutoProcessor not available: {e}")

	# Find image placeholder token (MiniCPM-V uses (<image>./</image>) or <image>)
	image_token = None
	vocab = tokenizer.get_vocab()
	for candidate in ["(<image>./</image>)", "<image>", "<IMAGE>", "[IMAGE]"]:
	if candidate in vocab:
	image_token = candidate
	break
	# Fallback: scan tokenizer's special/added tokens
	if image_token is None:
	for token in list(tokenizer.special_tokens_map.values()) + list(tokenizer.added_tokens_encoder.keys()):
	if isinstance(token, str) and "image" in token.lower():
	image_token = token
	break
	print(f"Image placeholder token: {image_token!r}")
	print(f"Special tokens: {tokenizer.special_tokens_map}")

	# ── Pre-process ALL data before PEFT wrapping ─────────────────────────────
	instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}"

	def preprocess_one(image, response, debug=False):
	image = image.convert("RGB")

	# MUST use processor.apply_chat_template with {"type": "image"} —
	# NOT tokenizer.apply_chat_template with a "<image>" string.
	# Only the processor knows to expand {"type":"image"} into the correct
	# number of <\|image_pad\|> tokens; the tokenizer leaves a bare <image>
	# placeholder and the model then finds tokens:0, features:N mismatch.
	msgs = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": [
	{"type": "image"},
	{"type": "text", "text": instruction},
	]},
	]
	text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

	# max_slice_nums=6: balance context vs OOM for portrait invoices
	proc_out = processor(
	text=text,
	images=[image],
	return_tensors="pt",
	max_slice_nums=6,
	)

	if debug:
	print("=== PROCESSOR OUTPUT SHAPES (first sample) ===")
	for k, v in proc_out.items():
	if isinstance(v, torch.Tensor):
	print(f" {k}: shape={list(v.shape)}, dtype={v.dtype}")
	elif isinstance(v, list):
	item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:3]]
	print(f" {k}: list[{len(v)}] = {item_info}")
	else:
	print(f" {k}: {type(v).__name__} = {v!r}")

	prompt_ids = proc_out["input_ids"][0]

	# pixel_values: processor returns (1, 3, 14, W) WITH batch dim.
	# target_sizes: returned as (N_tiles, 2) with NO batch dim.
	# Strip batch dim only where it exists (shape[0]==1).
	vision_fields = {}
	for k, v in proc_out.items():
	if k in ("input_ids", "attention_mask"):
	continue
	if isinstance(v, torch.Tensor):
	if k == "pixel_values":
	vision_fields[k] = v[0] # (1,3,14,W) → (3,14,W); collator stacks to (B,3,14,W)
	elif v.shape[0] == 1:
	vision_fields[k] = v[0] # strip batch-1 wrapper from metadata scalars
	else:
	vision_fields[k] = v # e.g. target_sizes (N_tiles,2) — no batch dim
	elif isinstance(v, list) and len(v) == 1:
	vision_fields[k] = v[0]
	else:
	vision_fields[k] = v

	if debug:
	print("=== VISION FIELDS AFTER PROCESSING ===")
	for k, v in vision_fields.items():
	if isinstance(v, torch.Tensor):
	print(f" {k}: shape={list(v.shape)}")
	else:
	print(f" {k}: {type(v).__name__} = {v!r}")

	response_ids = torch.tensor(
	tokenizer.encode(response + tokenizer.eos_token, add_special_tokens=False),
	dtype=torch.long,
	)

	full_ids = torch.cat([prompt_ids, response_ids])
	labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids])
	full_ids = full_ids[:MAX_SEQ_LENGTH]
	labels = labels[:MAX_SEQ_LENGTH]

	return {
	"input_ids": full_ids,
	"attention_mask": torch.ones_like(full_ids),
	"labels": labels,
	**vision_fields, # pixel_values, image_sizes, etc.
	}

	print("Pre-processing training data...")
	train_data, eval_data = [], []
	for i, s in enumerate(hf_ds["train"]):
	try:
	train_data.append(preprocess_one(s["image"], s["response"], debug=(i == 0)))
	except Exception as e:
	print(f" Skipping train[{i}]: {e}")
	for i, s in enumerate(hf_ds["test"]):
	try:
	eval_data.append(preprocess_one(s["image"], s["response"]))
	except Exception as e:
	print(f" Skipping eval[{i}]: {e}")
	print(f"Pre-processed {len(train_data)} train, {len(eval_data)} eval samples")

	# ── PEFT wrapping ─────────────────────────────────────────────────────────
	model = prepare_model_for_kbit_training(raw_model, use_gradient_checkpointing=True)

	# task_type=None → base PeftModel; avoids requiring prepare_inputs_for_generation
	lora_config = LoraConfig(
	r=LORA_RANK,
	lora_alpha=LORA_RANK * 2,
	target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
	lora_dropout=0.05,
	bias="none",
	task_type=None,
	)
	model = get_peft_model(model, lora_config)
	model.print_trainable_parameters()

	# ── Dataset (just wraps pre-processed list) ───────────────────────────────
	class PreprocessedDataset(TorchDataset):
	def __init__(self, data):
	self.data = data

	def __len__(self):
	return len(self.data)

	def __getitem__(self, idx):
	return self.data[idx]

	train_dataset = PreprocessedDataset(train_data)
	eval_dataset = PreprocessedDataset(eval_data)

	# ── Collator: pad to batch max length ─────────────────────────────────────
	pad_id = tokenizer.pad_token_id or 0

	import torch.nn.functional as F

	def collate_fn(batch):
	max_len = max(b["input_ids"].size(0) for b in batch)
	result = {}
	for b in batch:
	pad = max_len - b["input_ids"].size(0)
	result.setdefault("input_ids", []).append(F.pad(b["input_ids"], (0, pad), value=pad_id))
	result.setdefault("attention_mask", []).append(F.pad(b["attention_mask"], (0, pad), value=0))
	result.setdefault("labels", []).append(F.pad(b["labels"], (0, pad), value=-100))
	result = {k: torch.stack(v) for k, v in result.items()}

	# Pass through every vision field.
	# pixel_values (3,14,W) per sample → stack → (B,3,14,W) [4D for conv2d]
	# target_sizes (N_tiles,2) per sample → cat → (total_tiles,2) [no extra batch dim]
	extra_keys = [k for k in batch[0] if k not in result]
	for k in extra_keys:
	vals = [b[k] for b in batch]
	if k == "target_sizes":
	result[k] = torch.cat(vals, dim=0) # (total_tiles, 2)
	else:
	try:
	result[k] = torch.stack(vals)
	except (RuntimeError, TypeError):
	result[k] = vals
	return result

	# ── Debug Trainer: print input shapes on first batch ─────────────────────
	_debug_step_done = [False]

	class DebugTrainer(Trainer):
	def compute_loss(self, model, inputs, num_items_in_batch=None, **kwargs):
	if not _debug_step_done[0]:
	_debug_step_done[0] = True
	print("=== MODEL INPUT SHAPES (first batch) ===")
	for k, v in inputs.items():
	if isinstance(v, torch.Tensor):
	print(f" {k}: shape={list(v.shape)}, dtype={v.dtype}")
	elif isinstance(v, list):
	item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:2]]
	print(f" {k}: list[{len(v)}] = {item_info}")
	else:
	print(f" {k}: {type(v).__name__} = {v!r}")
	return super().compute_loss(model, inputs, num_items_in_batch=num_items_in_batch, **kwargs)

	# ── Training ──────────────────────────────────────────────────────────────
	training_args = TrainingArguments(
	output_dir="/output/minicpm-v-sft",
	per_device_train_batch_size=BATCH_SIZE,
	per_device_eval_batch_size=BATCH_SIZE,
	gradient_accumulation_steps=GRAD_ACCUM,
	warmup_steps=10,
	num_train_epochs=EPOCHS,
	learning_rate=LEARNING_RATE,
	bf16=True,
	fp16=False,
	logging_steps=10,
	eval_strategy="epoch",
	save_strategy="epoch",
	load_best_model_at_end=True,
	report_to="none",
	remove_unused_columns=False,
	dataloader_num_workers=0, # data already pre-processed; no workers needed
	)

	trainer = DebugTrainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=eval_dataset,
	data_collator=collate_fn,
	)
	trainer.train()
	print("Training complete")

	# ── Save LoRA adapter ─────────────────────────────────────────────────────
	model.save_pretrained("/output/minicpm-v-lora")
	tokenizer.save_pretrained("/output/minicpm-v-lora")
	print("LoRA adapter saved to /output/minicpm-v-lora")

	# ── Publish adapter to HF Hub ─────────────────────────────────────────────
	api = HfApi(token=os.environ["HF_TOKEN"])
	api.create_repo(HF_REPO, repo_type="model", exist_ok=True)

	adapter_dir = Path("/output/minicpm-v-lora")
	for f in adapter_dir.iterdir():
	if f.is_file():
	print(f"Uploading {f.name}...")
	api.upload_file(path_or_fileobj=str(f), path_in_repo=f.name, repo_id=HF_REPO)

	model_card = f"""---
	license: apache-2.0
	base_model: {BASE_MODEL}
	datasets:
	- {HF_DATASET_REPO}
	tags:
	- invoice-extraction
	- indian-fmcg
	- minicpm-v
	- ocr
	- qlora
	- peft
	---

	# MiniCPM-V 4.6 — Indian Invoice Extraction (LoRA Adapter)

	Fine-tuned [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}) for structured extraction
	from Indian distributor invoices (printed GST bills, Tally exports, WhatsApp screenshots,
	handwritten bills).

	## Usage

	```python
	from transformers import AutoModelForMultimodalLM, AutoTokenizer
	from peft import PeftModel

	base = AutoModelForMultimodalLM.from_pretrained("{BASE_MODEL}", trust_remote_code=True)
	model = PeftModel.from_pretrained(base, "{HF_REPO}")
	tokenizer = AutoTokenizer.from_pretrained("{HF_REPO}", trust_remote_code=True)
	```

	## Training

	- Base model: {BASE_MODEL}
	- Method: QLoRA (rank {LORA_RANK}) via PEFT + bitsandbytes on Modal A10G
	- Dataset: [{HF_DATASET_REPO}](https://huggingface.co/datasets/{HF_DATASET_REPO}) — 500 synthetic Indian invoices
	- Part of: Kirana Detective AI (HuggingFace Build Small Hackathon 2026)
	"""
	api.upload_file(
	path_or_fileobj=model_card.encode(),
	path_in_repo="README.md",
	repo_id=HF_REPO,
	)
	print(f"Published to {HF_REPO}")


	# ─── Dry-run: verify preprocessing + first batch shapes without training ──────
	# Usage: modal run finetune/train_minicpm_v.py::main_dryrun
	# Completes in ~2 min; confirms shapes are correct before a full training run.

	@app.function(
	image=IMAGE,
	gpu="A10G",
	timeout=600,
	secrets=[HF_SECRET],
	memory=32768,
	)
	def dryrun():
	import torch
	from datasets import load_dataset
	from transformers import AutoModelForMultimodalLM, AutoTokenizer, AutoProcessor

	hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
	raw_model = AutoModelForMultimodalLM.from_pretrained(BASE_MODEL, trust_remote_code=True, device_map="auto")
	tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
	processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)

	sample = hf_ds["train"][0]
	image = sample["image"].convert("RGB")
	instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}"
	msgs = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}]},
	]
	text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
	proc_out = processor(text=text, images=[image], return_tensors="pt", max_slice_nums=6)

	print("=== DRY-RUN: processor output ===")
	image_pad_id = tokenizer.convert_tokens_to_ids("<\|image_pad\|>")
	for k, v in proc_out.items():
	if isinstance(v, torch.Tensor):
	count = (v == image_pad_id).sum().item() if k == "input_ids" else ""
	pad_info = f" (<\|image_pad\|> count={count})" if count != "" else ""
	print(f" {k}: shape={list(v.shape)}{pad_info}")

	response_ids = torch.tensor(
	tokenizer.encode(sample["response"] + tokenizer.eos_token, add_special_tokens=False),
	dtype=torch.long,
	)
	prompt_ids = proc_out["input_ids"][0]
	full_ids = torch.cat([prompt_ids, response_ids])[:MAX_SEQ_LENGTH]
	labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids])[:MAX_SEQ_LENGTH]

	model_inputs = {
	"input_ids": full_ids.unsqueeze(0),
	"attention_mask": torch.ones_like(full_ids).unsqueeze(0),
	"labels": labels.unsqueeze(0),
	}
	for k, v in proc_out.items():
	if k not in ("input_ids", "attention_mask"):
	model_inputs[k] = v

	device = next(raw_model.parameters()).device
	model_inputs = {
	k: v.to(device) if isinstance(v, torch.Tensor) else v
	for k, v in model_inputs.items()
	}
	raw_model.eval()
	with torch.no_grad():
	outputs = raw_model(**model_inputs)
	if getattr(outputs, "loss", None) is None:
	raise RuntimeError(
	f"Dry-run forward did not return loss. Output keys: {list(outputs.keys())}"
	)
	print(f"Dry-run forward loss: {outputs.loss.detach().float().item():.4f}")
	print("Dry-run complete - shapes and labeled forward pass look correct")


	# ─── Local entrypoints ─────────────────────────────────────────────────────────

	@app.local_entrypoint()
	def main():
	train.remote()


	@app.local_entrypoint()
	def main_push():
	push_dataset.remote()


	@app.local_entrypoint()
	def main_dryrun():
	dryrun.remote()