""" Task 0.2 (Part 2) — Fine-tune MiniCPM-V 4.6 on synthetic Indian invoices. Two-step workflow: Step 1 — Upload dataset to HF Hub (run once, after generate_invoices.py): modal run finetune/train_minicpm_v.py::main_push Step 2 — Train: modal run finetune/train_minicpm_v.py Publishes: build-small-hackathon/kirana-invoice-train-data (HF dataset, reusable) build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction (model adapter + GGUF) Training approach: QLoRA via native PEFT + bitsandbytes on base openbmb/MiniCPM-V-4.6 (unsloth is incompatible with MiniCPM-V-4.6 which requires transformers>=5.7.0) System prompt: "Extract invoice JSON" User turn: + "Extract all invoice fields as JSON" Assistant turn: """ import json import os from pathlib import Path import modal app = modal.App("kirana-minicpm-v-finetune") IMAGE = ( modal.Image.debian_slim(python_version="3.11") .apt_install("libsm6", "libxext6", "git") .pip_install( "huggingface_hub>=0.30.0", "datasets>=3.0.0", "torch>=2.3.0", "torchvision>=0.18.0", "transformers[torch]>=5.7.0", "peft>=0.14.0", "bitsandbytes>=0.43.0", "accelerate>=0.28.0", "trl>=0.9.0", "pillow>=10.0.0", "sentencepiece>=0.2.0", "timm>=0.9.0", ) ) HF_SECRET = modal.Secret.from_name("hf-secret") BASE_MODEL = "openbmb/MiniCPM-V-4.6" HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction" HF_DATASET_REPO = "build-small-hackathon/kirana-invoice-train-data" ANNOTATIONS_PATH = "/data/synthetic_invoices/annotations.jsonl" LORA_RANK = 16 MAX_SEQ_LENGTH = 2048 EPOCHS = 3 BATCH_SIZE = 1 GRAD_ACCUM = 16 LEARNING_RATE = 1e-4 SYSTEM_PROMPT = ( "You are an invoice extraction assistant. " "Given an invoice image, extract all fields as valid JSON. " "Return ONLY the JSON object, no explanation." ) JSON_SCHEMA = """{ "invoice_number": "string", "supplier": "string", "date": "YYYY-MM-DD", "items": [ { "product_raw": "string", "quantity": number, "unit_price": number, "gst_rate": number, "line_total": number } ], "grand_total": number, "extraction_warnings": [] }""" # ─── Step 1: upload dataset ──────────────────────────────────────────────────── @app.function( image=IMAGE, timeout=3600, secrets=[HF_SECRET], volumes={"/data": modal.Volume.from_name("kirana-synth-data", create_if_missing=True)}, memory=8192, ) def push_dataset(): """Build a HF dataset from the Modal volume and push to Hub.""" from PIL import Image as PILImage from datasets import Dataset, Features, Value from datasets import Image as HFImage annotations = [] with open(ANNOTATIONS_PATH) as f: for line in f: ann = json.loads(line.strip()) img_path = os.path.join("/data", ann["image"]) if os.path.exists(img_path): annotations.append(ann) print(f"Found {len(annotations)} images — loading...") images, responses = [], [] for ann in annotations: try: img = PILImage.open(os.path.join("/data", ann["image"])).convert("RGB") images.append(img) responses.append(json.dumps(ann["invoice"], ensure_ascii=False)) except Exception as e: print(f" Skipping {ann['image']}: {e}") print(f"Building dataset from {len(images)} images...") features = Features({"image": HFImage(), "response": Value("string")}) ds = Dataset.from_dict({"image": images, "response": responses}, features=features) split = ds.train_test_split(test_size=0.1, seed=42) split.push_to_hub(HF_DATASET_REPO, token=os.environ["HF_TOKEN"]) print( f"Pushed: {len(split['train'])} train + {len(split['test'])} test" f" → {HF_DATASET_REPO}" ) # ─── Step 2: train ──────────────────────────────────────────────────────────── @app.function( image=IMAGE, gpu="A10G", timeout=14400, secrets=[HF_SECRET], volumes={ "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=True), }, memory=32768, ) def train(): import torch from datasets import load_dataset from huggingface_hub import HfApi from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from torch.utils.data import Dataset as TorchDataset from transformers import ( AutoModelForMultimodalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments, ) # ── Load dataset ────────────────────────────────────────────────────────── hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"]) print(f"Dataset: {len(hf_ds['train'])} train, {len(hf_ds['test'])} eval") # ── Load model with 4-bit QLoRA ─────────────────────────────────────────── bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_use_double_quant=True, ) raw_model = AutoModelForMultimodalLM.from_pretrained( BASE_MODEL, quantization_config=bnb_config, trust_remote_code=True, dtype=torch.bfloat16, device_map="auto", ) tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) print(f"Loaded model class: {raw_model.__class__.__name__}") # ── Discover image preprocessing API ───────────────────────────────────── # Try AutoProcessor first (modern HuggingFace VLM interface) processor = None try: from transformers import AutoProcessor processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True) print(f"Loaded processor: {processor.__class__.__name__}") except Exception as e: print(f"AutoProcessor not available: {e}") # Find image placeholder token (MiniCPM-V uses (./) or ) image_token = None vocab = tokenizer.get_vocab() for candidate in ["(./)", "", "", "[IMAGE]"]: if candidate in vocab: image_token = candidate break # Fallback: scan tokenizer's special/added tokens if image_token is None: for token in list(tokenizer.special_tokens_map.values()) + list(tokenizer.added_tokens_encoder.keys()): if isinstance(token, str) and "image" in token.lower(): image_token = token break print(f"Image placeholder token: {image_token!r}") print(f"Special tokens: {tokenizer.special_tokens_map}") # ── Pre-process ALL data before PEFT wrapping ───────────────────────────── instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}" def preprocess_one(image, response, debug=False): image = image.convert("RGB") # MUST use processor.apply_chat_template with {"type": "image"} — # NOT tokenizer.apply_chat_template with a "" string. # Only the processor knows to expand {"type":"image"} into the correct # number of <|image_pad|> tokens; the tokenizer leaves a bare # placeholder and the model then finds tokens:0, features:N mismatch. msgs = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": instruction}, ]}, ] text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) # max_slice_nums=6: balance context vs OOM for portrait invoices proc_out = processor( text=text, images=[image], return_tensors="pt", max_slice_nums=6, ) if debug: print("=== PROCESSOR OUTPUT SHAPES (first sample) ===") for k, v in proc_out.items(): if isinstance(v, torch.Tensor): print(f" {k}: shape={list(v.shape)}, dtype={v.dtype}") elif isinstance(v, list): item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:3]] print(f" {k}: list[{len(v)}] = {item_info}") else: print(f" {k}: {type(v).__name__} = {v!r}") prompt_ids = proc_out["input_ids"][0] # pixel_values: processor returns (1, 3, 14, W) WITH batch dim. # target_sizes: returned as (N_tiles, 2) with NO batch dim. # Strip batch dim only where it exists (shape[0]==1). vision_fields = {} for k, v in proc_out.items(): if k in ("input_ids", "attention_mask"): continue if isinstance(v, torch.Tensor): if k == "pixel_values": vision_fields[k] = v[0] # (1,3,14,W) → (3,14,W); collator stacks to (B,3,14,W) elif v.shape[0] == 1: vision_fields[k] = v[0] # strip batch-1 wrapper from metadata scalars else: vision_fields[k] = v # e.g. target_sizes (N_tiles,2) — no batch dim elif isinstance(v, list) and len(v) == 1: vision_fields[k] = v[0] else: vision_fields[k] = v if debug: print("=== VISION FIELDS AFTER PROCESSING ===") for k, v in vision_fields.items(): if isinstance(v, torch.Tensor): print(f" {k}: shape={list(v.shape)}") else: print(f" {k}: {type(v).__name__} = {v!r}") response_ids = torch.tensor( tokenizer.encode(response + tokenizer.eos_token, add_special_tokens=False), dtype=torch.long, ) full_ids = torch.cat([prompt_ids, response_ids]) labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids]) full_ids = full_ids[:MAX_SEQ_LENGTH] labels = labels[:MAX_SEQ_LENGTH] return { "input_ids": full_ids, "attention_mask": torch.ones_like(full_ids), "labels": labels, **vision_fields, # pixel_values, image_sizes, etc. } print("Pre-processing training data...") train_data, eval_data = [], [] for i, s in enumerate(hf_ds["train"]): try: train_data.append(preprocess_one(s["image"], s["response"], debug=(i == 0))) except Exception as e: print(f" Skipping train[{i}]: {e}") for i, s in enumerate(hf_ds["test"]): try: eval_data.append(preprocess_one(s["image"], s["response"])) except Exception as e: print(f" Skipping eval[{i}]: {e}") print(f"Pre-processed {len(train_data)} train, {len(eval_data)} eval samples") # ── PEFT wrapping ───────────────────────────────────────────────────────── model = prepare_model_for_kbit_training(raw_model, use_gradient_checkpointing=True) # task_type=None → base PeftModel; avoids requiring prepare_inputs_for_generation lora_config = LoraConfig( r=LORA_RANK, lora_alpha=LORA_RANK * 2, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_dropout=0.05, bias="none", task_type=None, ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() # ── Dataset (just wraps pre-processed list) ─────────────────────────────── class PreprocessedDataset(TorchDataset): def __init__(self, data): self.data = data def __len__(self): return len(self.data) def __getitem__(self, idx): return self.data[idx] train_dataset = PreprocessedDataset(train_data) eval_dataset = PreprocessedDataset(eval_data) # ── Collator: pad to batch max length ───────────────────────────────────── pad_id = tokenizer.pad_token_id or 0 import torch.nn.functional as F def collate_fn(batch): max_len = max(b["input_ids"].size(0) for b in batch) result = {} for b in batch: pad = max_len - b["input_ids"].size(0) result.setdefault("input_ids", []).append(F.pad(b["input_ids"], (0, pad), value=pad_id)) result.setdefault("attention_mask", []).append(F.pad(b["attention_mask"], (0, pad), value=0)) result.setdefault("labels", []).append(F.pad(b["labels"], (0, pad), value=-100)) result = {k: torch.stack(v) for k, v in result.items()} # Pass through every vision field. # pixel_values (3,14,W) per sample → stack → (B,3,14,W) [4D for conv2d] # target_sizes (N_tiles,2) per sample → cat → (total_tiles,2) [no extra batch dim] extra_keys = [k for k in batch[0] if k not in result] for k in extra_keys: vals = [b[k] for b in batch] if k == "target_sizes": result[k] = torch.cat(vals, dim=0) # (total_tiles, 2) else: try: result[k] = torch.stack(vals) except (RuntimeError, TypeError): result[k] = vals return result # ── Debug Trainer: print input shapes on first batch ───────────────────── _debug_step_done = [False] class DebugTrainer(Trainer): def compute_loss(self, model, inputs, num_items_in_batch=None, **kwargs): if not _debug_step_done[0]: _debug_step_done[0] = True print("=== MODEL INPUT SHAPES (first batch) ===") for k, v in inputs.items(): if isinstance(v, torch.Tensor): print(f" {k}: shape={list(v.shape)}, dtype={v.dtype}") elif isinstance(v, list): item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:2]] print(f" {k}: list[{len(v)}] = {item_info}") else: print(f" {k}: {type(v).__name__} = {v!r}") return super().compute_loss(model, inputs, num_items_in_batch=num_items_in_batch, **kwargs) # ── Training ────────────────────────────────────────────────────────────── training_args = TrainingArguments( output_dir="/output/minicpm-v-sft", per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, gradient_accumulation_steps=GRAD_ACCUM, warmup_steps=10, num_train_epochs=EPOCHS, learning_rate=LEARNING_RATE, bf16=True, fp16=False, logging_steps=10, eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, report_to="none", remove_unused_columns=False, dataloader_num_workers=0, # data already pre-processed; no workers needed ) trainer = DebugTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=collate_fn, ) trainer.train() print("Training complete") # ── Save LoRA adapter ───────────────────────────────────────────────────── model.save_pretrained("/output/minicpm-v-lora") tokenizer.save_pretrained("/output/minicpm-v-lora") print("LoRA adapter saved to /output/minicpm-v-lora") # ── Publish adapter to HF Hub ───────────────────────────────────────────── api = HfApi(token=os.environ["HF_TOKEN"]) api.create_repo(HF_REPO, repo_type="model", exist_ok=True) adapter_dir = Path("/output/minicpm-v-lora") for f in adapter_dir.iterdir(): if f.is_file(): print(f"Uploading {f.name}...") api.upload_file(path_or_fileobj=str(f), path_in_repo=f.name, repo_id=HF_REPO) model_card = f"""--- license: apache-2.0 base_model: {BASE_MODEL} datasets: - {HF_DATASET_REPO} tags: - invoice-extraction - indian-fmcg - minicpm-v - ocr - qlora - peft --- # MiniCPM-V 4.6 — Indian Invoice Extraction (LoRA Adapter) Fine-tuned [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}) for structured extraction from Indian distributor invoices (printed GST bills, Tally exports, WhatsApp screenshots, handwritten bills). ## Usage ```python from transformers import AutoModelForMultimodalLM, AutoTokenizer from peft import PeftModel base = AutoModelForMultimodalLM.from_pretrained("{BASE_MODEL}", trust_remote_code=True) model = PeftModel.from_pretrained(base, "{HF_REPO}") tokenizer = AutoTokenizer.from_pretrained("{HF_REPO}", trust_remote_code=True) ``` ## Training - Base model: {BASE_MODEL} - Method: QLoRA (rank {LORA_RANK}) via PEFT + bitsandbytes on Modal A10G - Dataset: [{HF_DATASET_REPO}](https://huggingface.co/datasets/{HF_DATASET_REPO}) — 500 synthetic Indian invoices - Part of: Kirana Detective AI (HuggingFace Build Small Hackathon 2026) """ api.upload_file( path_or_fileobj=model_card.encode(), path_in_repo="README.md", repo_id=HF_REPO, ) print(f"Published to {HF_REPO}") # ─── Dry-run: verify preprocessing + first batch shapes without training ────── # Usage: modal run finetune/train_minicpm_v.py::main_dryrun # Completes in ~2 min; confirms shapes are correct before a full training run. @app.function( image=IMAGE, gpu="A10G", timeout=600, secrets=[HF_SECRET], memory=32768, ) def dryrun(): import torch from datasets import load_dataset from transformers import AutoModelForMultimodalLM, AutoTokenizer, AutoProcessor hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"]) raw_model = AutoModelForMultimodalLM.from_pretrained(BASE_MODEL, trust_remote_code=True, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True) processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True) sample = hf_ds["train"][0] image = sample["image"].convert("RGB") instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}" msgs = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}]}, ] text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) proc_out = processor(text=text, images=[image], return_tensors="pt", max_slice_nums=6) print("=== DRY-RUN: processor output ===") image_pad_id = tokenizer.convert_tokens_to_ids("<|image_pad|>") for k, v in proc_out.items(): if isinstance(v, torch.Tensor): count = (v == image_pad_id).sum().item() if k == "input_ids" else "" pad_info = f" (<|image_pad|> count={count})" if count != "" else "" print(f" {k}: shape={list(v.shape)}{pad_info}") response_ids = torch.tensor( tokenizer.encode(sample["response"] + tokenizer.eos_token, add_special_tokens=False), dtype=torch.long, ) prompt_ids = proc_out["input_ids"][0] full_ids = torch.cat([prompt_ids, response_ids])[:MAX_SEQ_LENGTH] labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids])[:MAX_SEQ_LENGTH] model_inputs = { "input_ids": full_ids.unsqueeze(0), "attention_mask": torch.ones_like(full_ids).unsqueeze(0), "labels": labels.unsqueeze(0), } for k, v in proc_out.items(): if k not in ("input_ids", "attention_mask"): model_inputs[k] = v device = next(raw_model.parameters()).device model_inputs = { k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in model_inputs.items() } raw_model.eval() with torch.no_grad(): outputs = raw_model(**model_inputs) if getattr(outputs, "loss", None) is None: raise RuntimeError( f"Dry-run forward did not return loss. Output keys: {list(outputs.keys())}" ) print(f"Dry-run forward loss: {outputs.loss.detach().float().item():.4f}") print("Dry-run complete - shapes and labeled forward pass look correct") # ─── Local entrypoints ───────────────────────────────────────────────────────── @app.local_entrypoint() def main(): train.remote() @app.local_entrypoint() def main_push(): push_dataset.remote() @app.local_entrypoint() def main_dryrun(): dryrun.remote()