""" Push the trained MiniCPM-V LoRA adapter from Modal volume to HuggingFace Hub. Usage: modal run finetune/push_minicpm_v_to_hf.py Reads from Modal volume: kirana-minicpm-v-output (/output/minicpm-v-lora) Pushes to: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction """ import os import modal app = modal.App("kirana-push-minicpm-v") IMAGE = ( modal.Image.debian_slim(python_version="3.11") .pip_install("huggingface_hub>=0.30.0") ) HF_SECRET = modal.Secret.from_name("hf-secret") HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction" BASE_MODEL = "openbmb/MiniCPM-V-4.6" HF_DATASET_REPO = "build-small-hackathon/kirana-invoice-train-data" MODEL_CARD = """\ --- license: apache-2.0 base_model: openbmb/MiniCPM-V-4.6 datasets: - build-small-hackathon/kirana-invoice-train-data language: - en tags: - invoice-extraction - indian-fmcg - minicpm-v - ocr - qlora - peft - kirana - vision-language pipeline_tag: image-text-to-text --- # MiniCPM-V 4.6 — Indian Invoice Extraction (LoRA Adapter) Fine-tuned [`openbmb/MiniCPM-V-4.6`](https://huggingface.co/openbmb/MiniCPM-V-4.6) for structured JSON extraction from Indian distributor invoices. Part of the **Kirana Detective** project — an AI audit pipeline for small Indian grocery (kirana) stores. > **This is a PEFT LoRA adapter** — you need the base model + this adapter to run inference. ## Training Results | Epoch | Train Loss | Eval Loss | |-------|-----------|-----------| | 1 | ~6.08 | 0.2901 | | 2 | ~3.95 | 0.2281 | | 3 | ~3.33 | **0.212** | **Training summary** (3 epochs, 87 steps, ~52 min on A10G): - Total average train loss: 4.774 - Best eval loss: **0.212** (epoch 3, loaded as final checkpoint) - Trainable parameters: 9,486,336 / 1,309,914,352 (0.72%) - Dataset: 450 train + 50 eval synthetic invoices ## Usage ```python from peft import PeftModel, PeftConfig from transformers import AutoModel, AutoProcessor import torch # Load adapter config to get base model id config = PeftConfig.from_pretrained("naazimsnh02/minicpm-v-4-6-indian-invoice-extraction") base_model = AutoModel.from_pretrained( config.base_model_name_or_path, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto", ) model = PeftModel.from_pretrained(base_model, "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction") processor = AutoProcessor.from_pretrained("naazimsnh02/minicpm-v-4-6-indian-invoice-extraction", trust_remote_code=True) ``` ### Inference Example ```python from PIL import Image image = Image.open("invoice.jpg") messages = [ { "role": "system", "content": "You are an invoice extraction assistant. Given an invoice image, extract all fields as valid JSON. Return ONLY the JSON object, no explanation." }, { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": "Extract all invoice fields as JSON."} ] } ] inputs = processor(messages, return_tensors="pt").to(model.device) with torch.no_grad(): output = model.generate(**inputs, max_new_tokens=512) result_json = processor.decode(output[0], skip_special_tokens=True) ``` ## Output Schema ```json { "invoice_number": "INV-2024-001", "supplier": "Hindustan Unilever Ltd.", "date": "2026-06-10", "items": [ { "product_raw": "SURF XL 1KG", "quantity": 12, "unit_price": 95.00, "gst_rate": 18, "line_total": 1140.00 } ], "grand_total": 9650.00, "extraction_warnings": [] } ``` ## Supported Invoice Formats - Printed GST invoices (Tally-style, thermal-print) - Tally PDF exports - WhatsApp screenshot invoices - Handwritten bills ## Training Details | Parameter | Value | |-----------|-------| | Base model | openbmb/MiniCPM-V-4.6 | | Model class | MiniCPMV4_6ForConditionalGeneration | | Fine-tuning method | QLoRA (4-bit + LoRA) | | LoRA rank | 16 | | Quantization | bitsandbytes 4-bit (nf4) | | Batch size | 1 (grad accum × 16 = effective 16) | | Learning rate | 1e-4 (cosine decay, warmup 10 steps) | | Epochs | 3 | | Total steps | 87 | | Hardware | NVIDIA A10G (22 GB VRAM) | | Training time | ~52 minutes | | Orchestration | Modal (serverless GPU) | | Framework | Transformers ≥ 5.7.0 + PEFT | ## Citation ```bibtex @misc{kirana-detector-minicpm-v-2026, title = {Kirana Detective: MiniCPM-V 4.6 Indian Invoice Extraction}, author = {Syed Naazim Hussain}, year = {2026}, url = {https://huggingface.co/naazimsnh02/minicpm-v-4-6-indian-invoice-extraction} } ``` ## License Apache 2.0 (same as base model openbmb/MiniCPM-V-4.6) """ @app.function( image=IMAGE, timeout=600, secrets=[HF_SECRET], volumes={ "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=False) }, ) def push_to_hub(): from huggingface_hub import HfApi from pathlib import Path token = os.environ["HF_TOKEN"] api = HfApi(token=token) print(f"Creating repo: {HF_REPO}") api.create_repo(HF_REPO, repo_type="model", exist_ok=True, private=False) adapter_dir = Path("/output/minicpm-v-lora") if not adapter_dir.exists(): raise FileNotFoundError( f"Adapter not found at {adapter_dir}. " "Did the training job complete successfully?" ) files = list(adapter_dir.iterdir()) print(f"Found {len(files)} files in {adapter_dir}:") for f in files: print(f" {f.name} ({f.stat().st_size / 1024:.1f} KB)") for f in files: if f.is_file(): print(f"Uploading {f.name}...") api.upload_file( path_or_fileobj=str(f), path_in_repo=f.name, repo_id=HF_REPO, repo_type="model", ) print("Uploading README.md (model card)...") api.upload_file( path_or_fileobj=MODEL_CARD.encode(), path_in_repo="README.md", repo_id=HF_REPO, repo_type="model", ) print(f"\nDone! Model published at: https://huggingface.co/{HF_REPO}") @app.local_entrypoint() def main(): push_to_hub.remote()