""" Merge MiniCPM-V 4.6 LoRA adapter into the base model and push the merged HF weights to Hugging Face. Why merge instead of converting LoRA to GGUF directly: llama.cpp's convert_lora_to_gguf.py and convert_hf_to_gguf.py both fail for MiniCPMV4_6Model (architecture not in llama.cpp's registry). The only working path is to have ggml.ai's GGUF-my-repo Space do the conversion — it uses a patched llama.cpp that supports this architecture. Two-step workflow: Step 1 (this script): - Load base model + LoRA from Modal volume - Merge LoRA weights into the full model (merge_and_unload) - Push merged HF model to MERGED_HF_REPO - Download OpenBMB's mmproj.gguf and upload it to HF_REPO for immediate use Step 2 (manual — ~15 min): - Go to https://huggingface.co/spaces/ggml-org/gguf-my-repo - Enter: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged - Select Q4_K_M quantisation - Wait for the Space to create the GGUF repo - Update app.py MODEL_REPO to point to the resulting GGUF repo Run: modal run finetune/export_minicpm_v_gguf.py Reads adapter from: /output/minicpm-v-lora in Modal volume kirana-minicpm-v-output Publishes merged HF model to: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged Also uploads mmproj.gguf to: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction """ from __future__ import annotations import os import modal app = modal.App("kirana-export-minicpm-v-gguf") IMAGE = ( modal.Image.debian_slim(python_version="3.11") .pip_install( "huggingface_hub>=0.30.0", "safetensors>=0.4.3", "torch>=2.3.0", "transformers>=5.7.0", "peft>=0.14.0", "accelerate>=0.34.0", ) ) HF_SECRET = modal.Secret.from_name("hf-secret") BASE_MODEL = "openbmb/MiniCPM-V-4.6" SOURCE_GGUF_REPO = "openbmb/MiniCPM-V-4.6-gguf" HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction" MERGED_HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged" # Full professional model card is maintained in push_minicpm_v_merged_card.py. # This is a minimal card used during the merge+push run; run push_minicpm_v_merged_card.py # separately to update the README on HF Hub. MODEL_CARD_MERGED = f"""\ --- license: apache-2.0 base_model: {BASE_MODEL} datasets: - build-small-hackathon/kirana-invoice-train-data language: - en tags: - invoice-extraction - indian-fmcg - minicpm-v - vision-language - ocr - qlora - merged-weights - kirana - hackathon pipeline_tag: image-text-to-text --- # MiniCPM-V 4.6 — Indian Invoice Extraction (Merged) Fine-tuned [`{BASE_MODEL}`](https://huggingface.co/{BASE_MODEL}) for structured JSON extraction from Indian distributor (kirana) invoices. QLoRA adapter weights are fully merged — no PEFT dependency at inference time. See full model card: [`naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged`](https://huggingface.co/{MERGED_HF_REPO}) ## Quick Start ```python import torch from transformers import AutoModel, AutoTokenizer from PIL import Image model = AutoModel.from_pretrained( "{MERGED_HF_REPO}", trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="auto", ) model.eval() tokenizer = AutoTokenizer.from_pretrained("{MERGED_HF_REPO}", trust_remote_code=True) image = Image.open("invoice.jpg").convert("RGB") msgs = [{{"role": "user", "content": [image, "Extract all line items as JSON."]}}] response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048) ``` ## Training Summary | Parameter | Value | |---|---| | Base model | `{BASE_MODEL}` | | Fine-tuning | QLoRA rank 16 | | Dataset | 450 train + 50 eval synthetic Indian invoices | | Eval loss | 0.2120 (3 epochs) | | Training hardware | Modal A10G, ~52 min | | Adapter params | 9.5M / 1.3B total (0.72%) | ## License Apache 2.0 — same as base model. """ def _validate_gguf_header(path: str) -> None: with open(path, "rb") as f: magic = f.read(4) if magic != b"GGUF": raise RuntimeError(f"Downloaded file is not a GGUF: {path}") @app.function( image=IMAGE, timeout=3600, secrets=[HF_SECRET], volumes={ "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=False), }, memory=16384, # 16 GB — 1.3B model in bfloat16 ≈ 2.6 GB; headroom for merge + save ) def merge_and_push(): import torch from pathlib import Path from peft import PeftModel from transformers import AutoModel, AutoTokenizer from huggingface_hub import HfApi, hf_hub_download token = os.environ["HF_TOKEN"] api = HfApi(token=token) adapter_dir = Path("/output/minicpm-v-lora") merged_dir = Path("/output/minicpm-v-merged") if not adapter_dir.exists(): raise RuntimeError( f"Missing adapter directory: {adapter_dir}. " "Run finetune/train_minicpm_v.py first." ) # ── Step 1: Merge LoRA into base model ─────────────────────────────────── if (merged_dir / "config.json").exists(): print("Merged model already exists at /output/minicpm-v-merged, skipping merge.") else: print(f"Loading base model {BASE_MODEL} ...") base_model = AutoModel.from_pretrained( BASE_MODEL, torch_dtype=torch.bfloat16, trust_remote_code=True, token=token, ) print(f"Loading LoRA adapter from {adapter_dir} ...") model = PeftModel.from_pretrained(base_model, str(adapter_dir)) print("Merging LoRA weights into base model ...") merged_model = model.merge_and_unload() print(f"Saving merged model to {merged_dir} ...") merged_dir.mkdir(parents=True, exist_ok=True) merged_model.save_pretrained(str(merged_dir)) tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, token=token) tokenizer.save_pretrained(str(merged_dir)) print("Merge complete.") # ── Step 2: Create HF repo and push merged model ───────────────────────── print(f"Creating / verifying HF repo {MERGED_HF_REPO} ...") api.create_repo(repo_id=MERGED_HF_REPO, repo_type="model", exist_ok=True, private=False) print(f"Uploading merged model to {MERGED_HF_REPO} ...") api.upload_folder( folder_path=str(merged_dir), repo_id=MERGED_HF_REPO, repo_type="model", commit_message="Add merged MiniCPM-V-4.6 invoice fine-tune", ) print("Uploading README.md to merged repo ...") api.upload_file( path_or_fileobj=MODEL_CARD_MERGED.encode("utf-8"), path_in_repo="README.md", repo_id=MERGED_HF_REPO, repo_type="model", ) # ── Step 3: Download OpenBMB mmproj and upload to GGUF repo ────────────── # The LoRA only touched LLM layers — mmproj weights are unchanged, so # OpenBMB's mmproj.gguf is identical to what we would produce ourselves. print(f"Listing GGUF files in {SOURCE_GGUF_REPO} ...") source_files = list(api.list_repo_files(SOURCE_GGUF_REPO, repo_type="model")) mmproj_files = [f for f in source_files if "mmproj" in f.lower() and f.endswith(".gguf")] if not mmproj_files: raise RuntimeError(f"No mmproj GGUF found in {SOURCE_GGUF_REPO}. Files: {source_files}") source_mmproj = mmproj_files[0] print(f"Downloading {source_mmproj} ...") mmproj_path = hf_hub_download( repo_id=SOURCE_GGUF_REPO, filename=source_mmproj, repo_type="model", token=token, local_dir="/output/minicpm-v-gguf", ) _validate_gguf_header(mmproj_path) print(f"Uploading mmproj.gguf to {HF_REPO} ...") api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True, private=False) api.upload_file( path_or_fileobj=mmproj_path, path_in_repo="mmproj.gguf", repo_id=HF_REPO, repo_type="model", ) print() print("=" * 70) print("DONE. Next steps:") print() print("1. Go to: https://huggingface.co/spaces/ggml-org/gguf-my-repo") print(f"2. Enter model ID: {MERGED_HF_REPO}") print("3. Select quantisation: Q4_K_M") print("4. Click convert — takes ~15 min on the Space's A10G") print() print("The Space will create a new repo (usually named") print(f" {MERGED_HF_REPO}-GGUF") print("containing model.gguf + mmproj.gguf (both for the fine-tuned model).") print() print(f"mmproj.gguf already uploaded to: https://huggingface.co/{HF_REPO}") print("(usable immediately — vision encoder weights are unchanged by fine-tuning)") print("=" * 70) @app.local_entrypoint() def main(): merge_and_push.remote()