"""
Merge MiniCPM-V 4.6 LoRA adapter into the base model and push the merged
HF weights to Hugging Face.

Why merge instead of converting LoRA to GGUF directly:
  llama.cpp's convert_lora_to_gguf.py and convert_hf_to_gguf.py both fail
  for MiniCPMV4_6Model (architecture not in llama.cpp's registry). The only
  working path is to have ggml.ai's GGUF-my-repo Space do the conversion —
  it uses a patched llama.cpp that supports this architecture.

Two-step workflow:
  Step 1 (this script):
    - Load base model + LoRA from Modal volume
    - Merge LoRA weights into the full model (merge_and_unload)
    - Push merged HF model to MERGED_HF_REPO
    - Download OpenBMB's mmproj.gguf and upload it to HF_REPO for immediate use

  Step 2 (manual — ~15 min):
    - Go to https://huggingface.co/spaces/ggml-org/gguf-my-repo
    - Enter: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged
    - Select Q4_K_M quantisation
    - Wait for the Space to create the GGUF repo
    - Update app.py MODEL_REPO to point to the resulting GGUF repo

Run:
    modal run finetune/export_minicpm_v_gguf.py

Reads adapter from: /output/minicpm-v-lora in Modal volume kirana-minicpm-v-output
Publishes merged HF model to: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged
Also uploads mmproj.gguf to: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction
"""

from __future__ import annotations

import os

import modal

app = modal.App("kirana-export-minicpm-v-gguf")

IMAGE = (
    modal.Image.debian_slim(python_version="3.11")
    .pip_install(
        "huggingface_hub>=0.30.0",
        "safetensors>=0.4.3",
        "torch>=2.3.0",
        "transformers>=5.7.0",
        "peft>=0.14.0",
        "accelerate>=0.34.0",
    )
)

HF_SECRET = modal.Secret.from_name("hf-secret")

BASE_MODEL = "openbmb/MiniCPM-V-4.6"
SOURCE_GGUF_REPO = "openbmb/MiniCPM-V-4.6-gguf"
HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction"
MERGED_HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged"

# Full professional model card is maintained in push_minicpm_v_merged_card.py.
# This is a minimal card used during the merge+push run; run push_minicpm_v_merged_card.py
# separately to update the README on HF Hub.
MODEL_CARD_MERGED = f"""\
---
license: apache-2.0
base_model: {BASE_MODEL}
datasets:
  - build-small-hackathon/kirana-invoice-train-data
language:
  - en
tags:
  - invoice-extraction
  - indian-fmcg
  - minicpm-v
  - vision-language
  - ocr
  - qlora
  - merged-weights
  - kirana
  - hackathon
pipeline_tag: image-text-to-text
---

# MiniCPM-V 4.6 — Indian Invoice Extraction (Merged)

Fine-tuned [`{BASE_MODEL}`](https://huggingface.co/{BASE_MODEL}) for structured
JSON extraction from Indian distributor (kirana) invoices. QLoRA adapter weights
are fully merged — no PEFT dependency at inference time.

See full model card: [`naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged`](https://huggingface.co/{MERGED_HF_REPO})

## Quick Start

```python
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image

model = AutoModel.from_pretrained(
    "{MERGED_HF_REPO}", trust_remote_code=True,
    torch_dtype=torch.bfloat16, device_map="auto",
)
model.eval()
tokenizer = AutoTokenizer.from_pretrained("{MERGED_HF_REPO}", trust_remote_code=True)

image = Image.open("invoice.jpg").convert("RGB")
msgs = [{{"role": "user", "content": [image, "Extract all line items as JSON."]}}]
response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048)
```

## Training Summary

| Parameter | Value |
|---|---|
| Base model | `{BASE_MODEL}` |
| Fine-tuning | QLoRA rank 16 |
| Dataset | 450 train + 50 eval synthetic Indian invoices |
| Eval loss | 0.2120 (3 epochs) |
| Training hardware | Modal A10G, ~52 min |
| Adapter params | 9.5M / 1.3B total (0.72%) |

## License

Apache 2.0 — same as base model.
"""


def _validate_gguf_header(path: str) -> None:
    with open(path, "rb") as f:
        magic = f.read(4)
    if magic != b"GGUF":
        raise RuntimeError(f"Downloaded file is not a GGUF: {path}")


@app.function(
    image=IMAGE,
    timeout=3600,
    secrets=[HF_SECRET],
    volumes={
        "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=False),
    },
    memory=16384,  # 16 GB — 1.3B model in bfloat16 ≈ 2.6 GB; headroom for merge + save
)
def merge_and_push():
    import torch
    from pathlib import Path
    from peft import PeftModel
    from transformers import AutoModel, AutoTokenizer
    from huggingface_hub import HfApi, hf_hub_download

    token = os.environ["HF_TOKEN"]
    api = HfApi(token=token)

    adapter_dir = Path("/output/minicpm-v-lora")
    merged_dir = Path("/output/minicpm-v-merged")

    if not adapter_dir.exists():
        raise RuntimeError(
            f"Missing adapter directory: {adapter_dir}. "
            "Run finetune/train_minicpm_v.py first."
        )

    # ── Step 1: Merge LoRA into base model ───────────────────────────────────

    if (merged_dir / "config.json").exists():
        print("Merged model already exists at /output/minicpm-v-merged, skipping merge.")
    else:
        print(f"Loading base model {BASE_MODEL} ...")
        base_model = AutoModel.from_pretrained(
            BASE_MODEL,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
            token=token,
        )

        print(f"Loading LoRA adapter from {adapter_dir} ...")
        model = PeftModel.from_pretrained(base_model, str(adapter_dir))

        print("Merging LoRA weights into base model ...")
        merged_model = model.merge_and_unload()

        print(f"Saving merged model to {merged_dir} ...")
        merged_dir.mkdir(parents=True, exist_ok=True)
        merged_model.save_pretrained(str(merged_dir))

        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, token=token)
        tokenizer.save_pretrained(str(merged_dir))
        print("Merge complete.")

    # ── Step 2: Create HF repo and push merged model ─────────────────────────

    print(f"Creating / verifying HF repo {MERGED_HF_REPO} ...")
    api.create_repo(repo_id=MERGED_HF_REPO, repo_type="model", exist_ok=True, private=False)

    print(f"Uploading merged model to {MERGED_HF_REPO} ...")
    api.upload_folder(
        folder_path=str(merged_dir),
        repo_id=MERGED_HF_REPO,
        repo_type="model",
        commit_message="Add merged MiniCPM-V-4.6 invoice fine-tune",
    )

    print("Uploading README.md to merged repo ...")
    api.upload_file(
        path_or_fileobj=MODEL_CARD_MERGED.encode("utf-8"),
        path_in_repo="README.md",
        repo_id=MERGED_HF_REPO,
        repo_type="model",
    )

    # ── Step 3: Download OpenBMB mmproj and upload to GGUF repo ──────────────
    # The LoRA only touched LLM layers — mmproj weights are unchanged, so
    # OpenBMB's mmproj.gguf is identical to what we would produce ourselves.

    print(f"Listing GGUF files in {SOURCE_GGUF_REPO} ...")
    source_files = list(api.list_repo_files(SOURCE_GGUF_REPO, repo_type="model"))
    mmproj_files = [f for f in source_files if "mmproj" in f.lower() and f.endswith(".gguf")]
    if not mmproj_files:
        raise RuntimeError(f"No mmproj GGUF found in {SOURCE_GGUF_REPO}. Files: {source_files}")
    source_mmproj = mmproj_files[0]
    print(f"Downloading {source_mmproj} ...")

    mmproj_path = hf_hub_download(
        repo_id=SOURCE_GGUF_REPO,
        filename=source_mmproj,
        repo_type="model",
        token=token,
        local_dir="/output/minicpm-v-gguf",
    )
    _validate_gguf_header(mmproj_path)

    print(f"Uploading mmproj.gguf to {HF_REPO} ...")
    api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True, private=False)
    api.upload_file(
        path_or_fileobj=mmproj_path,
        path_in_repo="mmproj.gguf",
        repo_id=HF_REPO,
        repo_type="model",
    )

    print()
    print("=" * 70)
    print("DONE. Next steps:")
    print()
    print("1. Go to: https://huggingface.co/spaces/ggml-org/gguf-my-repo")
    print(f"2. Enter model ID: {MERGED_HF_REPO}")
    print("3. Select quantisation: Q4_K_M")
    print("4. Click convert — takes ~15 min on the Space's A10G")
    print()
    print("The Space will create a new repo (usually named")
    print(f"  {MERGED_HF_REPO}-GGUF")
    print("containing model.gguf + mmproj.gguf (both for the fine-tuned model).")
    print()
    print(f"mmproj.gguf already uploaded to: https://huggingface.co/{HF_REPO}")
    print("(usable immediately — vision encoder weights are unchanged by fine-tuning)")
    print("=" * 70)


@app.local_entrypoint()
def main():
    merge_and_push.remote()