"""
Push the professional README / model card to
naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged on HuggingFace.

No Modal required — runs locally using the HF token from environment.

Run:
    $env:HF_TOKEN = "hf_..."          # PowerShell
    python finetune/push_minicpm_v_merged_card.py
"""

from __future__ import annotations

import os
from huggingface_hub import HfApi

HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged"
BASE_MODEL = "openbmb/MiniCPM-V-4.6"
DATASET_REPO = "build-small-hackathon/kirana-invoice-train-data"

MODEL_CARD = """\
---
license: apache-2.0
base_model: openbmb/MiniCPM-V-4.6
datasets:
  - build-small-hackathon/kirana-invoice-train-data
language:
  - en
tags:
  - invoice-extraction
  - indian-fmcg
  - minicpm-v
  - vision-language
  - ocr
  - qlora
  - merged-weights
  - kirana
  - hackathon
pipeline_tag: image-text-to-text
---

# MiniCPM-V 4.6 — Indian Invoice Extraction (Merged)

Fine-tuned [`openbmb/MiniCPM-V-4.6`](https://huggingface.co/openbmb/MiniCPM-V-4.6) for
structured JSON extraction from Indian distributor (kirana) invoices.

QLoRA adapter weights are **fully merged** into the base model — no PEFT dependency at
inference time. Part of the **Kirana Detective** project: a six-agent AI pipeline that
audits invoices for pricing anomalies, missing deliveries, and GST errors.

---

## Model Details

| Attribute | Value |
|---|---|
| **Base model** | [openbmb/MiniCPM-V-4.6](https://huggingface.co/openbmb/MiniCPM-V-4.6) |
| **Task** | Vision-language OCR + structured JSON extraction |
| **Fine-tuning method** | QLoRA — 4-bit NF4 base, LoRA rank 16, α 32 |
| **Trainable parameters** | 9,486,336 / 1,309,914,352 **(0.72%)** |
| **Target modules** | `q_proj`, `k_proj`, `v_proj`, `o_proj`, `gate_proj`, `up_proj`, `down_proj` |
| **Training epochs** | 3 |
| **Final eval loss** | **0.2120** (↓ from 0.2901 at epoch 1) |
| **Training hardware** | NVIDIA A10G 22 GB VRAM (Modal) |
| **Training duration** | ~52 minutes |
| **Output format** | Merged full weights — bfloat16 |
| **Inference runtime** | `transformers` (`AutoModel` + `model.chat()`) |

---

## Training Data

**Dataset**: [`build-small-hackathon/kirana-invoice-train-data`](https://huggingface.co/datasets/build-small-hackathon/kirana-invoice-train-data)

| Split | Examples |
|---|---|
| Train | 450 |
| Eval | 50 |

Synthetic Indian distributor invoices generated with Pillow across:

- **10 suppliers**: HUL, Nestlé, Parle, Britannia, ITC, Amul, Dabur, Marico, Emami, Godrej
- **4 invoice formats**: Printed GST bill, Tally PDF export, handwritten, WhatsApp screenshot
- **Intentional errors injected**: GST rate mismatches, duplicate line items, price spikes — to
  train the model to surface extraction warnings alongside extracted data

---

## Training Metrics

| Epoch | Train Loss | Eval Loss |
|---|---|---|
| 1 | — | 0.2901 |
| 2 | — | 0.2281 |
| 3 | — | **0.2120** |

---

## Supported Input Formats

| Format | Example |
|---|---|
| Printed GST invoice | Standard B2B tax invoice with HSN codes |
| Tally PDF export | Machine-generated tabular layout |
| Handwritten invoice | Photo of handwritten bill |
| WhatsApp screenshot | Low-resolution forwarded invoice image |

---

## Output Schema

The model returns **only** a JSON object matching this schema — no markdown, no prose:

```json
{
  "invoice_number": "INV-2024-001",
  "supplier": "Hindustan Unilever Ltd.",
  "date": "2026-06-10",
  "items": [
    {
      "product_raw": "SURF XL 1KG",
      "quantity": 12,
      "unit_price": 95.00,
      "gst_rate": 18,
      "line_total": 1140.00
    },
    {
      "product_raw": "MAGGI MASALA 70G",
      "quantity": 48,
      "unit_price": 14.00,
      "gst_rate": 5,
      "line_total": 672.00
    }
  ],
  "grand_total": 9650.00,
  "extraction_warnings": []
}
```

**Field notes**:
- `product_raw` — verbatim as printed on the invoice (abbreviations, typos preserved)
- `gst_rate` — percentage value (5, 12, 18, 28), not a decimal
- `date` — ISO 8601 (`YYYY-MM-DD`) when parseable, raw string otherwise
- `extraction_warnings` — list of issues noticed (missing fields, illegible areas, GST anomalies)
- Numeric fields default to `0` when unreadable; `invoice_number`/`supplier`/`date` default to `null`

---

## Usage

### Basic Inference

```python
import torch
from transformers import AutoModel, AutoTokenizer
from PIL import Image

model = AutoModel.from_pretrained(
    "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(
    "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged",
    trust_remote_code=True,
)

image = Image.open("invoice.jpg").convert("RGB")

prompt = (
    "You are an OCR agent for Indian kirana store invoices. "
    "Extract all information from this invoice image and return ONLY valid JSON "
    "matching this schema exactly:\\n"
    '{"invoice_number": string|null, "supplier": string|null, "date": string|null, '
    '"items": [{"product_raw": string, "quantity": number, "unit_price": number, '
    '"gst_rate": number, "line_total": number}], '
    '"grand_total": number, "extraction_warnings": [string]}\\n'
    "Return ONLY the JSON object, no markdown, no prose."
)

msgs = [{"role": "user", "content": [image, prompt]}]
response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048)
print(response)
```

### From a PDF (multi-page)

```python
import fitz  # PyMuPDF
from PIL import Image
import io, json

doc = fitz.open("invoice.pdf")
results = []
for page in doc:
    pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
    img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
    msgs = [{"role": "user", "content": [img, prompt]}]
    raw = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048)
    results.append(json.loads(raw))
```

---

## How It Fits in Kirana Detective

```
Invoice image
      │
      ▼
┌─────────────────────────────┐
│  Agent 1 — Invoice Extractor │  ← this model
│  MiniCPM-V 4.6 (merged)     │
└─────────────────────────────┘
      │ InvoiceJSON (raw product names)
      ▼
┌─────────────────────────────┐
│  Agent 2 — Product Matcher  │  MiniCPM5-1B normalizer
└─────────────────────────────┘
      │
      ▼
┌─────────────────────────────┐
│  Agent 3 — Pricing Check    │  catalog + price history
└─────────────────────────────┘
      │
      ▼  (+ delivery photos)
┌─────────────────────────────┐
│  Agent 4 — Visual Counter   │  YOLO26n ONNX
└─────────────────────────────┘
      │
      ▼
┌─────────────────────────────┐
│  Agent 5 — Reconciliation   │
│  Agent 6 — Savings Report   │  MiniCPM5-1B
└─────────────────────────────┘
      │
      ▼
₹ Leakage report + action items
```

Related repos:
- [`build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`](https://huggingface.co/build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer) — product name normalizer
- [`build-small-hackathon/yolo26n-indian-fmcg-detection`](https://huggingface.co/build-small-hackathon/yolo26n-indian-fmcg-detection) — YOLO product counter

---

## Limitations

- Trained on **synthetic** invoices only — real-world performance may vary on heavily degraded,
  stamped, or non-standard layouts until production data is collected.
- Optimised for **English and numeric** invoice content; Hindi/regional-language invoices are
  not yet covered.
- Product names are extracted **verbatim** (`product_raw`) — normalization to canonical SKU
  names is handled downstream by the MiniCPM5-1B normalizer agent.
- `grand_total` extraction can fail on invoices with complex multi-page subtotal structures.

---

## Reproducibility

The LoRA adapter was trained with this script and then merged:

```bash
modal run finetune/train_minicpm_v.py    # fine-tune → saves adapter to Modal volume
modal run finetune/export_minicpm_v_gguf.py  # merge LoRA → push merged weights to HF
```

Source: [GitHub — Kirana Detective](https://github.com/naazimsnh02/kirana-detective)

---

## Citation

```bibtex
@misc{kirana_detective_minicpmv_2026,
  author    = {Hussain, Syed Naazim},
  title     = {MiniCPM-V 4.6 Fine-Tuned for Indian Invoice Extraction},
  year      = {2026},
  publisher = {HuggingFace},
  howpublished = {\\url{https://huggingface.co/naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged}},
}
```

---

## License

Apache 2.0 — same license as the base [`openbmb/MiniCPM-V-4.6`](https://huggingface.co/openbmb/MiniCPM-V-4.6) model.
"""


def main() -> None:
    token = os.environ.get("HF_TOKEN")
    if not token:
        raise SystemExit("Set HF_TOKEN environment variable before running.")

    api = HfApi(token=token)
    api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True, private=False)
    api.upload_file(
        path_or_fileobj=MODEL_CARD.encode("utf-8"),
        path_in_repo="README.md",
        repo_id=HF_REPO,
        repo_type="model",
        commit_message="Update professional model card",
    )
    print(f"Model card pushed to https://huggingface.co/{HF_REPO}")


if __name__ == "__main__":
    main()