"""
Task 0.2 (Part 2) — Fine-tune MiniCPM-V 4.6 on synthetic Indian invoices.

Two-step workflow:
    Step 1 — Upload dataset to HF Hub (run once, after generate_invoices.py):
        modal run finetune/train_minicpm_v.py::main_push

    Step 2 — Train:
        modal run finetune/train_minicpm_v.py

Publishes:
    build-small-hackathon/kirana-invoice-train-data             (HF dataset, reusable)
    build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction  (model adapter + GGUF)

Training approach:
    QLoRA via native PEFT + bitsandbytes on base openbmb/MiniCPM-V-4.6
    (unsloth is incompatible with MiniCPM-V-4.6 which requires transformers>=5.7.0)
    System prompt: "Extract invoice JSON"
    User turn: <image> + "Extract all invoice fields as JSON"
    Assistant turn: <annotation JSON>
"""

import json
import os
from pathlib import Path

import modal

app = modal.App("kirana-minicpm-v-finetune")

IMAGE = (
    modal.Image.debian_slim(python_version="3.11")
    .apt_install("libsm6", "libxext6", "git")
    .pip_install(
        "huggingface_hub>=0.30.0",
        "datasets>=3.0.0",
        "torch>=2.3.0",
        "torchvision>=0.18.0",
        "transformers[torch]>=5.7.0",
        "peft>=0.14.0",
        "bitsandbytes>=0.43.0",
        "accelerate>=0.28.0",
        "trl>=0.9.0",
        "pillow>=10.0.0",
        "sentencepiece>=0.2.0",
        "timm>=0.9.0",
    )
)

HF_SECRET = modal.Secret.from_name("hf-secret")

BASE_MODEL = "openbmb/MiniCPM-V-4.6"
HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction"
HF_DATASET_REPO = "build-small-hackathon/kirana-invoice-train-data"

ANNOTATIONS_PATH = "/data/synthetic_invoices/annotations.jsonl"
LORA_RANK = 16
MAX_SEQ_LENGTH = 2048
EPOCHS = 3
BATCH_SIZE = 1
GRAD_ACCUM = 16
LEARNING_RATE = 1e-4

SYSTEM_PROMPT = (
    "You are an invoice extraction assistant. "
    "Given an invoice image, extract all fields as valid JSON. "
    "Return ONLY the JSON object, no explanation."
)

JSON_SCHEMA = """{
  "invoice_number": "string",
  "supplier": "string",
  "date": "YYYY-MM-DD",
  "items": [
    {
      "product_raw": "string",
      "quantity": number,
      "unit_price": number,
      "gst_rate": number,
      "line_total": number
    }
  ],
  "grand_total": number,
  "extraction_warnings": []
}"""


# ─── Step 1: upload dataset ────────────────────────────────────────────────────

@app.function(
    image=IMAGE,
    timeout=3600,
    secrets=[HF_SECRET],
    volumes={"/data": modal.Volume.from_name("kirana-synth-data", create_if_missing=True)},
    memory=8192,
)
def push_dataset():
    """Build a HF dataset from the Modal volume and push to Hub."""
    from PIL import Image as PILImage
    from datasets import Dataset, Features, Value
    from datasets import Image as HFImage

    annotations = []
    with open(ANNOTATIONS_PATH) as f:
        for line in f:
            ann = json.loads(line.strip())
            img_path = os.path.join("/data", ann["image"])
            if os.path.exists(img_path):
                annotations.append(ann)

    print(f"Found {len(annotations)} images — loading...")
    images, responses = [], []
    for ann in annotations:
        try:
            img = PILImage.open(os.path.join("/data", ann["image"])).convert("RGB")
            images.append(img)
            responses.append(json.dumps(ann["invoice"], ensure_ascii=False))
        except Exception as e:
            print(f"  Skipping {ann['image']}: {e}")

    print(f"Building dataset from {len(images)} images...")
    features = Features({"image": HFImage(), "response": Value("string")})
    ds = Dataset.from_dict({"image": images, "response": responses}, features=features)

    split = ds.train_test_split(test_size=0.1, seed=42)
    split.push_to_hub(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
    print(
        f"Pushed: {len(split['train'])} train + {len(split['test'])} test"
        f"  →  {HF_DATASET_REPO}"
    )


# ─── Step 2: train ────────────────────────────────────────────────────────────

@app.function(
    image=IMAGE,
    gpu="A10G",
    timeout=14400,
    secrets=[HF_SECRET],
    volumes={
        "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=True),
    },
    memory=32768,
)
def train():
    import torch
    from datasets import load_dataset
    from huggingface_hub import HfApi
    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
    from torch.utils.data import Dataset as TorchDataset
    from transformers import (
        AutoModelForMultimodalLM,
        AutoTokenizer,
        BitsAndBytesConfig,
        Trainer,
        TrainingArguments,
    )

    # ── Load dataset ──────────────────────────────────────────────────────────
    hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
    print(f"Dataset: {len(hf_ds['train'])} train, {len(hf_ds['test'])} eval")

    # ── Load model with 4-bit QLoRA ───────────────────────────────────────────
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )

    raw_model = AutoModelForMultimodalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_config,
        trust_remote_code=True,
        dtype=torch.bfloat16,
        device_map="auto",
    )
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    print(f"Loaded model class: {raw_model.__class__.__name__}")

    # ── Discover image preprocessing API ─────────────────────────────────────
    # Try AutoProcessor first (modern HuggingFace VLM interface)
    processor = None
    try:
        from transformers import AutoProcessor
        processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
        print(f"Loaded processor: {processor.__class__.__name__}")
    except Exception as e:
        print(f"AutoProcessor not available: {e}")

    # Find image placeholder token (MiniCPM-V uses (<image>./</image>) or <image>)
    image_token = None
    vocab = tokenizer.get_vocab()
    for candidate in ["(<image>./</image>)", "<image>", "<IMAGE>", "[IMAGE]"]:
        if candidate in vocab:
            image_token = candidate
            break
    # Fallback: scan tokenizer's special/added tokens
    if image_token is None:
        for token in list(tokenizer.special_tokens_map.values()) + list(tokenizer.added_tokens_encoder.keys()):
            if isinstance(token, str) and "image" in token.lower():
                image_token = token
                break
    print(f"Image placeholder token: {image_token!r}")
    print(f"Special tokens: {tokenizer.special_tokens_map}")

    # ── Pre-process ALL data before PEFT wrapping ─────────────────────────────
    instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}"

    def preprocess_one(image, response, debug=False):
        image = image.convert("RGB")

        # MUST use processor.apply_chat_template with {"type": "image"} —
        # NOT tokenizer.apply_chat_template with a "<image>" string.
        # Only the processor knows to expand {"type":"image"} into the correct
        # number of <|image_pad|> tokens; the tokenizer leaves a bare <image>
        # placeholder and the model then finds tokens:0, features:N mismatch.
        msgs = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": [
                {"type": "image"},
                {"type": "text", "text": instruction},
            ]},
        ]
        text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

        # max_slice_nums=6: balance context vs OOM for portrait invoices
        proc_out = processor(
            text=text,
            images=[image],
            return_tensors="pt",
            max_slice_nums=6,
        )

        if debug:
            print("=== PROCESSOR OUTPUT SHAPES (first sample) ===")
            for k, v in proc_out.items():
                if isinstance(v, torch.Tensor):
                    print(f"  {k}: shape={list(v.shape)}, dtype={v.dtype}")
                elif isinstance(v, list):
                    item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:3]]
                    print(f"  {k}: list[{len(v)}] = {item_info}")
                else:
                    print(f"  {k}: {type(v).__name__} = {v!r}")

        prompt_ids = proc_out["input_ids"][0]

        # pixel_values: processor returns (1, 3, 14, W) WITH batch dim.
        # target_sizes: returned as (N_tiles, 2) with NO batch dim.
        # Strip batch dim only where it exists (shape[0]==1).
        vision_fields = {}
        for k, v in proc_out.items():
            if k in ("input_ids", "attention_mask"):
                continue
            if isinstance(v, torch.Tensor):
                if k == "pixel_values":
                    vision_fields[k] = v[0]  # (1,3,14,W) → (3,14,W); collator stacks to (B,3,14,W)
                elif v.shape[0] == 1:
                    vision_fields[k] = v[0]  # strip batch-1 wrapper from metadata scalars
                else:
                    vision_fields[k] = v     # e.g. target_sizes (N_tiles,2) — no batch dim
            elif isinstance(v, list) and len(v) == 1:
                vision_fields[k] = v[0]
            else:
                vision_fields[k] = v

        if debug:
            print("=== VISION FIELDS AFTER PROCESSING ===")
            for k, v in vision_fields.items():
                if isinstance(v, torch.Tensor):
                    print(f"  {k}: shape={list(v.shape)}")
                else:
                    print(f"  {k}: {type(v).__name__} = {v!r}")

        response_ids = torch.tensor(
            tokenizer.encode(response + tokenizer.eos_token, add_special_tokens=False),
            dtype=torch.long,
        )

        full_ids = torch.cat([prompt_ids, response_ids])
        labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids])
        full_ids = full_ids[:MAX_SEQ_LENGTH]
        labels = labels[:MAX_SEQ_LENGTH]

        return {
            "input_ids": full_ids,
            "attention_mask": torch.ones_like(full_ids),
            "labels": labels,
            **vision_fields,   # pixel_values, image_sizes, etc.
        }

    print("Pre-processing training data...")
    train_data, eval_data = [], []
    for i, s in enumerate(hf_ds["train"]):
        try:
            train_data.append(preprocess_one(s["image"], s["response"], debug=(i == 0)))
        except Exception as e:
            print(f"  Skipping train[{i}]: {e}")
    for i, s in enumerate(hf_ds["test"]):
        try:
            eval_data.append(preprocess_one(s["image"], s["response"]))
        except Exception as e:
            print(f"  Skipping eval[{i}]: {e}")
    print(f"Pre-processed {len(train_data)} train, {len(eval_data)} eval samples")

    # ── PEFT wrapping ─────────────────────────────────────────────────────────
    model = prepare_model_for_kbit_training(raw_model, use_gradient_checkpointing=True)

    # task_type=None → base PeftModel; avoids requiring prepare_inputs_for_generation
    lora_config = LoraConfig(
        r=LORA_RANK,
        lora_alpha=LORA_RANK * 2,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=None,
    )
    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()

    # ── Dataset (just wraps pre-processed list) ───────────────────────────────
    class PreprocessedDataset(TorchDataset):
        def __init__(self, data):
            self.data = data

        def __len__(self):
            return len(self.data)

        def __getitem__(self, idx):
            return self.data[idx]

    train_dataset = PreprocessedDataset(train_data)
    eval_dataset = PreprocessedDataset(eval_data)

    # ── Collator: pad to batch max length ─────────────────────────────────────
    pad_id = tokenizer.pad_token_id or 0

    import torch.nn.functional as F

    def collate_fn(batch):
        max_len = max(b["input_ids"].size(0) for b in batch)
        result = {}
        for b in batch:
            pad = max_len - b["input_ids"].size(0)
            result.setdefault("input_ids", []).append(F.pad(b["input_ids"], (0, pad), value=pad_id))
            result.setdefault("attention_mask", []).append(F.pad(b["attention_mask"], (0, pad), value=0))
            result.setdefault("labels", []).append(F.pad(b["labels"], (0, pad), value=-100))
        result = {k: torch.stack(v) for k, v in result.items()}

        # Pass through every vision field.
        # pixel_values (3,14,W) per sample → stack → (B,3,14,W)  [4D for conv2d]
        # target_sizes (N_tiles,2) per sample → cat → (total_tiles,2) [no extra batch dim]
        extra_keys = [k for k in batch[0] if k not in result]
        for k in extra_keys:
            vals = [b[k] for b in batch]
            if k == "target_sizes":
                result[k] = torch.cat(vals, dim=0)  # (total_tiles, 2)
            else:
                try:
                    result[k] = torch.stack(vals)
                except (RuntimeError, TypeError):
                    result[k] = vals
        return result

    # ── Debug Trainer: print input shapes on first batch ─────────────────────
    _debug_step_done = [False]

    class DebugTrainer(Trainer):
        def compute_loss(self, model, inputs, num_items_in_batch=None, **kwargs):
            if not _debug_step_done[0]:
                _debug_step_done[0] = True
                print("=== MODEL INPUT SHAPES (first batch) ===")
                for k, v in inputs.items():
                    if isinstance(v, torch.Tensor):
                        print(f"  {k}: shape={list(v.shape)}, dtype={v.dtype}")
                    elif isinstance(v, list):
                        item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:2]]
                        print(f"  {k}: list[{len(v)}] = {item_info}")
                    else:
                        print(f"  {k}: {type(v).__name__} = {v!r}")
            return super().compute_loss(model, inputs, num_items_in_batch=num_items_in_batch, **kwargs)

    # ── Training ──────────────────────────────────────────────────────────────
    training_args = TrainingArguments(
        output_dir="/output/minicpm-v-sft",
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=GRAD_ACCUM,
        warmup_steps=10,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        bf16=True,
        fp16=False,
        logging_steps=10,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to="none",
        remove_unused_columns=False,
        dataloader_num_workers=0,  # data already pre-processed; no workers needed
    )

    trainer = DebugTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=collate_fn,
    )
    trainer.train()
    print("Training complete")

    # ── Save LoRA adapter ─────────────────────────────────────────────────────
    model.save_pretrained("/output/minicpm-v-lora")
    tokenizer.save_pretrained("/output/minicpm-v-lora")
    print("LoRA adapter saved to /output/minicpm-v-lora")

    # ── Publish adapter to HF Hub ─────────────────────────────────────────────
    api = HfApi(token=os.environ["HF_TOKEN"])
    api.create_repo(HF_REPO, repo_type="model", exist_ok=True)

    adapter_dir = Path("/output/minicpm-v-lora")
    for f in adapter_dir.iterdir():
        if f.is_file():
            print(f"Uploading {f.name}...")
            api.upload_file(path_or_fileobj=str(f), path_in_repo=f.name, repo_id=HF_REPO)

    model_card = f"""---
license: apache-2.0
base_model: {BASE_MODEL}
datasets:
  - {HF_DATASET_REPO}
tags:
  - invoice-extraction
  - indian-fmcg
  - minicpm-v
  - ocr
  - qlora
  - peft
---

# MiniCPM-V 4.6 — Indian Invoice Extraction (LoRA Adapter)

Fine-tuned [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}) for structured extraction
from Indian distributor invoices (printed GST bills, Tally exports, WhatsApp screenshots,
handwritten bills).

## Usage

```python
from transformers import AutoModelForMultimodalLM, AutoTokenizer
from peft import PeftModel

base = AutoModelForMultimodalLM.from_pretrained("{BASE_MODEL}", trust_remote_code=True)
model = PeftModel.from_pretrained(base, "{HF_REPO}")
tokenizer = AutoTokenizer.from_pretrained("{HF_REPO}", trust_remote_code=True)
```

## Training

- Base model: {BASE_MODEL}
- Method: QLoRA (rank {LORA_RANK}) via PEFT + bitsandbytes on Modal A10G
- Dataset: [{HF_DATASET_REPO}](https://huggingface.co/datasets/{HF_DATASET_REPO}) — 500 synthetic Indian invoices
- Part of: Kirana Detective AI (HuggingFace Build Small Hackathon 2026)
"""
    api.upload_file(
        path_or_fileobj=model_card.encode(),
        path_in_repo="README.md",
        repo_id=HF_REPO,
    )
    print(f"Published to {HF_REPO}")


# ─── Dry-run: verify preprocessing + first batch shapes without training ──────
# Usage: modal run finetune/train_minicpm_v.py::main_dryrun
# Completes in ~2 min; confirms shapes are correct before a full training run.

@app.function(
    image=IMAGE,
    gpu="A10G",
    timeout=600,
    secrets=[HF_SECRET],
    memory=32768,
)
def dryrun():
    import torch
    from datasets import load_dataset
    from transformers import AutoModelForMultimodalLM, AutoTokenizer, AutoProcessor

    hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
    raw_model = AutoModelForMultimodalLM.from_pretrained(BASE_MODEL, trust_remote_code=True, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)

    sample = hf_ds["train"][0]
    image = sample["image"].convert("RGB")
    instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}"
    msgs = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}]},
    ]
    text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
    proc_out = processor(text=text, images=[image], return_tensors="pt", max_slice_nums=6)

    print("=== DRY-RUN: processor output ===")
    image_pad_id = tokenizer.convert_tokens_to_ids("<|image_pad|>")
    for k, v in proc_out.items():
        if isinstance(v, torch.Tensor):
            count = (v == image_pad_id).sum().item() if k == "input_ids" else ""
            pad_info = f"  (<|image_pad|> count={count})" if count != "" else ""
            print(f"  {k}: shape={list(v.shape)}{pad_info}")

    response_ids = torch.tensor(
        tokenizer.encode(sample["response"] + tokenizer.eos_token, add_special_tokens=False),
        dtype=torch.long,
    )
    prompt_ids = proc_out["input_ids"][0]
    full_ids = torch.cat([prompt_ids, response_ids])[:MAX_SEQ_LENGTH]
    labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids])[:MAX_SEQ_LENGTH]

    model_inputs = {
        "input_ids": full_ids.unsqueeze(0),
        "attention_mask": torch.ones_like(full_ids).unsqueeze(0),
        "labels": labels.unsqueeze(0),
    }
    for k, v in proc_out.items():
        if k not in ("input_ids", "attention_mask"):
            model_inputs[k] = v

    device = next(raw_model.parameters()).device
    model_inputs = {
        k: v.to(device) if isinstance(v, torch.Tensor) else v
        for k, v in model_inputs.items()
    }
    raw_model.eval()
    with torch.no_grad():
        outputs = raw_model(**model_inputs)
    if getattr(outputs, "loss", None) is None:
        raise RuntimeError(
            f"Dry-run forward did not return loss. Output keys: {list(outputs.keys())}"
        )
    print(f"Dry-run forward loss: {outputs.loss.detach().float().item():.4f}")
    print("Dry-run complete - shapes and labeled forward pass look correct")


# ─── Local entrypoints ─────────────────────────────────────────────────────────

@app.local_entrypoint()
def main():
    train.remote()


@app.local_entrypoint()
def main_push():
    push_dataset.remote()


@app.local_entrypoint()
def main_dryrun():
    dryrun.remote()