"""
Task 0.3 — Fine-tune MiniCPM5-1B on Indian FMCG product name normalisation.

Run on Modal (A10G GPU, ~1-2 hours):
    modal run finetune/train_minicpm5_1b.py

Publishes:
    naazimsnh02/minicpm5-1b-indian-fmcg-normalizer
        model.gguf  (Q4_K_M quantised)

Dataset: 2,000 synthetic (raw_name, normalized_name) pairs generated here
covering 200 Indian FMCG SKUs with realistic abbreviations and typos.
"""

import json
import os
import random
import re
from pathlib import Path

import modal

app = modal.App("kirana-minicpm5-1b-finetune")

IMAGE = (
    modal.Image.debian_slim(python_version="3.11")
    .apt_install(
        "curl", "git", "cmake", "build-essential",
        "libssl-dev", "libcurl4-openssl-dev",
    )
    .pip_install(
        "unsloth>=2026.5.0",
        "huggingface_hub>=0.30.0",
        "datasets>=3.0.0",
        "torch>=2.3.0",
        "transformers>=4.43.0",
        "trl>=0.9.0",
        "peft>=0.12.0",
    )
)

HF_SECRET = modal.Secret.from_name("hf-secret")

BASE_MODEL = "openbmb/MiniCPM5-1B"
HF_REPO = "naazimsnh02/minicpm5-1b-indian-fmcg-normalizer"

CATALOG_PATH = Path(__file__).parent.parent / "data" / "fmcg_catalog.json"

LORA_RANK = 16
MAX_SEQ_LENGTH = 512
EPOCHS = 3
BATCH_SIZE = 8
LEARNING_RATE = 2e-4

SYSTEM_PROMPT = (
    "You are an Indian FMCG product name normalizer. "
    "Given a raw product name from a distributor invoice, return ONLY the canonical product name. "
    "No explanation, no punctuation — just the canonical name."
)


# ─── Synthetic pair generation ────────────────────────────────────────────────

ABBREVIATION_RULES = [
    # common invoice shorthand patterns
    (r"\bMaggi\b", ["MAGGI", "MAGGI NDL", "MAGGI NL", "MGI"]),
    (r"\bNoodles\b", ["NDL", "NOODLE", "NOODLES", "NDL."]),
    (r"\bMasala\b", ["MSL", "MASLA", "MASALA", "MAS"]),
    (r"\bWashing Powder\b", ["WSH PWD", "WASH PWD", "W/POWDER", "WSH POWD"]),
    (r"\bToothpaste\b", ["TPASTE", "T/PASTE", "TOOTH PASTE", "TP"]),
    (r"\bCoconut Oil\b", ["CNT OIL", "COCNT OIL", "C.OIL", "CO OIL"]),
    (r"\bHandwash\b", ["H/WASH", "HNDWSH", "HW", "HND WSH"]),
    (r"\bShampoo\b", ["SHMP", "SHP", "SHAMPOO", "SHM"]),
    (r"\bBiscuit\b", ["BISC", "BSC", "BISCUIT", "BISC."]),
    (r"\bFlour\b", ["FLR", "FLOUR", "ATTA", "ATT"]),
    (r"\bButter\b", ["BTR", "BUTTER", "BUT"]),
    (r"\bCheese\b", ["CHSE", "CHZ", "CHEESE"]),
    (r"\bSlices\b", ["SLC", "SLICES", "SLI"]),
    (r"(\d+)g\b", lambda m: [f"{m.group(1)}GM", f"{m.group(1)}G", f"{m.group(1)}GRM", f"{m.group(1)} G"]),
    (r"(\d+)ml\b", lambda m: [f"{m.group(1)}ML", f"{m.group(1)} ML", f"{m.group(1)}ml"]),
    (r"(\d+)kg\b", lambda m: [f"{m.group(1)}KG", f"{m.group(1)}KGS", f"{m.group(1)} KG"]),
    (r"(\d+)L\b", lambda m: [f"{m.group(1)}LTR", f"{m.group(1)}L", f"{m.group(1)} LTR"]),
    (r"\bStrong Teeth\b", ["STR TEETH", "STRONG TH", "STR.TH"]),
    (r"\b(Parle-G|Parle G)\b", ["PARLE G", "PARLEG", "PARLE-G", "P-G", "PG"]),
    (r"\bGood Day\b", ["GOOD DAY", "GD DAY", "GOODDAY"]),
    (r"\bMarie Light\b", ["MARIE LT", "MARIE LIGHT", "MRL", "MARIE"]),
]

COMMON_TYPOS = [
    ("Excel", "Exel"), ("Colgate", "Colgat"), ("Aashirvaad", "Ashirwad"),
    ("Britannia", "Britania"), ("Pepsodent", "Pepsodant"), ("Parachute", "Parachut"),
    ("Lifebuoy", "Lifebouy"), ("Amul", "AMUL"), ("Maggi", "Magi"),
    ("Frooti", "Fruti"), ("Sprite", "Sprit"),
]


def make_raw_variants(canonical: str, aliases: list, n: int = 8) -> list:
    """Generate realistic raw invoice strings from a canonical product name."""
    variants = set(aliases[:5])  # start with known aliases

    # Uppercase variant
    variants.add(canonical.upper())

    # Apply abbreviation rules
    for pattern, replacements in ABBREVIATION_RULES:
        if callable(replacements):
            for m in re.finditer(pattern, canonical, re.IGNORECASE):
                for rep in replacements(m)[:2]:
                    variants.add(re.sub(pattern, rep, canonical, flags=re.IGNORECASE))
        else:
            for rep in replacements[:2]:
                variants.add(re.sub(pattern, rep, canonical, flags=re.IGNORECASE))

    # Apply common typos
    for correct, typo in COMMON_TYPOS:
        if correct in canonical:
            variants.add(canonical.replace(correct, typo))
            variants.add(canonical.upper().replace(correct.upper(), typo.upper()))

    # Random truncation
    words = canonical.split()
    if len(words) >= 3:
        variants.add(" ".join(words[:2]).upper())
        variants.add(" ".join(words[:3]).upper())

    # Shuffle and take n
    variant_list = [v for v in variants if v != canonical]
    random.shuffle(variant_list)
    return variant_list[:n]


def generate_normalisation_dataset(catalog: list, target_pairs: int = 2000) -> list:
    pairs = []
    random.seed(42)

    pairs_per_sku = max(1, target_pairs // len(catalog))

    for entry in catalog:
        canonical = entry["canonical_name"]
        aliases = entry.get("common_aliases", [])
        raw_variants = make_raw_variants(canonical, aliases, n=pairs_per_sku + 4)

        # Ensure aliases are always included
        for alias in aliases[:4]:
            if alias != canonical:
                pairs.append({"raw": alias, "canonical": canonical})

        # Add generated variants
        for raw in raw_variants[:pairs_per_sku]:
            if raw and raw != canonical:
                pairs.append({"raw": raw.strip(), "canonical": canonical})

    random.shuffle(pairs)
    return pairs[:target_pairs]


def build_training_sample(pair: dict) -> dict:
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": f'Invoice product name: "{pair["raw"]}"'},
            {"role": "assistant", "content": pair["canonical"]},
        ]
    }


# ─── Modal function ───────────────────────────────────────────────────────────

@app.function(
    image=IMAGE,
    gpu="A10G",
    timeout=7200,
    secrets=[HF_SECRET],
    volumes={
        "/data": modal.Volume.from_name("kirana-synth-data", create_if_missing=True),
        "/output": modal.Volume.from_name("kirana-minicpm5-1b-output", create_if_missing=True),
    },
    memory=16384,
)
def train():
    from datasets import Dataset
    from huggingface_hub import HfApi
    from unsloth import FastLanguageModel
    from trl import SFTTrainer, SFTConfig

    # Upload catalog to modal volume if not already there
    catalog_modal = Path("/data/fmcg_catalog.json")
    if not catalog_modal.exists():
        raise RuntimeError("Upload data/fmcg_catalog.json to Modal volume first (run Task 3.1 locally).")

    catalog = json.loads(catalog_modal.read_text())
    pairs = generate_normalisation_dataset(catalog, target_pairs=2000)
    print(f"Generated {len(pairs)} normalisation pairs")

    samples = [build_training_sample(p) for p in pairs]
    split_idx = int(len(samples) * 0.9)
    train_dataset = Dataset.from_list(samples[:split_idx])
    eval_dataset = Dataset.from_list(samples[split_idx:])

    # --- Load model ---
    model, tokenizer = FastLanguageModel.from_pretrained(
        BASE_MODEL,
        max_seq_length=MAX_SEQ_LENGTH,
        load_in_4bit=True,
        dtype=None,
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r=LORA_RANK,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha=LORA_RANK,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=42,
    )

    def to_text(example):
        example["text"] = tokenizer.apply_chat_template(
            example["messages"], tokenize=False, add_generation_prompt=False
        )
        return example

    train_dataset = train_dataset.map(to_text)
    eval_dataset = eval_dataset.map(to_text)

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        args=SFTConfig(
            dataset_text_field="text",
            output_dir="/output/minicpm5-1b-sft",
            per_device_train_batch_size=BATCH_SIZE,
            gradient_accumulation_steps=2,
            warmup_steps=20,
            num_train_epochs=EPOCHS,
            learning_rate=LEARNING_RATE,
            fp16=False,
            bf16=True,
            logging_steps=20,
            eval_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            report_to="none",
            max_seq_length=MAX_SEQ_LENGTH,
        ),
    )
    trainer.train()
    print("Training complete")

    # --- Export GGUF ---
    model.save_pretrained_gguf(
        "/output/minicpm5-1b-gguf",
        tokenizer,
        quantization_method="q4_k_m",
    )

    # --- Publish ---
    api = HfApi(token=os.environ["HF_TOKEN"])
    api.create_repo(HF_REPO, repo_type="model", exist_ok=True)

    gguf_dir = Path("/output/minicpm5-1b-gguf_gguf")
    for f in gguf_dir.glob("*.gguf"):
        print(f"Uploading {f.name}...")
        api.upload_file(path_or_fileobj=str(f), path_in_repo=f.name, repo_id=HF_REPO)

    model_card = f"""---
license: apache-2.0
base_model: {BASE_MODEL}
tags:
  - product-normalization
  - indian-fmcg
  - gguf
  - qlora
---

# MiniCPM5-1B — Indian FMCG Product Name Normalizer

Fine-tuned [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}) to normalize
inconsistent Indian FMCG product names from distributor invoices to canonical SKU names.

## Example

| Raw Invoice Name | Normalized |
|---|---|
| MAGGI NDL 70GM | Nestle Maggi Masala Noodles 70g |
| SURF XL 1K | Surf Excel Washing Powder 1kg |
| PARLEG 80 | Parle-G Biscuit 80g |
| COLGAT 100G | Colgate Strong Teeth Toothpaste 100g |

## Training

- Base model: {BASE_MODEL}
- Method: QLoRA (rank {LORA_RANK}) via Unsloth on Modal A10G
- Dataset: 2,000 synthetic (raw, canonical) pairs covering 200 Indian FMCG SKUs
- Part of: Kirana Detective AI (HuggingFace Build Small Hackathon 2026)
"""
    api.upload_file(path_or_fileobj=model_card.encode(), path_in_repo="README.md", repo_id=HF_REPO)
    print(f"Published to {HF_REPO}")


@app.local_entrypoint()
def main():
    train.remote()