""" Task 0.3 — Fine-tune MiniCPM5-1B on Indian FMCG product name normalisation. Run on Modal (A10G GPU, ~1-2 hours): modal run finetune/train_minicpm5_1b.py Publishes: naazimsnh02/minicpm5-1b-indian-fmcg-normalizer model.gguf (Q4_K_M quantised) Dataset: 2,000 synthetic (raw_name, normalized_name) pairs generated here covering 200 Indian FMCG SKUs with realistic abbreviations and typos. """ import json import os import random import re from pathlib import Path import modal app = modal.App("kirana-minicpm5-1b-finetune") IMAGE = ( modal.Image.debian_slim(python_version="3.11") .apt_install( "curl", "git", "cmake", "build-essential", "libssl-dev", "libcurl4-openssl-dev", ) .pip_install( "unsloth>=2026.5.0", "huggingface_hub>=0.30.0", "datasets>=3.0.0", "torch>=2.3.0", "transformers>=4.43.0", "trl>=0.9.0", "peft>=0.12.0", ) ) HF_SECRET = modal.Secret.from_name("hf-secret") BASE_MODEL = "openbmb/MiniCPM5-1B" HF_REPO = "naazimsnh02/minicpm5-1b-indian-fmcg-normalizer" CATALOG_PATH = Path(__file__).parent.parent / "data" / "fmcg_catalog.json" LORA_RANK = 16 MAX_SEQ_LENGTH = 512 EPOCHS = 3 BATCH_SIZE = 8 LEARNING_RATE = 2e-4 SYSTEM_PROMPT = ( "You are an Indian FMCG product name normalizer. " "Given a raw product name from a distributor invoice, return ONLY the canonical product name. " "No explanation, no punctuation — just the canonical name." ) # ─── Synthetic pair generation ──────────────────────────────────────────────── ABBREVIATION_RULES = [ # common invoice shorthand patterns (r"\bMaggi\b", ["MAGGI", "MAGGI NDL", "MAGGI NL", "MGI"]), (r"\bNoodles\b", ["NDL", "NOODLE", "NOODLES", "NDL."]), (r"\bMasala\b", ["MSL", "MASLA", "MASALA", "MAS"]), (r"\bWashing Powder\b", ["WSH PWD", "WASH PWD", "W/POWDER", "WSH POWD"]), (r"\bToothpaste\b", ["TPASTE", "T/PASTE", "TOOTH PASTE", "TP"]), (r"\bCoconut Oil\b", ["CNT OIL", "COCNT OIL", "C.OIL", "CO OIL"]), (r"\bHandwash\b", ["H/WASH", "HNDWSH", "HW", "HND WSH"]), (r"\bShampoo\b", ["SHMP", "SHP", "SHAMPOO", "SHM"]), (r"\bBiscuit\b", ["BISC", "BSC", "BISCUIT", "BISC."]), (r"\bFlour\b", ["FLR", "FLOUR", "ATTA", "ATT"]), (r"\bButter\b", ["BTR", "BUTTER", "BUT"]), (r"\bCheese\b", ["CHSE", "CHZ", "CHEESE"]), (r"\bSlices\b", ["SLC", "SLICES", "SLI"]), (r"(\d+)g\b", lambda m: [f"{m.group(1)}GM", f"{m.group(1)}G", f"{m.group(1)}GRM", f"{m.group(1)} G"]), (r"(\d+)ml\b", lambda m: [f"{m.group(1)}ML", f"{m.group(1)} ML", f"{m.group(1)}ml"]), (r"(\d+)kg\b", lambda m: [f"{m.group(1)}KG", f"{m.group(1)}KGS", f"{m.group(1)} KG"]), (r"(\d+)L\b", lambda m: [f"{m.group(1)}LTR", f"{m.group(1)}L", f"{m.group(1)} LTR"]), (r"\bStrong Teeth\b", ["STR TEETH", "STRONG TH", "STR.TH"]), (r"\b(Parle-G|Parle G)\b", ["PARLE G", "PARLEG", "PARLE-G", "P-G", "PG"]), (r"\bGood Day\b", ["GOOD DAY", "GD DAY", "GOODDAY"]), (r"\bMarie Light\b", ["MARIE LT", "MARIE LIGHT", "MRL", "MARIE"]), ] COMMON_TYPOS = [ ("Excel", "Exel"), ("Colgate", "Colgat"), ("Aashirvaad", "Ashirwad"), ("Britannia", "Britania"), ("Pepsodent", "Pepsodant"), ("Parachute", "Parachut"), ("Lifebuoy", "Lifebouy"), ("Amul", "AMUL"), ("Maggi", "Magi"), ("Frooti", "Fruti"), ("Sprite", "Sprit"), ] def make_raw_variants(canonical: str, aliases: list, n: int = 8) -> list: """Generate realistic raw invoice strings from a canonical product name.""" variants = set(aliases[:5]) # start with known aliases # Uppercase variant variants.add(canonical.upper()) # Apply abbreviation rules for pattern, replacements in ABBREVIATION_RULES: if callable(replacements): for m in re.finditer(pattern, canonical, re.IGNORECASE): for rep in replacements(m)[:2]: variants.add(re.sub(pattern, rep, canonical, flags=re.IGNORECASE)) else: for rep in replacements[:2]: variants.add(re.sub(pattern, rep, canonical, flags=re.IGNORECASE)) # Apply common typos for correct, typo in COMMON_TYPOS: if correct in canonical: variants.add(canonical.replace(correct, typo)) variants.add(canonical.upper().replace(correct.upper(), typo.upper())) # Random truncation words = canonical.split() if len(words) >= 3: variants.add(" ".join(words[:2]).upper()) variants.add(" ".join(words[:3]).upper()) # Shuffle and take n variant_list = [v for v in variants if v != canonical] random.shuffle(variant_list) return variant_list[:n] def generate_normalisation_dataset(catalog: list, target_pairs: int = 2000) -> list: pairs = [] random.seed(42) pairs_per_sku = max(1, target_pairs // len(catalog)) for entry in catalog: canonical = entry["canonical_name"] aliases = entry.get("common_aliases", []) raw_variants = make_raw_variants(canonical, aliases, n=pairs_per_sku + 4) # Ensure aliases are always included for alias in aliases[:4]: if alias != canonical: pairs.append({"raw": alias, "canonical": canonical}) # Add generated variants for raw in raw_variants[:pairs_per_sku]: if raw and raw != canonical: pairs.append({"raw": raw.strip(), "canonical": canonical}) random.shuffle(pairs) return pairs[:target_pairs] def build_training_sample(pair: dict) -> dict: return { "messages": [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": f'Invoice product name: "{pair["raw"]}"'}, {"role": "assistant", "content": pair["canonical"]}, ] } # ─── Modal function ─────────────────────────────────────────────────────────── @app.function( image=IMAGE, gpu="A10G", timeout=7200, secrets=[HF_SECRET], volumes={ "/data": modal.Volume.from_name("kirana-synth-data", create_if_missing=True), "/output": modal.Volume.from_name("kirana-minicpm5-1b-output", create_if_missing=True), }, memory=16384, ) def train(): from datasets import Dataset from huggingface_hub import HfApi from unsloth import FastLanguageModel from trl import SFTTrainer, SFTConfig # Upload catalog to modal volume if not already there catalog_modal = Path("/data/fmcg_catalog.json") if not catalog_modal.exists(): raise RuntimeError("Upload data/fmcg_catalog.json to Modal volume first (run Task 3.1 locally).") catalog = json.loads(catalog_modal.read_text()) pairs = generate_normalisation_dataset(catalog, target_pairs=2000) print(f"Generated {len(pairs)} normalisation pairs") samples = [build_training_sample(p) for p in pairs] split_idx = int(len(samples) * 0.9) train_dataset = Dataset.from_list(samples[:split_idx]) eval_dataset = Dataset.from_list(samples[split_idx:]) # --- Load model --- model, tokenizer = FastLanguageModel.from_pretrained( BASE_MODEL, max_seq_length=MAX_SEQ_LENGTH, load_in_4bit=True, dtype=None, ) model = FastLanguageModel.get_peft_model( model, r=LORA_RANK, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_alpha=LORA_RANK, lora_dropout=0, bias="none", use_gradient_checkpointing="unsloth", random_state=42, ) def to_text(example): example["text"] = tokenizer.apply_chat_template( example["messages"], tokenize=False, add_generation_prompt=False ) return example train_dataset = train_dataset.map(to_text) eval_dataset = eval_dataset.map(to_text) trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=train_dataset, eval_dataset=eval_dataset, args=SFTConfig( dataset_text_field="text", output_dir="/output/minicpm5-1b-sft", per_device_train_batch_size=BATCH_SIZE, gradient_accumulation_steps=2, warmup_steps=20, num_train_epochs=EPOCHS, learning_rate=LEARNING_RATE, fp16=False, bf16=True, logging_steps=20, eval_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, report_to="none", max_seq_length=MAX_SEQ_LENGTH, ), ) trainer.train() print("Training complete") # --- Export GGUF --- model.save_pretrained_gguf( "/output/minicpm5-1b-gguf", tokenizer, quantization_method="q4_k_m", ) # --- Publish --- api = HfApi(token=os.environ["HF_TOKEN"]) api.create_repo(HF_REPO, repo_type="model", exist_ok=True) gguf_dir = Path("/output/minicpm5-1b-gguf_gguf") for f in gguf_dir.glob("*.gguf"): print(f"Uploading {f.name}...") api.upload_file(path_or_fileobj=str(f), path_in_repo=f.name, repo_id=HF_REPO) model_card = f"""--- license: apache-2.0 base_model: {BASE_MODEL} tags: - product-normalization - indian-fmcg - gguf - qlora --- # MiniCPM5-1B — Indian FMCG Product Name Normalizer Fine-tuned [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}) to normalize inconsistent Indian FMCG product names from distributor invoices to canonical SKU names. ## Example | Raw Invoice Name | Normalized | |---|---| | MAGGI NDL 70GM | Nestle Maggi Masala Noodles 70g | | SURF XL 1K | Surf Excel Washing Powder 1kg | | PARLEG 80 | Parle-G Biscuit 80g | | COLGAT 100G | Colgate Strong Teeth Toothpaste 100g | ## Training - Base model: {BASE_MODEL} - Method: QLoRA (rank {LORA_RANK}) via Unsloth on Modal A10G - Dataset: 2,000 synthetic (raw, canonical) pairs covering 200 Indian FMCG SKUs - Part of: Kirana Detective AI (HuggingFace Build Small Hackathon 2026) """ api.upload_file(path_or_fileobj=model_card.encode(), path_in_repo="README.md", repo_id=HF_REPO) print(f"Published to {HF_REPO}") @app.local_entrypoint() def main(): train.remote()