""" Task 0.2 (Part 1) — Generate 500 synthetic Indian invoice images for MiniCPM-V 4.6 fine-tuning. Renders using pure Pillow — no WeasyPrint, no native system library dependencies. Output: data/synthetic_invoices/{printed_gst,tally_pdf,handwritten,whatsapp}/NNNN.png data/synthetic_invoices/annotations.jsonl Run: python finetune/generate_invoices.py """ import json import math import os import random from datetime import date, timedelta from pathlib import Path from PIL import Image, ImageDraw, ImageFont, ImageFilter import numpy as np CATALOG_PATH = Path(__file__).parent.parent / "data" / "fmcg_catalog.json" OUTPUT_DIR = Path(__file__).parent.parent / "data" / "synthetic_invoices" # A4 at 96 DPI W, H = 794, 1123 SUPPLIERS = [ {"name": "Hindustan Unilever Ltd", "gstin": "27AAACH0192P1Z1", "city": "Mumbai", "state": "Maharashtra"}, {"name": "Nestle India Ltd", "gstin": "07AAACN0032R1ZX", "city": "Delhi", "state": "Delhi"}, {"name": "Parle Products Pvt Ltd", "gstin": "27AABCP1309J1ZN", "city": "Mumbai", "state": "Maharashtra"}, {"name": "Britannia Industries Ltd","gstin": "29AAACB1783L1Z4", "city": "Bengaluru","state": "Karnataka"}, {"name": "ITC Limited", "gstin": "33AAACI5950H1Z6", "city": "Chennai", "state": "Tamil Nadu"}, {"name": "Amul (GCMMF)", "gstin": "24AABCG0788K1ZZ", "city": "Anand", "state": "Gujarat"}, {"name": "Dabur India Ltd", "gstin": "09AABCD0012F1Z2", "city": "Ghaziabad","state": "Uttar Pradesh"}, {"name": "Marico Limited", "gstin": "27AABCM0357L1ZN", "city": "Mumbai", "state": "Maharashtra"}, {"name": "Emami Limited", "gstin": "19AABCE4767H1Z3", "city": "Kolkata", "state": "West Bengal"}, {"name": "Godrej Consumer Products","gstin": "27AABCG0498H1ZC", "city": "Mumbai", "state": "Maharashtra"}, ] BUYERS = [ {"name": "Ravi Provision Store", "gstin": "33AABCR5678K1ZQ", "city": "Chennai"}, {"name": "Sri Lakshmi Kirana", "gstin": "33AACFR4321J1ZP", "city": "Chennai"}, {"name": "Sharma Brothers Traders", "gstin": "27AACHS9012L1ZM", "city": "Mumbai"}, {"name": "Patel Kirana Mart", "gstin": "24AABCK6543M1ZN", "city": "Ahmedabad"}, {"name": "Murugan General Stores", "gstin": "33AADFS8765N1ZR", "city": "Coimbatore"}, {"name": "New India Kirana", "gstin": "07AABCB3456P1ZT", "city": "Delhi"}, {"name": "Ganesh Traders", "gstin": "29AABCG9988L1Z5", "city": "Bengaluru"}, {"name": "Krishna General Store", "gstin": "36AABCK2211N1ZV", "city": "Hyderabad"}, ] # ─── Font loading ───────────────────────────────────────────────────────────── def _try_load(names: list, size: int) -> ImageFont.FreeTypeFont | ImageFont.ImageFont: for name in names: try: return ImageFont.truetype(name, size) except Exception: pass return ImageFont.load_default() def fonts(size: int = 11, bold: bool = False, mono: bool = False): if mono: return _try_load(["cour.ttf", "CourierNew.ttf", "DejaVuSansMono.ttf", "LiberationMono-Regular.ttf", "Courier New.ttf"], size) if bold: return _try_load(["arialbd.ttf", "Arial Bold.ttf", "DejaVuSans-Bold.ttf", "LiberationSans-Bold.ttf", "arial.ttf"], size) return _try_load(["arial.ttf", "Arial.ttf", "DejaVuSans.ttf", "LiberationSans-Regular.ttf", "Helvetica.ttf"], size) # ─── Data generation ────────────────────────────────────────────────────────── def load_catalog() -> list: with open(CATALOG_PATH, encoding="utf-8") as f: return json.load(f) def random_date() -> str: base = date(2024, 4, 1) return (base + timedelta(days=random.randint(0, 365))).isoformat() def invoice_number() -> str: return f"{random.choice(['INV','TAX','GST','BILL'])}/{random.choice(['2024-25','2025-26'])}/{random.randint(1000,99999):05d}" def make_invoice(catalog: list) -> dict: n = random.randint(5, 14) products = random.sample(catalog, min(n, len(catalog))) items = [] for p in products: qty = random.randint(1, 50) price = round(random.uniform(8, 480), 2) if random.random() < 0.15: # price anomaly for training variety price = round(price * random.uniform(1.10, 1.25), 2) gst = p["gst_rate"] items.append({ "product_raw": random.choice([p["canonical_name"]] + p["common_aliases"][:3]), "product_id": p["product_id"], "quantity": qty, "unit_price": price, "gst_rate": gst, "hsn_code": p["hsn_code"], "line_total": round(qty * price * (1 + gst / 100), 2), }) # ~12% chance of a duplicate line if random.random() < 0.12 and len(items) > 1: items.append(items[0].copy()) subtotal = round(sum(it["quantity"] * it["unit_price"] for it in items), 2) gst_total = round(sum(it["quantity"] * it["unit_price"] * it["gst_rate"] / 100 for it in items), 2) grand_total = round(subtotal + gst_total, 2) return { "invoice_number": invoice_number(), "supplier": random.choice(SUPPLIERS), "buyer": random.choice(BUYERS), "date": random_date(), "items": items, "subtotal": subtotal, "gst_total": gst_total, "grand_total": grand_total, } # ─── Shared drawing helpers ─────────────────────────────────────────────────── def text_w(draw, txt, font) -> int: bbox = draw.textbbox((0, 0), txt, font=font) return bbox[2] - bbox[0] def draw_hline(draw, x0, y, x1, color=(0, 0, 0), width=1): draw.line([(x0, y), (x1, y)], fill=color, width=width) def draw_vline(draw, x, y0, y1, color=(0, 0, 0), width=1): draw.line([(x, y0), (x, y1)], fill=color, width=width) COL_WIDTHS = [30, 230, 50, 50, 60, 40, 75] # #, Product, HSN, Qty, Price, GST%, Amount COL_LABELS = ["#", "Product Description", "HSN", "Qty", "Unit Price", "GST%", "Amount"] MARGIN_L, MARGIN_R = 35, 35 TABLE_W = W - MARGIN_L - MARGIN_R # 724 px def table_col_x(col_idx: int) -> int: """Left x of column inside the table.""" x = MARGIN_L for i in range(col_idx): x += COL_WIDTHS[i] return x def draw_table_header(draw, y: int, row_h: int = 18) -> int: fh = fonts(9, bold=True) fill_hdr = (30, 30, 60) draw.rectangle([MARGIN_L, y, W - MARGIN_R, y + row_h], fill=fill_hdr) x = MARGIN_L for i, (label, cw) in enumerate(zip(COL_LABELS, COL_WIDTHS)): tw = text_w(draw, label, fh) draw.text((x + max(0, (cw - tw) // 2), y + 4), label, font=fh, fill=(255, 255, 255)) x += cw return y + row_h def draw_table_rows(draw, items: list, y_start: int, row_h: int = 15) -> int: fn = fonts(8) y = y_start for i, it in enumerate(items): bg = (250, 250, 255) if i % 2 == 0 else (255, 255, 255) draw.rectangle([MARGIN_L, y, W - MARGIN_R, y + row_h], fill=bg) cells = [ str(i + 1), it["product_raw"][:28], it["hsn_code"], str(it["quantity"]), f"Rs.{it['unit_price']:.2f}", f"{it['gst_rate']}%", f"Rs.{it['line_total']:.2f}", ] x = MARGIN_L for cell, cw in zip(cells, COL_WIDTHS): tw = text_w(draw, cell, fn) # right-align numbers if cell.startswith("Rs.") or cell[0].isdigit(): draw.text((x + cw - tw - 3, y + 3), cell, font=fn, fill=(0, 0, 0)) else: draw.text((x + 3, y + 3), cell, font=fn, fill=(0, 0, 0)) x += cw draw_hline(draw, MARGIN_L, y + row_h, W - MARGIN_R, color=(200, 200, 200)) y += row_h return y def draw_table_border(draw, y_top: int, y_bottom: int): draw.rectangle([MARGIN_L, y_top, W - MARGIN_R, y_bottom], outline=(0, 0, 0), width=1) x = MARGIN_L for cw in COL_WIDTHS[:-1]: x += cw draw_vline(draw, x, y_top, y_bottom, color=(160, 160, 160)) # ─── Format 1: Printed GST Bill ─────────────────────────────────────────────── def render_printed_gst(inv: dict) -> Image.Image: img = Image.new("RGB", (W, H), (255, 255, 255)) draw = ImageDraw.Draw(img) f_title = fonts(16, bold=True) f_sub = fonts(10, bold=True) f_norm = fonts(9) f_small = fonts(8) y = 20 # Title bar draw.rectangle([0, y, W, y + 30], fill=(20, 20, 50)) title = "TAX INVOICE" tw = text_w(draw, title, f_title) draw.text(((W - tw) // 2, y + 6), title, font=f_title, fill=(255, 255, 255)) y += 35 sub = "(Original for Buyer)" tw = text_w(draw, sub, f_small) draw.text(((W - tw) // 2, y), sub, font=f_small, fill=(100, 100, 100)) y += 18 draw_hline(draw, MARGIN_L, y, W - MARGIN_R, color=(0, 0, 0), width=2) y += 8 # Supplier / Buyer columns sup, buy = inv["supplier"], inv["buyer"] col_w = (W - MARGIN_L - MARGIN_R) // 2 cx = MARGIN_L + col_w + 10 y0 = y draw.text((MARGIN_L, y), "SOLD BY:", font=f_sub, fill=(50, 50, 50)) draw.text((cx, y), "BILL TO:", font=f_sub, fill=(50, 50, 50)) y += 14 draw.text((MARGIN_L, y), sup["name"], font=fonts(9, bold=True), fill=(0, 0, 0)) draw.text((cx, y), buy["name"], font=fonts(9, bold=True), fill=(0, 0, 0)) y += 13 draw.text((MARGIN_L, y), f"GSTIN: {sup['gstin']}", font=f_norm, fill=(60, 60, 60)) draw.text((cx, y), f"GSTIN: {buy['gstin']}", font=f_norm, fill=(60, 60, 60)) y += 13 draw.text((MARGIN_L, y), f"{sup['city']}, {sup['state']}", font=f_norm, fill=(80, 80, 80)) draw.text((cx, y), f"{buy['city']}", font=f_norm, fill=(80, 80, 80)) y += 18 draw_hline(draw, MARGIN_L, y, W - MARGIN_R, color=(0, 0, 80), width=1) y += 6 # Invoice meta draw.text((MARGIN_L, y), f"Invoice No: {inv['invoice_number']}", font=f_sub, fill=(0, 0, 0)) date_txt = f"Date: {inv['date']}" draw.text((W - MARGIN_R - text_w(draw, date_txt, f_sub), y), date_txt, font=f_sub, fill=(0, 0, 0)) y += 20 # Table y_tbl = y y = draw_table_header(draw, y) y = draw_table_rows(draw, inv["items"], y) draw_table_border(draw, y_tbl, y) y += 10 # Totals fb = fonts(9, bold=True) totals = [ ("Subtotal (before GST)", f"Rs.{inv['subtotal']:.2f}"), ("Total GST", f"Rs.{inv['gst_total']:.2f}"), ("GRAND TOTAL", f"Rs.{inv['grand_total']:.2f}"), ] for label, val in totals: lw = text_w(draw, label, fb) vw = text_w(draw, val, fb) draw.text((W - MARGIN_R - 200, y), label, font=fb, fill=(0, 0, 0)) draw.text((W - MARGIN_R - vw, y), val, font=fb, fill=(180, 0, 0) if label == "GRAND TOTAL" else (0, 0, 0)) y += 15 draw_hline(draw, MARGIN_L, y + 4, W - MARGIN_R, color=(0, 0, 0), width=1) y += 18 footer = "This is a computer-generated invoice. No signature required." fw = text_w(draw, footer, f_small) draw.text(((W - fw) // 2, y), footer, font=f_small, fill=(150, 150, 150)) return img # ─── Format 2: Tally PDF ───────────────────────────────────────────────────── def render_tally(inv: dict) -> Image.Image: img = Image.new("RGB", (W, H), (255, 255, 255)) draw = ImageDraw.Draw(img) fm = fonts(10, mono=True) fm_b = fonts(11, bold=True) fm_s = fonts(9, mono=True) y = 20 draw_hline(draw, MARGIN_L, y, W - MARGIN_R, width=2) y += 6 title = inv["supplier"]["name"] tw = text_w(draw, title, fm_b) draw.text(((W - tw) // 2, y), title, font=fm_b, fill=(0, 0, 0)) y += 16 sub = "PURCHASE BILL / TAX INVOICE" tw2 = text_w(draw, sub, fm) draw.text(((W - tw2) // 2, y), sub, font=fm, fill=(0, 0, 0)) y += 8 draw_hline(draw, MARGIN_L, y, W - MARGIN_R, width=2) y += 10 def kv(key, val, y): draw.text((MARGIN_L, y), f"{key}:", font=fm_s, fill=(80, 80, 80)) draw.text((MARGIN_L + 130, y), val, font=fm_s, fill=(0, 0, 0)) return y + 14 y = kv("Invoice No", inv["invoice_number"], y) y = kv("Date", inv["date"], y) y = kv("Supplier GSTIN", inv["supplier"]["gstin"], y) y = kv("Buyer", inv["buyer"]["name"], y) y = kv("Buyer GSTIN", inv["buyer"]["gstin"], y) y += 6 draw_hline(draw, MARGIN_L, y, W - MARGIN_R) y += 4 # Tally-style table: S.N | Item | Qty | Rate | Tax% | Amount TCOLS = [30, 240, 45, 65, 45, 80] TLBLS = ["S.N", "Item Name", "Qty", "Rate", "Tax%", "Amount"] tx = MARGIN_L draw.rectangle([MARGIN_L, y, W - MARGIN_R, y + 16], fill=(230, 230, 230)) for lbl, cw in zip(TLBLS, TCOLS): draw.text((tx + 2, y + 3), lbl, font=fm_s, fill=(0, 0, 0)) tx += cw y += 16 draw_hline(draw, MARGIN_L, y, W - MARGIN_R) for i, it in enumerate(inv["items"]): cells = [ str(i + 1), it["product_raw"][:30], str(it["quantity"]), f"{it['unit_price']:.2f}", f"{it['gst_rate']}%", f"{it['line_total']:.2f}", ] tx = MARGIN_L for cell, cw in zip(cells, TCOLS): draw.text((tx + 2, y + 2), cell, font=fm_s, fill=(0, 0, 0)) tx += cw y += 14 draw_hline(draw, MARGIN_L, y, W - MARGIN_R, color=(200, 200, 200)) draw_hline(draw, MARGIN_L, y, W - MARGIN_R) y += 6 # Totals for label, val in [("Taxable Amount", f"{inv['subtotal']:.2f}"), ("GST Amount", f"{inv['gst_total']:.2f}"), ("Grand Total", f"{inv['grand_total']:.2f}")]: lw = text_w(draw, label, fm) vw = text_w(draw, val, fm_b) draw.text((MARGIN_L, y), label, font=fm, fill=(0, 0, 0)) draw.text((W - MARGIN_R - vw, y), val, font=fm_b, fill=(0, 0, 0)) y += 16 draw_hline(draw, MARGIN_L, y, W - MARGIN_R, width=2) y += 30 sig = f"For {inv['supplier']['name']}" draw.text((W - MARGIN_R - text_w(draw, sig, fm_s) - 10, y), sig, font=fm_s, fill=(80, 80, 80)) y += 35 auth = "Authorised Signatory" draw.text((W - MARGIN_R - text_w(draw, auth, fm_s) - 10, y), auth, font=fm_s, fill=(80, 80, 80)) return img # ─── Format 3: WhatsApp screenshot ─────────────────────────────────────────── def render_whatsapp(inv: dict) -> Image.Image: # Render a base invoice then add WhatsApp chrome base = render_printed_gst(inv) base_w, base_h = base.size # Frame dimensions HEADER_H = 56 FOOTER_H = 30 SIDE_PAD = 12 BOTTOM_PAD = 10 frame_w = base_w + 2 * SIDE_PAD + 4 frame_h = base_h + HEADER_H + FOOTER_H + BOTTOM_PAD frame = Image.new("RGB", (frame_w, frame_h), (229, 221, 213)) # WA background draw = ImageDraw.Draw(frame) # WA header bar draw.rectangle([0, 0, frame_w, HEADER_H], fill=(18, 140, 126)) # Back arrow area draw.rectangle([0, 0, 50, HEADER_H], fill=(18, 140, 126)) fb = fonts(11, bold=True) fn = fonts(9) draw.text((14, 10), "<", font=fonts(16, bold=True), fill=(255, 255, 255)) # Avatar circle draw.ellipse([56, 8, 92, 44], fill=(37, 211, 102)) draw.text((68, 18), "D", font=fb, fill=(255, 255, 255)) # Supplier name sup_short = inv["supplier"]["name"][:24] draw.text((100, 10), sup_short, font=fb, fill=(255, 255, 255)) draw.text((100, 28), "Distributor", font=fn, fill=(200, 255, 230)) # Chat bubble background bubble_x = SIDE_PAD bubble_y = HEADER_H + 8 bubble_rect = [bubble_x, bubble_y, bubble_x + base_w + 4, bubble_y + base_h + 4] draw.rectangle(bubble_rect, fill=(255, 255, 255), outline=(220, 220, 220)) # "Forwarded" badge fwd_font = fonts(8) draw.text((bubble_x + 6, bubble_y + 4), "Forwarded", font=fwd_font, fill=(102, 102, 102)) # Paste base invoice frame.paste(base, (bubble_x + 2, bubble_y + 16)) # Timestamp + ticks ts = f"{inv['date']} 10:3{random.randint(0,9)} AM ✓✓" tw = text_w(draw, ts, fwd_font) ts_y = bubble_y + base_h + 18 draw.text((bubble_x + base_w + 2 - tw - 4, ts_y), ts, font=fwd_font, fill=(102, 102, 102)) return frame # ─── Format 4: Handwritten (simulated) ─────────────────────────────────────── def render_handwritten(inv: dict) -> Image.Image: # Cream paper background with slight noise bg_color = (252, 248, 238) img = Image.new("RGB", (W, H), bg_color) arr = np.array(img, dtype=np.int16) noise = np.random.randint(-6, 7, arr.shape, dtype=np.int16) arr = np.clip(arr + noise, 0, 255).astype(np.uint8) img = Image.fromarray(arr) draw = ImageDraw.Draw(img) # Use fonts but with slightly irregular line spacing to simulate handwriting fh = fonts(12, bold=True) fn = fonts(11) fs = fonts(9) def jitter(): return random.randint(-2, 2) ink = (20, 20, 80) # dark blue ink pencil = (60, 60, 60) # grey for secondary text red_ink = (160, 20, 20) # red totals y = 28 + jitter() # Title — handwritten heading title = "PURCHASE BILL" tw = text_w(draw, title, fh) draw.text(((W - tw) // 2 + jitter(), y), title, font=fh, fill=ink) y += 22 # Underline draw_hline(draw, MARGIN_L + 40, y, W - MARGIN_R - 40, color=ink, width=2) y += 10 draw.text((MARGIN_L + jitter(), y), f"To: {inv['buyer']['name']}", font=fn, fill=ink) y += 16 draw.text((MARGIN_L + jitter(), y), f"From: {inv['supplier']['name']}", font=fn, fill=pencil) y += 16 draw.text((MARGIN_L + jitter(), y), f"Invoice No: {inv['invoice_number']}", font=fs, fill=pencil) date_txt = f"Date: {inv['date']}" draw.text((W - MARGIN_R - text_w(draw, date_txt, fs) + jitter(), y), date_txt, font=fs, fill=pencil) y += 18 draw_hline(draw, MARGIN_L, y, W - MARGIN_R, color=ink) y += 6 # Column headers H_COLS = [("S.N", 28), ("Item", 250), ("Qty", 40), ("Rate", 65), ("GST", 40), ("Total", 75)] tx = MARGIN_L for lbl, cw in H_COLS: draw.text((tx + jitter(), y + jitter()), lbl, font=fonts(10, bold=True), fill=ink) tx += cw y += 16 draw_hline(draw, MARGIN_L, y, W - MARGIN_R, color=ink) y += 4 for i, it in enumerate(inv["items"]): cells = [str(i + 1), it["product_raw"][:30], str(it["quantity"]), f"{it['unit_price']:.0f}", f"{it['gst_rate']}%", f"{it['line_total']:.0f}"] tx = MARGIN_L for cell, (_, cw) in zip(cells, H_COLS): draw.text((tx + jitter(), y + jitter()), cell, font=fn, fill=ink) tx += cw y += 16 if i % 3 == 2: draw_hline(draw, MARGIN_L, y, W - MARGIN_R, color=(180, 180, 140)) draw_hline(draw, MARGIN_L, y + 2, W - MARGIN_R, color=ink, width=2) y += 16 for label, val, color in [ ("Subtotal", f"Rs.{inv['subtotal']:.2f}", ink), ("GST", f"Rs.{inv['gst_total']:.2f}", pencil), ("TOTAL", f"Rs.{inv['grand_total']:.2f}", red_ink), ]: lw = text_w(draw, label, fn) vw = text_w(draw, val, fonts(11, bold=True)) draw.text((W - MARGIN_R - 180 + jitter(), y), label, font=fn, fill=color) draw.text((W - MARGIN_R - vw + jitter(), y), val, font=fonts(11, bold=True), fill=color) y += 16 draw_hline(draw, MARGIN_L, y + 4, W - MARGIN_R, color=ink) # Slight rotation to simulate non-flat scan angle = random.uniform(-0.8, 0.8) if abs(angle) > 0.2: img = img.rotate(angle, resample=Image.BICUBIC, expand=False, fillcolor=bg_color) return img # ─── Augraphy degradation (optional) ───────────────────────────────────────── def apply_degradation(img: Image.Image) -> Image.Image: try: import augraphy import numpy as np arr = np.array(img.convert("RGB")) pipeline = augraphy.AugmentationPipeline([ augraphy.GaussNoise(gaussian_noise_mean=3, gaussian_noise_std=5, p=0.5), augraphy.Jpeg(quality_range=(72, 95), p=0.4), augraphy.LowInkPeriodicLines(count_range=(2, 4), noise_probability=0.08, p=0.25), ]) arr = pipeline(arr) return Image.fromarray(arr) except Exception: return img # ─── Annotation builder ─────────────────────────────────────────────────────── def make_annotation(inv: dict, image_rel_path: str) -> dict: return { "image": image_rel_path, "invoice": { "invoice_number": inv["invoice_number"], "supplier": inv["supplier"]["name"], "date": inv["date"], "items": [ { "product_raw": it["product_raw"], "quantity": it["quantity"], "unit_price": it["unit_price"], "gst_rate": it["gst_rate"], "line_total": it["line_total"], } for it in inv["items"] ], "grand_total": inv["grand_total"], "extraction_warnings": [], }, } # ─── Main ───────────────────────────────────────────────────────────────────── RENDERERS = { "printed_gst": render_printed_gst, "tally_pdf": render_tally, "whatsapp": render_whatsapp, "handwritten": render_handwritten, } def generate_all(samples_per_format: int = 125, seed: int = 42) -> None: random.seed(seed) np.random.seed(seed) OUTPUT_DIR.mkdir(parents=True, exist_ok=True) catalog = load_catalog() annotations = [] for fmt, renderer in RENDERERS.items(): fmt_dir = OUTPUT_DIR / fmt fmt_dir.mkdir(exist_ok=True) print(f"Generating {samples_per_format} x {fmt}...") for i in range(samples_per_format): inv = make_invoice(catalog) try: img = renderer(inv) img = apply_degradation(img) fname = f"{i:04d}.png" img.save(fmt_dir / fname, "PNG", optimize=True) rel = f"synthetic_invoices/{fmt}/{fname}" annotations.append(make_annotation(inv, rel)) except Exception as e: print(f" [{fmt}/{i:04d}] render error: {e}") if (i + 1) % 25 == 0: print(f" {i + 1}/{samples_per_format}") ann_path = OUTPUT_DIR / "annotations.jsonl" with open(ann_path, "w", encoding="utf-8") as f: for ann in annotations: f.write(json.dumps(ann, ensure_ascii=False) + "\n") print(f"\nDone: {len(annotations)} images -> {ann_path}") if __name__ == "__main__": generate_all()