"""Bridge YOLO class names → catalog product_ids by adding them as catalog aliases. The visual counter maps a detected YOLO class to a catalog SKU via `catalog.lookup_alias(class_name)`. The YOLO vocabulary ("ParleG_Biscuit_Parle") and catalog ids ("parle_g_250g") don't overlap, so delivery↔invoice reconciliation never matched. This adds each recognizable YOLO class as an alias on its catalog SKU so reconciliation works. Each YOLO class is mapped one-to-one to a single catalog SKU (demo items are pinned to exact sizes; the rest go to the best token-overlap match above THRESHOLD). Adding the exact class string as an alias is robust because catalog `_normalise_key` only lowercases/collapses whitespace (keeps underscores), matching how the visual counter passes the raw class name. Usage: python finetune/add_yolo_aliases.py # dry run — prints the mapping python finetune/add_yolo_aliases.py --apply # writes data/fmcg_catalog.json """ from __future__ import annotations import json import re import sys from pathlib import Path from huggingface_hub import hf_hub_download CATALOG = Path(__file__).parent.parent / "data" / "fmcg_catalog.json" THRESHOLD = 0.34 # min token-Jaccard to auto-bridge a non-demo class # Demo items pinned to exact catalog sizes so the sample invoice reconciles. # Includes the generic class names the model actually emits in practice (verified # from real detections) alongside the fine-grained variants — both point to one SKU. DEMO_MAP = { "ParleG_Biscuit_Parle": "parle_g_250g", "ParleG": "parle_g_250g", "Amul Butter": "amul_butter_500g", "Parachute coconut Oil": "parachute_oil_500ml", "Soap_Original_Dettol": "dettol_soap_125g", "Nescafe_Classic_Coffee": "nescafe_classic_100g", "Nescafe": "nescafe_classic_100g", # generic class the model emits "SoapNo1_Godrej": "godrej_no1_soap_100g", # generic class the model emits "Biscuit_Bourbon_Britannia": "bourbon_cream_150g", } def toks(s: str) -> set: return set(re.sub(r"[^a-z0-9 ]+", " ", s.replace("_", " ").lower()).split()) def build_mapping(data: list) -> dict: by_id = {e["product_id"]: e for e in data} cat_toks = {} for e in data: t = toks(e["canonical_name"]) for a in e.get("common_aliases", []): t |= toks(a) cat_toks[e["product_id"]] = t yolo = json.load(open( hf_hub_download("build-small-hackathon/yolo26n-indian-fmcg-detection", "class_names.json"), encoding="utf-8", )) mapping: dict[str, str] = {} for c in yolo: if c in DEMO_MAP: mapping[c] = DEMO_MAP[c] continue ct = toks(c) best_pid, best = None, 0.0 for pid, t in cat_toks.items(): u = ct | t j = len(ct & t) / len(u) if u else 0.0 if j > best: best, best_pid = j, pid if best >= THRESHOLD: mapping[c] = best_pid mapping.update(DEMO_MAP) # demo wins on any tie return mapping, by_id def main() -> None: apply = "--apply" in sys.argv data = json.load(open(CATALOG, encoding="utf-8")) by_id = {e["product_id"]: e for e in data} mapping, by_id = build_mapping(data) print(f"{'CLASS (YOLO)':40s} -> {'product_id':22s} | catalog name") added = 0 for c in sorted(mapping, key=lambda x: mapping[x]): pid = mapping[c] e = by_id[pid] already = c in e.get("common_aliases", []) flag = "" if already else " [+]" print(f" {c:38s} -> {pid:22s} | {e['canonical_name']}{flag}") if not already: added += 1 if apply: e.setdefault("common_aliases", []).append(c) print(f"\n{len(mapping)} classes mapped to {len(set(mapping.values()))} SKUs; {added} new aliases.") if apply: json.dump(data, open(CATALOG, "w", encoding="utf-8"), ensure_ascii=False, indent=2) print(f"WROTE {CATALOG}") else: print("Dry run — re-run with --apply to write the catalog.") if __name__ == "__main__": main()