Finetuning completed for yolo26n-indian-fmcg-detection and minicpm5-1b-indian-fmcg-normalizer
7b5611f | from __future__ import annotations | |
| import json | |
| import re | |
| from pathlib import Path | |
| from typing import Dict, Optional, Set | |
| from models import CatalogEntry | |
| _CATALOG_PATH = Path(__file__).parent / "data" / "fmcg_catalog.json" | |
| # 20 HSN prefix β GST rate mappings (prefix-match, longest wins) | |
| GST_LOOKUP: Dict[str, float] = { | |
| "0401": 0.0, # liquid milk | |
| "0402": 12.0, # milk powder / flavoured milk | |
| "0403": 5.0, # curd, buttermilk | |
| "0405": 12.0, # butter, ghee, fats | |
| "0406": 12.0, # cheese (block/slices); paneer is 5% but shares prefix | |
| "0901": 5.0, # coffee beans / roasted | |
| "0902": 5.0, # tea | |
| "1101": 5.0, # wheat flour / atta | |
| "1512": 5.0, # sunflower, safflower oil | |
| "1513": 5.0, # coconut oil, palm oil | |
| "1901": 18.0, # malt-based health drinks | |
| "1902": 18.0, # pasta, noodles | |
| "1905": 18.0, # biscuits, bread, rusks | |
| "2009": 12.0, # fruit juices (100%) | |
| "2101": 5.0, # instant coffee / tea extracts | |
| "2201": 18.0, # packaged drinking water | |
| "2202": 28.0, # aerated drinks (cola, soda) β fruit drinks override below | |
| "3305": 18.0, # hair products (shampoo, oil) | |
| "3306": 18.0, # oral care (toothpaste) | |
| "3401": 18.0, # soap | |
| "3402": 18.0, # detergents / dishwash | |
| "2501": 0.0, # salt | |
| "3304": 18.0, # skin-care preparations | |
| "3307": 18.0, # deodorants, room fresheners | |
| "3808": 18.0, # insecticides / toilet cleaners | |
| } | |
| def _normalise_key(text: str) -> str: | |
| return re.sub(r"\s+", " ", text.lower().strip()) | |
| class FMCGCatalog: | |
| def __init__(self) -> None: | |
| self._by_id: Dict[str, CatalogEntry] = {} | |
| self._by_alias: Dict[str, str] = {} # normalised alias β product_id | |
| self._all_ids: Set[str] = set() | |
| def load(self, path: Path = _CATALOG_PATH) -> "FMCGCatalog": | |
| raw = json.loads(path.read_text(encoding="utf-8")) | |
| for entry in raw: | |
| e = CatalogEntry(**entry) | |
| self._by_id[e.product_id] = e | |
| self._all_ids.add(e.product_id) | |
| # Index canonical name | |
| self._by_alias[_normalise_key(e.canonical_name)] = e.product_id | |
| # Index all aliases | |
| for alias in e.common_aliases: | |
| key = _normalise_key(alias) | |
| if key not in self._by_alias: # first writer wins on collision | |
| self._by_alias[key] = e.product_id | |
| return self | |
| # ββ Public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_by_id(self, product_id: str) -> Optional[CatalogEntry]: | |
| return self._by_id.get(product_id) | |
| def lookup_alias(self, text: str) -> Optional[str]: | |
| """Return product_id for an exact alias match (case-insensitive, whitespace-normalised).""" | |
| return self._by_alias.get(_normalise_key(text)) | |
| def all_product_ids(self) -> Set[str]: | |
| return set(self._all_ids) | |
| def get_gst_rate(self, hsn_code: str) -> Optional[float]: | |
| """Prefix-match hsn_code against GST_LOOKUP; longest prefix wins.""" | |
| if not hsn_code: | |
| return None | |
| best_len, result = 0, None | |
| for prefix, rate in GST_LOOKUP.items(): | |
| if hsn_code.startswith(prefix) and len(prefix) > best_len: | |
| best_len, result = len(prefix), rate | |
| return result | |
| def __len__(self) -> int: | |
| return len(self._by_id) | |
| # ββ Module-level singleton ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _catalog_instance: Optional[FMCGCatalog] = None | |
| def get_catalog() -> FMCGCatalog: | |
| global _catalog_instance | |
| if _catalog_instance is None: | |
| _catalog_instance = FMCGCatalog().load() | |
| return _catalog_instance | |