Spaces:

build-small-hackathon
/

kirana-detective

Sleeping

File size: 3,983 Bytes

7b5611f

from __future__ import annotations

import json
import re
from pathlib import Path
from typing import Dict, Optional, Set

from models import CatalogEntry

_CATALOG_PATH = Path(__file__).parent / "data" / "fmcg_catalog.json"

# 20 HSN prefix → GST rate mappings (prefix-match, longest wins)
GST_LOOKUP: Dict[str, float] = {
    "0401": 0.0,    # liquid milk
    "0402": 12.0,   # milk powder / flavoured milk
    "0403": 5.0,    # curd, buttermilk
    "0405": 12.0,   # butter, ghee, fats
    "0406": 12.0,   # cheese (block/slices); paneer is 5% but shares prefix
    "0901": 5.0,    # coffee beans / roasted
    "0902": 5.0,    # tea
    "1101": 5.0,    # wheat flour / atta
    "1512": 5.0,    # sunflower, safflower oil
    "1513": 5.0,    # coconut oil, palm oil
    "1901": 18.0,   # malt-based health drinks
    "1902": 18.0,   # pasta, noodles
    "1905": 18.0,   # biscuits, bread, rusks
    "2009": 12.0,   # fruit juices (100%)
    "2101": 5.0,    # instant coffee / tea extracts
    "2201": 18.0,   # packaged drinking water
    "2202": 28.0,   # aerated drinks (cola, soda) — fruit drinks override below
    "3305": 18.0,   # hair products (shampoo, oil)
    "3306": 18.0,   # oral care (toothpaste)
    "3401": 18.0,   # soap
    "3402": 18.0,   # detergents / dishwash
    "2501": 0.0,    # salt
    "3304": 18.0,   # skin-care preparations
    "3307": 18.0,   # deodorants, room fresheners
    "3808": 18.0,   # insecticides / toilet cleaners
}


def _normalise_key(text: str) -> str:
    return re.sub(r"\s+", " ", text.lower().strip())


class FMCGCatalog:
    def __init__(self) -> None:
        self._by_id:    Dict[str, CatalogEntry] = {}
        self._by_alias: Dict[str, str]          = {}  # normalised alias → product_id
        self._all_ids:  Set[str]                = set()

    def load(self, path: Path = _CATALOG_PATH) -> "FMCGCatalog":
        raw = json.loads(path.read_text(encoding="utf-8"))
        for entry in raw:
            e = CatalogEntry(**entry)
            self._by_id[e.product_id] = e
            self._all_ids.add(e.product_id)

            # Index canonical name
            self._by_alias[_normalise_key(e.canonical_name)] = e.product_id

            # Index all aliases
            for alias in e.common_aliases:
                key = _normalise_key(alias)
                if key not in self._by_alias:       # first writer wins on collision
                    self._by_alias[key] = e.product_id

        return self

    # ── Public API ────────────────────────────────────────────────────────────

    def get_by_id(self, product_id: str) -> Optional[CatalogEntry]:
        return self._by_id.get(product_id)

    def lookup_alias(self, text: str) -> Optional[str]:
        """Return product_id for an exact alias match (case-insensitive, whitespace-normalised)."""
        return self._by_alias.get(_normalise_key(text))

    def all_product_ids(self) -> Set[str]:
        return set(self._all_ids)

    def get_gst_rate(self, hsn_code: str) -> Optional[float]:
        """Prefix-match hsn_code against GST_LOOKUP; longest prefix wins."""
        if not hsn_code:
            return None
        best_len, result = 0, None
        for prefix, rate in GST_LOOKUP.items():
            if hsn_code.startswith(prefix) and len(prefix) > best_len:
                best_len, result = len(prefix), rate
        return result

    def __len__(self) -> int:
        return len(self._by_id)


# ── Module-level singleton ────────────────────────────────────────────────────

_catalog_instance: Optional[FMCGCatalog] = None


def get_catalog() -> FMCGCatalog:
    global _catalog_instance
    if _catalog_instance is None:
        _catalog_instance = FMCGCatalog().load()
    return _catalog_instance