Spaces:

build-small-hackathon
/

kirana-detective

Sleeping

App Files Files Community

naazimsnh02 commited on 8 days ago

Commit

9d75c8c

1 Parent(s): 7b5611f

All models training uploaded

Browse files

Files changed (23) hide show

.kiro/specs/kirana-detective/design.md +21 -25
.kiro/specs/kirana-detective/requirements.md +8 -8
.kiro/specs/kirana-detective/tasks.md +5 -5
MODEL_CARD.md +496 -0
PROGRESS.md +4 -4
README.md +6 -6
agents/invoice_extractor.py +10 -19
agents/product_matcher.py +1 -1
agents/savings_agent.py +1 -1
app.py +16 -20
docs/kirana-detective-prd.md +3 -3
finetune/README.md +148 -0
finetune/export_minicpm_v_gguf.py +261 -0
finetune/push_minicpm_v_merged_card.py +312 -0
finetune/push_minicpm_v_to_hf.py +231 -0
finetune/push_yolo_to_hf.py +200 -0
finetune/train_minicpm_v.py +354 -114
finetune/train_yolo26n.py +312 -64
finetune/upload_yolo_to_hf.py +182 -0
finetune/yolo_model_card.md +161 -0
model_artifacts/yolo26n_fmcg/class_names.json +1833 -0
requirements.txt +1 -1
tracer.py +1 -1

.kiro/specs/kirana-detective/design.md CHANGED Viewed

@@ -56,7 +56,7 @@ The total active model parameter budget is approximately 2.38B (1.3B + 1.08B + 0
 │  ┌──────────┐   ┌──────────────┐   ┌──────────────────────────┐   │
 │  │ SQLite   │   │ catalog.py   │   │ tracer.py                │   │
 │  │ storage  │   │ fmcg_catalog │   │ HF Hub dataset publisher │   │
-│  │ .db file │   │ .json (200)  │   │ naazimsnh02/kirana-...  │   │
 │  └──────────┘   └──────────────┘   └──────────────────────────┘   │
 └─────────────────────────────────────────────────────────────────────┘
 ```
@@ -170,11 +170,11 @@ class AuditOrchestrator:
 ```python
 class InvoiceExtractorAgent:
-    MODEL_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction"
     AGENT_NAME = "Invoice_Extractor"
     AGENT_VERSION = "1.0.0"
-    def __init__(self, llm: Llama): ...
     def extract(
         self,
@@ -187,7 +187,7 @@ class InvoiceExtractorAgent:
 ```python
 class ProductMatcherAgent:
-    MODEL_REPO = "naazimsnh02/minicpm5-1b-indian-fmcg-normalizer"
     AGENT_NAME = "Product_Matcher"
     AGENT_VERSION = "1.0.0"
@@ -255,7 +255,7 @@ class PricingAgent:
 ```python
 class VisualCounterAgent:
-    MODEL_REPO = "naazimsnh02/yolo26n-indian-fmcg-detection"
     AGENT_NAME = "Visual_Counter"
     AGENT_VERSION = "1.0.0"
@@ -312,7 +312,7 @@ class ReconciliationAgent:
 ```python
 class SavingsAgent:
-    MODEL_REPO = "naazimsnh02/minicpm5-1b-indian-fmcg-normalizer"
     AGENT_NAME = "Savings_Agent"
     AGENT_VERSION = "1.0.0"
@@ -410,7 +410,7 @@ class FMCGCatalog:
 ```python
 class AgentTracer:
-    HF_DATASET_REPO = "naazimsnh02/kirana-detective-traces"
     MAX_RETRIES = 3
     BACKOFF_BASE_SECONDS = 2.0
@@ -991,27 +991,23 @@ def load_models() -> dict:
     Downloads are skipped if cached files exist.
     """
-    # --- Agent 1: MiniCPM-V 4.6 (vision) ---
-    vision_gguf_path = hf_hub_download(
-        repo_id="naazimsnh02/minicpm-v-4-6-indian-invoice-extraction",
-        filename="model.gguf",       # Q4_K_M quantized
-    )
-    clip_model_path = hf_hub_download(
-        repo_id="naazimsnh02/minicpm-v-4-6-indian-invoice-extraction",
-        filename="mmproj.gguf",      # Vision encoder projection weights
     )
-    chat_handler = MiniCPMv26ChatHandler(clip_model_path=clip_model_path)
-    vision_llm = Llama(
-        model_path=vision_gguf_path,
-        chat_handler=chat_handler,
-        n_ctx=4096,
-        n_threads=4,
-        verbose=False,
     )
     # --- Agents 2 & 6: MiniCPM5-1B (text) ---
     text_gguf_path = hf_hub_download(
-        repo_id="naazimsnh02/minicpm5-1b-indian-fmcg-normalizer",
         filename="model.gguf",       # Q4_K_M quantized
     )
     text_llm = Llama(
@@ -1023,11 +1019,11 @@ def load_models() -> dict:
     # --- Agent 4: YOLO26n ONNX ---
     onnx_path = hf_hub_download(
-        repo_id="naazimsnh02/yolo26n-indian-fmcg-detection",
         filename="yolo26n_fmcg.onnx",
     )
     class_names_path = hf_hub_download(
-        repo_id="naazimsnh02/yolo26n-indian-fmcg-detection",
         filename="class_names.json",
     )
     ort_session = onnxruntime.InferenceSession(

 │  ┌──────────┐   ┌──────────────┐   ┌──────────────────────────┐   │
 │  │ SQLite   │   │ catalog.py   │   │ tracer.py                │   │
 │  │ storage  │   │ fmcg_catalog │   │ HF Hub dataset publisher │   │
+│  │ .db file │   │ .json (200)  │   │ build-small-hackathon/kirana-...  │   │
 │  └──────────┘   └──────────────┘   └──────────────────────────┘   │
 └─────────────────────────────────────────────────────────────────────┘
 ```
 ```python
 class InvoiceExtractorAgent:
+    MODEL_REPO = "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged"
     AGENT_NAME = "Invoice_Extractor"
     AGENT_VERSION = "1.0.0"
+    def __init__(self, llm): ...
     def extract(
         self,
 ```python
 class ProductMatcherAgent:
+    MODEL_REPO = "build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer"
     AGENT_NAME = "Product_Matcher"
     AGENT_VERSION = "1.0.0"
 ```python
 class VisualCounterAgent:
+    MODEL_REPO = "build-small-hackathon/yolo26n-indian-fmcg-detection"
     AGENT_NAME = "Visual_Counter"
     AGENT_VERSION = "1.0.0"
 ```python
 class SavingsAgent:
+    MODEL_REPO = "build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer"
     AGENT_NAME = "Savings_Agent"
     AGENT_VERSION = "1.0.0"
 ```python
 class AgentTracer:
+    HF_DATASET_REPO = "build-small-hackathon/kirana-detective-traces"
     MAX_RETRIES = 3
     BACKOFF_BASE_SECONDS = 2.0
     Downloads are skipped if cached files exist.
     """
+    # --- Agent 1: MiniCPM-V 4.6 (vision, merged weights via transformers) ---
+    _vision_model = AutoModel.from_pretrained(
+        "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged",
+        trust_remote_code=True,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
     )
+    _vision_model.eval()
+    _vision_tokenizer = AutoTokenizer.from_pretrained(
+        "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged",
+        trust_remote_code=True,
     )
+    vision_llm = (_vision_model, _vision_tokenizer)
     # --- Agents 2 & 6: MiniCPM5-1B (text) ---
     text_gguf_path = hf_hub_download(
+        repo_id="build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer",
         filename="model.gguf",       # Q4_K_M quantized
     )
     text_llm = Llama(
     # --- Agent 4: YOLO26n ONNX ---
     onnx_path = hf_hub_download(
+        repo_id="build-small-hackathon/yolo26n-indian-fmcg-detection",
         filename="yolo26n_fmcg.onnx",
     )
     class_names_path = hf_hub_download(
+        repo_id="build-small-hackathon/yolo26n-indian-fmcg-detection",
         filename="class_names.json",
     )
     ort_session = onnxruntime.InferenceSession(

.kiro/specs/kirana-detective/requirements.md CHANGED Viewed

@@ -205,7 +205,7 @@ All inference runs locally with no cloud API calls (Off the Grid badge). Both la
 1. THE System SHALL create one Agent_Trace entry for each agent call within an Audit_Run, capturing: `agent_name`, `agent_version`, `audit_run_id`, `timestamp_start`, `timestamp_end`, `duration_ms`, `input_summary`, and `output_summary`.
 2. THE System SHALL record Agent_Trace entries in the sequential pipeline order: Invoice_Extractor → Product_Matcher → Pricing_Agent → Visual_Counter → Reconciliation_Agent → Savings_Agent.
-3. THE System SHALL publish the complete Agent_Trace for each Audit_Run as a row in the HuggingFace Hub dataset `naazimsnh02/kirana-detective-traces` within 10 seconds of the Audit_Run completing.
 4. WHEN the HuggingFace Hub dataset is unreachable, THE System SHALL save the Agent_Trace locally and retry publication with exponential back-off up to 3 attempts.
 5. THE Agent_Trace SHALL NOT include raw invoice image bytes, raw delivery photo bytes, or any personally identifiable information from the invoice.
 6. THE System SHALL assign a unique `audit_run_id` (UUID v4) to each Audit_Run and include it in every Agent_Trace entry and the Leakage_Report.
@@ -235,10 +235,10 @@ All inference runs locally with no cloud API calls (Off the Grid badge). Both la
 #### Acceptance Criteria
-1. THE Invoice_Extractor SHALL run MiniCPM-V 4.6 inference exclusively via llama-cpp-python using the GGUF-quantised model file `naazimsnh02/minicpm-v-4-6-indian-invoice-extraction`, with no HTTP calls to any external AI API.
-2. THE Product_Matcher SHALL run MiniCPM5-1B inference exclusively via llama-cpp-python using the GGUF-quantised model file `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer`, with no HTTP calls to any external AI API.
-3. THE Savings_Agent SHALL run MiniCPM5-1B inference exclusively via llama-cpp-python using the GGUF-quantised model file `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer`, with no HTTP calls to any external AI API.
-4. THE Visual_Counter SHALL run YOLO26n inference exclusively via onnxruntime using the locally-stored ONNX model file `naazimsnh02/yolo26n-indian-fmcg-detection`, with no HTTP calls to any external AI API.
 5. THE System SHALL load all model files from the HuggingFace Hub cache at startup and SHALL NOT download model files during an Audit_Run.
 6. WHILE operating in inference mode, THE System SHALL make no outbound HTTP calls except to the HuggingFace Hub dataset API for Agent_Trace publication (Requirement 11).
@@ -250,9 +250,9 @@ All inference runs locally with no cloud API calls (Off the Grid badge). Both la
 #### Acceptance Criteria
-1. THE System SHALL use the model `naazimsnh02/minicpm-v-4-6-indian-invoice-extraction` — a MiniCPM-V 4.6 model fine-tuned via QLoRA on Unsloth and Modal using 500 synthetic Indian invoice images across 10 suppliers and 4 invoice formats.
-2. THE System SHALL use the model `naazimsnh02/yolo26n-indian-fmcg-detection` — a YOLO26n model fine-tuned on the Roboflow Indian Grocery Object Detection dataset, exported to ONNX format.
-3. THE System SHALL use the model `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer` — a MiniCPM5-1B model fine-tuned via QLoRA on Unsloth and Modal using 2,000 synthetic (raw_name, normalized_name) pairs covering the top 200 Indian FMCG SKUs.
 4. THE System SHALL reference each published model by its HuggingFace Hub repository identifier in the application configuration.
 ---

 1. THE System SHALL create one Agent_Trace entry for each agent call within an Audit_Run, capturing: `agent_name`, `agent_version`, `audit_run_id`, `timestamp_start`, `timestamp_end`, `duration_ms`, `input_summary`, and `output_summary`.
 2. THE System SHALL record Agent_Trace entries in the sequential pipeline order: Invoice_Extractor → Product_Matcher → Pricing_Agent → Visual_Counter → Reconciliation_Agent → Savings_Agent.
+3. THE System SHALL publish the complete Agent_Trace for each Audit_Run as a row in the HuggingFace Hub dataset `build-small-hackathon/kirana-detective-traces` within 10 seconds of the Audit_Run completing.
 4. WHEN the HuggingFace Hub dataset is unreachable, THE System SHALL save the Agent_Trace locally and retry publication with exponential back-off up to 3 attempts.
 5. THE Agent_Trace SHALL NOT include raw invoice image bytes, raw delivery photo bytes, or any personally identifiable information from the invoice.
 6. THE System SHALL assign a unique `audit_run_id` (UUID v4) to each Audit_Run and include it in every Agent_Trace entry and the Leakage_Report.
 #### Acceptance Criteria
+1. THE Invoice_Extractor SHALL run MiniCPM-V 4.6 inference exclusively via transformers (`AutoModel.chat()`) using the merged model `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged`, with no HTTP calls to any external AI API.
+2. THE Product_Matcher SHALL run MiniCPM5-1B inference exclusively via llama-cpp-python using the GGUF-quantised model file `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`, with no HTTP calls to any external AI API.
+3. THE Savings_Agent SHALL run MiniCPM5-1B inference exclusively via llama-cpp-python using the GGUF-quantised model file `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`, with no HTTP calls to any external AI API.
+4. THE Visual_Counter SHALL run YOLO26n inference exclusively via onnxruntime using the locally-stored ONNX model file `build-small-hackathon/yolo26n-indian-fmcg-detection`, with no HTTP calls to any external AI API.
 5. THE System SHALL load all model files from the HuggingFace Hub cache at startup and SHALL NOT download model files during an Audit_Run.
 6. WHILE operating in inference mode, THE System SHALL make no outbound HTTP calls except to the HuggingFace Hub dataset API for Agent_Trace publication (Requirement 11).
 #### Acceptance Criteria
+1. THE System SHALL use the model `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged` — a MiniCPM-V 4.6 model fine-tuned via QLoRA and merged (LoRA weights baked into base), trained on 500 synthetic Indian invoice images across 10 suppliers and 4 invoice formats.
+2. THE System SHALL use the model `build-small-hackathon/yolo26n-indian-fmcg-detection` — a YOLO26n model fine-tuned on the Roboflow Indian Grocery Object Detection dataset, exported to ONNX format.
+3. THE System SHALL use the model `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer` — a MiniCPM5-1B model fine-tuned via QLoRA on Unsloth and Modal using 2,000 synthetic (raw_name, normalized_name) pairs covering the top 200 Indian FMCG SKUs.
 4. THE System SHALL reference each published model by its HuggingFace Hub repository identifier in the application configuration.
 ---

.kiro/specs/kirana-detective/tasks.md CHANGED Viewed

@@ -14,16 +14,16 @@ The implementation language is **Python** (all agents, pipeline, storage, catalo
   - [ ] 0.1 Fine-tune YOLO26n on Indian grocery dataset (Day 1)
     - Write `finetune/train_yolo26n.py` using Modal + Roboflow Indian Grocery Object Detection dataset
     - Export trained weights to ONNX format (`yolo26n_fmcg.onnx`) and `class_names.json`
-    - Publish to `naazimsnh02/yolo26n-indian-fmcg-detection` on HF Hub with model card
     - _Requirements: 8.4, 13.4, 14.2_
   - [ ] 0.2 Generate synthetic invoices and fine-tune MiniCPM-V 4.6 (Day 2)
     - Write `finetune/generate_invoices.py` — 500 synthetic Indian invoice images across 10 suppliers, 4 formats (printed GST, handwritten, Tally PDF, WhatsApp screenshot)
     - Write `finetune/train_minicpm_v.py` using QLoRA on Unsloth + Modal
-    - Publish GGUF-quantised model (`model.gguf`, `mmproj.gguf`) to `naazimsnh02/minicpm-v-4-6-indian-invoice-extraction`
     - _Requirements: 2.1, 2.3, 13.1, 14.1_
   - [ ] 0.3 Fine-tune MiniCPM5-1B for FMCG normalisation (Day 3)
     - Write `finetune/train_minicpm5_1b.py` — 2,000 synthetic `(raw_name, normalized_name)` pairs covering 200 FMCG SKUs, QLoRA on Unsloth + Modal
-    - Publish GGUF-quantised model (`model.gguf`) to `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer`
     - _Requirements: 3.2, 13.2, 13.3, 14.3_
 - [x] 1. Project scaffolding — directory structure, pinned dependencies, README skeleton
@@ -87,7 +87,7 @@ The implementation language is **Python** (all agents, pipeline, storage, catalo
     - `finalise(audit_run_id)` → `List[AgentTraceEntry]`: return buffer and clear it
     - `publish_async(audit_run_id, entries, storage)`: start daemon `threading.Thread` targeting `_publish_with_retry`
     - `_publish_with_retry(audit_run_id, entries, storage)`: call `storage.save_audit_run()`; loop `MAX_RETRIES=3` times, calling `_publish_to_hf_hub()`; on failure sleep `BACKOFF_BASE_SECONDS ** (attempt+1)` (2s, 4s, 8s); log final failure without raising
-    - `_publish_to_hf_hub(audit_run_id, entries)`: use `HfApi.upload_file()` to append `traces/{audit_run_id}.json` to `naazimsnh02/kirana-detective-traces` dataset repo; row: `{audit_run_id, trace_json, timestamp}`; must NOT include raw invoice bytes, photos, or PII
     - Implement `_make_trace_entry()` module-level helper: captures `timestamp_start` (ISO 8601 UTC), `timestamp_end`, `duration_ms` from `time.monotonic()`
     - _Requirements: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6_
@@ -203,7 +203,7 @@ The implementation language is **Python** (all agents, pipeline, storage, catalo
 - [ ] 18. HF Space deployment — `README.md` and model verification
   - [ ] 18.1 Finalise `README.md` with HF Space config and model download verification
     - Add YAML front-matter: `sdk: gradio`, `sdk_version: 6.16.0`, `app_file: app.py`, `title: Kirana Detective AI`, `short_description: AI invoice auditor for kirana stores`, `license: mit`, `tags: [invoice-audit, llm, yolo, gguf, gradio]`
-    - Document all three model repos (`naazimsnh02/minicpm-v-4-6-indian-invoice-extraction`, `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer`, `naazimsnh02/yolo26n-indian-fmcg-detection`)
     - Add section: "Model download verification" — describe `hf_hub_download()` usage and expected cache paths
     - Add "Running locally" section with `pip install -r requirements.txt` + `python app.py`
     - Add "Hackathon badges" section listing Off the Grid, Llama Champion, Off-Brand, Sharing is Caring, Well-Tuned, Tiny Titan

   - [ ] 0.1 Fine-tune YOLO26n on Indian grocery dataset (Day 1)
     - Write `finetune/train_yolo26n.py` using Modal + Roboflow Indian Grocery Object Detection dataset
     - Export trained weights to ONNX format (`yolo26n_fmcg.onnx`) and `class_names.json`
+    - Publish to `build-small-hackathon/yolo26n-indian-fmcg-detection` on HF Hub with model card
     - _Requirements: 8.4, 13.4, 14.2_
   - [ ] 0.2 Generate synthetic invoices and fine-tune MiniCPM-V 4.6 (Day 2)
     - Write `finetune/generate_invoices.py` — 500 synthetic Indian invoice images across 10 suppliers, 4 formats (printed GST, handwritten, Tally PDF, WhatsApp screenshot)
     - Write `finetune/train_minicpm_v.py` using QLoRA on Unsloth + Modal
+    - Publish GGUF-quantised model (`model.gguf`, `mmproj.gguf`) to `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction`
     - _Requirements: 2.1, 2.3, 13.1, 14.1_
   - [ ] 0.3 Fine-tune MiniCPM5-1B for FMCG normalisation (Day 3)
     - Write `finetune/train_minicpm5_1b.py` — 2,000 synthetic `(raw_name, normalized_name)` pairs covering 200 FMCG SKUs, QLoRA on Unsloth + Modal
+    - Publish GGUF-quantised model (`model.gguf`) to `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`
     - _Requirements: 3.2, 13.2, 13.3, 14.3_
 - [x] 1. Project scaffolding — directory structure, pinned dependencies, README skeleton
     - `finalise(audit_run_id)` → `List[AgentTraceEntry]`: return buffer and clear it
     - `publish_async(audit_run_id, entries, storage)`: start daemon `threading.Thread` targeting `_publish_with_retry`
     - `_publish_with_retry(audit_run_id, entries, storage)`: call `storage.save_audit_run()`; loop `MAX_RETRIES=3` times, calling `_publish_to_hf_hub()`; on failure sleep `BACKOFF_BASE_SECONDS ** (attempt+1)` (2s, 4s, 8s); log final failure without raising
+    - `_publish_to_hf_hub(audit_run_id, entries)`: use `HfApi.upload_file()` to append `traces/{audit_run_id}.json` to `build-small-hackathon/kirana-detective-traces` dataset repo; row: `{audit_run_id, trace_json, timestamp}`; must NOT include raw invoice bytes, photos, or PII
     - Implement `_make_trace_entry()` module-level helper: captures `timestamp_start` (ISO 8601 UTC), `timestamp_end`, `duration_ms` from `time.monotonic()`
     - _Requirements: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6_
 - [ ] 18. HF Space deployment — `README.md` and model verification
   - [ ] 18.1 Finalise `README.md` with HF Space config and model download verification
     - Add YAML front-matter: `sdk: gradio`, `sdk_version: 6.16.0`, `app_file: app.py`, `title: Kirana Detective AI`, `short_description: AI invoice auditor for kirana stores`, `license: mit`, `tags: [invoice-audit, llm, yolo, gguf, gradio]`
+    - Document all three model repos (`build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction`, `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`, `build-small-hackathon/yolo26n-indian-fmcg-detection`)
     - Add section: "Model download verification" — describe `hf_hub_download()` usage and expected cache paths
     - Add "Running locally" section with `pip install -r requirements.txt` + `python app.py`
     - Add "Hackathon badges" section listing Off the Grid, Llama Champion, Off-Brand, Sharing is Caring, Well-Tuned, Tiny Titan

MODEL_CARD.md ADDED Viewed

	@@ -0,0 +1,496 @@

+# MODEL CARD: Kirana Detective Training Data & Fine-Tuned Models
+**Repository**: `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged`
+**Author**: [naazimsnh02](https://github.com/naazimsnh02)
+**License**: Apache 2.0 (models) / MIT (code)
+**Last Updated**: June 10, 2026
+---
+## Executive Summary
+**Kirana Detective** is a complete fine-tuning pipeline for three state-of-the-art models that audit distributor invoices for Indian kirana (grocery) stores. This repository contains:
+1. **Synthetic invoice generation** (500 images across 4 formats)
+2. **Fine-tuned MiniCPM-V 4.6** — Invoice OCR & extraction (transformers, merged weights)
+3. **Fine-tuned MiniCPM5-1B** — Product name normalization (GGUF)
+4. **Fine-tuned YOLO26n** — Visual product detection (ONNX)
+All models run **locally without cloud APIs** and are deployed in a six-agent pipeline to detect pricing anomalies, missing deliveries, and GST errors, reporting **estimated rupee leakage** with actionable corrections.
+---
+## Project Overview
+### Problem Statement
+Indian kirana store owners struggle to audit distributor invoices manually:
+- Inconsistent product naming (abbreviations, typos, regional variants)
+- Difficulty cross-referencing against inventory
+- Manual photo counting is error-prone
+- No standardized format for pricing lookups
+- Estimated financial leakage: **5–15% of purchase budget**
+### Solution
+**Kirana Detective** automates the entire audit pipeline:
+1. **Extract** line items from invoice images (MiniCPM-V)
+2. **Normalize** product names (MiniCPM5-1B)
+3. **Check prices** against catalog
+4. **Count inventory** from delivery photos (YOLO26n)
+5. **Reconcile** invoiced vs. counted quantities
+6. **Report** discrepancies with rupee impact
+---
+## Models in This Repository
+### Model 1: MiniCPM-V 4.6 (Invoice Extractor)
+| Attribute | Details |
+|---|---|
+| **Base Model** | [openbmb/MiniCPM-V-4.6](https://huggingface.co/openbmb/MiniCPM-V-4.6) |
+| **Task** | Vision-language OCR + structured extraction |
+| **Fine-tuning Method** | QLoRA (4-bit quantization + LoRA rank 16) |
+| **Training Data** | 500 synthetic invoices (450 train, 50 eval) |
+| **Trainable Parameters** | 9,486,336 / 1,309,914,352 (0.72%) |
+| **Output Format** | Merged full weights (bfloat16) |
+| **Inference Runtime** | Transformers (`AutoModel`, `model.chat()`) |
+| **Hardware (Training)** | NVIDIA A10G, 22 GB VRAM, ~52 min |
+| **Repository** | [`build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged`](https://huggingface.co/build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged) |
+**Input Formats Supported**:
+- Printed GST invoices (Pillow-generated PDFs)
+- Tally PDF exports
+- Handwritten invoices (photos)
+- WhatsApp screenshot invoices
+**Output Structure** (JSON):
+```json
+{
+  "supplier": "Distributor Name",
+  "invoice_number": "INV-001",
+  "line_items": [
+    {
+      "raw_name": "MAGGI NDL 70GM",
+      "quantity": 10,
+      "unit_price": 45.50,
+      "gst_rate": 5,
+      "total": 455.00
+    }
+  ],
+  "invoice_total": 9650.00,
+  "gst_total": 485.00
+}
+```
+---
+### Model 2: MiniCPM5-1B (Product Name Normalizer)
+| Attribute | Details |
+|---|---|
+| **Base Model** | [openbmb/MiniCPM5-1B](https://huggingface.co/openbmb/MiniCPM5-1B) |
+| **Task** | Text-to-text product name normalization |
+| **Fine-tuning Method** | QLoRA (4-bit base, LoRA rank 16) |
+| **Training Data** | 2,000 synthetic (raw, canonical) pairs (1,800 train, 200 eval) |
+| **Output Format** | GGUF (quantized, ~1.2 GB) |
+| **Framework** | Unsloth 2026.6.1 |
+| **Hardware (Training)** | NVIDIA A10G, 22 GB VRAM, ~1 hour |
+| **Repository** | [`build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`](https://huggingface.co/build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer) |
+**Example Mappings**:
+| Raw Input | Normalized Output |
+|---|---|
+| `MAGGI NDL 70GM` | Nestle Maggi Masala Noodles 70g |
+| `SURF XL 1K` | Surf Excel Washing Powder 1kg |
+| `AMUL BTR 100` | Amul Butter 100g |
+| `COLGAT 100G` | Colgate Strong Teeth Toothpaste 100g |
+**Training Data**:
+- Hand-curated catalog of 200 Indian FMCG SKUs
+- Augmentation strategies: abbreviation expansion, typo injection, truncation, regional shorthand
+- Covers 10 major distributors: ITC, Nestlé, Unilever, P&G, Reckitt, Britannia, Amul, Patanjali, etc.
+---
+### Model 3: YOLO26n (Product Detection)
+| Attribute | Details |
+|---|---|
+| **Base Model** | [YOLOv8 Nano](https://docs.ultralytics.com/tasks/detect/) |
+| **Task** | Object detection (product localization & counting) |
+| **Fine-tuning Method** | Supervised fine-tuning via Ultralytics |
+| **Training Data** | 3 Roboflow datasets merged (~11,400 images) |
+| **Output Format** | ONNX (15 MB, CPU/GPU compatible) |
+| **Framework** | Ultralytics YOLOv8 |
+| **Hardware (Training)** | NVIDIA A10G, 22 GB VRAM, ~2 hours, 100 epochs |
+| **Repository** | [`build-small-hackathon/yolo26n-indian-fmcg-detection`](https://huggingface.co/build-small-hackathon/yolo26n-indian-fmcg-detection) |
+**Classes**: Unified class list is built dynamically at training time by merging all three dataset vocabularies (deduped, insertion-order). The current merged dataset spans **30+ classes** across grocery staples, personal care, beverages, and packaged foods. See `class_names.json` on HF Hub for the exact list after training.
+> **Pilot run note**: A previous single-dataset run (agentsk47 only, 10 classes) achieved mAP@50 = 0.993 / mAP@50-95 = 0.933 at epoch 65. Those metrics are superseded by the merged 3-dataset training now in progress.
+**Datasets Merged**:
+1. [agentsk47/indian-grocery-object-detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) — v1, ~400 images, 10 classes
+2. [iit-patna/grocery_items](https://universe.roboflow.com/iit-patna-qg1jh/grocery_items-7i2em) — v45, 6,695 images, 20 classes
+3. [project-c5ho0/indian-market](https://universe.roboflow.com/project-c5ho0/indian-market-qieug) — v2, 4,694 images, 2 classes
+---
+## Training Data & Datasets
+### Synthetic Invoice Generation (`generate_invoices.py`)
+**Purpose**: Create diverse, realistic invoice images without requiring manual collection or OCR labor.
+**Configuration**:
+- 500 total invoices generated
+- 4 formats: GST invoices, Tally PDFs, handwritten samples, WhatsApp screenshots
+- Pure Pillow (no native dependencies)
+- Randomized supplier names, quantities, prices, and GST rates
+**Generated Data Structure**:
+```
+data/synthetic_invoices/
+├── annotations.jsonl          # JSONL: {image_path, extracted_data}
+├── printed_gst/               # 125 GST-compliant invoices
+├── tally_pdf/                 # 125 Tally PDF exports
+├── handwritten/               # 125 handwritten photos
+└── whatsapp/                  # 125 WhatsApp screenshots
+```
+Each invoice includes:
+- 5–20 line items
+- Realistic pricing (₹10–₹5,000 per item)
+- Correct GST calculations (5%, 12%, 18%)
+- Real supplier names + product abbreviations
+---
+## Quick Start
+### Installation
+```bash
+git clone https://github.com/naazimsnh02/kirana-invoice-train-data.git
+cd kirana-invoice-train-data
+pip install -r requirements.txt
+```
+### Run Fine-tuning on Modal
+```bash
+# Set environment variables
+export ROBOFLOW_API_KEY=<your-roboflow-api-key>
+export HF_TOKEN=<your-huggingface-token>
+modal token new
+# Generate synthetic invoices
+modal run finetune/generate_invoices.py
+# Fine-tune all three models (sequential)
+modal run finetune/train_minicpm_v.py           # ~2 hours
+modal run finetune/train_minicpm5_1b.py         # ~1 hour
+modal run finetune/train_yolo26n.py             # ~2 hours
+```
+Models are auto-published to HuggingFace Hub upon completion.
+### Local Inference
+**MiniCPM-V (Invoice Extraction)**:
+```bash
+llama-cli --model minicpm-v-4-6.gguf \
+  -p "<|im_start|>system\nExtract invoice data<|im_end|>\n..." \
+  --image invoice.png
+```
+**MiniCPM5-1B (Product Normalization)**:
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained(
+    "build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer"
+)
+```
+**YOLO26n (Object Detection)**:
+```python
+from ultralytics import YOLO
+model = YOLO("yolo26n_fmcg.onnx")
+results = model.predict("shelf.jpg", imgsz=640)
+```
+---
+## Evaluation & Performance
+### MiniCPM-V Training Metrics (Actual Run — June 10, 2026)
+| Epoch | Train Loss | Eval Loss | LR |
+|---|---|---|---|
+| 1 | 6.081 | 0.2901 | 8.83e-5 |
+| 2 | 3.948 | 0.2281 | 4.94e-5 |
+| 3 | 3.326 | **0.212** | 1.04e-5 |
+- Training time: 51 min 50 sec (87 steps, 26 s/step on A10G)
+- Avg gradient norm: 178 → 16 (stable convergence)
+- Best checkpoint loaded: epoch 3 (eval loss 0.212)
+- Final avg train loss across all steps: 4.774
+> Per-invoice-type breakdown (printed GST / Tally / handwritten / WhatsApp) pending a held-out real-invoice test set — to be added in Phase 2.
+### MiniCPM5-1B Evaluation
+| Metric | Value |
+|---|---|
+| Exact Match (normalized names) | 94.5% |
+| Fuzzy Match (Levenshtein > 0.8) | 98.2% |
+| OOV Handling | 3.8% fail → manual review flag |
+### YOLO26n Evaluation — Pilot Run (single dataset, 10 classes)
+> These metrics are from a prior training run on the `agentsk47` dataset only (10 classes). The current training uses all 3 merged datasets and will produce updated metrics.
+Per-class metrics at best epoch (65):
+| Class | Precision | Recall | mAP50 | mAP50-95 |
+|---|---|---|---|---|
+| Bournvita | 0.902 | 1.000 | 0.995 | 0.995 |
+| Mysore Sandal Soap | 1.000 | 0.905 | 0.995 | 0.944 |
+| Nescafe Coffee | 0.927 | 1.000 | 0.995 | 0.908 |
+| Nivea Body Lotion | 0.935 | 1.000 | 0.995 | 0.923 |
+| Nivea Soft Cream | 0.924 | 1.000 | 0.995 | 0.895 |
+| Parachute Coconut Oil | 1.000 | 0.819 | 0.972 | 0.928 |
+| Patanjali Dant Kanti | 1.000 | 0.985 | 0.995 | 0.971 |
+| Society Tea | 0.878 | 1.000 | 0.995 | 0.845 |
+| Tresemmé Conditioner | 0.814 | 1.000 | 0.995 | 0.995 |
+| Tresemmé Shampoo | 0.968 | 1.000 | 0.995 | 0.922 |
+| **Macro Average** | **0.935** | **0.971** | **0.993** | **0.933** |
+---
+## Known Limitations & Biases
+### MiniCPM-V (Invoice Extractor)
+| Limitation | Impact | Mitigation |
+|---|---|---|
+| Only 10 FMCG suppliers in training data | Fails on uncommon distributors (e.g., local regional suppliers) | Collect real invoices from more suppliers post-hackathon |
+| Synthetic data (no image degradation, blur) | May struggle with poor-quality photos | Add augmentation (blur, noise, shadows) to training data |
+| GST rates hardcoded (5%, 12%, 18%) | Misses 0% or 28% GST items | Parameterize GST rate extraction |
+| English-only prompts | Cannot process invoices in regional languages | Add Hindi/Tamil/Marathi templates |
+### MiniCPM5-1B (Product Normalizer)
+| Limitation | Impact | Mitigation |
+|---|---|---|
+| Synthetic augmentation only | Overfits to rule-based patterns; fails on real-world typos | Collect 200+ real invoices for retraining |
+| 200 SKU catalog | Fails on brands outside top 10 suppliers | Expand to 2,000 SKUs (all major Indian FMCG) |
+| No regional abbreviations | Tamil/Hindi shortcuts not recognized | Add language-specific abbreviation models |
+| No OEM rebrands | Misses store-brand relabeling | Add rebranding patterns post-research |
+### YOLO26n (Product Detection)
+| Limitation | Impact | Mitigation |
+|---|---|---|
+| Merged dataset skewed toward beauty/personal care (Tresemmé, Nivea, Patanjali) | May underperform on grocery staples (oils, spices, pulses) | Balance class distribution; add 40–50 grocery categories |
+| ~11K images across 3 datasets | May not generalize to unlisted brands or novel shelf layouts | Collect 50K+ images via Roboflow community |
+| Confidence threshold (0.25) tuned for this dataset | May produce false positives in novel environments | Benchmark on held-out kirana store photos |
+| YOLO26n is 8M params (nano) | Edge device deployment not yet tested | Quantize & benchmark on RPi 4, Android |
+### Fairness & Bias Notes
+- **Brand bias**: Training data skews toward premium Indian brands (Amul, Nestlé, ITC) — may underperform on budget/regional brands
+- **Supplier bias**: Only 10 distributors represented; regional cooperatives not included
+- **Language bias**: All training prompts in English; non-English invoices will fail
+- **Income bias**: Kirana store size assumption (₹5–50 lakh inventory) — very large or very small stores may see degraded performance
+---
+## Reproducibility
+### Seed Control
+All scripts use fixed seeds:
+```python
+SEED = 42
+random.seed(SEED)
+np.random.seed(SEED)
+torch.manual_seed(SEED)
+```
+### Roboflow Dataset Versions (Pinned)
+- agentsk47/indian-grocery-object-detection — **v1** (May 2025)
+- iit-patna/grocery_items — **v45** (Apr 2026)
+- project-c5ho0/indian-market — **v2** (Jun 2025)
+### Training Infrastructure
+- **Orchestration**: [Modal](https://modal.com) (serverless GPUs)
+- **Fine-tuning Framework**: Unsloth 2026.6.1 (LLM), Ultralytics (YOLO)
+- **Quantization**: llama.cpp (GGUF)
+- **Model Publishing**: HuggingFace Hub `huggingface_hub>=0.30.0`
+### Reproducibility Checklist
+- [x] Dataset versions pinned in code
+- [x] Random seeds fixed
+- [x] Hardware specs documented (A10G, 22 GB VRAM)
+- [x] Training duration recorded (~5 hours total)
+- [x] Evaluation metrics logged post-training
+- [ ] Cold start (fresh HF account) validation (TODO: test on new account)
+---
+## Files in This Repository
+```
+kirana-invoice-train-data/
+├── README.md                           # This file
+├── MODEL_CARD.md                       # Model card for HF Hub
+├── requirements.txt                    # Python dependencies
+│
+├── finetune/
+│   ├── README.md                       # Training workflow guide
+│   ├── generate_invoices.py            # Synthetic invoice generator (500 images)
+│   ├── train_minicpm_v.py              # Fine-tune MiniCPM-V (OCR)
+│   ├── train_minicpm5_1b.py            # Fine-tune MiniCPM5-1B (normalizer)
+│   ├── train_yolo26n.py                # Fine-tune YOLO26n (detection)
+│   ├── model_card.md                   # MiniCPM5-1B model card
+│   └── yolo_model_card.md              # YOLO26n model card
+│
+├── data/
+│   ├── fmcg_catalog.json               # 200 canonical SKU names + GST rates
+│   └── synthetic_invoices/
+│       ├── annotations.jsonl
+│       ├── printed_gst/                # 125 invoices
+│       ├── tally_pdf/                  # 125 invoices
+│       ├── handwritten/                # 125 invoices
+│       └── whatsapp/                   # 125 invoices
+│
+└── tests/
+    └── test_*.py                       # Unit & integration tests
+```
+---
+## Hardware & Cost Estimates
+### Training Cost (Modal On-Demand)
+| Model | GPU | Duration | On-Demand Cost |
+|---|---|---|---|
+| MiniCPM-V | NVIDIA A10G | ~2 hours | ~$3.00 |
+| MiniCPM5-1B | NVIDIA A10G | ~1 hour | $1.50 |
+| YOLO26n | NVIDIA A10G | ~2 hours | $3.00 |
+| **Total** | — | **~5 hours** | **~$7.50** |
+### Inference Hardware
+- **Laptop CPU (Intel i7)**: ~5–10 sec/invoice (MiniCPM-V) + ~2 sec/normalization + ~3 sec/image (YOLO)
+- **GPU (NVIDIA RTX 3080)**: ~0.5 sec/invoice + ~0.2 sec/normalization + ~0.1 sec/image
+- **Edge Device (Raspberry Pi 4)**: YOLO26n quantized to Q2_K ≈ 30–60 sec/image (untested)
+---
+## Usage in Production (Kirana Detective App)
+Models are downloaded on first run via:
+```python
+import torch
+from transformers import AutoModel, AutoTokenizer
+from PIL import Image
+# Merged weights — no PEFT required
+model = AutoModel.from_pretrained(
+    "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged",
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+model.eval()
+tokenizer = AutoTokenizer.from_pretrained(
+    "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged",
+    trust_remote_code=True,
+)
+# Inference
+image = Image.open("invoice.jpg")
+msgs = [{"role": "user", "content": [image, "Extract all line items as JSON."]}]
+response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048)
+```
+---
+## Next Steps & Roadmap
+### Phase 2 (Q3 2026)
+- [ ] Collect **500 real invoices** from partnered kirana stores
+- [ ] Expand product taxonomy: 200 SKUs → 2,000 SKUs
+- [ ] Add **regional language support** (Hindi, Tamil, Marathi, Kannada)
+- [ ] Fine-tune on **invoice degradation** (blur, folds, stains)
+- [ ] Benchmark on **edge devices** (Raspberry Pi, Android)
+### Phase 3 (Q4 2026)
+- [ ] Multi-language MiniCPM5-1B normalizer
+- [ ] Expand YOLO26n to **50–100 classes** (full grocery taxonomy)
+- [ ] Real-time video product counting via YOLO
+- [ ] Mobile app (React Native) with offline inference
+### Research Questions
+- How do models perform on **store-private labels** vs. branded products?
+- Can we detect **counterfeit products** via label anomalies?
+- What is the **fairness gap** for regional vs. national brands?
+---
+## Licensing & Attribution
+- **Code**: MIT License
+- **Models**:
+  - MiniCPM-V: [openbmb/MiniCPM-V](https://github.com/OpenBMB/MiniCPM-V) — Apache 2.0
+  - MiniCPM5-1B: [openbmb/MiniCPM5-1B](https://huggingface.co/openbmb/MiniCPM5-1B) — Apache 2.0
+  - YOLO26n: [Ultralytics YOLOv8](https://github.com/ultralytics/ultralytics) — AGPL-3.0
+- **Datasets**:
+  - Roboflow datasets: Individual licenses (CC BY 4.0, CC BY-SA 4.0) — check each repo
+  - Synthetic invoices: CC0 (public domain)
+---
+## Contributing
+Contributions welcome! Areas of need:
+1. **Real invoice collection**: Partner kirana stores to share anonymized invoices
+2. **Regional language templates**: Hindi, Tamil, Marathi invoice formats
+3. **Edge device benchmarks**: Profile inference on RPi 4, Snapdragon, etc.
+4. **Dataset expansion**: Add 1,000+ more products to YOLO26n training
+5. **Fairness audits**: Test models on regional/budget brands
+---
+## Contact & Support
+- **Author**: [naazimsnh02](https://github.com/naazimsnh02)
+- **Issues**: [GitHub Issues](https://github.com/naazimsnh02/kirana-invoice-train-data/issues)
+- **Discussions**: [GitHub Discussions](https://github.com/naazimsnh02/kirana-invoice-train-data/discussions)
+- **HF Hub Models**:
+  - [`build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged`](https://huggingface.co/build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged)
+  - [`build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`](https://huggingface.co/build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer)
+  - [`build-small-hackathon/yolo26n-indian-fmcg-detection`](https://huggingface.co/build-small-hackathon/yolo26n-indian-fmcg-detection)
+---
+## Citation
+If you use this repository or models in your work, please cite:
+```bibtex
+@misc{kirana_detective_2026,
+  author = {Hussain, Syed Naazim},
+  title = {Kirana Detective: Fine-Tuned Models for Indian Grocery Invoice Auditing},
+  year = {2026},
+  publisher = {HuggingFace},
+  howpublished = {\url{https://huggingface.co/build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged}},
+}
+```
+---
+**Version**: 1.0
+**Last Updated**: June 10, 2026

PROGRESS.md CHANGED Viewed

@@ -84,10 +84,10 @@ modal run finetune/train_minicpm5_1b.py
 ## HF Repos to Create
 After fine-tuning publishes, verify these exist:
-- `naazimsnh02/yolo26n-indian-fmcg-detection`
-- `naazimsnh02/minicpm-v-4-6-indian-invoice-extraction`
-- `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer`
-- `naazimsnh02/kirana-detective-traces` (dataset — create manually before first audit run)
 ---

 ## HF Repos to Create
 After fine-tuning publishes, verify these exist:
+- `build-small-hackathon/yolo26n-indian-fmcg-detection`
+- `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction`
+- `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`
+- `build-small-hackathon/kirana-detective-traces` (dataset — create manually before first audit run)
 ---

README.md CHANGED Viewed

@@ -21,15 +21,15 @@ AI-powered inventory and invoice auditor for Indian kirana stores. Upload a dist
 ## Models
-All models run **locally via llama.cpp / ONNX — no cloud API calls**.
 | Model | HuggingFace Repo | Purpose |
 |---|---|---|
-| MiniCPM-V 4.6 (GGUF) | `naazimsnh02/minicpm-v-4-6-indian-invoice-extraction` | Invoice OCR + extraction |
-| MiniCPM5-1B (GGUF) | `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer` | Product name normalization |
-| YOLO26n (ONNX) | `naazimsnh02/yolo26n-indian-fmcg-detection` | Delivery photo product counting |
-Models are downloaded automatically on first run via `hf_hub_download()` and cached locally.
 ## Running Locally
@@ -38,7 +38,7 @@ pip install -r requirements.txt
 python app.py
 ```
-Requires ~4 GB RAM for the quantized models. First run downloads ~2 GB of model weights.
 ## Six-Agent Pipeline

 ## Models
+All models run **locally — no cloud API calls**.
 | Model | HuggingFace Repo | Purpose |
 |---|---|---|
+| MiniCPM-V 4.6 (transformers) | `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged` | Invoice OCR + extraction |
+| MiniCPM5-1B (GGUF) | `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer` | Product name normalization |
+| YOLO26n (ONNX) | `build-small-hackathon/yolo26n-indian-fmcg-detection` | Delivery photo product counting |
+Models are downloaded automatically on first run via `hf_hub_download()` / `AutoModel` and cached locally.
 ## Running Locally
 python app.py
 ```
+Requires ~6 GB RAM. First run downloads ~3 GB of model weights.
 ## Six-Agent Pipeline

agents/invoice_extractor.py CHANGED Viewed

@@ -14,11 +14,11 @@ logger = logging.getLogger(__name__)
 AGENT_NAME = "Invoice_Extractor"
 AGENT_VERSION = "1.0.0"
-MODEL_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction"
 _MAX_FILE_BYTES = 20 * 1024 * 1024  # 20 MB
 _ALLOWED_SUFFIXES = {".jpg", ".jpeg", ".png", ".webp", ".pdf"}
-_TIMEOUT_SECONDS = 30
 _EXTRACT_PROMPT = (
     "You are an OCR agent for Indian kirana store invoices. "
@@ -92,23 +92,14 @@ def _dict_to_invoice(data: dict) -> InvoiceJSON:
 def _call_llm_with_image(llm, image_bytes: bytes, prompt: str) -> str:
-    """Call MiniCPM-V via llama-cpp-python chat API with an image."""
-    import base64
-    b64 = base64.b64encode(image_bytes).decode()
-    response = llm.create_chat_completion(
-        messages=[
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}},
-                    {"type": "text", "text": prompt},
-                ],
-            }
-        ],
-        max_tokens=2048,
-        temperature=0.0,
-    )
-    return response["choices"][0]["message"]["content"]
 def _extract_from_image(llm, image_bytes: bytes) -> Tuple[InvoiceJSON, list[str]]:

 AGENT_NAME = "Invoice_Extractor"
 AGENT_VERSION = "1.0.0"
+MODEL_REPO = "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged"
 _MAX_FILE_BYTES = 20 * 1024 * 1024  # 20 MB
 _ALLOWED_SUFFIXES = {".jpg", ".jpeg", ".png", ".webp", ".pdf"}
+_TIMEOUT_SECONDS = 120
 _EXTRACT_PROMPT = (
     "You are an OCR agent for Indian kirana store invoices. "
 def _call_llm_with_image(llm, image_bytes: bytes, prompt: str) -> str:
+    """Call MiniCPM-V via transformers chat API with an image."""
+    import io
+    from PIL import Image as PILImage
+    model, tokenizer = llm
+    image = PILImage.open(io.BytesIO(image_bytes)).convert("RGB")
+    msgs = [{"role": "user", "content": [image, prompt]}]
+    return model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048)
 def _extract_from_image(llm, image_bytes: bytes) -> Tuple[InvoiceJSON, list[str]]:

agents/product_matcher.py CHANGED Viewed

@@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
 AGENT_NAME = "Product_Matcher"
 AGENT_VERSION = "1.0.0"
-MODEL_REPO = "naazimsnh02/minicpm5-1b-indian-fmcg-normalizer"
 _TIMEOUT_SECONDS = 20

 AGENT_NAME = "Product_Matcher"
 AGENT_VERSION = "1.0.0"
+MODEL_REPO = "build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer"
 _TIMEOUT_SECONDS = 20

agents/savings_agent.py CHANGED Viewed

@@ -13,7 +13,7 @@ logger = logging.getLogger(__name__)
 AGENT_NAME = "Savings_Agent"
 AGENT_VERSION = "1.0.0"
-MODEL_REPO = "naazimsnh02/minicpm5-1b-indian-fmcg-normalizer"
 _TIMEOUT_SECONDS = 15

 AGENT_NAME = "Savings_Agent"
 AGENT_VERSION = "1.0.0"
+MODEL_REPO = "build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer"
 _TIMEOUT_SECONDS = 15

app.py CHANGED Viewed

@@ -58,31 +58,27 @@ def load_models() -> None:
     try:
         from huggingface_hub import hf_hub_download
         from llama_cpp import Llama
-        from llama_cpp.llama_chat_format import MiniCPMv26ChatHandler
         import onnxruntime as ort
-        logger.info("Downloading vision model (MiniCPM-V 4.6)…")
-        vision_model_path = hf_hub_download(
-            repo_id="naazimsnh02/minicpm-v-4-6-indian-invoice-extraction",
-            filename="model.gguf",
-        )
-        mmproj_path = hf_hub_download(
-            repo_id="naazimsnh02/minicpm-v-4-6-indian-invoice-extraction",
-            filename="mmproj.gguf",
-        )
-        chat_handler = MiniCPMv26ChatHandler(clip_model_path=mmproj_path)
-        vision_llm = Llama(
-            model_path=vision_model_path,
-            chat_handler=chat_handler,
-            n_ctx=4096,
-            n_threads=4,
-            verbose=False,
         )
         logger.info("Vision LLM ready")
         logger.info("Downloading text model (MiniCPM5-1B)…")
         text_model_path = hf_hub_download(
-            repo_id="naazimsnh02/minicpm5-1b-indian-fmcg-normalizer",
             filename="model.gguf",
         )
         text_llm = Llama(
@@ -95,11 +91,11 @@ def load_models() -> None:
         logger.info("Downloading YOLO model…")
         onnx_path = hf_hub_download(
-            repo_id="naazimsnh02/yolo26n-indian-fmcg-detection",
             filename="yolo26n_fmcg.onnx",
         )
         class_names_path = hf_hub_download(
-            repo_id="naazimsnh02/yolo26n-indian-fmcg-detection",
             filename="class_names.json",
         )
         with open(class_names_path, encoding="utf-8") as f:

     try:
         from huggingface_hub import hf_hub_download
         from llama_cpp import Llama
         import onnxruntime as ort
+        logger.info("Downloading vision model (MiniCPM-V 4.6 merged)…")
+        import torch
+        from transformers import AutoModel, AutoTokenizer
+        _VISION_REPO = "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged"
+        _vision_model = AutoModel.from_pretrained(
+            _VISION_REPO,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
         )
+        _vision_model.eval()
+        _vision_tokenizer = AutoTokenizer.from_pretrained(_VISION_REPO, trust_remote_code=True)
+        vision_llm = (_vision_model, _vision_tokenizer)
         logger.info("Vision LLM ready")
         logger.info("Downloading text model (MiniCPM5-1B)…")
         text_model_path = hf_hub_download(
+            repo_id="build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer",
             filename="model.gguf",
         )
         text_llm = Llama(
         logger.info("Downloading YOLO model…")
         onnx_path = hf_hub_download(
+            repo_id="build-small-hackathon/yolo26n-indian-fmcg-detection",
             filename="yolo26n_fmcg.onnx",
         )
         class_names_path = hf_hub_download(
+            repo_id="build-small-hackathon/yolo26n-indian-fmcg-detection",
             filename="class_names.json",
         )
         with open(class_names_path, encoding="utf-8") as f:

docs/kirana-detective-prd.md CHANGED Viewed

@@ -413,7 +413,7 @@ This keeps the app far below the hackathon's 32B cap and within the Tiny Titan s
 **Platform:** Modal + Unsloth QLoRA (~2–3 hours training time)
-**Publish to:** `naazimsnh02/minicpm-v-4-6-indian-invoice-extraction`
 ---
@@ -426,7 +426,7 @@ This keeps the app far below the hackathon's 32B cap and within the Tiny Titan s
 **Export:** ONNX for local CPU inference
-**Publish to:** `naazimsnh02/yolo26n-indian-fmcg-detection`
 ---
@@ -435,7 +435,7 @@ This keeps the app far below the hackathon's 32B cap and within the Tiny Titan s
 **Dataset:** 2,000 synthetic (raw_name, normalized_name) pairs covering top 200 Indian FMCG SKUs
-**Publish to:** `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer`
 ---

 **Platform:** Modal + Unsloth QLoRA (~2–3 hours training time)
+**Publish to:** `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged`
 ---
 **Export:** ONNX for local CPU inference
+**Publish to:** `build-small-hackathon/yolo26n-indian-fmcg-detection`
 ---
 **Dataset:** 2,000 synthetic (raw_name, normalized_name) pairs covering top 200 Indian FMCG SKUs
+**Publish to:** `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`
 ---

finetune/README.md ADDED Viewed

	@@ -0,0 +1,148 @@

+# Model Fine-tuning Guide
+Fine-tune Kirana Detective's three models on Indian FMCG invoice data.
+## Quick Start (TL;DR)
+```bash
+export ROBOFLOW_API_KEY=<your-key>
+export HF_TOKEN=<your-token>
+modal run finetune/generate_invoices.py     # 10 min
+modal run finetune/train_minicpm_v.py       # 2 hours
+modal run finetune/train_minicpm5_1b.py     # 1 hour
+modal run finetune/train_yolo26n.py         # 2 hours
+```
+Models auto-publish to HuggingFace Hub on completion.
+---
+## Three Models, Three Pipelines
+### 1. MiniCPM-V 4.6 (Invoice OCR) — `train_minicpm_v.py`
+**Purpose**: Extract line items, amounts, GST from invoice images (printed PDFs, handwritten, WhatsApp screenshots)
+**Input**: 500 synthetic invoices (4 formats)
+**Method**: QLoRA fine-tuning with Unsloth
+**Output**: GGUF quantized model → HF Hub
+**Hardware**: A10G, 22 GB VRAM, ~2 hours
+**Datasets used**:
+- Synthetic invoices generated by `generate_invoices.py`
+- Splits: train/val/test = 400/50/50
+- Formats: pure Pillow (no native deps) — GST, Tally PDF, handwritten, WhatsApp
+---
+### 2. MiniCPM5-1B (Product Name Normalizer) — `train_minicpm5_1b.py`
+**Purpose**: Map invoice abbreviations (e.g., "MAGGI NDL 70GM") to canonical names
+**Input**: 2,000 synthetic (raw, canonical) pairs
+**Method**: QLoRA, 4-bit base + LoRA adapters
+**Output**: GGUF quantized model
+**Hardware**: A10G, ~1 hour
+**Dataset generation**:
+- Hand-curated 200 SKU catalog
+- Rule-based augmentation: abbreviation expansion, typo injection, truncation
+- Coverage: 10 major Indian FMCG suppliers
+---
+### 3. YOLO26n (Product Detection) — `train_yolo26n.py`
+**Purpose**: Count packaged products in shelf/counter photos
+**Input**: 3 Roboflow datasets merged (11,000+ images)
+**Method**: Ultralytics standard training pipeline
+**Output**: ONNX format for CPU/GPU inference
+**Hardware**: A10G, ~2 hours
+**Datasets merged**:
+1. [agentsk47/indian-grocery-object-detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) v1
+2. [iit-patna/grocery_items](https://universe.roboflow.com/iit-patna-qg1jh/grocery_items-7i2em) v45 (6,695 images)
+3. [project-c5ho0/indian-market](https://universe.roboflow.com/project-c5ho0/indian-market-qieug) v2 (4,694 images)
+---
+## Prerequisites
+```bash
+# 1. Clone this repo
+git clone https://github.com/build-small-hackathon/kirana-invoice-train-data.git
+cd kirana-invoice-train-data
+# 2. Install local deps (for generated synthetics preview only)
+pip install -r requirements.txt
+# 3. Set up secrets for Modal/HF
+modal token new
+export ROBOFLOW_API_KEY=<from Roboflow universe account>
+export HF_TOKEN=<from huggingface.co/settings/tokens>
+# 4. Test Modal setup
+modal run finetune/generate_invoices.py
+```
+---
+## Reproducibility Checklist
+- [ ] **Dataset versioning**: All Roboflow versions pinned (v1, v45, v2)
+- [ ] **Seed control**: Random seeds fixed in all training scripts
+- [ ] **Output validation**: Run `tests/` after each model completes
+- [ ] **HF Hub publish logs**: Check model card auto-generated from training
+- [ ] **GGUF quantization**: Verified mAP/F1 vs. float32 baseline
+---
+## Known Limitations & Biases
+| Model | Limitation | Impact | Mitigation |
+|---|---|---|---|
+| MiniCPM-V | Only 10 FMCG suppliers in training data | Fails on uncommon brands | Add more invoices post-hackathon |
+| MiniCPM5-1B | Synthetic data only (no real invoice typos) | Overfits to rule-based augmentation | Collect 200+ real examples next |
+| YOLO26n | Merged dataset skewed toward beauty/personal care (Tresemmé, Nivea, Patanjali) | May underperform on grocery staples | Balance class distribution across grocery categories |
+---
+## Troubleshooting
+**"Modal timeout after 2 hours?"**
+→ YOLO training can take 2–3h depending on GPU queue. Increase timeout in `modal.json`.
+**"GGUF quantization fails?"**
+→ Ensure llama.cpp is compiled with CUDA support if GPU quantization intended.
+**"HF Hub publish returns 403?"**
+→ `HF_TOKEN` must have write access. Regenerate at huggingface.co/settings/tokens.
+---
+## Output Files
+After successful runs, check HF Hub:
+- **MiniCPM-V**: `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction`
+  - `model.gguf` (4.5 GB)
+  - `model_card.md`
+- **MiniCPM5-1B**: `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`
+  - `model.gguf` (1.2 GB)
+  - `model_card.md`
+- **YOLO26n**: `build-small-hackathon/yolo26n-indian-fmcg-detection`
+  - `best.onnx` (15 MB)
+  - `class_names.json`
+  - `model_card.md`
+---
+## Next Steps Post-Hackathon
+1. **Collect real invoice data** from partnered kirana stores (500 minimum)
+2. **Expand product taxonomy** (currently 200 SKUs → 2000)
+3. **Add regional variants** (Hindi/Tamil/Malayalam abbreviations)
+4. **Benchmark inference latency** on Raspberry Pi / Android devices

finetune/export_minicpm_v_gguf.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+Merge MiniCPM-V 4.6 LoRA adapter into the base model and push the merged
+HF weights to Hugging Face.
+Why merge instead of converting LoRA to GGUF directly:
+  llama.cpp's convert_lora_to_gguf.py and convert_hf_to_gguf.py both fail
+  for MiniCPMV4_6Model (architecture not in llama.cpp's registry). The only
+  working path is to have ggml.ai's GGUF-my-repo Space do the conversion —
+  it uses a patched llama.cpp that supports this architecture.
+Two-step workflow:
+  Step 1 (this script):
+    - Load base model + LoRA from Modal volume
+    - Merge LoRA weights into the full model (merge_and_unload)
+    - Push merged HF model to MERGED_HF_REPO
+    - Download OpenBMB's mmproj.gguf and upload it to HF_REPO for immediate use
+  Step 2 (manual — ~15 min):
+    - Go to https://huggingface.co/spaces/ggml-org/gguf-my-repo
+    - Enter: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged
+    - Select Q4_K_M quantisation
+    - Wait for the Space to create the GGUF repo
+    - Update app.py MODEL_REPO to point to the resulting GGUF repo
+Run:
+    modal run finetune/export_minicpm_v_gguf.py
+Reads adapter from: /output/minicpm-v-lora in Modal volume kirana-minicpm-v-output
+Publishes merged HF model to: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged
+Also uploads mmproj.gguf to: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction
+"""
+from __future__ import annotations
+import os
+import modal
+app = modal.App("kirana-export-minicpm-v-gguf")
+IMAGE = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install(
+        "huggingface_hub>=0.30.0",
+        "safetensors>=0.4.3",
+        "torch>=2.3.0",
+        "transformers>=5.7.0",
+        "peft>=0.14.0",
+        "accelerate>=0.34.0",
+    )
+)
+HF_SECRET = modal.Secret.from_name("hf-secret")
+BASE_MODEL = "openbmb/MiniCPM-V-4.6"
+SOURCE_GGUF_REPO = "openbmb/MiniCPM-V-4.6-gguf"
+HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction"
+MERGED_HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged"
+# Full professional model card is maintained in push_minicpm_v_merged_card.py.
+# This is a minimal card used during the merge+push run; run push_minicpm_v_merged_card.py
+# separately to update the README on HF Hub.
+MODEL_CARD_MERGED = f"""\
+---
+license: apache-2.0
+base_model: {BASE_MODEL}
+datasets:
+  - build-small-hackathon/kirana-invoice-train-data
+language:
+  - en
+tags:
+  - invoice-extraction
+  - indian-fmcg
+  - minicpm-v
+  - vision-language
+  - ocr
+  - qlora
+  - merged-weights
+  - kirana
+  - hackathon
+pipeline_tag: image-text-to-text
+---
+# MiniCPM-V 4.6 — Indian Invoice Extraction (Merged)
+Fine-tuned [`{BASE_MODEL}`](https://huggingface.co/{BASE_MODEL}) for structured
+JSON extraction from Indian distributor (kirana) invoices. QLoRA adapter weights
+are fully merged — no PEFT dependency at inference time.
+See full model card: [`naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged`](https://huggingface.co/{MERGED_HF_REPO})
+## Quick Start
+```python
+import torch
+from transformers import AutoModel, AutoTokenizer
+from PIL import Image
+model = AutoModel.from_pretrained(
+    "{MERGED_HF_REPO}", trust_remote_code=True,
+    torch_dtype=torch.bfloat16, device_map="auto",
+)
+model.eval()
+tokenizer = AutoTokenizer.from_pretrained("{MERGED_HF_REPO}", trust_remote_code=True)
+image = Image.open("invoice.jpg").convert("RGB")
+msgs = [{{"role": "user", "content": [image, "Extract all line items as JSON."]}}]
+response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048)
+```
+## Training Summary
+| Parameter | Value |
+|---|---|
+| Base model | `{BASE_MODEL}` |
+| Fine-tuning | QLoRA rank 16 |
+| Dataset | 450 train + 50 eval synthetic Indian invoices |
+| Eval loss | 0.2120 (3 epochs) |
+| Training hardware | Modal A10G, ~52 min |
+| Adapter params | 9.5M / 1.3B total (0.72%) |
+## License
+Apache 2.0 — same as base model.
+"""
+def _validate_gguf_header(path: str) -> None:
+    with open(path, "rb") as f:
+        magic = f.read(4)
+    if magic != b"GGUF":
+        raise RuntimeError(f"Downloaded file is not a GGUF: {path}")
+@app.function(
+    image=IMAGE,
+    timeout=3600,
+    secrets=[HF_SECRET],
+    volumes={
+        "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=False),
+    },
+    memory=16384,  # 16 GB — 1.3B model in bfloat16 ≈ 2.6 GB; headroom for merge + save
+)
+def merge_and_push():
+    import torch
+    from pathlib import Path
+    from peft import PeftModel
+    from transformers import AutoModel, AutoTokenizer
+    from huggingface_hub import HfApi, hf_hub_download
+    token = os.environ["HF_TOKEN"]
+    api = HfApi(token=token)
+    adapter_dir = Path("/output/minicpm-v-lora")
+    merged_dir = Path("/output/minicpm-v-merged")
+    if not adapter_dir.exists():
+        raise RuntimeError(
+            f"Missing adapter directory: {adapter_dir}. "
+            "Run finetune/train_minicpm_v.py first."
+        )
+    # ── Step 1: Merge LoRA into base model ───────────────────────────────────
+    if (merged_dir / "config.json").exists():
+        print("Merged model already exists at /output/minicpm-v-merged, skipping merge.")
+    else:
+        print(f"Loading base model {BASE_MODEL} ...")
+        base_model = AutoModel.from_pretrained(
+            BASE_MODEL,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+            token=token,
+        )
+        print(f"Loading LoRA adapter from {adapter_dir} ...")
+        model = PeftModel.from_pretrained(base_model, str(adapter_dir))
+        print("Merging LoRA weights into base model ...")
+        merged_model = model.merge_and_unload()
+        print(f"Saving merged model to {merged_dir} ...")
+        merged_dir.mkdir(parents=True, exist_ok=True)
+        merged_model.save_pretrained(str(merged_dir))
+        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, token=token)
+        tokenizer.save_pretrained(str(merged_dir))
+        print("Merge complete.")
+    # ── Step 2: Create HF repo and push merged model ─────────────────────────
+    print(f"Creating / verifying HF repo {MERGED_HF_REPO} ...")
+    api.create_repo(repo_id=MERGED_HF_REPO, repo_type="model", exist_ok=True, private=False)
+    print(f"Uploading merged model to {MERGED_HF_REPO} ...")
+    api.upload_folder(
+        folder_path=str(merged_dir),
+        repo_id=MERGED_HF_REPO,
+        repo_type="model",
+        commit_message="Add merged MiniCPM-V-4.6 invoice fine-tune",
+    )
+    print("Uploading README.md to merged repo ...")
+    api.upload_file(
+        path_or_fileobj=MODEL_CARD_MERGED.encode("utf-8"),
+        path_in_repo="README.md",
+        repo_id=MERGED_HF_REPO,
+        repo_type="model",
+    )
+    # ── Step 3: Download OpenBMB mmproj and upload to GGUF repo ──────────────
+    # The LoRA only touched LLM layers — mmproj weights are unchanged, so
+    # OpenBMB's mmproj.gguf is identical to what we would produce ourselves.
+    print(f"Listing GGUF files in {SOURCE_GGUF_REPO} ...")
+    source_files = list(api.list_repo_files(SOURCE_GGUF_REPO, repo_type="model"))
+    mmproj_files = [f for f in source_files if "mmproj" in f.lower() and f.endswith(".gguf")]
+    if not mmproj_files:
+        raise RuntimeError(f"No mmproj GGUF found in {SOURCE_GGUF_REPO}. Files: {source_files}")
+    source_mmproj = mmproj_files[0]
+    print(f"Downloading {source_mmproj} ...")
+    mmproj_path = hf_hub_download(
+        repo_id=SOURCE_GGUF_REPO,
+        filename=source_mmproj,
+        repo_type="model",
+        token=token,
+        local_dir="/output/minicpm-v-gguf",
+    )
+    _validate_gguf_header(mmproj_path)
+    print(f"Uploading mmproj.gguf to {HF_REPO} ...")
+    api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True, private=False)
+    api.upload_file(
+        path_or_fileobj=mmproj_path,
+        path_in_repo="mmproj.gguf",
+        repo_id=HF_REPO,
+        repo_type="model",
+    )
+    print()
+    print("=" * 70)
+    print("DONE. Next steps:")
+    print()
+    print("1. Go to: https://huggingface.co/spaces/ggml-org/gguf-my-repo")
+    print(f"2. Enter model ID: {MERGED_HF_REPO}")
+    print("3. Select quantisation: Q4_K_M")
+    print("4. Click convert — takes ~15 min on the Space's A10G")
+    print()
+    print("The Space will create a new repo (usually named")
+    print(f"  {MERGED_HF_REPO}-GGUF")
+    print("containing model.gguf + mmproj.gguf (both for the fine-tuned model).")
+    print()
+    print(f"mmproj.gguf already uploaded to: https://huggingface.co/{HF_REPO}")
+    print("(usable immediately — vision encoder weights are unchanged by fine-tuning)")
+    print("=" * 70)
+@app.local_entrypoint()
+def main():
+    merge_and_push.remote()

finetune/push_minicpm_v_merged_card.py ADDED Viewed

	@@ -0,0 +1,312 @@

+"""
+Push the professional README / model card to
+naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged on HuggingFace.
+No Modal required — runs locally using the HF token from environment.
+Run:
+    $env:HF_TOKEN = "hf_..."          # PowerShell
+    python finetune/push_minicpm_v_merged_card.py
+"""
+from __future__ import annotations
+import os
+from huggingface_hub import HfApi
+HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged"
+BASE_MODEL = "openbmb/MiniCPM-V-4.6"
+DATASET_REPO = "build-small-hackathon/kirana-invoice-train-data"
+MODEL_CARD = """\
+---
+license: apache-2.0
+base_model: openbmb/MiniCPM-V-4.6
+datasets:
+  - build-small-hackathon/kirana-invoice-train-data
+language:
+  - en
+tags:
+  - invoice-extraction
+  - indian-fmcg
+  - minicpm-v
+  - vision-language
+  - ocr
+  - qlora
+  - merged-weights
+  - kirana
+  - hackathon
+pipeline_tag: image-text-to-text
+---
+# MiniCPM-V 4.6 — Indian Invoice Extraction (Merged)
+Fine-tuned [`openbmb/MiniCPM-V-4.6`](https://huggingface.co/openbmb/MiniCPM-V-4.6) for
+structured JSON extraction from Indian distributor (kirana) invoices.
+QLoRA adapter weights are **fully merged** into the base model — no PEFT dependency at
+inference time. Part of the **Kirana Detective** project: a six-agent AI pipeline that
+audits invoices for pricing anomalies, missing deliveries, and GST errors.
+---
+## Model Details
+| Attribute | Value |
+|---|---|
+| **Base model** | [openbmb/MiniCPM-V-4.6](https://huggingface.co/openbmb/MiniCPM-V-4.6) |
+| **Task** | Vision-language OCR + structured JSON extraction |
+| **Fine-tuning method** | QLoRA — 4-bit NF4 base, LoRA rank 16, α 32 |
+| **Trainable parameters** | 9,486,336 / 1,309,914,352 **(0.72%)** |
+| **Target modules** | `q_proj`, `k_proj`, `v_proj`, `o_proj`, `gate_proj`, `up_proj`, `down_proj` |
+| **Training epochs** | 3 |
+| **Final eval loss** | **0.2120** (↓ from 0.2901 at epoch 1) |
+| **Training hardware** | NVIDIA A10G 22 GB VRAM (Modal) |
+| **Training duration** | ~52 minutes |
+| **Output format** | Merged full weights — bfloat16 |
+| **Inference runtime** | `transformers` (`AutoModel` + `model.chat()`) |
+---
+## Training Data
+**Dataset**: [`build-small-hackathon/kirana-invoice-train-data`](https://huggingface.co/datasets/build-small-hackathon/kirana-invoice-train-data)
+| Split | Examples |
+|---|---|
+| Train | 450 |
+| Eval | 50 |
+Synthetic Indian distributor invoices generated with Pillow across:
+- **10 suppliers**: HUL, Nestlé, Parle, Britannia, ITC, Amul, Dabur, Marico, Emami, Godrej
+- **4 invoice formats**: Printed GST bill, Tally PDF export, handwritten, WhatsApp screenshot
+- **Intentional errors injected**: GST rate mismatches, duplicate line items, price spikes — to
+  train the model to surface extraction warnings alongside extracted data
+---
+## Training Metrics
+| Epoch | Train Loss | Eval Loss |
+|---|---|---|
+| 1 | — | 0.2901 |
+| 2 | — | 0.2281 |
+| 3 | — | **0.2120** |
+---
+## Supported Input Formats
+| Format | Example |
+|---|---|
+| Printed GST invoice | Standard B2B tax invoice with HSN codes |
+| Tally PDF export | Machine-generated tabular layout |
+| Handwritten invoice | Photo of handwritten bill |
+| WhatsApp screenshot | Low-resolution forwarded invoice image |
+---
+## Output Schema
+The model returns **only** a JSON object matching this schema — no markdown, no prose:
+```json
+{
+  "invoice_number": "INV-2024-001",
+  "supplier": "Hindustan Unilever Ltd.",
+  "date": "2026-06-10",
+  "items": [
+    {
+      "product_raw": "SURF XL 1KG",
+      "quantity": 12,
+      "unit_price": 95.00,
+      "gst_rate": 18,
+      "line_total": 1140.00
+    },
+    {
+      "product_raw": "MAGGI MASALA 70G",
+      "quantity": 48,
+      "unit_price": 14.00,
+      "gst_rate": 5,
+      "line_total": 672.00
+    }
+  ],
+  "grand_total": 9650.00,
+  "extraction_warnings": []
+}
+```
+**Field notes**:
+- `product_raw` — verbatim as printed on the invoice (abbreviations, typos preserved)
+- `gst_rate` — percentage value (5, 12, 18, 28), not a decimal
+- `date` — ISO 8601 (`YYYY-MM-DD`) when parseable, raw string otherwise
+- `extraction_warnings` — list of issues noticed (missing fields, illegible areas, GST anomalies)
+- Numeric fields default to `0` when unreadable; `invoice_number`/`supplier`/`date` default to `null`
+---
+## Usage
+### Basic Inference
+```python
+import torch
+from transformers import AutoModel, AutoTokenizer
+from PIL import Image
+model = AutoModel.from_pretrained(
+    "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged",
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+model.eval()
+tokenizer = AutoTokenizer.from_pretrained(
+    "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged",
+    trust_remote_code=True,
+)
+image = Image.open("invoice.jpg").convert("RGB")
+prompt = (
+    "You are an OCR agent for Indian kirana store invoices. "
+    "Extract all information from this invoice image and return ONLY valid JSON "
+    "matching this schema exactly:\\n"
+    '{"invoice_number": string|null, "supplier": string|null, "date": string|null, '
+    '"items": [{"product_raw": string, "quantity": number, "unit_price": number, '
+    '"gst_rate": number, "line_total": number}], '
+    '"grand_total": number, "extraction_warnings": [string]}\\n'
+    "Return ONLY the JSON object, no markdown, no prose."
+)
+msgs = [{"role": "user", "content": [image, prompt]}]
+response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048)
+print(response)
+```
+### From a PDF (multi-page)
+```python
+import fitz  # PyMuPDF
+from PIL import Image
+import io, json
+doc = fitz.open("invoice.pdf")
+results = []
+for page in doc:
+    pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
+    img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
+    msgs = [{"role": "user", "content": [img, prompt]}]
+    raw = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048)
+    results.append(json.loads(raw))
+```
+---
+## How It Fits in Kirana Detective
+```
+Invoice image
+      │
+      ▼
+┌─────────────────────────────┐
+│  Agent 1 — Invoice Extractor │  ← this model
+│  MiniCPM-V 4.6 (merged)     │
+└─────────────────────────────┘
+      │ InvoiceJSON (raw product names)
+      ▼
+┌─────────────────────────────┐
+│  Agent 2 — Product Matcher  │  MiniCPM5-1B normalizer
+└─────────────────────────────┘
+      │
+      ▼
+┌─────────────────────────────┐
+│  Agent 3 — Pricing Check    │  catalog + price history
+└─────────────────────────────┘
+      │
+      ▼  (+ delivery photos)
+┌─────────────────────────────┐
+│  Agent 4 — Visual Counter   │  YOLO26n ONNX
+└─────────────────────────────┘
+      │
+      ▼
+┌─────────────────────────────┐
+│  Agent 5 — Reconciliation   │
+│  Agent 6 — Savings Report   │  MiniCPM5-1B
+└─────────────────────────────┘
+      │
+      ▼
+₹ Leakage report + action items
+```
+Related repos:
+- [`build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`](https://huggingface.co/build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer) — product name normalizer
+- [`build-small-hackathon/yolo26n-indian-fmcg-detection`](https://huggingface.co/build-small-hackathon/yolo26n-indian-fmcg-detection) — YOLO product counter
+---
+## Limitations
+- Trained on **synthetic** invoices only — real-world performance may vary on heavily degraded,
+  stamped, or non-standard layouts until production data is collected.
+- Optimised for **English and numeric** invoice content; Hindi/regional-language invoices are
+  not yet covered.
+- Product names are extracted **verbatim** (`product_raw`) — normalization to canonical SKU
+  names is handled downstream by the MiniCPM5-1B normalizer agent.
+- `grand_total` extraction can fail on invoices with complex multi-page subtotal structures.
+---
+## Reproducibility
+The LoRA adapter was trained with this script and then merged:
+```bash
+modal run finetune/train_minicpm_v.py    # fine-tune → saves adapter to Modal volume
+modal run finetune/export_minicpm_v_gguf.py  # merge LoRA → push merged weights to HF
+```
+Source: [GitHub — Kirana Detective](https://github.com/naazimsnh02/kirana-detective)
+---
+## Citation
+```bibtex
+@misc{kirana_detective_minicpmv_2026,
+  author    = {Hussain, Syed Naazim},
+  title     = {MiniCPM-V 4.6 Fine-Tuned for Indian Invoice Extraction},
+  year      = {2026},
+  publisher = {HuggingFace},
+  howpublished = {\\url{https://huggingface.co/naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged}},
+}
+```
+---
+## License
+Apache 2.0 — same license as the base [`openbmb/MiniCPM-V-4.6`](https://huggingface.co/openbmb/MiniCPM-V-4.6) model.
+"""
+def main() -> None:
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        raise SystemExit("Set HF_TOKEN environment variable before running.")
+    api = HfApi(token=token)
+    api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True, private=False)
+    api.upload_file(
+        path_or_fileobj=MODEL_CARD.encode("utf-8"),
+        path_in_repo="README.md",
+        repo_id=HF_REPO,
+        repo_type="model",
+        commit_message="Update professional model card",
+    )
+    print(f"Model card pushed to https://huggingface.co/{HF_REPO}")
+if __name__ == "__main__":
+    main()

finetune/push_minicpm_v_to_hf.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""
+Push the trained MiniCPM-V LoRA adapter from Modal volume to HuggingFace Hub.
+Usage:
+    modal run finetune/push_minicpm_v_to_hf.py
+Reads from Modal volume: kirana-minicpm-v-output  (/output/minicpm-v-lora)
+Pushes to: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction
+"""
+import os
+import modal
+app = modal.App("kirana-push-minicpm-v")
+IMAGE = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install("huggingface_hub>=0.30.0")
+)
+HF_SECRET = modal.Secret.from_name("hf-secret")
+HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction"
+BASE_MODEL = "openbmb/MiniCPM-V-4.6"
+HF_DATASET_REPO = "build-small-hackathon/kirana-invoice-train-data"
+MODEL_CARD = """\
+---
+license: apache-2.0
+base_model: openbmb/MiniCPM-V-4.6
+datasets:
+  - build-small-hackathon/kirana-invoice-train-data
+language:
+  - en
+tags:
+  - invoice-extraction
+  - indian-fmcg
+  - minicpm-v
+  - ocr
+  - qlora
+  - peft
+  - kirana
+  - vision-language
+pipeline_tag: image-text-to-text
+---
+# MiniCPM-V 4.6 — Indian Invoice Extraction (LoRA Adapter)
+Fine-tuned [`openbmb/MiniCPM-V-4.6`](https://huggingface.co/openbmb/MiniCPM-V-4.6) for structured JSON extraction from Indian distributor invoices.
+Part of the **Kirana Detective** project — an AI audit pipeline for small Indian grocery (kirana) stores.
+> **This is a PEFT LoRA adapter** — you need the base model + this adapter to run inference.
+## Training Results
+| Epoch | Train Loss | Eval Loss |
+|-------|-----------|-----------|
+| 1 | ~6.08 | 0.2901 |
+| 2 | ~3.95 | 0.2281 |
+| 3 | ~3.33 | **0.212** |
+**Training summary** (3 epochs, 87 steps, ~52 min on A10G):
+- Total average train loss: 4.774
+- Best eval loss: **0.212** (epoch 3, loaded as final checkpoint)
+- Trainable parameters: 9,486,336 / 1,309,914,352 (0.72%)
+- Dataset: 450 train + 50 eval synthetic invoices
+## Usage
+```python
+from peft import PeftModel, PeftConfig
+from transformers import AutoModel, AutoProcessor
+import torch
+# Load adapter config to get base model id
+config = PeftConfig.from_pretrained("naazimsnh02/minicpm-v-4-6-indian-invoice-extraction")
+base_model = AutoModel.from_pretrained(
+    config.base_model_name_or_path,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+model = PeftModel.from_pretrained(base_model, "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction")
+processor = AutoProcessor.from_pretrained("naazimsnh02/minicpm-v-4-6-indian-invoice-extraction", trust_remote_code=True)
+```
+### Inference Example
+```python
+from PIL import Image
+image = Image.open("invoice.jpg")
+messages = [
+    {
+        "role": "system",
+        "content": "You are an invoice extraction assistant. Given an invoice image, extract all fields as valid JSON. Return ONLY the JSON object, no explanation."
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": image},
+            {"type": "text", "text": "Extract all invoice fields as JSON."}
+        ]
+    }
+]
+inputs = processor(messages, return_tensors="pt").to(model.device)
+with torch.no_grad():
+    output = model.generate(**inputs, max_new_tokens=512)
+result_json = processor.decode(output[0], skip_special_tokens=True)
+```
+## Output Schema
+```json
+{
+  "invoice_number": "INV-2024-001",
+  "supplier": "Hindustan Unilever Ltd.",
+  "date": "2026-06-10",
+  "items": [
+    {
+      "product_raw": "SURF XL 1KG",
+      "quantity": 12,
+      "unit_price": 95.00,
+      "gst_rate": 18,
+      "line_total": 1140.00
+    }
+  ],
+  "grand_total": 9650.00,
+  "extraction_warnings": []
+}
+```
+## Supported Invoice Formats
+- Printed GST invoices (Tally-style, thermal-print)
+- Tally PDF exports
+- WhatsApp screenshot invoices
+- Handwritten bills
+## Training Details
+| Parameter | Value |
+|-----------|-------|
+| Base model | openbmb/MiniCPM-V-4.6 |
+| Model class | MiniCPMV4_6ForConditionalGeneration |
+| Fine-tuning method | QLoRA (4-bit + LoRA) |
+| LoRA rank | 16 |
+| Quantization | bitsandbytes 4-bit (nf4) |
+| Batch size | 1 (grad accum × 16 = effective 16) |
+| Learning rate | 1e-4 (cosine decay, warmup 10 steps) |
+| Epochs | 3 |
+| Total steps | 87 |
+| Hardware | NVIDIA A10G (22 GB VRAM) |
+| Training time | ~52 minutes |
+| Orchestration | Modal (serverless GPU) |
+| Framework | Transformers ≥ 5.7.0 + PEFT |
+## Citation
+```bibtex
+@misc{kirana-detector-minicpm-v-2026,
+  title  = {Kirana Detective: MiniCPM-V 4.6 Indian Invoice Extraction},
+  author = {Syed Naazim Hussain},
+  year   = {2026},
+  url    = {https://huggingface.co/naazimsnh02/minicpm-v-4-6-indian-invoice-extraction}
+}
+```
+## License
+Apache 2.0 (same as base model openbmb/MiniCPM-V-4.6)
+"""
+@app.function(
+    image=IMAGE,
+    timeout=600,
+    secrets=[HF_SECRET],
+    volumes={
+        "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=False)
+    },
+)
+def push_to_hub():
+    from huggingface_hub import HfApi
+    from pathlib import Path
+    token = os.environ["HF_TOKEN"]
+    api = HfApi(token=token)
+    print(f"Creating repo: {HF_REPO}")
+    api.create_repo(HF_REPO, repo_type="model", exist_ok=True, private=False)
+    adapter_dir = Path("/output/minicpm-v-lora")
+    if not adapter_dir.exists():
+        raise FileNotFoundError(
+            f"Adapter not found at {adapter_dir}. "
+            "Did the training job complete successfully?"
+        )
+    files = list(adapter_dir.iterdir())
+    print(f"Found {len(files)} files in {adapter_dir}:")
+    for f in files:
+        print(f"  {f.name} ({f.stat().st_size / 1024:.1f} KB)")
+    for f in files:
+        if f.is_file():
+            print(f"Uploading {f.name}...")
+            api.upload_file(
+                path_or_fileobj=str(f),
+                path_in_repo=f.name,
+                repo_id=HF_REPO,
+                repo_type="model",
+            )
+    print("Uploading README.md (model card)...")
+    api.upload_file(
+        path_or_fileobj=MODEL_CARD.encode(),
+        path_in_repo="README.md",
+        repo_id=HF_REPO,
+        repo_type="model",
+    )
+    print(f"\nDone! Model published at: https://huggingface.co/{HF_REPO}")
+@app.local_entrypoint()
+def main():
+    push_to_hub.remote()

finetune/push_yolo_to_hf.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+Push trained YOLO artifacts from Modal volume to HuggingFace Hub.
+Usage:
+    modal run finetune/push_yolo_to_hf.py
+Reads from Modal volume: kirana-yolo-output  (/output/)
+Pushes to: naazimsnh02/yolo26n-indian-fmcg-detection
+  - best.pt       (PyTorch weights)
+  - best.onnx     (ONNX, opset 12)
+  - class_names.json
+  - README.md     (model card)
+"""
+import os
+import modal
+app = modal.App("kirana-push-yolo")
+IMAGE = (
+    modal.Image.debian_slim(python_version="3.11")
+    .pip_install("huggingface_hub>=0.30.0")
+)
+HF_SECRET = modal.Secret.from_name("hf-secret")
+HF_REPO   = "naazimsnh02/yolo26n-indian-fmcg-detection"
+MODEL_CARD = """\
+---
+license: apache-2.0
+base_model: yolo26n
+language:
+  - en
+tags:
+  - object-detection
+  - yolo
+  - indian-fmcg
+  - onnx
+  - ultralytics
+  - kirana
+pipeline_tag: object-detection
+datasets:
+  - agentsk47/indian-grocery-object-detection-mfsnx
+  - iit-patna-qg1jh/grocery_items-7i2em
+  - project-c5ho0/indian-market-qieug
+---
+# YOLO26n — Indian FMCG Product Detection
+Fine-tuned [YOLO26n](https://docs.ultralytics.com) on a **merged dataset of three Indian grocery sources**
+from Roboflow Universe.  Part of the **Kirana Detective** project — an AI system for small Indian grocery
+stores to visually count and reconcile shelf/counter inventory from photos.
+## Performance
+| Metric | Value |
+|---|---|
+| mAP50 (all classes) | **0.428** |
+| mAP50-95 (all classes) | **0.302** |
+| Total classes | 1,831 |
+| Validation images | 1,236 |
+| Validation instances | 13,443 |
+Training ran for **100 epochs** (60 initial + 40 resumed after restart) on an NVIDIA A10G via Modal.
+## Training Datasets
+| Dataset | Workspace | Version | Images | Classes |
+|---|---|---|---|---|
+| [Indian Grocery Object Detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) | agentsk47 | v1 | ~400 | 10 |
+| [Grocery Items](https://universe.roboflow.com/iit-patna-qg1jh/grocery_items-7i2em) | IIT Patna | v45 | 6,695 | 20 |
+| [Indian Market](https://universe.roboflow.com/project-c5ho0/indian-market-qieug) | project-c5ho0 | v2 | 4,694 | 2 |
+All three datasets were downloaded in **YOLOv8 format**, class IDs remapped to a unified list, and merged
+before training.  The full unified class list (1,831 entries) is available in `class_names.json`.
+## Files
+| File | Description |
+|---|---|
+| `best.pt` | PyTorch checkpoint (best mAP50 epoch) |
+| `best.onnx` | ONNX export, opset 12 (recommended for inference) |
+| `class_names.json` | Full list of 1,831 class names (index = class_id) |
+## How to Use
+### ONNX Runtime (CPU / any platform)
+```python
+import json, numpy as np, onnxruntime as ort
+from PIL import Image
+session    = ort.InferenceSession("best.onnx", providers=["CPUExecutionProvider"])
+class_names = json.load(open("class_names.json"))
+def preprocess(path, size=640):
+    img = Image.open(path).convert("RGB").resize((size, size))
+    return (np.array(img, dtype=np.float32) / 255.0).transpose(2, 0, 1)[None]
+input_name = session.get_inputs()[0].name
+outputs    = session.run(None, {input_name: preprocess("shelf.jpg")})
+# outputs[0]: (1, 300, 6) — [x1, y1, x2, y2, confidence, class_id]
+```
+### Ultralytics (PyTorch)
+```python
+from ultralytics import YOLO
+model   = YOLO("best.pt")
+results = model.predict("shelf.jpg", imgsz=640, conf=0.25)
+results[0].show()
+```
+## Training Details
+| Parameter | Value |
+|---|---|
+| Base model | YOLO26n |
+| Input size | 640 × 640 |
+| Epochs | 100 (60 + 40 resumed) |
+| Batch size | 16 |
+| Early stopping patience | 20 |
+| Export format | ONNX opset 12 |
+| Hardware | NVIDIA A10G (Modal) |
+## Citation
+```bibtex
+@misc{kirana-detective-yolo-2026,
+  title  = {Kirana Detective: YOLO26n Indian FMCG Product Detector},
+  author = {Naazim},
+  year   = {2026},
+  url    = {https://huggingface.co/naazimsnh02/yolo26n-indian-fmcg-detection}
+}
+```
+"""
+@app.function(
+    image=IMAGE,
+    timeout=600,
+    secrets=[HF_SECRET],
+    volumes={"/output": modal.Volume.from_name("kirana-yolo-output", create_if_missing=False)},
+)
+def push_to_hub():
+    import json
+    import shutil
+    import tempfile
+    from pathlib import Path
+    from huggingface_hub import HfApi
+    # --- Locate artifacts ---
+    output    = Path("/output")
+    best_pt   = output / "runs/yolo26n_fmcg/weights/best.pt"
+    best_onnx = output / "runs/yolo26n_fmcg/weights/best.onnx"
+    cls_json  = output / "class_names.json"
+    print("=== Volume contents (/output) ===")
+    for p in sorted(output.rglob("*")):
+        if p.is_file():
+            print(f"  {p.relative_to(output)}  ({p.stat().st_size / 1024:.1f} KB)")
+    missing = [p for p in (best_pt, best_onnx, cls_json) if not p.exists()]
+    if missing:
+        raise FileNotFoundError(f"Missing artifacts: {[str(m) for m in missing]}")
+    with open(cls_json) as f:
+        classes = json.load(f)
+    print(f"\nClass count: {len(classes)}")
+    # --- Stage all files into a temp folder, then push as a single commit ---
+    api = HfApi(token=os.environ["HF_TOKEN"])
+    api.create_repo(HF_REPO, repo_type="model", exist_ok=True, private=False)
+    with tempfile.TemporaryDirectory() as staging:
+        staging = Path(staging)
+        shutil.copy(best_pt,   staging / "best.pt")
+        shutil.copy(best_onnx, staging / "best.onnx")
+        shutil.copy(cls_json,  staging / "class_names.json")
+        (staging / "README.md").write_text(MODEL_CARD, encoding="utf-8")
+        print("\nFiles staged for upload:")
+        for f in sorted(staging.iterdir()):
+            print(f"  {f.name}  ({f.stat().st_size / 1024:.1f} KB)")
+        print("\nPushing to HF Hub (single commit)...")
+        api.upload_folder(
+            folder_path=str(staging),
+            repo_id=HF_REPO,
+            repo_type="model",
+            commit_message="Add best.pt, best.onnx, class_names.json, README (100-epoch FMCG detector)",
+        )
+    print(f"\nDone — https://huggingface.co/{HF_REPO}")
+@app.local_entrypoint()
+def main():
+    push_to_hub.remote()

finetune/train_minicpm_v.py CHANGED Viewed

@@ -10,10 +10,11 @@ Two-step workflow:
 Publishes:
     build-small-hackathon/kirana-invoice-train-data             (HF dataset, reusable)
-    naazimsnh02/minicpm-v-4-6-indian-invoice-extraction  (model GGUF)
 Training approach:
-    QLoRA via Unsloth on base openbmb/MiniCPM-V-4.6
     System prompt: "Extract invoice JSON"
     User turn: <image> + "Extract all invoice fields as JSON"
     Assistant turn: <annotation JSON>
@@ -29,17 +30,20 @@ app = modal.App("kirana-minicpm-v-finetune")
 IMAGE = (
     modal.Image.debian_slim(python_version="3.11")
-    .apt_install("libsm6", "libxext6")
     .pip_install(
-        "unsloth>=2026.5.0",
         "huggingface_hub>=0.30.0",
         "datasets>=3.0.0",
         "torch>=2.3.0",
         "torchvision>=0.18.0",
-        "transformers[torch]<=5.5.0",
         "trl>=0.9.0",
-        "peft>=0.18.0",
         "pillow>=10.0.0",
     )
 )
@@ -53,9 +57,9 @@ ANNOTATIONS_PATH = "/data/synthetic_invoices/annotations.jsonl"
 LORA_RANK = 16
 MAX_SEQ_LENGTH = 2048
 EPOCHS = 3
-BATCH_SIZE = 4
-GRAD_ACCUM = 4
-LEARNING_RATE = 2e-4
 SYSTEM_PROMPT = (
     "You are an invoice extraction assistant. "
@@ -91,11 +95,7 @@ JSON_SCHEMA = """{
     memory=8192,
 )
 def push_dataset():
-    """Build a HF dataset from the Modal volume and push to Hub.
-    Uses flat image column (one Image() per row, not a nested list) so Arrow
-    serialisation never encounters mixed list/non-list types.
-    """
     from PIL import Image as PILImage
     from datasets import Dataset, Features, Value
     from datasets import Image as HFImage
@@ -135,7 +135,7 @@ def push_dataset():
 @app.function(
     image=IMAGE,
     gpu="A10G",
-    timeout=14400,  # 4 hours
     secrets=[HF_SECRET],
     volumes={
         "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=True),
@@ -143,125 +143,287 @@ def push_dataset():
     memory=32768,
 )
 def train():
     from datasets import load_dataset
     from huggingface_hub import HfApi
-    from unsloth import FastVisionModel
-    from unsloth.trainer import UnslothVisionDataCollator
-    from trl import SFTTrainer, SFTConfig
-    # Load the pre-built dataset from HF Hub.
-    # image column is decoded to PIL on access; response is a plain string.
     hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
     print(f"Dataset: {len(hf_ds['train'])} train, {len(hf_ds['test'])} eval")
-    instruction = (
-        f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}"
     )
-    def to_conversation(sample):
-        # All content values are lists so Arrow infers a single consistent struct
-        # type (struct<type, text?>) with a nullable text field — no mixed
-        # list/non-list values at any nesting level.
         return {
-            "messages": [
-                {
-                    "role": "system",
-                    "content": [{"type": "text", "text": SYSTEM_PROMPT}],
-                },
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "image"},           # placeholder matched by collator
-                        {"type": "text", "text": instruction},
-                    ],
-                },
-                {
-                    "role": "assistant",
-                    "content": [{"type": "text", "text": sample["response"]}],
-                },
-            ],
-            # Single PIL Image per row (flat, not wrapped in a list).
-            # Arrow stores this as Image() — no nested-list serialisation issue.
-            "images": sample["image"],
         }
-    train_dataset = hf_ds["train"].map(
-        to_conversation, remove_columns=hf_ds["train"].column_names
-    )
-    eval_dataset = hf_ds["test"].map(
-        to_conversation, remove_columns=hf_ds["test"].column_names
-    )
-    # --- Load model with Unsloth ---
-    model, tokenizer = FastVisionModel.from_pretrained(
-        BASE_MODEL,
-        load_in_4bit=True,
-        use_gradient_checkpointing="unsloth",
-    )
-    model = FastVisionModel.get_peft_model(
-        model,
-        finetune_vision_layers=True,
-        finetune_language_layers=True,
-        finetune_attention_modules=True,
-        finetune_mlp_modules=True,
         r=LORA_RANK,
-        lora_alpha=LORA_RANK,
-        lora_dropout=0,
         bias="none",
-        random_state=42,
     )
-    # UnslothVisionDataCollator handles apply_chat_template + image injection
-    # at batch time; remove_unused_columns=False keeps the images column.
-    trainer = SFTTrainer(
         model=model,
-        tokenizer=tokenizer,
-        data_collator=UnslothVisionDataCollator(model, tokenizer),
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
-        args=SFTConfig(
-            output_dir="/output/minicpm-v-sft",
-            per_device_train_batch_size=BATCH_SIZE,
-            gradient_accumulation_steps=GRAD_ACCUM,
-            warmup_steps=10,
-            num_train_epochs=EPOCHS,
-            learning_rate=LEARNING_RATE,
-            fp16=False,
-            bf16=True,
-            logging_steps=10,
-            eval_strategy="epoch",
-            save_strategy="epoch",
-            load_best_model_at_end=True,
-            report_to="none",
-            max_seq_length=MAX_SEQ_LENGTH,
-            remove_unused_columns=False,
-        ),
     )
     trainer.train()
     print("Training complete")
-    # --- Save merged model ---
-    model.save_pretrained_merged(
-        "/output/minicpm-v-merged", tokenizer, save_method="merged_16bit"
-    )
-    # --- Export to GGUF Q4_K_M ---
-    model.save_pretrained_gguf(
-        "/output/minicpm-v-gguf",
-        tokenizer,
-        quantization_method="q4_k_m",
-    )
-    print("GGUF export complete")
-    # --- Publish model to HF Hub ---
     api = HfApi(token=os.environ["HF_TOKEN"])
     api.create_repo(HF_REPO, repo_type="model", exist_ok=True)
-    gguf_dir = Path("/output/minicpm-v-gguf")
-    for f in gguf_dir.glob("*.gguf"):
-        print(f"Uploading {f.name}...")
-        api.upload_file(path_or_fileobj=str(f), path_in_repo=f.name, repo_id=HF_REPO)
     model_card = f"""---
 license: apache-2.0
@@ -271,13 +433,13 @@ datasets:
 tags:
   - invoice-extraction
   - indian-fmcg
-  - gguf
   - minicpm-v
   - ocr
   - qlora
 ---
-# MiniCPM-V 4.6 — Indian Invoice Extraction
 Fine-tuned [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}) for structured extraction
 from Indian distributor invoices (printed GST bills, Tally exports, WhatsApp screenshots,
@@ -286,18 +448,19 @@ handwritten bills).
 ## Usage
 ```python
-from llama_cpp import Llama
-from llama_cpp.llama_chat_format import MiniCPMv26ChatHandler
-handler = MiniCPMv26ChatHandler(clip_model_path="mmproj.gguf")
-llm = Llama(model_path="model.gguf", chat_handler=handler, n_ctx=4096)
 ```
 ## Training
 - Base model: {BASE_MODEL}
-- Method: QLoRA (rank {LORA_RANK}) via Unsloth on Modal A10G
-- Dataset: [{HF_DATASET_REPO}](https://huggingface.co/datasets/{HF_DATASET_REPO}) — 500 synthetic Indian invoices (4 formats × 10 suppliers)
 - Part of: Kirana Detective AI (HuggingFace Build Small Hackathon 2026)
 """
     api.upload_file(
@@ -308,6 +471,78 @@ llm = Llama(model_path="model.gguf", chat_handler=handler, n_ctx=4096)
     print(f"Published to {HF_REPO}")
 # ─── Local entrypoints ─────────────────────────────────────────────────────────
 @app.local_entrypoint()
@@ -318,3 +553,8 @@ def main():
 @app.local_entrypoint()
 def main_push():
     push_dataset.remote()

 Publishes:
     build-small-hackathon/kirana-invoice-train-data             (HF dataset, reusable)
+    build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction  (model adapter + GGUF)
 Training approach:
+    QLoRA via native PEFT + bitsandbytes on base openbmb/MiniCPM-V-4.6
+    (unsloth is incompatible with MiniCPM-V-4.6 which requires transformers>=5.7.0)
     System prompt: "Extract invoice JSON"
     User turn: <image> + "Extract all invoice fields as JSON"
     Assistant turn: <annotation JSON>
 IMAGE = (
     modal.Image.debian_slim(python_version="3.11")
+    .apt_install("libsm6", "libxext6", "git")
     .pip_install(
         "huggingface_hub>=0.30.0",
         "datasets>=3.0.0",
         "torch>=2.3.0",
         "torchvision>=0.18.0",
+        "transformers[torch]>=5.7.0",
+        "peft>=0.14.0",
+        "bitsandbytes>=0.43.0",
+        "accelerate>=0.28.0",
         "trl>=0.9.0",
         "pillow>=10.0.0",
+        "sentencepiece>=0.2.0",
+        "timm>=0.9.0",
     )
 )
 LORA_RANK = 16
 MAX_SEQ_LENGTH = 2048
 EPOCHS = 3
+BATCH_SIZE = 1
+GRAD_ACCUM = 16
+LEARNING_RATE = 1e-4
 SYSTEM_PROMPT = (
     "You are an invoice extraction assistant. "
     memory=8192,
 )
 def push_dataset():
+    """Build a HF dataset from the Modal volume and push to Hub."""
     from PIL import Image as PILImage
     from datasets import Dataset, Features, Value
     from datasets import Image as HFImage
 @app.function(
     image=IMAGE,
     gpu="A10G",
+    timeout=14400,
     secrets=[HF_SECRET],
     volumes={
         "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=True),
     memory=32768,
 )
 def train():
+    import torch
     from datasets import load_dataset
     from huggingface_hub import HfApi
+    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+    from torch.utils.data import Dataset as TorchDataset
+    from transformers import (
+        AutoModelForMultimodalLM,
+        AutoTokenizer,
+        BitsAndBytesConfig,
+        Trainer,
+        TrainingArguments,
+    )
+    # ── Load dataset ──────────────────────────────────────────────────────────
     hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
     print(f"Dataset: {len(hf_ds['train'])} train, {len(hf_ds['test'])} eval")
+    # ── Load model with 4-bit QLoRA ───────────────────────────────────────────
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+    )
+    raw_model = AutoModelForMultimodalLM.from_pretrained(
+        BASE_MODEL,
+        quantization_config=bnb_config,
+        trust_remote_code=True,
+        dtype=torch.bfloat16,
+        device_map="auto",
     )
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
+    print(f"Loaded model class: {raw_model.__class__.__name__}")
+    # ── Discover image preprocessing API ─────────────────────────────────────
+    # Try AutoProcessor first (modern HuggingFace VLM interface)
+    processor = None
+    try:
+        from transformers import AutoProcessor
+        processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
+        print(f"Loaded processor: {processor.__class__.__name__}")
+    except Exception as e:
+        print(f"AutoProcessor not available: {e}")
+    # Find image placeholder token (MiniCPM-V uses (<image>./</image>) or <image>)
+    image_token = None
+    vocab = tokenizer.get_vocab()
+    for candidate in ["(<image>./</image>)", "<image>", "<IMAGE>", "[IMAGE]"]:
+        if candidate in vocab:
+            image_token = candidate
+            break
+    # Fallback: scan tokenizer's special/added tokens
+    if image_token is None:
+        for token in list(tokenizer.special_tokens_map.values()) + list(tokenizer.added_tokens_encoder.keys()):
+            if isinstance(token, str) and "image" in token.lower():
+                image_token = token
+                break
+    print(f"Image placeholder token: {image_token!r}")
+    print(f"Special tokens: {tokenizer.special_tokens_map}")
+    # ── Pre-process ALL data before PEFT wrapping ─────────────────────────────
+    instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}"
+    def preprocess_one(image, response, debug=False):
+        image = image.convert("RGB")
+        # MUST use processor.apply_chat_template with {"type": "image"} —
+        # NOT tokenizer.apply_chat_template with a "<image>" string.
+        # Only the processor knows to expand {"type":"image"} into the correct
+        # number of <|image_pad|> tokens; the tokenizer leaves a bare <image>
+        # placeholder and the model then finds tokens:0, features:N mismatch.
+        msgs = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": [
+                {"type": "image"},
+                {"type": "text", "text": instruction},
+            ]},
+        ]
+        text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
+        # max_slice_nums=6: balance context vs OOM for portrait invoices
+        proc_out = processor(
+            text=text,
+            images=[image],
+            return_tensors="pt",
+            max_slice_nums=6,
+        )
+        if debug:
+            print("=== PROCESSOR OUTPUT SHAPES (first sample) ===")
+            for k, v in proc_out.items():
+                if isinstance(v, torch.Tensor):
+                    print(f"  {k}: shape={list(v.shape)}, dtype={v.dtype}")
+                elif isinstance(v, list):
+                    item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:3]]
+                    print(f"  {k}: list[{len(v)}] = {item_info}")
+                else:
+                    print(f"  {k}: {type(v).__name__} = {v!r}")
+        prompt_ids = proc_out["input_ids"][0]
+        # pixel_values: processor returns (1, 3, 14, W) WITH batch dim.
+        # target_sizes: returned as (N_tiles, 2) with NO batch dim.
+        # Strip batch dim only where it exists (shape[0]==1).
+        vision_fields = {}
+        for k, v in proc_out.items():
+            if k in ("input_ids", "attention_mask"):
+                continue
+            if isinstance(v, torch.Tensor):
+                if k == "pixel_values":
+                    vision_fields[k] = v[0]  # (1,3,14,W) → (3,14,W); collator stacks to (B,3,14,W)
+                elif v.shape[0] == 1:
+                    vision_fields[k] = v[0]  # strip batch-1 wrapper from metadata scalars
+                else:
+                    vision_fields[k] = v     # e.g. target_sizes (N_tiles,2) — no batch dim
+            elif isinstance(v, list) and len(v) == 1:
+                vision_fields[k] = v[0]
+            else:
+                vision_fields[k] = v
+        if debug:
+            print("=== VISION FIELDS AFTER PROCESSING ===")
+            for k, v in vision_fields.items():
+                if isinstance(v, torch.Tensor):
+                    print(f"  {k}: shape={list(v.shape)}")
+                else:
+                    print(f"  {k}: {type(v).__name__} = {v!r}")
+        response_ids = torch.tensor(
+            tokenizer.encode(response + tokenizer.eos_token, add_special_tokens=False),
+            dtype=torch.long,
+        )
+        full_ids = torch.cat([prompt_ids, response_ids])
+        labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids])
+        full_ids = full_ids[:MAX_SEQ_LENGTH]
+        labels = labels[:MAX_SEQ_LENGTH]
         return {
+            "input_ids": full_ids,
+            "attention_mask": torch.ones_like(full_ids),
+            "labels": labels,
+            **vision_fields,   # pixel_values, image_sizes, etc.
         }
+    print("Pre-processing training data...")
+    train_data, eval_data = [], []
+    for i, s in enumerate(hf_ds["train"]):
+        try:
+            train_data.append(preprocess_one(s["image"], s["response"], debug=(i == 0)))
+        except Exception as e:
+            print(f"  Skipping train[{i}]: {e}")
+    for i, s in enumerate(hf_ds["test"]):
+        try:
+            eval_data.append(preprocess_one(s["image"], s["response"]))
+        except Exception as e:
+            print(f"  Skipping eval[{i}]: {e}")
+    print(f"Pre-processed {len(train_data)} train, {len(eval_data)} eval samples")
+    # ── PEFT wrapping ─────────────────────────────────────────────────────────
+    model = prepare_model_for_kbit_training(raw_model, use_gradient_checkpointing=True)
+    # task_type=None → base PeftModel; avoids requiring prepare_inputs_for_generation
+    lora_config = LoraConfig(
         r=LORA_RANK,
+        lora_alpha=LORA_RANK * 2,
+        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+        lora_dropout=0.05,
         bias="none",
+        task_type=None,
+    )
+    model = get_peft_model(model, lora_config)
+    model.print_trainable_parameters()
+    # ── Dataset (just wraps pre-processed list) ───────────────────────────────
+    class PreprocessedDataset(TorchDataset):
+        def __init__(self, data):
+            self.data = data
+        def __len__(self):
+            return len(self.data)
+        def __getitem__(self, idx):
+            return self.data[idx]
+    train_dataset = PreprocessedDataset(train_data)
+    eval_dataset = PreprocessedDataset(eval_data)
+    # ── Collator: pad to batch max length ─────────────────────────────────────
+    pad_id = tokenizer.pad_token_id or 0
+    import torch.nn.functional as F
+    def collate_fn(batch):
+        max_len = max(b["input_ids"].size(0) for b in batch)
+        result = {}
+        for b in batch:
+            pad = max_len - b["input_ids"].size(0)
+            result.setdefault("input_ids", []).append(F.pad(b["input_ids"], (0, pad), value=pad_id))
+            result.setdefault("attention_mask", []).append(F.pad(b["attention_mask"], (0, pad), value=0))
+            result.setdefault("labels", []).append(F.pad(b["labels"], (0, pad), value=-100))
+        result = {k: torch.stack(v) for k, v in result.items()}
+        # Pass through every vision field.
+        # pixel_values (3,14,W) per sample → stack → (B,3,14,W)  [4D for conv2d]
+        # target_sizes (N_tiles,2) per sample → cat → (total_tiles,2) [no extra batch dim]
+        extra_keys = [k for k in batch[0] if k not in result]
+        for k in extra_keys:
+            vals = [b[k] for b in batch]
+            if k == "target_sizes":
+                result[k] = torch.cat(vals, dim=0)  # (total_tiles, 2)
+            else:
+                try:
+                    result[k] = torch.stack(vals)
+                except (RuntimeError, TypeError):
+                    result[k] = vals
+        return result
+    # ── Debug Trainer: print input shapes on first batch ─────────────────────
+    _debug_step_done = [False]
+    class DebugTrainer(Trainer):
+        def compute_loss(self, model, inputs, num_items_in_batch=None, **kwargs):
+            if not _debug_step_done[0]:
+                _debug_step_done[0] = True
+                print("=== MODEL INPUT SHAPES (first batch) ===")
+                for k, v in inputs.items():
+                    if isinstance(v, torch.Tensor):
+                        print(f"  {k}: shape={list(v.shape)}, dtype={v.dtype}")
+                    elif isinstance(v, list):
+                        item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:2]]
+                        print(f"  {k}: list[{len(v)}] = {item_info}")
+                    else:
+                        print(f"  {k}: {type(v).__name__} = {v!r}")
+            return super().compute_loss(model, inputs, num_items_in_batch=num_items_in_batch, **kwargs)
+    # ── Training ──────────────────────────────────────────────────────────────
+    training_args = TrainingArguments(
+        output_dir="/output/minicpm-v-sft",
+        per_device_train_batch_size=BATCH_SIZE,
+        per_device_eval_batch_size=BATCH_SIZE,
+        gradient_accumulation_steps=GRAD_ACCUM,
+        warmup_steps=10,
+        num_train_epochs=EPOCHS,
+        learning_rate=LEARNING_RATE,
+        bf16=True,
+        fp16=False,
+        logging_steps=10,
+        eval_strategy="epoch",
+        save_strategy="epoch",
+        load_best_model_at_end=True,
+        report_to="none",
+        remove_unused_columns=False,
+        dataloader_num_workers=0,  # data already pre-processed; no workers needed
     )
+    trainer = DebugTrainer(
         model=model,
+        args=training_args,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
+        data_collator=collate_fn,
     )
     trainer.train()
     print("Training complete")
+    # ── Save LoRA adapter ─────────────────────────────────────────────────────
+    model.save_pretrained("/output/minicpm-v-lora")
+    tokenizer.save_pretrained("/output/minicpm-v-lora")
+    print("LoRA adapter saved to /output/minicpm-v-lora")
+    # ── Publish adapter to HF Hub ─────────────────────────────────────────────
     api = HfApi(token=os.environ["HF_TOKEN"])
     api.create_repo(HF_REPO, repo_type="model", exist_ok=True)
+    adapter_dir = Path("/output/minicpm-v-lora")
+    for f in adapter_dir.iterdir():
+        if f.is_file():
+            print(f"Uploading {f.name}...")
+            api.upload_file(path_or_fileobj=str(f), path_in_repo=f.name, repo_id=HF_REPO)
     model_card = f"""---
 license: apache-2.0
 tags:
   - invoice-extraction
   - indian-fmcg
   - minicpm-v
   - ocr
   - qlora
+  - peft
 ---
+# MiniCPM-V 4.6 — Indian Invoice Extraction (LoRA Adapter)
 Fine-tuned [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}) for structured extraction
 from Indian distributor invoices (printed GST bills, Tally exports, WhatsApp screenshots,
 ## Usage
 ```python
+from transformers import AutoModelForMultimodalLM, AutoTokenizer
+from peft import PeftModel
+base = AutoModelForMultimodalLM.from_pretrained("{BASE_MODEL}", trust_remote_code=True)
+model = PeftModel.from_pretrained(base, "{HF_REPO}")
+tokenizer = AutoTokenizer.from_pretrained("{HF_REPO}", trust_remote_code=True)
 ```
 ## Training
 - Base model: {BASE_MODEL}
+- Method: QLoRA (rank {LORA_RANK}) via PEFT + bitsandbytes on Modal A10G
+- Dataset: [{HF_DATASET_REPO}](https://huggingface.co/datasets/{HF_DATASET_REPO}) — 500 synthetic Indian invoices
 - Part of: Kirana Detective AI (HuggingFace Build Small Hackathon 2026)
 """
     api.upload_file(
     print(f"Published to {HF_REPO}")
+# ─── Dry-run: verify preprocessing + first batch shapes without training ──────
+# Usage: modal run finetune/train_minicpm_v.py::main_dryrun
+# Completes in ~2 min; confirms shapes are correct before a full training run.
+@app.function(
+    image=IMAGE,
+    gpu="A10G",
+    timeout=600,
+    secrets=[HF_SECRET],
+    memory=32768,
+)
+def dryrun():
+    import torch
+    from datasets import load_dataset
+    from transformers import AutoModelForMultimodalLM, AutoTokenizer, AutoProcessor
+    hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
+    raw_model = AutoModelForMultimodalLM.from_pretrained(BASE_MODEL, trust_remote_code=True, device_map="auto")
+    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
+    processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
+    sample = hf_ds["train"][0]
+    image = sample["image"].convert("RGB")
+    instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}"
+    msgs = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}]},
+    ]
+    text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
+    proc_out = processor(text=text, images=[image], return_tensors="pt", max_slice_nums=6)
+    print("=== DRY-RUN: processor output ===")
+    image_pad_id = tokenizer.convert_tokens_to_ids("<|image_pad|>")
+    for k, v in proc_out.items():
+        if isinstance(v, torch.Tensor):
+            count = (v == image_pad_id).sum().item() if k == "input_ids" else ""
+            pad_info = f"  (<|image_pad|> count={count})" if count != "" else ""
+            print(f"  {k}: shape={list(v.shape)}{pad_info}")
+    response_ids = torch.tensor(
+        tokenizer.encode(sample["response"] + tokenizer.eos_token, add_special_tokens=False),
+        dtype=torch.long,
+    )
+    prompt_ids = proc_out["input_ids"][0]
+    full_ids = torch.cat([prompt_ids, response_ids])[:MAX_SEQ_LENGTH]
+    labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids])[:MAX_SEQ_LENGTH]
+    model_inputs = {
+        "input_ids": full_ids.unsqueeze(0),
+        "attention_mask": torch.ones_like(full_ids).unsqueeze(0),
+        "labels": labels.unsqueeze(0),
+    }
+    for k, v in proc_out.items():
+        if k not in ("input_ids", "attention_mask"):
+            model_inputs[k] = v
+    device = next(raw_model.parameters()).device
+    model_inputs = {
+        k: v.to(device) if isinstance(v, torch.Tensor) else v
+        for k, v in model_inputs.items()
+    }
+    raw_model.eval()
+    with torch.no_grad():
+        outputs = raw_model(**model_inputs)
+    if getattr(outputs, "loss", None) is None:
+        raise RuntimeError(
+            f"Dry-run forward did not return loss. Output keys: {list(outputs.keys())}"
+        )
+    print(f"Dry-run forward loss: {outputs.loss.detach().float().item():.4f}")
+    print("Dry-run complete - shapes and labeled forward pass look correct")
 # ─── Local entrypoints ─────────────────────────────────────────────────────────
 @app.local_entrypoint()
 @app.local_entrypoint()
 def main_push():
     push_dataset.remote()
+@app.local_entrypoint()
+def main_dryrun():
+    dryrun.remote()

finetune/train_yolo26n.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Task 0.1 — Fine-tune YOLO26n on Indian Grocery Object Detection dataset.
 Run on Modal (A10G GPU, ~1-2 hours):
     modal run finetune/train_yolo26n.py
@@ -10,6 +10,11 @@ Publishes:
         class_names.json
         model card
 Prerequisites:
     ROBOFLOW_API_KEY in env (for dataset download)
     HF_TOKEN in env (for HF Hub publish)
@@ -25,8 +30,6 @@ app = modal.App("kirana-yolo26n-finetune")
 IMAGE = (
     modal.Image.debian_slim(python_version="3.11")
-    # libGL.so.1 is required by opencv-python (pulled in by ultralytics)
-    # libglib2.0-0 is required by libGL on Debian slim
     .apt_install("libgl1-mesa-glx", "libglib2.0-0")
     .pip_install(
         "ultralytics>=8.4.0",
@@ -34,33 +37,174 @@ IMAGE = (
         "huggingface_hub>=0.30.0",
         "onnx>=1.16.0",
         "onnxruntime>=1.18.0",
     )
 )
 ROBOFLOW_API_KEY = modal.Secret.from_name("roboflow-secret")
 HF_SECRET = modal.Secret.from_name("hf-secret")
-# Roboflow dataset: Indian Grocery Object Detection (AgentSK47)
-# https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx
-ROBOFLOW_WORKSPACE = "agentsk47"
-ROBOFLOW_PROJECT = "indian-grocery-object-detection-mfsnx"
-ROBOFLOW_VERSION = 1
-HF_REPO = "naazimsnh02/yolo26n-indian-fmcg-detection"
-# YOLO model name — update if Ultralytics renames the YOLO26 nano checkpoint
 YOLO_BASE_MODEL = "yolo26n.pt"
-YOLO_FALLBACK = "yolo11n.pt"  # use if yolo26n is not yet released
-EPOCHS = 100
 IMG_SIZE = 640
-BATCH = 16
 @app.function(
     image=IMAGE,
     gpu="A10G",
-    timeout=7200,
     secrets=[ROBOFLOW_API_KEY, HF_SECRET],
     volumes={"/output": modal.Volume.from_name("kirana-yolo-output", create_if_missing=True)},
 )
@@ -72,36 +216,86 @@ def train():
     from ultralytics import YOLO
     from huggingface_hub import HfApi
-    # --- Download dataset from Roboflow ---
-    rf = Roboflow(api_key=os.environ["ROBOFLOW_API_KEY"])
-    project = rf.workspace(ROBOFLOW_WORKSPACE).project(ROBOFLOW_PROJECT)
-    dataset = project.version(ROBOFLOW_VERSION).download("yolov8", location="/data/indian-grocery")
-    data_yaml = "/data/indian-grocery/data.yaml"
-    print(f"Dataset downloaded to {dataset.location}")
-    # --- Load model ---
-    try:
-        model = YOLO(YOLO_BASE_MODEL)
-        print(f"Loaded base model: {YOLO_BASE_MODEL}")
-    except Exception:
-        print(f"YOLO26n not found, falling back to {YOLO_FALLBACK}")
-        model = YOLO(YOLO_FALLBACK)
-    # --- Train ---
-    results = model.train(
-        data=data_yaml,
-        epochs=EPOCHS,
-        imgsz=IMG_SIZE,
-        batch=BATCH,
-        project="/output/runs",
-        name="yolo26n_fmcg",
-        exist_ok=True,
-        device=0,
-        patience=20,
-        save=True,
-        plots=True,
-    )
     print(f"Training complete. Best mAP50: {results.results_dict.get('metrics/mAP50(B)', 'N/A')}")
     best_pt = Path("/output/runs/yolo26n_fmcg/weights/best.pt")
@@ -112,61 +306,115 @@ def train():
     shutil.copy(str(onnx_path), "/output/yolo26n_fmcg.onnx")
     print(f"Exported ONNX to {onnx_path}")
-    # --- Save class names ---
     import yaml
     with open(data_yaml) as f:
         data_cfg = yaml.safe_load(f)
-    class_names = data_cfg.get("names", {})
     if isinstance(class_names, dict):
         class_names = [class_names[i] for i in sorted(class_names.keys())]
     with open("/output/class_names.json", "w") as f:
-        json.dump(class_names, f, indent=2)
-    print(f"Saved {len(class_names)} class names")
     # --- Publish to HF Hub ---
     api = HfApi(token=os.environ["HF_TOKEN"])
     api.create_repo(HF_REPO, repo_type="model", exist_ok=True)
-    api.upload_file(path_or_fileobj="/output/yolo26n_fmcg.onnx", path_in_repo="yolo26n_fmcg.onnx", repo_id=HF_REPO)
-    api.upload_file(path_or_fileobj="/output/class_names.json", path_in_repo="class_names.json", repo_id=HF_REPO)
-    # Upload model card
     model_card = f"""---
 license: apache-2.0
 tags:
   - object-detection
   - yolo
   - indian-fmcg
   - onnx
   - ultralytics
 ---
-# YOLO26n Indian FMCG Detection
-Fine-tuned YOLO26n on the [Indian Grocery Object Detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) dataset from Roboflow.
 ## Classes ({len(class_names)} total)
-{chr(10).join(f"- {name}" for name in class_names[:30])}
-{"..." if len(class_names) > 30 else ""}
-## Usage
 ```python
-import onnxruntime as ort
 import json
 session = ort.InferenceSession("yolo26n_fmcg.onnx", providers=["CPUExecutionProvider"])
 class_names = json.load(open("class_names.json"))
 ```
-## Training
-- Base model: YOLO26n (Ultralytics)
-- Dataset: Indian Grocery Object Detection (Roboflow, {ROBOFLOW_VERSION} version)
-- Epochs: {EPOCHS}, img_size: {IMG_SIZE}x{IMG_SIZE}
-- Platform: Modal A10G GPU
-- Part of: Kirana Detective AI (HuggingFace Build Small Hackathon 2026)
 """
     api.upload_file(
         path_or_fileobj=model_card.encode(),

 """
+Task 0.1 — Fine-tune YOLO26n on merged Indian grocery datasets.
 Run on Modal (A10G GPU, ~1-2 hours):
     modal run finetune/train_yolo26n.py
         class_names.json
         model card
+Datasets merged (all downloaded as yolov8 format, NOT openai format):
+    1. agentsk47/indian-grocery-object-detection-mfsnx  v1   (~10 classes, small)
+    2. iit-patna-qg1jh/grocery_items-7i2em              v45  (20 classes, 6,695 images)
+    3. project-c5ho0/indian-market-qieug                v2   (2 classes,  4,694 images)
 Prerequisites:
     ROBOFLOW_API_KEY in env (for dataset download)
     HF_TOKEN in env (for HF Hub publish)
 IMAGE = (
     modal.Image.debian_slim(python_version="3.11")
     .apt_install("libgl1-mesa-glx", "libglib2.0-0")
     .pip_install(
         "ultralytics>=8.4.0",
         "huggingface_hub>=0.30.0",
         "onnx>=1.16.0",
         "onnxruntime>=1.18.0",
+        "pyyaml>=6.0",
     )
 )
 ROBOFLOW_API_KEY = modal.Secret.from_name("roboflow-secret")
 HF_SECRET = modal.Secret.from_name("hf-secret")
+# All three Roboflow datasets — downloaded as "yolov8" format (NOT "openai")
+DATASETS = [
+    {
+        "workspace": "agentsk47",
+        "project":   "indian-grocery-object-detection-mfsnx",
+        "version":   1,
+        "location":  "/data/ds_agentsk47",
+    },
+    {
+        "workspace": "iit-patna-qg1jh",
+        "project":   "grocery_items-7i2em",
+        "version":   45,
+        "location":  "/data/ds_iitpatna",
+    },
+    {
+        "workspace": "project-c5ho0",
+        "project":   "indian-market-qieug",
+        "version":   2,
+        "location":  "/data/ds_indianmarket",
+    },
+]
+MERGED_DIR  = "/output/merged_dataset"   # persisted on volume → skip re-download on resume
+HF_REPO     = "naazimsnh02/yolo26n-indian-fmcg-detection"
 YOLO_BASE_MODEL = "yolo26n.pt"
+YOLO_FALLBACK   = "yolo11n.pt"
+EPOCHS   = 100
 IMG_SIZE = 640
+BATCH    = 16
+# ── Dataset merge helpers ──────────────────────────────────────────────────────
+def _read_class_names(data_yaml_path: str) -> list[str]:
+    import yaml
+    with open(data_yaml_path) as f:
+        cfg = yaml.safe_load(f)
+    names = cfg.get("names", [])
+    if isinstance(names, dict):
+        names = [names[i] for i in sorted(names.keys())]
+    return names
+def _remap_label_file(src: str, dst: str, id_map: dict[int, int]) -> None:
+    """Copy a YOLO label file, remapping class IDs via id_map."""
+    from pathlib import Path
+    Path(dst).parent.mkdir(parents=True, exist_ok=True)
+    lines_out = []
+    with open(src) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            parts = line.split()
+            old_id = int(parts[0])
+            new_id = id_map.get(old_id, old_id)
+            lines_out.append(f"{new_id} {' '.join(parts[1:])}")
+    with open(dst, "w") as f:
+        f.write("\n".join(lines_out))
+def merge_yolo_datasets(dataset_locations: list[str], output_dir: str) -> str:
+    """
+    Merge N YOLOv8 datasets into one directory with unified class IDs.
+    Returns the path to the merged data.yaml.
+    """
+    import shutil
+    import yaml
+    from pathlib import Path
+    # 1. Build unified class list (insertion-order dedup across all datasets)
+    unified_classes: list[str] = []
+    per_ds_classes: list[list[str]] = []
+    for loc in dataset_locations:
+        yaml_path = Path(loc) / "data.yaml"
+        if not yaml_path.exists():
+            # Try one level deeper (Roboflow sometimes nests)
+            candidates = list(Path(loc).rglob("data.yaml"))
+            yaml_path = candidates[0] if candidates else yaml_path
+        names = _read_class_names(str(yaml_path))
+        per_ds_classes.append(names)
+        for name in names:
+            if name not in unified_classes:
+                unified_classes.append(name)
+    print(f"Unified class list ({len(unified_classes)} classes): {unified_classes}")
+    # 2. Build per-dataset old_id → new_id maps
+    id_maps: list[dict[int, int]] = []
+    for names in per_ds_classes:
+        id_maps.append({i: unified_classes.index(name) for i, name in enumerate(names)})
+    # 3. Copy images + remapped labels for each split
+    splits = ["train", "valid", "test"]
+    out_root = Path(output_dir)
+    for ds_idx, loc in enumerate(dataset_locations):
+        ds_root = Path(loc)
+        # Roboflow may nest under a subdirectory matching the project name
+        if not (ds_root / "train").exists():
+            subdirs = [d for d in ds_root.iterdir() if d.is_dir() and (d / "train").exists()]
+            if subdirs:
+                ds_root = subdirs[0]
+        id_map = id_maps[ds_idx]
+        ds_tag = f"ds{ds_idx}"
+        for split in splits:
+            img_src = ds_root / split / "images"
+            lbl_src = ds_root / split / "labels"
+            if not img_src.exists():
+                continue
+            img_dst = out_root / split / "images"
+            lbl_dst = out_root / split / "labels"
+            img_dst.mkdir(parents=True, exist_ok=True)
+            lbl_dst.mkdir(parents=True, exist_ok=True)
+            for img_file in img_src.iterdir():
+                # Prefix filename with dataset tag to avoid collisions
+                new_name = f"{ds_tag}_{img_file.name}"
+                shutil.copy(str(img_file), str(img_dst / new_name))
+                stem = img_file.stem
+                lbl_file = lbl_src / f"{stem}.txt"
+                if lbl_file.exists():
+                    _remap_label_file(
+                        str(lbl_file),
+                        str(lbl_dst / f"{ds_tag}_{stem}.txt"),
+                        id_map,
+                    )
+    # 4. Write merged data.yaml
+    merged_yaml = out_root / "data.yaml"
+    cfg = {
+        "path":  str(out_root),
+        "train": "train/images",
+        "val":   "valid/images",
+        "test":  "test/images",
+        "nc":    len(unified_classes),
+        "names": unified_classes,
+    }
+    with open(merged_yaml, "w") as f:
+        yaml.dump(cfg, f, allow_unicode=True, default_flow_style=False)
+    # Count merged images
+    for split in splits:
+        n = len(list((out_root / split / "images").glob("*"))) if (out_root / split / "images").exists() else 0
+        print(f"  {split}: {n} images")
+    return str(merged_yaml)
+# ── Modal function ─────────────────────────────────────────────────────────────
 @app.function(
     image=IMAGE,
     gpu="A10G",
+    timeout=28800,
     secrets=[ROBOFLOW_API_KEY, HF_SECRET],
     volumes={"/output": modal.Volume.from_name("kirana-yolo-output", create_if_missing=True)},
 )
     from ultralytics import YOLO
     from huggingface_hub import HfApi
+    last_pt   = Path("/output/runs/yolo26n_fmcg/weights/last.pt")
+    merged_yaml = Path(MERGED_DIR) / "data.yaml"
+    # --- Dataset: skip download+merge if already cached on the volume ---
+    if merged_yaml.exists():
+        print(f"Merged dataset found at {merged_yaml}, skipping download.")
+        data_yaml = str(merged_yaml)
+    else:
+        rf = Roboflow(api_key=os.environ["ROBOFLOW_API_KEY"])
+        locations = []
+        for ds in DATASETS:
+            print(f"Downloading {ds['workspace']}/{ds['project']} v{ds['version']}...")
+            project = rf.workspace(ds["workspace"]).project(ds["project"])
+            result  = project.version(ds["version"]).download("yolov8", location=ds["location"])
+            locations.append(ds["location"])
+            print(f"  -> {result.location}")
+        print("Merging datasets...")
+        data_yaml = merge_yolo_datasets(locations, MERGED_DIR)
+        print(f"Merged data.yaml: {data_yaml}")
+    # --- Resume from checkpoint if one exists, otherwise start fresh ---
+    if last_pt.exists():
+        import torch
+        ckpt = torch.load(str(last_pt), map_location="cpu", weights_only=False)
+        done_epoch = ckpt.get("epoch", 0)          # 0-indexed epoch that finished
+        remaining  = EPOCHS - (done_epoch + 1)
+        print(f"Checkpoint found — epoch {done_epoch + 1}/{EPOCHS}, {remaining} epochs remaining.")
+        if remaining <= 0:
+            print("Training already complete, skipping to export.")
+            results = type("R", (), {"results_dict": {}})()  # dummy result
+        else:
+            try:
+                model   = YOLO(str(last_pt))
+                results = model.train(resume=True)
+            except (ValueError, RuntimeError) as exc:
+                # Optimizer state mismatch (e.g. after env/package upgrade).
+                # Fall back: load weights, continue for remaining epochs with a
+                # lower LR so we don't disturb the already-converged parameters.
+                print(f"Full resume failed ({exc}).")
+                print(f"Falling back to weights-only resume: {remaining} more epochs.")
+                model   = YOLO(str(last_pt))
+                results = model.train(
+                    data=data_yaml,
+                    epochs=remaining,
+                    imgsz=IMG_SIZE,
+                    batch=BATCH,
+                    project="/output/runs",
+                    name="yolo26n_fmcg",
+                    exist_ok=True,
+                    device=0,
+                    patience=20,
+                    save=True,
+                    plots=True,
+                    lr0=0.0005,   # reduced: weights already partially trained
+                    lrf=0.01,
+                )
+    else:
+        try:
+            model = YOLO(YOLO_BASE_MODEL)
+            print(f"Loaded base model: {YOLO_BASE_MODEL}")
+        except Exception:
+            print(f"YOLO26n not found, falling back to {YOLO_FALLBACK}")
+            model = YOLO(YOLO_FALLBACK)
+        results = model.train(
+            data=data_yaml,
+            epochs=EPOCHS,
+            imgsz=IMG_SIZE,
+            batch=BATCH,
+            project="/output/runs",
+            name="yolo26n_fmcg",
+            exist_ok=True,
+            device=0,
+            patience=20,
+            save=True,
+            plots=True,
+        )
     print(f"Training complete. Best mAP50: {results.results_dict.get('metrics/mAP50(B)', 'N/A')}")
     best_pt = Path("/output/runs/yolo26n_fmcg/weights/best.pt")
     shutil.copy(str(onnx_path), "/output/yolo26n_fmcg.onnx")
     print(f"Exported ONNX to {onnx_path}")
+    # --- Save unified class names ---
     import yaml
     with open(data_yaml) as f:
         data_cfg = yaml.safe_load(f)
+    class_names = data_cfg.get("names", [])
     if isinstance(class_names, dict):
         class_names = [class_names[i] for i in sorted(class_names.keys())]
     with open("/output/class_names.json", "w") as f:
+        json.dump(class_names, f, indent=2, ensure_ascii=False)
+    print(f"Saved {len(class_names)} unified class names")
     # --- Publish to HF Hub ---
     api = HfApi(token=os.environ["HF_TOKEN"])
     api.create_repo(HF_REPO, repo_type="model", exist_ok=True)
+    api.upload_file(path_or_fileobj="/output/yolo26n_fmcg.onnx",     path_in_repo="yolo26n_fmcg.onnx",  repo_id=HF_REPO)
+    api.upload_file(path_or_fileobj="/output/class_names.json",       path_in_repo="class_names.json",   repo_id=HF_REPO)
     model_card = f"""---
 license: apache-2.0
+base_model: yolo26n
+language:
+  - en
 tags:
   - object-detection
   - yolo
   - indian-fmcg
   - onnx
   - ultralytics
+pipeline_tag: object-detection
+datasets:
+  - agentsk47/indian-grocery-object-detection-mfsnx
+  - iit-patna-qg1jh/grocery_items-7i2em
+  - project-c5ho0/indian-market-qieug
 ---
+# YOLO26n — Indian FMCG Product Detection
+Fine-tuned [YOLO26n](https://docs.ultralytics.com) on a **merged dataset of three Indian grocery sources** from Roboflow Universe. Part of the **Kirana Detective** project — an AI system for small Indian grocery stores to visually count and reconcile inventory from shelf/counter photos.
+## Training Datasets
+| Dataset | Workspace | Version | Images | Classes |
+|---|---|---|---|---|
+| [Indian Grocery Object Detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) | agentsk47 | v1 | ~400 | 10 |
+| [Grocery Items](https://universe.roboflow.com/iit-patna-qg1jh/grocery_items-7i2em) | IIT Patna | v45 | 6,695 | 20 |
+| [Indian Market](https://universe.roboflow.com/project-c5ho0/indian-market-qieug) | project-c5ho0 | v2 | 4,694 | 2 |
+All three datasets were downloaded in **YOLOv8 format**, class IDs remapped to a unified list, and merged before training.
 ## Classes ({len(class_names)} total)
+{chr(10).join(f"- {name}" for name in class_names)}
+## How to Use
+### Python (ONNX Runtime)
 ```python
 import json
+import numpy as np
+import onnxruntime as ort
+from PIL import Image
 session = ort.InferenceSession("yolo26n_fmcg.onnx", providers=["CPUExecutionProvider"])
 class_names = json.load(open("class_names.json"))
+def preprocess(image_path, size=640):
+    img = Image.open(image_path).convert("RGB").resize((size, size))
+    arr = np.array(img, dtype=np.float32) / 255.0
+    return arr.transpose(2, 0, 1)[None]  # BCHW
+input_name = session.get_inputs()[0].name
+outputs = session.run(None, {{input_name: preprocess("shelf.jpg")}})
+# outputs[0]: (1, 300, 6) — [x1, y1, x2, y2, confidence, class_id]
 ```
+### Ultralytics (PyTorch)
+```python
+from ultralytics import YOLO
+model = YOLO("yolo26n_fmcg.onnx", task="detect")
+results = model.predict("shelf.jpg", imgsz=640, conf=0.25)
+results[0].show()
+```
+## Training Details
+| Parameter | Value |
+|---|---|
+| Base model | YOLO26n |
+| Input size | 640 × 640 |
+| Epochs (scheduled) | {EPOCHS} |
+| Batch size | {BATCH} |
+| Early stopping patience | 20 |
+| Export format | ONNX opset 12 |
+| Hardware | NVIDIA A10G (Modal) |
+## Citation
+```bibtex
+@misc{{kirana-detective-yolo-2026,
+  title  = {{Kirana Detective: YOLO26n Indian FMCG Product Detector}},
+  author = {{Naazim}},
+  year   = {{2026}},
+  url    = {{https://huggingface.co/naazimsnh02/yolo26n-indian-fmcg-detection}}
+}}
+```
 """
     api.upload_file(
         path_or_fileobj=model_card.encode(),

finetune/upload_yolo_to_hf.py ADDED Viewed

	@@ -0,0 +1,182 @@

+"""
+Upload locally-downloaded YOLO artifacts to HuggingFace Hub.
+Prerequisites:
+    pip install huggingface_hub
+    Set HF_TOKEN env var  (or run `huggingface-cli login`)
+Usage:
+    python finetune/upload_yolo_to_hf.py
+Uploads from:  model_artifacts/yolo26n_fmcg/
+    best.pt, best.onnx, class_names.json  →  naazimsnh02/yolo26n-indian-fmcg-detection
+"""
+import os
+from pathlib import Path
+HF_REPO    = "naazimsnh02/yolo26n-indian-fmcg-detection"
+ARTIFACTS  = Path(__file__).parent.parent / "model_artifacts" / "yolo26n_fmcg"
+MODEL_CARD = """\
+---
+license: apache-2.0
+base_model: yolo26n
+language:
+  - en
+tags:
+  - object-detection
+  - yolo
+  - indian-fmcg
+  - onnx
+  - ultralytics
+  - kirana
+pipeline_tag: object-detection
+datasets:
+  - agentsk47/indian-grocery-object-detection-mfsnx
+  - iit-patna-qg1jh/grocery_items-7i2em
+  - project-c5ho0/indian-market-qieug
+---
+# YOLO26n — Indian FMCG Product Detection
+Fine-tuned [YOLO26n](https://docs.ultralytics.com) on a **merged dataset of three Indian grocery sources**
+from Roboflow Universe.  Part of the **Kirana Detective** project — an AI system for small Indian grocery
+stores to visually count and reconcile shelf/counter inventory from photos.
+## Performance
+| Metric | Value |
+|---|---|
+| mAP50 (all classes) | **0.428** |
+| mAP50-95 (all classes) | **0.302** |
+| Total classes | 1,831 |
+| Validation images | 1,236 |
+| Validation instances | 13,443 |
+Training ran for **100 epochs** (60 initial + 40 resumed after restart) on an NVIDIA A10G via Modal.
+## Training Datasets
+| Dataset | Workspace | Version | Images | Classes |
+|---|---|---|---|---|
+| [Indian Grocery Object Detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) | agentsk47 | v1 | ~400 | 10 |
+| [Grocery Items](https://universe.roboflow.com/iit-patna-qg1jh/grocery_items-7i2em) | IIT Patna | v45 | 6,695 | 20 |
+| [Indian Market](https://universe.roboflow.com/project-c5ho0/indian-market-qieug) | project-c5ho0 | v2 | 4,694 | 2 |
+All three datasets were downloaded in **YOLOv8 format**, class IDs remapped to a unified list, and merged
+before training. The full unified class list (1,831 entries) is in `class_names.json`.
+## Files
+| File | Description |
+|---|---|
+| `best.pt` | PyTorch checkpoint (best mAP50 epoch) |
+| `best.onnx` | ONNX export, opset 12 (recommended for inference) |
+| `class_names.json` | Full list of 1,831 class names (index = class_id) |
+## How to Use
+### ONNX Runtime (CPU / any platform)
+```python
+import json, numpy as np, onnxruntime as ort
+from PIL import Image
+session     = ort.InferenceSession("best.onnx", providers=["CPUExecutionProvider"])
+class_names = json.load(open("class_names.json"))
+def preprocess(path, size=640):
+    img = Image.open(path).convert("RGB").resize((size, size))
+    return (np.array(img, dtype=np.float32) / 255.0).transpose(2, 0, 1)[None]
+input_name = session.get_inputs()[0].name
+outputs    = session.run(None, {input_name: preprocess("shelf.jpg")})
+# outputs[0]: (1, 300, 6) — [x1, y1, x2, y2, confidence, class_id]
+```
+### Ultralytics (PyTorch)
+```python
+from ultralytics import YOLO
+model   = YOLO("best.pt")
+results = model.predict("shelf.jpg", imgsz=640, conf=0.25)
+results[0].show()
+```
+## Training Details
+| Parameter | Value |
+|---|---|
+| Base model | YOLO26n |
+| Input size | 640 × 640 |
+| Epochs | 100 (60 initial + 40 resumed) |
+| Batch size | 16 |
+| Early stopping patience | 20 |
+| Export format | ONNX opset 12 |
+| Hardware | NVIDIA A10G (Modal) |
+## Citation
+```bibtex
+@misc{kirana-detective-yolo-2026,
+  title  = {Kirana Detective: YOLO26n Indian FMCG Product Detector},
+  author = {Naazim},
+  year   = {2026},
+  url    = {https://huggingface.co/naazimsnh02/yolo26n-indian-fmcg-detection}
+}
+```
+"""
+def main():
+    from huggingface_hub import HfApi
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        raise EnvironmentError("HF_TOKEN env var not set. Run: set HF_TOKEN=hf_...")
+    api = HfApi(token=token)
+    files = {
+        "best.pt":          ARTIFACTS / "best.pt",
+        "best.onnx":        ARTIFACTS / "best.onnx",
+        "class_names.json": ARTIFACTS / "class_names.json",
+    }
+    for name, path in files.items():
+        if not path.exists():
+            raise FileNotFoundError(f"Missing: {path}")
+    print(f"Repo: {HF_REPO}")
+    api.create_repo(HF_REPO, repo_type="model", exist_ok=True, private=False)
+    # Upload all files in a single commit to avoid the "no changes" skip bug
+    from huggingface_hub import CommitOperationAdd
+    operations = []
+    for repo_path, local_path in files.items():
+        size_mb = local_path.stat().st_size / 1024 / 1024
+        print(f"  Staging {repo_path}  ({size_mb:.1f} MB)")
+        operations.append(CommitOperationAdd(path_in_repo=repo_path, path_or_fileobj=str(local_path)))
+    operations.append(
+        CommitOperationAdd(
+            path_in_repo="README.md",
+            path_or_fileobj=MODEL_CARD.encode("utf-8"),
+        )
+    )
+    print("  Staging README.md")
+    print("\nCommitting...")
+    commit = api.create_commit(
+        repo_id=HF_REPO,
+        repo_type="model",
+        operations=operations,
+        commit_message="Add best.pt, best.onnx, class_names.json, README (100-epoch FMCG detector)",
+    )
+    print(f"Done — {commit.commit_url}")
+if __name__ == "__main__":
+    main()

finetune/yolo_model_card.md ADDED Viewed

	@@ -0,0 +1,161 @@

+---
+license: apache-2.0
+base_model: yolo26n
+language:
+  - en
+tags:
+  - object-detection
+  - yolo
+  - indian-fmcg
+  - onnx
+  - ultralytics
+pipeline_tag: object-detection
+datasets:
+  - agentsk47/indian-grocery-object-detection-mfsnx
+  - iit-patna-qg1jh/grocery_items-7i2em
+  - project-c5ho0/indian-market-qieug
+---
+# YOLO26n — Indian FMCG Product Detection
+Fine-tuned [YOLO26n](https://docs.ultralytics.com) on a **merged dataset of three Indian grocery sources** from Roboflow Universe to detect and localize packaged FMCG products in shelf or counter images. Part of the **Kirana Detective** project — an AI system for small Indian grocery stores to visually count and reconcile inventory from photos.
+> **Note**: This file is a local reference snapshot. The actual model card uploaded to HuggingFace Hub is generated dynamically at the end of `train_yolo26n.py` and will include the real class list and metrics from the latest training run.
+## Model Description
+The model takes a 640×640 image and returns bounding boxes, class labels, and confidence scores for all detected Indian FMCG product categories. It is exported as ONNX for deployment on both CPU and GPU without requiring a full PyTorch installation.
+The class list is built at training time by merging the three Roboflow dataset vocabularies (deduped, insertion-order). See `class_names.json` on HF Hub for the exact unified list.
+## Pilot Run Results (single dataset, 10 classes)
+The metrics below are from a previous training run using only the `agentsk47/indian-grocery-object-detection` dataset (10 classes). They are superseded by the current merged 3-dataset training run.
+**Pilot classes:**
+| # | Class |
+|---|---|
+| 0 | Bournvita |
+| 1 | Mysore Sandal Soap |
+| 2 | Nescafe Classic Coffee |
+| 3 | Nivea Body Lotion |
+| 4 | Nivea Soft Moisturising Cream |
+| 5 | Parachute Coconut Oil |
+| 6 | Patanjali Dant Kanti |
+| 7 | Society Tea Powder Plain |
+| 8 | Tresemme Hairfall Defense Conditioner |
+| 9 | Tresemme Hairfall Defense Shampoo |
+## Pilot Evaluation Results (best.pt, epoch 65 — single dataset run)
+| Class | Images | Instances | Precision | Recall | mAP50 | mAP50-95 |
+|---|---|---|---|---|---|---|
+| **all** | **41** | **51** | **0.935** | **0.971** | **0.993** | **0.933** |
+| Bournvita | 3 | 3 | 0.902 | 1.000 | 0.995 | 0.995 |
+| Mysore Sandal Soap | 8 | 8 | 1.000 | 0.905 | 0.995 | 0.944 |
+| Nescafe Classic Coffee | 4 | 4 | 0.927 | 1.000 | 0.995 | 0.908 |
+| Nivea Body Lotion | 7 | 7 | 0.935 | 1.000 | 0.995 | 0.923 |
+| Nivea Soft Moisturising Cream | 3 | 3 | 0.924 | 1.000 | 0.995 | 0.895 |
+| Parachute Coconut Oil | 6 | 6 | 1.000 | 0.819 | 0.972 | 0.928 |
+| Patanjali Dant Kanti | 7 | 7 | 1.000 | 0.985 | 0.995 | 0.971 |
+| Society Tea Powder Plain | 2 | 2 | 0.878 | 1.000 | 0.995 | 0.845 |
+| Tresemme Hairfall Defense Conditioner | 1 | 1 | 0.814 | 1.000 | 0.995 | 0.995 |
+| Tresemme Hairfall Defense Shampoo | 10 | 10 | 0.968 | 1.000 | 0.995 | 0.922 |
+## How to Use
+### Python (ONNX Runtime)
+```python
+import json
+import numpy as np
+import onnxruntime as ort
+from PIL import Image
+session = ort.InferenceSession("yolo26n_fmcg.onnx", providers=["CPUExecutionProvider"])
+class_names = json.load(open("class_names.json"))
+def preprocess(image_path, size=640):
+    img = Image.open(image_path).convert("RGB").resize((size, size))
+    arr = np.array(img, dtype=np.float32) / 255.0
+    return arr.transpose(2, 0, 1)[None]  # BCHW
+input_name = session.get_inputs()[0].name
+outputs = session.run(None, {input_name: preprocess("shelf.jpg")})
+# outputs[0]: (1, 300, 6) — [x1, y1, x2, y2, confidence, class_id]
+```
+### Ultralytics (PyTorch)
+```python
+from ultralytics import YOLO
+model = YOLO("yolo26n_fmcg.onnx", task="detect")
+results = model.predict("shelf.jpg", imgsz=640, conf=0.25)
+results[0].show()
+```
+## Training Details
+### Datasets (merged)
+All three downloaded in **YOLOv8 format** (not openai), class IDs remapped to a unified list before training.
+| Dataset | Workspace | Version | Images | Classes |
+|---|---|---|---|---|
+| [Indian Grocery Object Detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) | agentsk47 | v1 | ~400 | 10 |
+| [Grocery Items](https://universe.roboflow.com/iit-patna-qg1jh/grocery_items-7i2em) | IIT Patna | v45 | 6,695 | 20 |
+| [Indian Market](https://universe.roboflow.com/project-c5ho0/indian-market-qieug) | project-c5ho0 | v2 | 4,694 | 2 |
+### Hyperparameters
+| Parameter | Value |
+|---|---|
+| Base model | YOLO26n |
+| Input size | 640 × 640 |
+| Epochs (scheduled) | 100 |
+| Epochs (actual) | 85 (early stop at 65+20) |
+| Batch size | 16 |
+| Early stopping patience | 20 |
+| Optimizer | Auto (Ultralytics default) |
+| Export format | ONNX opset 12 |
+### Training Infrastructure
+| Field | Value |
+|---|---|
+| Hardware | NVIDIA A10G (22 GB VRAM) |
+| Framework | Ultralytics 8.4.63 |
+| PyTorch | 2.12.0+cu130 |
+| Orchestration | Modal |
+| Training time | 0.094 hours (~5.6 minutes) |
+| Model size | 5.4 MB (PyTorch) · 9.4 MB (ONNX) |
+| Parameters | 2,376,786 |
+| GFLOPs | 5.2 |
+| Inference speed | 0.2 ms preprocess + 1.0 ms inference (A10G) |
+### Training Curve Notes (pilot run)
+- Best checkpoint at **epoch 65** (mAP50 = 0.993, mAP50-95 = 0.933) — single-dataset pilot
+- EarlyStopping triggered at epoch 85 (no improvement for 20 epochs)
+- Final box loss: 0.4201 · cls loss: 0.4657 · dfl loss: 0.006
+The current 3-dataset merged training run will produce updated curve notes.
+## Limitations
+- Merged dataset skewed toward beauty/personal care (Tresemmé, Nivea, Patanjali); may underperform on grocery staples
+- ~11K images across 3 sources; performance on crowded shelves or partial occlusions is untested
+- Exported at opset 12 for broad compatibility; advanced indexing operations use multi-op decomposition (see ONNX export warning)
+## Citation
+```bibtex
+@misc{kirana-detective-yolo-2026,
+  title  = {Kirana Detective: YOLO26n Indian FMCG Product Detector},
+  author = {Syed Naazim Hussain},
+  year   = {2026},
+  url    = {https://huggingface.co/build-small-hackathon/yolo26n-indian-fmcg-detection}
+}
+```

model_artifacts/yolo26n_fmcg/class_names.json ADDED Viewed

	@@ -0,0 +1,1833 @@

+[
+  "Bournvita",
+  "Mysore Sandal Soap",
+  "Nescafe_Classic_Coffee",
+  "Nivea Body Lotion",
+  "Nivea_Soft_Moisturising_cream",
+  "Parachute coconut Oil",
+  "Patanjali Dant Kanti",
+  "Society_TeaPowder_plain",
+  "Tresemme_Hairfall_Defense_conditioner",
+  "Tresemme_Hairfall_Defense_shampoo",
+  "24Mantra",
+  "3D_Fryums_Ajwah",
+  "50 50 Gol Maal",
+  "50 50 sweet and salty",
+  "50 50 top",
+  "72H_AxeSignature",
+  "8am soyachunks",
+  "ALphonsoMangoPulp_GoldenCrown",
+  "Aakash",
+  "Aarambh",
+  "Aashirvaad",
+  "Aashirvaad_Besan",
+  "Aashirvaad_Salt",
+  "Aashirvaad_TurmericPowder",
+  "Abzorb",
+  "AcaciaHoney_KhushNuma",
+  "Act2",
+  "Action_Dishwash_Liquid",
+  "ActiveSaltNeem_Toothpaste_Colgate",
+  "ActiveSalt_Toothpaste_Colgate",
+  "Adrenaline_48H_AxeSignature",
+  "Agarbathi_Bansuri",
+  "Agarbathi_Baratanatyam",
+  "Agarbathi_Cycle",
+  "Agarbatti_SalShakti",
+  "Agastya",
+  "Agrabathi_Mangaldeep",
+  "Agrawal",
+  "AirFreshner",
+  "AllOut",
+  "Almendro",
+  "AlmondFlour_Granos",
+  "AlmondKernals_Aram-s",
+  "Almonds",
+  "AloFrut_Juice",
+  "Aloevera_Handwash_Patanjali",
+  "Aloo Bhujia",
+  "AlooBhujia_Bicano",
+  "AlooBhujia_Bikano",
+  "AlpineFresh_BodySpray_ShotLayerr",
+  "AlterEgo_BodySpray_ParkAvenue",
+  "AmericanGarden",
+  "AmericanPancakeCo",
+  "AmlaCandy_Patanjali",
+  "AmlaOil",
+  "Ammol_LemonMazaa",
+  "Amul",
+  "Amul Butter",
+  "Amul ghee",
+  "Amul-Dreamlite_Biscuites",
+  "AmulCool",
+  "Amul_BadamShake",
+  "Amul_ButterCookies",
+  "Amul_Kool",
+  "Amulya",
+  "AntiRoachgel_Hit",
+  "Anymany",
+  "Apis",
+  "AplineRose-BlackCurrent_Perfume_YardleyLondon",
+  "AppleCiderVinegar_AmericanGarden",
+  "AppleCiderVinegar_CanadianGarden",
+  "AppleCiderVinegar_JustOrganic",
+  "AppleCiderVinegar_WOW",
+  "AppleCiderVinegar_Wanposh",
+  "Apple_Juice_Tropicana",
+  "AppyFizz",
+  "Apricots",
+  "Apsara",
+  "Aravalli_Rice",
+  "Ariel",
+  "Ashok",
+  "BC",
+  "BLackCode_Denver",
+  "BNatural",
+  "Babe_BodySpray_Vanesa",
+  "BabyCereal",
+  "BabyCorn_GoldenCrown",
+  "BabyWash_Himalaya",
+  "Babyrub_Vicks",
+  "BadamDrink_Cavins",
+  "BadamDrink_MTR",
+  "BadamLachha_Bicano",
+  "BadamMilkshake_Cavins",
+  "Baggrys",
+  "Bagrrys",
+  "Baidyanath",
+  "Bajaj",
+  "Bakeshree",
+  "BakingPaper",
+  "BakingPowder",
+  "Balaji",
+  "Bambino",
+  "Banana_chips",
+  "BanneNawabs",
+  "BasilSeed_Ajwah",
+  "BasilSeeds",
+  "Batchelors",
+  "BathingSoap_LifeBuoy",
+  "BathingSoap_Pears",
+  "Bathroom_Cleaner",
+  "Bauli",
+  "BauliSavoriz",
+  "Beans",
+  "BeardTrimmer_Babila",
+  "Belgian",
+  "BerryJam_Mala",
+  "BesanLaddoo_BigBasket",
+  "Besan_Rajdhani",
+  "BhujiaMasala_Brij",
+  "Bhujia_Bicano",
+  "Bhujia_FirstCrop",
+  "Bhujia_Haldirams",
+  "Bhujialalji",
+  "Bhujiya_Prabhuji",
+  "Bicano_PanjeeriLaddu",
+  "Bigbabol_Chewinggum",
+  "Bikaji",
+  "Bikaji Tana bana",
+  "Bikaji_Kuch Kuch",
+  "Bikaji_navratan mixtute",
+  "BikaneriBhujia_Brij",
+  "Bikano",
+  "Bikano Aloo Bhujia",
+  "Bikano Royal Rasgulla",
+  "Bikano Tasty",
+  "BikanoRasgulla",
+  "Bikano_navratan mixtute",
+  "Bikano_navratanmixtute",
+  "Bingo",
+  "BiotinTablets_HKVitals",
+  "Biotique",
+  "Biscuit",
+  "Biscuit_Bourbon_Britania",
+  "Biscuit_Bourbon_Britannia",
+  "Biscuit_Bourbon_FullBloom",
+  "Biscuit_ButterBite_PriyaGold",
+  "Biscuit_ButterBite_first crop",
+  "Biscuit_CashewBadam_FirstCrop",
+  "Biscuit_ChocoRolls_HideandSeek",
+  "Biscuit_Chocolate_FullBloom",
+  "Biscuit_DarkFantasy",
+  "Biscuit_Digestive_FirstCrop",
+  "Biscuit_GlucosePlus_Sunfeast",
+  "Biscuit_HideandSeek_Parle",
+  "Biscuit_Nutrichoice_Arrowroot_Britannia",
+  "Biscuit_Nutrichoice_Britannia",
+  "Biscuit_Nutrichoice_Cracker_Britannia",
+  "Biscuit_Nutricrunch_Britannia",
+  "Biscuit_SweetnSalty",
+  "Biscuit_TigerKrunch",
+  "Biscuits_50_50_Britannia",
+  "Biscuits_Amul",
+  "Biscuits_Buttery_50_50_Britannia",
+  "Bisleri",
+  "BlackBourbon_Hide-Seek_ParlePlatina",
+  "BlackMagic_Deo_Spinz",
+  "BlackPepperPowder_MDH",
+  "BlackPepper_Catch",
+  "Bleach_Fem",
+  "BlueBird",
+  "BlueCurrents_St.John",
+  "Blueberries",
+  "Blueberry_Gaurmia",
+  "Blush_BodySpray_Eva",
+  "Blush_FragrantTalc_Envy",
+  "BodyHairRemover_Veet",
+  "BodyLotion",
+  "BodyLotion_Parachute",
+  "BodyMist_Ossum",
+  "BodySpray_BoldSecret",
+  "BodySpray_Eva",
+  "BodySpray_Jovan",
+  "BodySpray_ParkAvenue",
+  "BodySpray_The_Man_Company",
+  "BodySpray_Women",
+  "BodySpray_men",
+  "BombayBiryani_Alamin",
+  "Bookside",
+  "Boondi_Jabson-s",
+  "Boost",
+  "Borges",
+  "BoroPlus",
+  "Boroline",
+  "Bounce",
+  "Bounty",
+  "Bourbon cream biscuits",
+  "Bourbon_Cremica",
+  "Bourbon_FullBoom",
+  "Bread",
+  "Brij_Bhujia",
+  "Brij_Namkeen",
+  "Brij_Panchratan",
+  "Britannia",
+  "Britannia_50_50_Biscuits",
+  "Britannia_WinkinCow_lassi",
+  "BrookeBond",
+  "Brown-Haley",
+  "BrownRice",
+  "BrowniesBasket",
+  "Bru",
+  "BubbleGumFlovour_ToothpasteForKids_DentoShine",
+  "Budweiser",
+  "Buldak_Ramen",
+  "Bumtum_BabyPaints",
+  "Bumtum_DoobiDoo",
+  "Butlers",
+  "Butter",
+  "ButterBatter",
+  "ButterBite_Cookies",
+  "ButterCookies_Amul",
+  "ButterCookies_Unibic",
+  "ButterLite_NamkeenJeera_Biscuits",
+  "Butter_AmulLite",
+  "Butter_MilkyMist",
+  "Butter_MotherDairy",
+  "Butter_Nutralite",
+  "ButtonMushroom_GoldenCrown",
+  "Cadbury",
+  "Cadbury_Celebrations",
+  "Cake",
+  "CaliforniaAlmonds_Wonderland",
+  "CalifornianAlmonds_happilo",
+  "Candey_Butter_PureBurst",
+  "Candy",
+  "Candy_Butter_PureBurst",
+  "Candy_Chocolate_PureBurst",
+  "Candy_Halls",
+  "Candy_KachaAam_PureBurst",
+  "Candy_KachchaAam_PureBurst",
+  "Candy_Lacmi",
+  "Candy_Mandola",
+  "Candy_Mentos",
+  "Candy_OrangeBite_Parle",
+  "Candy_Orange_PureBurst",
+  "Candy_PureBurst_KachchaAam",
+  "Candy_PureBurst_MangoTango",
+  "Candy_PureBurst_Orange",
+  "Candy_PureBurst_chocolate",
+  "Candyman_ChocoDouble",
+  "CaramelBliss_Popcorn_ActII",
+  "CashewSticks_GoneMad",
+  "Cashews",
+  "CastorOil",
+  "Catch",
+  "Catch Black Pepper",
+  "Catch Table Salt",
+  "Catch chicken masala",
+  "Catch_Cola",
+  "Catch_GaramMasala",
+  "Catch_KashmiriMirchi",
+  "Catch_MeatMasala",
+  "Catch_Pepper",
+  "Catch_SprintUp",
+  "Catch_jeera powder",
+  "Cavin",
+  "Cavin_VaneelaMilkShake",
+  "Cavins",
+  "Cerelac_Apple_Nestle",
+  "Cerelac_Wheat_Nestle",
+  "ChakkiAtta",
+  "ChampSportingClub_Denver",
+  "ChanaMasala_Jobsons",
+  "ChanaMasala_MinuteMeals_MTR",
+  "ChanaMasala_MinuteMeals_Mir",
+  "CharcoalCleangel_Toothpaste_Colgate",
+  "Charged_thumbsup",
+  "Cheese",
+  "CheeseCubes",
+  "CheeseCubes_D-lecta",
+  "CheeseCubes_MilkyMist",
+  "CheeseCubes_MotherDairy",
+  "CheeseSlices",
+  "CheeseSlices_Amul",
+  "CheeseSlices_Britannia",
+  "CheeseSlices_D-lecta",
+  "CheeseSlices_Go",
+  "CheeseSlices_MilkyMist",
+  "CheeseSlices_MotherDairy",
+  "Cheese_D-lecta",
+  "Cheetos",
+  "Cheezza_Britannia",
+  "ChefsBasket",
+  "ChiaSeed_Ajwah",
+  "ChiaSeeds",
+  "ChickenMasala_Orika",
+  "Chik",
+  "ChileanInshellWalnut_Happilo",
+  "ChileanInshellWalnut_nutraj",
+  "ChileanInshellcashenut_Happilo",
+  "ChileanInshellcashenut_nutraj",
+  "ChilliPowder_Kanwal",
+  "Chings",
+  "Chings_Chutney",
+  "Chips_Bicano",
+  "Chips_Crisps_Lays",
+  "Chips_Doritos",
+  "Chips_Haldirams",
+  "Chips_Pudina_haldirams",
+  "Choco-Pie",
+  "ChocoBakes_Cookies_Cadbury",
+  "ChocoFills_Kellogg-s",
+  "ChocoHazlenut_Cookies_Unibic",
+  "ChocoNutCookies_Unibic",
+  "ChocoPie",
+  "ChocoSpread_HealthyDay_Rasna",
+  "Chocoflakes_Kwality",
+  "Chocola",
+  "ChocolairsGoldCoffee_Cadbury",
+  "Chocolate",
+  "ChocolateCookies_Amul",
+  "ChocolatePeanut_Butter_MB",
+  "Chocolate_5Star",
+  "Chocolate_ChoclairsGold_Cadbury",
+  "Chocolate_ChocoBarXL",
+  "Chocolate_DairyMilk_Cadbury",
+  "Chocolate_Horlicks",
+  "Chocolate_Nutties_Cadburry",
+  "Chocolate_Snakker_PriyaGold",
+  "Chocolate_Truffel_Dukes",
+  "Chocozay",
+  "ChoiceMega",
+  "Chokolas",
+  "Cibaca_Toothpaste_Colgate",
+  "ClassicButter_Unibic",
+  "ClassicMalt_Horlicks",
+  "ClassicSalted_Popcorn_ActII",
+  "Classicgentleman_Perfume_Yardley",
+  "Clean-Clear",
+  "ClinicPlus",
+  "Cockroach_repllent_baygon",
+  "Cocktail",
+  "Cocoa_Cadbury",
+  "Cocoa_Hershey-s",
+  "Coconut7BrownSugar_Pocky",
+  "CoconutCookies_Americana",
+  "CoconutCookies_Gaia",
+  "CoconutLadoo_Ajwah",
+  "CoconutOil_Nirmal",
+  "CoconutPeda_Ajwah",
+  "CoconutPowder_Sunrise",
+  "CoconutWater_CocoRoyal",
+  "CoffeeMate_Nestle",
+  "Coffee_FullBloom",
+  "ColaCandy_Ajwah",
+  "Colgate",
+  "Colgate Total12",
+  "Colgate_BubbleFruit_ChihldPaste",
+  "Colgate_Strawberry_ChihldPaste",
+  "Colgate_Toothbrush",
+  "Colin",
+  "Comfort",
+  "Complan",
+  "CookTop",
+  "Cookies",
+  "Cookies_ButterBake",
+  "CoolKick_NiveaMen",
+  "CoolWave_Talc_Envy",
+  "Corainder_Everest",
+  "CornFlakes_FirstCrop",
+  "CornSoup_Bambino",
+  "CornStarch_Brown-Polson",
+  "Cornflour",
+  "Cornitos",
+  "CoughDrops_Honitus_Dabur",
+  "CoughDrops_Vicks",
+  "CowGhee_ThackerDairy",
+  "Crackers_Munchy-s",
+  "Cranberrries_Granos",
+  "Cream_Lakme",
+  "Cream_vitaminFaceWash",
+  "CremesBiscuit_Strawberry_Cremica",
+  "CrunchyChocolate_Pepero",
+  "CrunchyMunchy_Bicano",
+  "CrunchyMunchy_Bikano",
+  "CupCakes",
+  "Curd",
+  "Curd_MilkyMist",
+  "Curry_TopRamen",
+  "CustardPowder_Brown-Polson",
+  "Cycle",
+  "Czar_Scent_Fogg",
+  "DLecta",
+  "Daawat",
+  "Daawat_RozanaBasmatiRiceGold",
+  "Dabur",
+  "DaimSnax",
+  "DairyMilk_Chocolate",
+  "DairyMilk_Silk_Cadbury",
+  "DalMakhani_RedayMeals_Gits",
+  "Dal_Bikano",
+  "Dal_KhattaMeetha_BC",
+  "Dal_Pitara_DBL",
+  "Dal_Rajdhani",
+  "Dalmakhni_Gits",
+  "Dant Rakshak",
+  "DantKanti_Toothpaste_Patanjali",
+  "DarkChocolateChipsCookies_Gaia",
+  "DarkFantasy_ChocolateShake",
+  "DarkTemptation_CologneTalc_Axe",
+  "DarkTemptation_Fragrance_Axe",
+  "Dates",
+  "DatesSyrup_Lion",
+  "Davidoff",
+  "DelMonte",
+  "DelMonte_Mayonnaise",
+  "Delfi",
+  "Delicious",
+  "Delifresh",
+  "Delight",
+  "Delisious",
+  "Delmonte_GreenApple",
+  "Delmonte_Pineapple",
+  "Delmonte_peachjuice",
+  "Denver",
+  "DetergentBar_Henko",
+  "Dettol",
+  "Dev",
+  "Dexolac",
+  "Dhara",
+  "Dhara_Oil",
+  "DiSano",
+  "DiabeticCare_Nimbark",
+  "Diamond",
+  "Diapers",
+  "Diapers_MamyPokoPants",
+  "Diapers_Pampers",
+  "Digestibes_Nutricrunch",
+  "Digestive_NutriChoice_Britania",
+  "Discover_BodySpray_ParkAvenue",
+  "Dishwash_Prill",
+  "Dishwash_Scrubz",
+  "Dishwasher_Exo",
+  "Diva_BodySpray_Vanesa",
+  "Divine Agro Kasuri Methi",
+  "Divya",
+  "DogFood_Pedigree",
+  "Domex",
+  "Don_PerfumeDoeSpray_Beardo",
+  "Donut_Ziggy",
+  "DoobiDoo_BabyPaints",
+  "Doritos",
+  "Dorje",
+  "Dove",
+  "Dove_FacialTissue",
+  "DrOetker",
+  "DryFruit",
+  "DryFruitPanjeeriLadoo_Haldiram",
+  "DryFruits",
+  "Dukes",
+  "Dyna Sandal",
+  "EasyFun",
+  "EclairOs",
+  "Eclairs",
+  "Eclairs_PureBurst",
+  "EcoValley",
+  "Eggs",
+  "Elite",
+  "Emami",
+  "Emperia",
+  "Enchante_Spinz",
+  "EnergyMax",
+  "Engage",
+  "Eno",
+  "Ensure",
+  "Enzo",
+  "Epigamia",
+  "Everest",
+  "Everyuth",
+  "Exotic_Spinz",
+  "Ezee",
+  "FabBourbon_Biscuit_Parle",
+  "FabricConditioner_Softlan",
+  "Fair-Lovely",
+  "FairMart",
+  "Fairmart",
+  "FaloodaMix_KesarPista_Weikfield",
+  "FaraliChiwda_Bhujialalji",
+  "FastCard_GoodNight",
+  "FennelSeedPowder_Kanwal",
+  "FerreroRocher",
+  "Fiama",
+  "FieryPride_BodySpray_ShotLayerr",
+  "Fiesta_Dal",
+  "Figaro",
+  "Figs",
+  "Finosta",
+  "First Crop Bikaneri Bhuji",
+  "First Crop Potato chips",
+  "First crop Corn Flakes",
+  "FishOil_HKVitals",
+  "Flakes_Choco_FirstCrop",
+  "FlaxSeeds_Nutraj",
+  "FlossPicks_DeepClean_DentoShine",
+  "FlossPicks_DentoShine",
+  "Fogg",
+  "FoodCoast",
+  "ForestSpice_Deodrant_WildStone",
+  "Fortune",
+  "Fortune besen",
+  "Fortune chana sattu",
+  "Fortune maida",
+  "Fortune suji",
+  "Fortune_Sugar",
+  "Forture indori poha",
+  "FourSquare_CigarettePack",
+  "Fox-s",
+  "Foxs",
+  "FreshComfort_Deodrant_Nivea",
+  "FreshGel_Sensodyne",
+  "FreshMint_Sensodyne",
+  "Fresho",
+  "FrozenPackagedFood",
+  "Fruit-Nut_Cookies_Unibic",
+  "FruitCocktail_GoldenCrown",
+  "FruitGummies_Tapi",
+  "FruitJuice_Litchi_Real",
+  "Fruitins",
+  "Frutins",
+  "Fulvadi_Jabson",
+  "GM",
+  "GMFoods",
+  "Gaia",
+  "GaiaLite",
+  "Gaialite",
+  "Galaxy",
+  "Gamnuts_DryFruits",
+  "Gamnuts_Masale",
+  "Gangwal",
+  "GaramMasala_Orika",
+  "Garden Bhelpuri - Sevpuri",
+  "GarlicPaste_Nilon-s",
+  "Garnier",
+  "Gasona_Kudos",
+  "Geham",
+  "Geisha",
+  "Genteel",
+  "Gentleman_Perfume_YardleyLondon",
+  "Gerber",
+  "GetReal",
+  "Ghadi",
+  "Ghee",
+  "Ghee_Cow_PureBurst",
+  "Gillette",
+  "Girnar",
+  "Gits",
+  "GlassCleaner_Action",
+  "Glicy",
+  "Glow",
+  "Glow-Handsome",
+  "Glow-Lovely",
+  "GlucoPlusC_Dabur",
+  "GlucoPlusD_Dabur",
+  "Glucon-D",
+  "GluconD",
+  "Glycerin",
+  "Go",
+  "GoalSportingClub_Denver",
+  "Godrej",
+  "GoldFlake_CigarettePack",
+  "Gold_Nescafe",
+  "GoldenScent_Next",
+  "GoneMad",
+  "Good Life toor dal",
+  "Good life",
+  "Good life masoor",
+  "Good life urad whole",
+  "GoodDay",
+  "GoodHome",
+  "GoodKnight",
+  "GoodLife",
+  "GoodMorning",
+  "Gourmet-sDelite",
+  "Gowardhan",
+  "Grand_Coffee_Tata",
+  "GreenChilliPickle_Nilon-s",
+  "GreenChilliPickle_Tops",
+  "GreenTea_Pocky",
+  "GulabJamunMix_ Bambino",
+  "GupShupPeanuts_Haldirams",
+  "Guruji",
+  "Gustora",
+  "Guylian",
+  "Hair-Care",
+  "HairColor",
+  "HairDryer_Babila",
+  "HairOil_Clear",
+  "HairOil_Indulekha",
+  "Hair_Color_ColorMate",
+  "Hajmola",
+  "Haldiram-s_Chips",
+  "Haldirams",
+  "Haldirams_GujratiMix",
+  "Haldirams_KajuMixture",
+  "Haldirams_MoongDal",
+  "Haldirams_Peanuts",
+  "Hamilton_Perfume_Denver",
+  "HandSanitizer",
+  "HandSanitizer_Dettol",
+  "HandSanitizer_Himalaya",
+  "HandSanitizer_Lifeboy",
+  "HandwashVitamins_LifeBouy",
+  "Handwash_Herbal_Patanjali",
+  "Handwash_Santoor",
+  "Happilo",
+  "Happilo_Almonds",
+  "Happilo_PeanutButter",
+  "HappyHappyCreme_Biscuit_Parle",
+  "Haribo",
+  "Harpic",
+  "HastyTasty",
+  "Hawanan Barbeque Cheese Popcorn",
+  "Head-Shoulders",
+  "HealthyLife",
+  "Heinz",
+  "HellMann-s_Mayonnaise",
+  "Hellmanns",
+  "Hem",
+  "Herbal_Toothpaste_Colgate",
+  "Hersheys",
+  "Hersheys_MilkShake_vaneeliFlavour",
+  "Hide-Seek_Chocolate_Parle",
+  "Hide-Seek_CremeSandwiches",
+  "Hide-Seek_Milano_ParlePlatina",
+  "Hide-Seek_ParlePlatina",
+  "Himalaya",
+  "Hing_Ramdev",
+  "Hit",
+  "Hitkary",
+  "Hocco",
+  "HoneyOatmeal_Cookies_Unibic",
+  "Honey_Barosi",
+  "Honey_GoldDrops",
+  "Honey_Lion",
+  "Horlicks",
+  "Horlicks women-s plus",
+  "Hurricane_EnergyDrink",
+  "HyderabadiBiryani_Rehmat",
+  "HydraEnergy_Deodrant_WildStone",
+  "IceCream",
+  "IceCream_Amul",
+  "IceCream_BaskinRobbins",
+  "IceCream_Kulfi_Havmor",
+  "IespressoCoffee_Davidoff",
+  "ImliCandy_Ajwah",
+  "IndiShop",
+  "IndianSweet_MaysorePak_Nandini",
+  "Indulekha",
+  "Inhaler_Vicks",
+  "Insect_Killer_Hit",
+  "Insect_Killer_strategi",
+  "Insight_Perfume",
+  "Intense_Fragrance_Axe",
+  "Iodex",
+  "JMRFoods",
+  "Jabsons",
+  "Jackpot",
+  "JaggeryPowder",
+  "JainsTrupti",
+  "Jalani",
+  "Jaljira",
+  "Jam_Kissan",
+  "Jam_MixedFruit_Kissan",
+  "Jam_Patanjali",
+  "Jeera_GoodLife",
+  "Jel_SetWet",
+  "Jet_Imli_Toffee",
+  "JewelFarmer",
+  "Johnsons",
+  "Joiner_Drink",
+  "Joshina_Hamdard",
+  "Jovees",
+  "Joy",
+  "Jucie_CranberryDelight_Tropicana",
+  "Jucie_GuavaDelight_Tropicana",
+  "Jugnu_NapthaleneBalls",
+  "Juice_Apple_Natural",
+  "Juice_CoconutMilkDrink_MangoNataDeCoco_Uglobe",
+  "Juice_CoconutMilkDrink_PineappleNataDeCoco_Uglobe",
+  "Juice_CranberryDelight_Tropicana",
+  "Juice_Greenapple_Delmonte",
+  "Juice_Guava_Real",
+  "Juice_MixedFruit_Natural",
+  "Juice_Orange_Real",
+  "Juice_Peach_Delmonte",
+  "Juice_Peach_Rani",
+  "Juice_Pineapple_Delmonte",
+  "Juice_Pomegranate_Joiner",
+  "Juice_Pomegrate_Real",
+  "Juice_Pomegrate_Real_Dabur",
+  "Juice_Swing",
+  "Juice_coconut_Paper_Boat",
+  "JuniorTomatoKetchup_Tops",
+  "KTH",
+  "Kaffe",
+  "KajuMixture_Bikano",
+  "KalongiSeeds_Ajwah",
+  "Kamasutra",
+  "Kapiva",
+  "KashmiriMixture_Bicano",
+  "KashmiriMixture_Bikano",
+  "KashurDal_HudHud",
+  "KasundiSauce_Elmac",
+  "Kehwa_Aram-s",
+  "Kelloggs Corn Flakes Real Haney",
+  "Kellogs",
+  "KesarPeda_Ajwah",
+  "KesarQueen",
+  "KeshKing",
+  "Keya",
+  "KhattaMeetha_FirstCrop",
+  "KhattaMithaMix_Haldirams",
+  "Kids_Toothbrush_OralB",
+  "Kimchi",
+  "Kimchi_Samyang",
+  "KinderJoy",
+  "Kingfisher",
+  "Kinley",
+  "Kissan",
+  "KitchenAffairs",
+  "KitchenTreasures",
+  "Knorr",
+  "KuchKuch_Bikaji",
+  "Kurkure",
+  "Kurkure_ChatFills",
+  "Kurkure_SizzlinHot",
+  "Kwality",
+  "Kwality_Muesli",
+  "L-oreal",
+  "LactoCalamine",
+  "Lakme",
+  "Lal",
+  "LalMirchPaste_Aram-s",
+  "Lassi_WinkinCowClassic_Britannia",
+  "LaxmanRekhaa",
+  "Layka",
+  "Lays",
+  "Lays_American",
+  "Lays_ChilliLimbu",
+  "Lays_Indian",
+  "Lays_Spanish",
+  "Lays_WestIndies",
+  "LeCafe",
+  "Lehar",
+  "Lemon_Toothpaste_Colgate",
+  "Lifebuoy",
+  "Lime_Dishwash_Prill",
+  "Limonata",
+  "Lindt",
+  "Lipton",
+  "LiquidDeodrant_Lawman_pg3",
+  "LiquidDetergent_Patanjali",
+  "LiquidMosquitoRepellent_GoodNight",
+  "Liril",
+  "Listerine",
+  "Lite",
+  "LittleHearts",
+  "Lizol",
+  "LollipopTongueCleaner_DentoShine",
+  "LondonBubble",
+  "LondonMist_BodySpray_YardleyLondon",
+  "LondonMist_Perfume_Yardley",
+  "Lotte",
+  "Lotus",
+  "Lux",
+  "M-M",
+  "MB",
+  "MDH",
+  "MIxedFruit_Juice_Tropicana",
+  "MTR",
+  "Mad_Soap",
+  "Madhubani",
+  "Madhuri",
+  "MadrasiNamkeen_Brij",
+  "Maggi",
+  "Magnolia7GrapeFruit_Perfume_YardleyLondon",
+  "Mahakosh",
+  "Makhana",
+  "Makino Nacho chips",
+  "Mala_MixedFruitJam",
+  "Malas",
+  "Malkist",
+  "Mangalam",
+  "Mangaldeep",
+  "MangoBite_Parle",
+  "MangoFlavour_ToothpasteForKids_DentoShine",
+  "MangoJuice_Fresca",
+  "MangoMerry",
+  "MangoPickle_ImliTree",
+  "MangoPickle_Tops",
+  "Manna",
+  "Marie_McVities",
+  "Mars",
+  "Marvel",
+  "MarvelTea",
+  "MasalaMunch_Kurkure",
+  "MasalaNoodles_TopRamen",
+  "MasalaNoodles_Tops",
+  "MasalaNoodles_maggi",
+  "MasalaTikki_Kanwal",
+  "Masala_CuppaNoodles_Maggi",
+  "Masala_CuppaNoodles_Manchow",
+  "Masala_TopRamen",
+  "MaxFresh_Toothpaste_Colgate",
+  "MaxProtein",
+  "MazedaarMasala_CupNoodles",
+  "Mazic",
+  "McCain",
+  "McVities",
+  "MeatMasala_Orika",
+  "Meiji",
+  "MelonSeeds",
+  "MilkBooster_PureBurst",
+  "MilkCompound_Morde",
+  "MilkMagic",
+  "MilkShakti_Biscuit_Parle",
+  "Milk_Everyday_Nestle",
+  "Milk_Rusk_Mario",
+  "Milka",
+  "Milkshake_Badam_Cavins",
+  "Milkshake_Chocolate_Cavins",
+  "Milkshake_Straberry_Cavins",
+  "Milkshake_Vanilla_Cavins",
+  "Milky bar",
+  "MilkyMist",
+  "MiniChocolate_Oreo",
+  "MiniJumbo_MosquuitoCoil_GoodNight",
+  "MiniMeBakers",
+  "MinuteMaid",
+  "MirchiQuormaPaste_Kanwal",
+  "Mishri",
+  "MixedFruitJam_Sil",
+  "MixedPickle_Nilon-s",
+  "MixedPickle_Tops",
+  "Mixture_ALLINONE",
+  "Mixture_Ajwah",
+  "Mixture_Bicano",
+  "Mohuns",
+  "Mojito_Orange_Cravova",
+  "Mom",
+  "Moments_Chocolate",
+  "MomsMagic",
+  "MomsMagic_Biscuit_Sunfeast",
+  "Monaco_Biscuit_Parle",
+  "Monaco_Cheeslings",
+  "Monaco_PiriPiri",
+  "MongDal_Bikaji",
+  "Monita",
+  "Monster",
+  "MontexFoil",
+  "MoongDal_Bikaji",
+  "MoongFali",
+  "Mopz Floral Fresh",
+  "Mopz Lime Fresh",
+  "MorningDew_Perfume_Yardley",
+  "Morton",
+  "MosquitoKiller_Mortein",
+  "MosquitoOil_Genius_Maxo",
+  "MosquitoRepellent",
+  "MosquitoRepellentBlack_Hit",
+  "MosquitoRepellentRed_Hit",
+  "Mother-sRecipe",
+  "MotherDairy",
+  "MothersRecipe",
+  "Muesli_King",
+  "Munch Max",
+  "MuscleBlaze",
+  "Museli_Kellogs",
+  "MustardOil",
+  "MustardOil_ValleyKing",
+  "MutterPaneer_MinuteMeals_MTR",
+  "MutterPaneer_MinuteMeals_Mir",
+  "MyFruit",
+  "MysticWhite_Spinz",
+  "NANPro",
+  "Nafees",
+  "NailClipper",
+  "Nakoda",
+  "Namkeen",
+  "Namkeen_AllinOne_BC",
+  "Namkeen_BombayMixture_BC",
+  "Namkeen_KhattaMeetha_DC",
+  "Namkeen_MultiGrain_Jabson",
+  "NaturalChoice_ChannaDal",
+  "NaturalChoice_MixDal",
+  "NaturalChoice_MoongDal",
+  "NaturalChoice_RajmaChitra",
+  "NaturalColor_ColorMate",
+  "NaturalGlow_Deodrant_Nivea",
+  "NaturalHoney_Capilano",
+  "Nature-sChoice_Kismis",
+  "Nature-sChoice_MixedDryFruit",
+  "Navratan",
+  "Navratan_FirstCrop",
+  "NeemActive_Toothpaste",
+  "Neeraj",
+  "Neo_BodySpray_ParkAvenue",
+  "Nescafe",
+  "Nestle",
+  "Nestle Cerelac",
+  "Nilon-s_SAUCE",
+  "Nilons",
+  "Nissin",
+  "Nivea",
+  "Nongshim",
+  "NoodleSoup_ShinCup",
+  "Noodles_Chings_Hot Garlic",
+  "Noodles_Hakka Noodles",
+  "Noodles_Jumbo",
+  "Noodles_Yippee",
+  "NoonChai_Girnar",
+  "NoonChai_TezPremium",
+  "Nusobee_Dexolac",
+  "Nutella",
+  "Nutraj",
+  "Nutralite",
+  "Nutrela",
+  "Nutri choice thin arrowroot",
+  "NutriDelite",
+  "Nutricia",
+  "Nutricrunch_Biscuit_Britannia",
+  "Nutridelite",
+  "Nuts_Fruits_Berries_Gourmia",
+  "Nutveda",
+  "Nyle",
+  "OLiveOil_Sansu",
+  "Oats_CrunchyMuesli_Grry-s",
+  "Oats_ProteinRich_FirstCrop",
+  "Odonil",
+  "Oil_RefinedRice_Saffola",
+  "Oil_Sesame_FirstCrop",
+  "OldSpice",
+  "OleevActive",
+  "OleevSmart",
+  "OliveOil",
+  "OliveOil_Jivo",
+  "OliveOil_KeoKarpin",
+  "OrangeDrink_MinuteMaid",
+  "OrangeFlavour_ToothpasteForKids_DentoShine",
+  "OrangeJuice_Tropicana",
+  "OrangeSplash_Cookies_Unibic",
+  "Orbit",
+  "Oreo",
+  "OrganicAloeVera_Juice_Nimbark",
+  "OrganicAmla-Juice_Nimbark",
+  "OrganicIndia",
+  "OrganicTattva",
+  "Organica",
+  "Organicana",
+  "Oriental",
+  "OriginalChocolate_Pepero",
+  "Orion",
+  "Ortho_Oil_Zandu",
+  "Outshine_HandWash",
+  "Pack_BadamDrink_MTR",
+  "Pack_Chabaa_PineapplePulp",
+  "Pack_Chabaa_RedGrapeFruit",
+  "Pack_Coffee_Nescafe",
+  "Pack_Delmonte_GreenApple",
+  "Pack_Juice_Original",
+  "Pack_Juice_Peach_Delmonte",
+  "Pack_Juice_Pineapple_Delmonte",
+  "Pack_Juice_rani",
+  "PalakPaneer_MinuteMeals_MTR",
+  "Pancake Mix",
+  "Panchratna_Bikaji",
+  "PaneerBhujia_Bikano",
+  "Pansari_Poha",
+  "PaperBoat",
+  "PappaPig",
+  "PaprikaCashews_GoldenGate",
+  "Parachute",
+  "Paramute",
+  "ParisAgro",
+  "ParkAvenue",
+  "Parle",
+  "ParleGRoyale_Biscuit_Parle",
+  "ParleG_BigPack_Biscuit_Parle",
+  "ParleG_Biscuit_Parle",
+  "Parle_20_20_Nice_Biscuits",
+  "Parrot",
+  "Passion_Talc_Envy",
+  "Pasta Masala",
+  "Pasta_Fusilli_FirstCrop",
+  "Pasta_Wokifield",
+  "PastyPixel",
+  "Patanjali",
+  "Patisa_Haldirams",
+  "PauBhaji_Gits",
+  "Pav",
+  "PeanutButter_MyFitness",
+  "PeanutButter_funfoods",
+  "PeanutChikki_Parmod",
+  "Peanut_Butter_AmericanGarden",
+  "Peanutbutter_Alpino",
+  "Peanutbutter_FirstCrop",
+  "Peanutbutter_Pinotola",
+  "Pearl-Beauty_Deodrant_Nivea",
+  "Pears",
+  "PeasInBrine_GoldenCrown",
+  "PediaSure",
+  "PeppaPig",
+  "Peppy",
+  "Perfume_Denver",
+  "Pichkoo_TomatoKetchup_Maggi",
+  "Pickle_DoubleHorse",
+  "Pickle_Garlic_Mother-s",
+  "Pickle_KitchenTreasures",
+  "Pickle_Lime_Mother-s",
+  "Pickle_Mango_Mother-s",
+  "Pickle_Mother-sRecipe",
+  "Piknik",
+  "PineappleJam_FullBloom",
+  "PineappleJam_Kissan",
+  "PineappleSlice_GoldenCrown",
+  "PinkDelight",
+  "PinkSalt",
+  "Pinkrush_facewash",
+  "Pintola",
+  "Pipo",
+  "Pistachios_Wonderland",
+  "Pitambari",
+  "PizzaCheese_Go",
+  "PlainBhujia_Haldirams",
+  "PlaxSpicyFresh_Colgate",
+  "PlumCake",
+  "Poha_FirstCrop",
+  "Poha_Fortune",
+  "Ponds",
+  "Ponds_CharcoalFaceWash",
+  "Popcorn_Ajwah",
+  "Potata_Biscuit_Pran",
+  "PotatoChips_Cream - Onion_FirstCrop",
+  "Pramod",
+  "Pramod Tilkut",
+  "Pramod chikki Gaja",
+  "Pramod peanut chikki",
+  "Pramod sweet bliss",
+  "Pramod_Tilkut",
+  "Pran",
+  "PremiumTea_Mohan",
+  "Prestige",
+  "Primora",
+  "Princles",
+  "Pringles",
+  "ProNature",
+  "Protect-Care_Deodrant_NIvea",
+  "ProteinPlus",
+  "ProteinWater_Aquatein",
+  "Protinex",
+  "Prunes",
+  "Puffs_Funflips",
+  "Pulse_CoolTalc_Axe",
+  "Pulse_Fragrance_Axe",
+  "Pulses",
+  "Pulses_Goodlife",
+  "Pulses_SafeHarvest",
+  "Pulses_TataSampann",
+  "PunjabiChole_Gits",
+  "PunjabiTadka_Bikaji",
+  "PurHoney_Zandu",
+  "Pure",
+  "Purix",
+  "Pushp",
+  "Quaker",
+  "Queen_BodySpray_Vanesa",
+  "QuinoaSeeds_King",
+  "Raavi",
+  "RabdiDrink_Cavins",
+  "Racy",
+  "Rafaelo",
+  "Rajhans",
+  "RajmaMasala_Gits",
+  "RajmaMasala_RedayMeals_Gits",
+  "Ramu",
+  "Rani_Juice",
+  "RapidRelief_Sensodyne",
+  "Rasa",
+  "Rasna",
+  "RasoiMagic",
+  "RaspberryFlavour_ToothpasteForKids_DentoShine",
+  "Raw",
+  "RawPressery",
+  "ReadyToEatNoodles_WaiWai",
+  "Real",
+  "RealThai",
+  "RealThal",
+  "Real_Juice",
+  "RedBull",
+  "RedChilliPowder_Badshah",
+  "RedChilliPowder_Kanwal",
+  "RedChilliPowder_Rehmat",
+  "RedChilliPowder_c",
+  "RedChilliSauce",
+  "RedCurrents_St.John",
+  "RedLabel",
+  "RedTea_Mohan",
+  "RefinedOil",
+  "RefinedOil_Dhara",
+  "RefinedSoyabeanOil_NutriLive",
+  "Regular_GluconD",
+  "Repair-Protect_Sensodyne",
+  "RevitalH_Woman",
+  "Revive",
+  "Rex Baking Powder",
+  "Rice",
+  "Rice_Minimogra",
+  "Rich-Moist_PlumCake_Winkies",
+  "Rin",
+  "RiteBite",
+  "RoastedAndSaltedCashews_Happilo",
+  "RoastedChana_GoldenGate",
+  "RoastedFlexSeed_Ajwah",
+  "RockSalt",
+  "RoohAfza",
+  "RoomFreshner",
+  "Room_Mist_Lia",
+  "Rostaa",
+  "RoyalCupTea_Girnar",
+  "RoyalRatan",
+  "RoyalRedRoses_BodySpray_YardleyLondon",
+  "RoyaleGentleman_Perfume_Yardley",
+  "RuchiStar",
+  "Rusk_FirstCrop",
+  "Rusk_Toastea_Amul",
+  "SRK_AutographCollection_Denver",
+  "Sachamoti",
+  "SadaBahar",
+  "Sadabahar",
+  "Safal",
+  "SafeWash",
+  "Safffola",
+  "Saffola",
+  "Saffola Jammuni Veda",
+  "Saffola oil",
+  "SahiMixture_Bicano",
+  "Salt",
+  "Sams",
+  "Sanchi",
+  "Sapphire",
+  "SaraShree",
+  "SaunfPowder_Rehmat",
+  "Saunf_Everest",
+  "Savera",
+  "Savlon",
+  "Sayang",
+  "Schweppes",
+  "ScotchBrite",
+  "ScotchBrite_Scrubber",
+  "Seacod_CodLiverOilCapsules",
+  "SeedsandNuts_Happilo",
+  "SensitivePlus_Toothpaste_Colgate",
+  "Sensitive_Toothpaste_Colgate",
+  "Sensitivity-Gum_Sensodyne",
+  "ShahiMix_Bikano",
+  "Shampoo_Pantene",
+  "Shampoo_Shanelle",
+  "Shan",
+  "Shan_TandooriMasala",
+  "ShantaG",
+  "Shareat",
+  "ShavingFoam_ViJhon",
+  "Shero_Vanesa",
+  "Shot_Perfume_Layerr",
+  "ShudhUrja",
+  "Siddhayu",
+  "SilverCoin",
+  "Similac",
+  "Similac_FollowUpFormula_Abbot",
+  "Simple_GluconD",
+  "SizzlinHot_Kurkure",
+  "Skippi",
+  "Skittles",
+  "SlicedMushroom_Habit",
+  "SmartSecret_Fragrance_SummerSpring",
+  "SmithandJones",
+  "SnacTac",
+  "Snackible",
+  "Snacks_Ajwah",
+  "Snacks_Kuch-kuch_Bicano",
+  "Snacks_SnacLite_Haldirams",
+  "Snacks_ZigZag",
+  "Snactac",
+  "Snapin",
+  "Snicker",
+  "Snickers",
+  "Snug",
+  "SoYum",
+  "Soap",
+  "SoapNo1_Godrej",
+  "Soap_Aloevera_Dettol",
+  "Soap_Camay",
+  "Soap_Dettol cool",
+  "Soap_Fresh",
+  "Soap_IcyCool_Dettol",
+  "Soap_Jasmine_Lux",
+  "Soap_Liril",
+  "Soap_Neem_Dyna",
+  "Soap_Original_Dettol",
+  "Soap_Rose_Lux",
+  "Soap_Sandal_Dyna",
+  "Soap_fena",
+  "Society",
+  "SocietyTea",
+  "Sofit",
+  "SoftDrink",
+  "SoftDrink_Rasna",
+  "SoftDrink_cola_Campa",
+  "SoftDrink_sprite_Campa",
+  "Softdrink_thumbsup",
+  "Softouch",
+  "Soni Fresh",
+  "SoyaChunks_Ei8amhit",
+  "SoyaSticks-s_Bikaji",
+  "SoyaSticks_Bhujialalji",
+  "SoyaSticks_Jabsons",
+  "Soya_Sauce_Nilons",
+  "Spice_Everest",
+  "Spices_24MantraOrganic",
+  "Spices_Catch",
+  "Spices_FineLife",
+  "Spices_MDH",
+  "Spices_ProNature",
+  "SportingClub_Denver",
+  "Spreads_Hershey-s",
+  "SpringBlossom_perfume_YardleyLondon",
+  "SriSri",
+  "StarBlossom_Perfume_YardleyLondon",
+  "StarFlowerrazi_Perfume_Yardley",
+  "Stayfree",
+  "Sting",
+  "Storia",
+  "StrawberryFlovour_ToothpasteForKids_DentoShine",
+  "Streax",
+  "String",
+  "StrongTeeth_Toothpaste_Colgate",
+  "StuffedChilliPickle_Tops",
+  "SubhKamal",
+  "Suffola",
+  "Sugar",
+  "SugarFree",
+  "SugarFreeCookies_Gaia",
+  "SugarFree_BiskFarm",
+  "SugarLite",
+  "Suhana",
+  "Sultan_Scent_Fogg",
+  "Sundrop",
+  "Sunfeast",
+  "Sunfeast Glucose plus",
+  "SunflowerOil_Cargill",
+  "SunflowerSeed_Ajwah",
+  "SunflowerSeeds_King",
+  "Sunrich",
+  "Sunscream_Lakme",
+  "Sunsilk",
+  "SuperSarvottam",
+  "SurfExcel",
+  "SurfaceDisinfectantSpray_Savlon",
+  "Svami",
+  "Swach",
+  "Swadist",
+  "Swaminarayan",
+  "SweetCorn_Snacko",
+  "SwwetCorn_Sundrop",
+  "SyntheticClearVinegar_Solar",
+  "T-Boost_TrueBasics",
+  "TaaliPeanuts",
+  "Taaza",
+  "TabelSalt_catch",
+  "Tadaa",
+  "TajMahal",
+  "Talati",
+  "Talod",
+  "Tamarind",
+  "Tamarind_Dishwash_Prill",
+  "Tamarind_Priya",
+  "Tang",
+  "TastyNutes_Bikano",
+  "TastyNutes_FirstCrop",
+  "Tata",
+  "TataGold",
+  "TataPremium",
+  "TataTea taaza",
+  "Tata_Agni",
+  "TeaCountry",
+  "TeaTime",
+  "TeaValley",
+  "Tea_3Roses_BrookeBond",
+  "Tea_ElaichiChai_Shera",
+  "Tea_Mayur",
+  "Tea_Ruby_BrookeBond",
+  "Tea_Shera",
+  "Tea_royal",
+  "Tealeaves_DoubleDiamond",
+  "TedheMedhe_Bingo_AlooBhuja",
+  "TedheMedhe_Bingo_PulseMix",
+  "TeekhaMeetha_Bhujialalji",
+  "TeekhaMeetha_FirstCrop",
+  "Tetley",
+  "TheBakersDozen",
+  "TicTac",
+  "Tide",
+  "Tiffany",
+  "Tiger",
+  "Timios",
+  "Tingle",
+  "Toblerone",
+  "Toffee_Ajwah",
+  "ToiletCleaner_Expelz",
+  "TomatoChilliSauce_Maggi",
+  "TomatoDiscs_Peppy",
+  "TomatoKetchup",
+  "TomatoPuree_GodenCrown",
+  "TomatoPuree_Kissan",
+  "TomatoSoup_Bambino",
+  "TongGarden",
+  "TooYumm",
+  "ToothBrush",
+  "ToothPaste",
+  "ToothPaste_CloseUp_ExtremeCool",
+  "ToothPaste_PepsodentG",
+  "ToothPowder_Colgate",
+  "Toothbrush_Sensodyne",
+  "ToothpasteForLittleOnes_DentoShine",
+  "Toothpaste_Meswak",
+  "Toothpaste_Meswak_Dabur",
+  "Toothpaste_RedGel_Dabur",
+  "Toothpaste_Red_BAEFresh_Dabur",
+  "Toothpaste_tulsi_Dabur",
+  "TopBiscuit_Parle",
+  "TopNTown",
+  "TopNut",
+  "TopRamen",
+  "Top_Biscuit_Parle",
+  "Top_Ramen",
+  "Tops",
+  "Tops_DrinkingChocolate",
+  "Total12_Toothpaste_Colgate",
+  "TragacanthGum_Ajwah",
+  "Trail_Mix_DryFruit",
+  "Trapa",
+  "Treat_BasmatiRice",
+  "Trelish",
+  "Tresemme",
+  "Trident",
+  "TriphalaChurna_Baidyanath",
+  "Tropicana",
+  "Truffles_Joyland",
+  "Trump_Scent_Fogg",
+  "TulsiGingerTurmeric_OrganicIndia",
+  "TunaNaturalOil_GoldenCrown",
+  "TunaOil_GoldenCrown",
+  "TurkishApricot_Happilo",
+  "TurmericPowder_Kanwal",
+  "TurmericPowder_Rehmat",
+  "Twinings",
+  "TwistiesNamkeen_FirstCrop",
+  "Twix",
+  "Tycoon_Scent_Fogg",
+  "Ujala",
+  "UltraPowerBalm_Zandu",
+  "UltraSensual_Deodrant_WildStone",
+  "UncleChips",
+  "Unibic",
+  "Unibic_CashewBadam_Cookies",
+  "Upma_Bambino",
+  "Upma_MTR",
+  "Utsav",
+  "VJohn",
+  "VWash",
+  "Vahdam",
+  "VajradantiSF_Toothpaste_Vicco",
+  "Vanish",
+  "Vaseline",
+  "Vatika",
+  "VedShakti_Toothpaste_Colgate",
+  "Veeba",
+  "VegBiryani_Gits",
+  "VegMayonnaise_Imli_Tree",
+  "VegMayonnaise_Veeba",
+  "VeganMayo_HellManns",
+  "Veggi_CupNoodles_Manchow",
+  "VeggieManchow_CupNoodles",
+  "Vermicelli",
+  "Vicks",
+  "Vicks_VapoRub_SteamPads",
+  "Vidal",
+  "Vim",
+  "Vim Anti bac",
+  "Vinegar",
+  "Vinegar_Everest",
+  "VisibleWhite_Toothpaste_Colgate",
+  "VitaminCapsules_Seacod",
+  "Voilet-Raspberry_Perfume_YardleyLondon",
+  "Voyage_Perfume_ParkAvenue",
+  "WSQ_VICCO",
+  "WaferRoll_Champion",
+  "Wafer_Orange_FullBloom",
+  "Wafer_Strawberry_FullBloom",
+  "Wafers_Gastone_Lago",
+  "Wafers_Tirameso_Creweto",
+  "Wafers_Treat_Britannia",
+  "WaffleBites_Craveto",
+  "Waffy_Parle",
+  "WaghBakri",
+  "WaiWai",
+  "Walnut",
+  "WalnutKernels_Kohinoor",
+  "Walnut_Nutraj",
+  "Water",
+  "WatermelonFlovour_ToothpasteForKids_DentoShine",
+  "WeikField",
+  "Weikfield",
+  "WellnessCollection_TGLCo.",
+  "Wheel",
+  "Whisper",
+  "WhiteBread_Dewz",
+  "WhiteCurrents_St.John",
+  "WhiteningSmoothSkin_Deodrant_Nivea",
+  "Whitening_Sensodyne",
+  "Wilkinson_Sword_Gillette",
+  "Win2",
+  "Winegreens",
+  "Winkies",
+  "Women-sPlus_Horlicks",
+  "WomensPlus",
+  "WottaGirl_Perfume_Layerr",
+  "Yardley",
+  "Yeah",
+  "Yeast",
+  "YellowBananaChips_Haldirams",
+  "YellowBlossom_Spinz",
+  "YogaBar",
+  "Yogurt",
+  "Yummies",
+  "Yummiez",
+  "Yummy",
+  "Zandu",
+  "ZanduBalm",
+  "Zenzi",
+  "Zouk_BodySpray_ParkAvenue",
+  "ZzzQuil_Natura",
+  "aachar-sethi",
+  "aashirvaad milk",
+  "adzanarice_charminar",
+  "agra petha",
+  "aircleaner_airwick",
+  "aircleaner_campure",
+  "airfreshber_koparo",
+  "airfreshner_hifresh",
+  "airfreshner_lavender",
+  "airfreshner_wonderfresh",
+  "almondmilk_at",
+  "amul cow milk",
+  "amul masti dahi",
+  "amul moti milk",
+  "amulcow milk",
+  "amulmilk",
+  "amulmithai mate",
+  "amulspray infant milk food",
+  "ananda ghee",
+  "apple_fishermanfriend",
+  "appy",
+  "aquafly_water",
+  "asian cosmos",
+  "axe",
+  "bagrry-s corn flakes",
+  "bajaj hair oil",
+  "bajaj majesty RX11",
+  "banana fryums",
+  "bansal tableware",
+  "batook",
+  "bc bikaneri rasgulla",
+  "bc_rasgulla_ bikaneri",
+  "beer_oldammaiga",
+  "besen",
+  "betty crocker pancske mix",
+  "bicano kaju mixture",
+  "bikaneri bhujia first crop",
+  "bikaneri_bhujia_firstcrop",
+  "bikano bhelpuri mixture",
+  "bikano time mixture",
+  "biscuit_bourbon",
+  "biscuit_deluxe",
+  "biscuit_sugarcracker",
+  "bisk farn googly",
+  "biskfarm eat fit",
+  "biskfarm top herbs",
+  "black salt zoff",
+  "borges",
+  "boroplus_bodylotion",
+  "britannia cow ghee",
+  "britannia the laughiung cow",
+  "britannia tiger crunch",
+  "britannia50 50 time pass",
+  "britanniacheese slicess",
+  "brown - Polson custard powder",
+  "camembert",
+  "campoor",
+  "candy_kisses",
+  "candy_m-m",
+  "candy_nerds",
+  "candy_ricola",
+  "catch black papper",
+  "catch red chilli powder",
+  "catch_black_papper",
+  "cello",
+  "chabaa_redgrape",
+  "chabaa_whitegrapefruit",
+  "chana dal",
+  "cheese-corn bite",
+  "cheese_soignon",
+  "chile",
+  "chings soy sauce",
+  "chocalateHorlicks",
+  "choclate-gaia",
+  "choclate_ambriona",
+  "choclate_epigamia",
+  "choclate_lindberg",
+  "choclate_loacker",
+  "choclate_melt",
+  "classic bread",
+  "cobra",
+  "coconut milk_uglobe",
+  "cod Liver oil capsules",
+  "colgate max",
+  "conditioner natural",
+  "cookie_merba",
+  "cool freshness",
+  "cows_ghee",
+  "cream-vlcc",
+  "crop oats Yummy protein-rich oats",
+  "cup noodles italiano",
+  "dabar anmol hair oil",
+  "dabar champrash",
+  "dabar gulabari moisturizing",
+  "dabur babool",
+  "dahi_at",
+  "dalda",
+  "danaram",
+  "dantkanti",
+  "dark fantasy sandwich cream",
+  "delisoga",
+  "double diamond tea",
+  "dove daily shine shampoo",
+  "dove intense repair shampoo",
+  "dozo power wash",
+  "drink_toran",
+  "drink_torani",
+  "drools",
+  "dyna neem soap",
+  "elmac lemon",
+  "emami 7oils",
+  "expelz ultra clean",
+  "expelz_ultraclean",
+  "facewash_beardo",
+  "facewash_coffee",
+  "facewash_wow",
+  "fair and handsome",
+  "fanta",
+  "femina casserole",
+  "figo detergent",
+  "first crop all in one mixture",
+  "first crop aloo bhujia",
+  "first crop besan",
+  "first crop butter cookies",
+  "first crop cream onion potato chips",
+  "first crop dry fruit",
+  "first crop instant noodle",
+  "first crop navratan",
+  "first crop oats",
+  "first crop peanut butter",
+  "first crop tasty peanut",
+  "first crop tastypeanuts",
+  "first crop zahidi dates",
+  "fist crop navratan",
+  "fist crop oats",
+  "fortune indori poha",
+  "fresca_greenapple",
+  "freshNatural_Deodrant_Nivea",
+  "full bloom",
+  "full bloom classic tea",
+  "full bloom ketchup",
+  "full bloom strawberry",
+  "gaurianjeer",
+  "ghee_Patanjali",
+  "ghee_milkfood",
+  "ginger nuts",
+  "gits rabdi",
+  "glow-lovely_cream",
+  "glow-lovely_winter brightcream",
+  "gluta-hya_Vaseline",
+  "go cheese processed",
+  "go pizza cheese",
+  "godrej yummiez",
+  "godrejrich creme",
+  "goldengate roasted almonds",
+  "good life chana dal",
+  "good life jeera",
+  "greenmint_Mala-s",
+  "harpic",
+  "hathi brand mustard oil",
+  "hawkins",
+  "head and shoulder 2 in 1",
+  "highonCranberry_Beer_Coolberg",
+  "himalaya complete care",
+  "himalaya neem face wash",
+  "himalaya shampoo",
+  "himalayan body lotion",
+  "hotdog",
+  "icecream_creambell",
+  "imli treengreen chilli pickle",
+  "inchi",
+  "independence",
+  "independence biryani special",
+  "independence dal",
+  "india gate jeera rice",
+  "indica easy hair colour",
+  "intense engage",
+  "jira_goodlife",
+  "johnson",
+  "joyo clean max",
+  "juice_edwesis",
+  "juice_sante",
+  "junior horlicks",
+  "kachi ghani mustard oil first crop",
+  "kakaji",
+  "kelloggy-s corn flakes real honey",
+  "kellogs corn flakes",
+  "keya all purpose",
+  "keya piri piri",
+  "kissan mixed fruit jam",
+  "kissan peanut butter",
+  "knorr chicken soup",
+  "krack track",
+  "light frydrate_Vaseline",
+  "livon",
+  "loreal colour protect shampoo",
+  "lotus moisturiser",
+  "lotus whiteglow",
+  "making corn",
+  "mala jamun jam",
+  "malist cheese",
+  "mama earth baby body wash",
+  "mamaearth baby body wash",
+  "manish",
+  "manypoko pants",
+  "masala chai",
+  "masala_sbm",
+  "masti oye",
+  "max protein daily",
+  "maza",
+  "mdh amchur powder",
+  "mdh hing",
+  "mdh jeera powder",
+  "mdh kashmiri mirch",
+  "mdh lal mirch",
+  "melody",
+  "men turbobright_Garnier",
+  "meusli",
+  "mevities ginger nuts",
+  "milton",
+  "minakshi ghee",
+  "miranda",
+  "mixture_indiaah",
+  "mohan red tea",
+  "mosquito repllent_baygon",
+  "mosquito repllent_bbhome",
+  "mosquito repllent_mamaearth",
+  "mosquito repllents_bodyguard",
+  "mosquito repllents_campure",
+  "mother ginger pickle",
+  "mother recipe garlic pickle",
+  "mother teekha meeth",
+  "mothers upma",
+  "mothers_upma",
+  "mstick",
+  "munch the cashew",
+  "mushroom Soup_Knorr",
+  "my home",
+  "namaste india desi ghee",
+  "napthaleneballs_ultra",
+  "natural amla shikakai shampoo",
+  "nescafe classic blast roast",
+  "nestle lactogen pro",
+  "nestle nan pro",
+  "nimbu hi nimbu clanser",
+  "nivea men deep impact",
+  "noodles_MastiOye",
+  "nutralite sampriti ghee",
+  "oil_iris",
+  "organic suji",
+  "osam dahi",
+  "osam plain dahi",
+  "osam plaindahi",
+  "outshuine",
+  "pack_chabaa_whitegrapefruit",
+  "paint_asianpaint",
+  "paneer butterMasala_CuppaNoodles_Maggi",
+  "pantene advance hairfall solution shampoo",
+  "paraclute body lotion",
+  "parle jaggery",
+  "parle occasions",
+  "parth jaggery",
+  "patanjali saundarya face wash",
+  "peanut_Bicano",
+  "pedigree",
+  "phenyl_ambetol",
+  "poloqueen healthy",
+  "poloqueen jasmine",
+  "poloqueen lemon",
+  "poltcab",
+  "potato fryums",
+  "potatochips_kakaji",
+  "prabhaji  jhakaas mix",
+  "prabhaji  moong daal",
+  "prabhaji all in one",
+  "prabhaji bhujia",
+  "prabhaji chat pata",
+  "prabhaji khata meetha",
+  "prabhji khatta meetha",
+  "prabhuji moong dal -",
+  "priemer_terminator",
+  "priya gold CNC",
+  "punch the healthy crunch chile",
+  "punchthehealthy_crunch_chile",
+  "pure burst cow ghee",
+  "pureGhee_Amul",
+  "quickwrapp",
+  "real man",
+  "redpaste",
+  "rite bite nuts-seeds",
+  "riya bindas",
+  "riya hum tum",
+  "roasted vermicelli",
+  "roomfreshner_ambipur",
+  "ruchi kheer mix",
+  "ruchiayuna jaggery",
+  "saffalo masala oats",
+  "saffalo oats",
+  "saffola mustard oil",
+  "sauce_schezwan",
+  "savlon deep clean",
+  "sawan",
+  "shahi pariwar mustard oil",
+  "shampoo_Natural",
+  "shaving_foam_gillete",
+  "snac tac chana",
+  "snac tac hot -sour soup",
+  "snac tac moong dal",
+  "snac tac navratan mix",
+  "snac tac tomato ketchup",
+  "snac tac tomato soup",
+  "snacks_jabsons",
+  "snowhite detergent powder",
+  "soap-protam",
+  "soap-santoor",
+  "soap-savloon",
+  "soap_ghari",
+  "soap_nip",
+  "soap_no.1",
+  "softdrink_arupe",
+  "softdrink_edwises",
+  "softdrink_evocus",
+  "softdrink_raze",
+  "softdrink_v8",
+  "sofy anti bacteria",
+  "soya chunks",
+  "soya sauce_sam",
+  "spice chicken_CuppaNoodles_Maggi",
+  "spout",
+  "spray_moov",
+  "sudha lassi",
+  "sudha milk",
+  "sudha peda",
+  "sundrop oil",
+  "sunfeast dream",
+  "sweet bliss peanut chikki",
+  "sweet corn chicken soup_Knorr",
+  "sweet_marshmelts",
+  "tata samparn hing",
+  "tea-tgl",
+  "the man companyaloevera",
+  "tomato fryums",
+  "top Ramen masala",
+  "top herbs",
+  "top ramen curry",
+  "towels_blooms",
+  "troplcana",
+  "tsauce_ops",
+  "uncl",
+  "vaseline deep moisture",
+  "vasmol kesh kala",
+  "veg cheese finger",
+  "vicco",
+  "vim Maha bar",
+  "vintagecheddar_wyke",
+  "vlcc charco",
+  "vlcc eternal youth skin firming",
+  "vlcc insta glow diamonfd bleach",
+  "wafers-moneta",
+  "wafers_unibic",
+  "wal wal takka",
+  "weikfield",
+  "whippingcream",
+  "whiskas",
+  "white hansha jasmine fragrance",
+  "white hansha rose fragrance",
+  "white hansha sanitary",
+  "whiteningSensitive_Deodrant_Nivea",
+  "wiekfeild custard powder",
+  "wildstone code",
+  "woodpriemer_terminator",
+  "yardley talc",
+  "yardleys after shave lotionn",
+  "yippee",
+  "zoff",
+  "Kitchen",
+  "MixedPickle_Alps",
+  "jam",
+  "oil",
+  "undefined"
+]

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 gradio==6.16.0
-llama-cpp-python==0.3.9
 onnxruntime==1.21.0
 Pillow==11.2.1
 PyMuPDF==1.25.5

 gradio==6.16.0
+llama-cpp-python==0.3.28
 onnxruntime==1.21.0
 Pillow==11.2.1
 PyMuPDF==1.25.5

tracer.py CHANGED Viewed

@@ -14,7 +14,7 @@ if TYPE_CHECKING:
 logger = logging.getLogger(__name__)
-_HF_DATASET_REPO = "naazimsnh02/kirana-detective-traces"
 MAX_RETRIES = 3
 BACKOFF_BASE_SECONDS = 2  # sleeps 2s, 4s, 8s on successive failures

 logger = logging.getLogger(__name__)
+_HF_DATASET_REPO = "build-small-hackathon/kirana-detective-traces"
 MAX_RETRIES = 3
 BACKOFF_BASE_SECONDS = 2  # sleeps 2s, 4s, 8s on successive failures