naazimsnh02 commited on
Commit
9d75c8c
·
1 Parent(s): 7b5611f

All models training uploaded

Browse files
.kiro/specs/kirana-detective/design.md CHANGED
@@ -56,7 +56,7 @@ The total active model parameter budget is approximately 2.38B (1.3B + 1.08B + 0
56
  │ ┌──────────┐ ┌──────────────┐ ┌──────────────────────────┐ │
57
  │ │ SQLite │ │ catalog.py │ │ tracer.py │ │
58
  │ │ storage │ │ fmcg_catalog │ │ HF Hub dataset publisher │ │
59
- │ │ .db file │ │ .json (200) │ │ naazimsnh02/kirana-... │ │
60
  │ └──────────┘ └──────────────┘ └──────────────────────────┘ │
61
  └─────────────────────────────────────────────────────────────────────┘
62
  ```
@@ -170,11 +170,11 @@ class AuditOrchestrator:
170
 
171
  ```python
172
  class InvoiceExtractorAgent:
173
- MODEL_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction"
174
  AGENT_NAME = "Invoice_Extractor"
175
  AGENT_VERSION = "1.0.0"
176
 
177
- def __init__(self, llm: Llama): ...
178
 
179
  def extract(
180
  self,
@@ -187,7 +187,7 @@ class InvoiceExtractorAgent:
187
 
188
  ```python
189
  class ProductMatcherAgent:
190
- MODEL_REPO = "naazimsnh02/minicpm5-1b-indian-fmcg-normalizer"
191
  AGENT_NAME = "Product_Matcher"
192
  AGENT_VERSION = "1.0.0"
193
 
@@ -255,7 +255,7 @@ class PricingAgent:
255
 
256
  ```python
257
  class VisualCounterAgent:
258
- MODEL_REPO = "naazimsnh02/yolo26n-indian-fmcg-detection"
259
  AGENT_NAME = "Visual_Counter"
260
  AGENT_VERSION = "1.0.0"
261
 
@@ -312,7 +312,7 @@ class ReconciliationAgent:
312
 
313
  ```python
314
  class SavingsAgent:
315
- MODEL_REPO = "naazimsnh02/minicpm5-1b-indian-fmcg-normalizer"
316
  AGENT_NAME = "Savings_Agent"
317
  AGENT_VERSION = "1.0.0"
318
 
@@ -410,7 +410,7 @@ class FMCGCatalog:
410
 
411
  ```python
412
  class AgentTracer:
413
- HF_DATASET_REPO = "naazimsnh02/kirana-detective-traces"
414
  MAX_RETRIES = 3
415
  BACKOFF_BASE_SECONDS = 2.0
416
 
@@ -991,27 +991,23 @@ def load_models() -> dict:
991
  Downloads are skipped if cached files exist.
992
  """
993
 
994
- # --- Agent 1: MiniCPM-V 4.6 (vision) ---
995
- vision_gguf_path = hf_hub_download(
996
- repo_id="naazimsnh02/minicpm-v-4-6-indian-invoice-extraction",
997
- filename="model.gguf", # Q4_K_M quantized
998
- )
999
- clip_model_path = hf_hub_download(
1000
- repo_id="naazimsnh02/minicpm-v-4-6-indian-invoice-extraction",
1001
- filename="mmproj.gguf", # Vision encoder projection weights
1002
  )
1003
- chat_handler = MiniCPMv26ChatHandler(clip_model_path=clip_model_path)
1004
- vision_llm = Llama(
1005
- model_path=vision_gguf_path,
1006
- chat_handler=chat_handler,
1007
- n_ctx=4096,
1008
- n_threads=4,
1009
- verbose=False,
1010
  )
 
1011
 
1012
  # --- Agents 2 & 6: MiniCPM5-1B (text) ---
1013
  text_gguf_path = hf_hub_download(
1014
- repo_id="naazimsnh02/minicpm5-1b-indian-fmcg-normalizer",
1015
  filename="model.gguf", # Q4_K_M quantized
1016
  )
1017
  text_llm = Llama(
@@ -1023,11 +1019,11 @@ def load_models() -> dict:
1023
 
1024
  # --- Agent 4: YOLO26n ONNX ---
1025
  onnx_path = hf_hub_download(
1026
- repo_id="naazimsnh02/yolo26n-indian-fmcg-detection",
1027
  filename="yolo26n_fmcg.onnx",
1028
  )
1029
  class_names_path = hf_hub_download(
1030
- repo_id="naazimsnh02/yolo26n-indian-fmcg-detection",
1031
  filename="class_names.json",
1032
  )
1033
  ort_session = onnxruntime.InferenceSession(
 
56
  │ ┌──────────┐ ┌──────────────┐ ┌──────────────────────────┐ │
57
  │ │ SQLite │ │ catalog.py │ │ tracer.py │ │
58
  │ │ storage │ │ fmcg_catalog │ │ HF Hub dataset publisher │ │
59
+ │ │ .db file │ │ .json (200) │ │ build-small-hackathon/kirana-... │ │
60
  │ └──────────┘ └──────────────┘ └──────────────────────────┘ │
61
  └─────────────────────────────────────────────────────────────────────┘
62
  ```
 
170
 
171
  ```python
172
  class InvoiceExtractorAgent:
173
+ MODEL_REPO = "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged"
174
  AGENT_NAME = "Invoice_Extractor"
175
  AGENT_VERSION = "1.0.0"
176
 
177
+ def __init__(self, llm): ...
178
 
179
  def extract(
180
  self,
 
187
 
188
  ```python
189
  class ProductMatcherAgent:
190
+ MODEL_REPO = "build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer"
191
  AGENT_NAME = "Product_Matcher"
192
  AGENT_VERSION = "1.0.0"
193
 
 
255
 
256
  ```python
257
  class VisualCounterAgent:
258
+ MODEL_REPO = "build-small-hackathon/yolo26n-indian-fmcg-detection"
259
  AGENT_NAME = "Visual_Counter"
260
  AGENT_VERSION = "1.0.0"
261
 
 
312
 
313
  ```python
314
  class SavingsAgent:
315
+ MODEL_REPO = "build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer"
316
  AGENT_NAME = "Savings_Agent"
317
  AGENT_VERSION = "1.0.0"
318
 
 
410
 
411
  ```python
412
  class AgentTracer:
413
+ HF_DATASET_REPO = "build-small-hackathon/kirana-detective-traces"
414
  MAX_RETRIES = 3
415
  BACKOFF_BASE_SECONDS = 2.0
416
 
 
991
  Downloads are skipped if cached files exist.
992
  """
993
 
994
+ # --- Agent 1: MiniCPM-V 4.6 (vision, merged weights via transformers) ---
995
+ _vision_model = AutoModel.from_pretrained(
996
+ "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged",
997
+ trust_remote_code=True,
998
+ torch_dtype=torch.bfloat16,
999
+ device_map="auto",
 
 
1000
  )
1001
+ _vision_model.eval()
1002
+ _vision_tokenizer = AutoTokenizer.from_pretrained(
1003
+ "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged",
1004
+ trust_remote_code=True,
 
 
 
1005
  )
1006
+ vision_llm = (_vision_model, _vision_tokenizer)
1007
 
1008
  # --- Agents 2 & 6: MiniCPM5-1B (text) ---
1009
  text_gguf_path = hf_hub_download(
1010
+ repo_id="build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer",
1011
  filename="model.gguf", # Q4_K_M quantized
1012
  )
1013
  text_llm = Llama(
 
1019
 
1020
  # --- Agent 4: YOLO26n ONNX ---
1021
  onnx_path = hf_hub_download(
1022
+ repo_id="build-small-hackathon/yolo26n-indian-fmcg-detection",
1023
  filename="yolo26n_fmcg.onnx",
1024
  )
1025
  class_names_path = hf_hub_download(
1026
+ repo_id="build-small-hackathon/yolo26n-indian-fmcg-detection",
1027
  filename="class_names.json",
1028
  )
1029
  ort_session = onnxruntime.InferenceSession(
.kiro/specs/kirana-detective/requirements.md CHANGED
@@ -205,7 +205,7 @@ All inference runs locally with no cloud API calls (Off the Grid badge). Both la
205
 
206
  1. THE System SHALL create one Agent_Trace entry for each agent call within an Audit_Run, capturing: `agent_name`, `agent_version`, `audit_run_id`, `timestamp_start`, `timestamp_end`, `duration_ms`, `input_summary`, and `output_summary`.
207
  2. THE System SHALL record Agent_Trace entries in the sequential pipeline order: Invoice_Extractor → Product_Matcher → Pricing_Agent → Visual_Counter → Reconciliation_Agent → Savings_Agent.
208
- 3. THE System SHALL publish the complete Agent_Trace for each Audit_Run as a row in the HuggingFace Hub dataset `naazimsnh02/kirana-detective-traces` within 10 seconds of the Audit_Run completing.
209
  4. WHEN the HuggingFace Hub dataset is unreachable, THE System SHALL save the Agent_Trace locally and retry publication with exponential back-off up to 3 attempts.
210
  5. THE Agent_Trace SHALL NOT include raw invoice image bytes, raw delivery photo bytes, or any personally identifiable information from the invoice.
211
  6. THE System SHALL assign a unique `audit_run_id` (UUID v4) to each Audit_Run and include it in every Agent_Trace entry and the Leakage_Report.
@@ -235,10 +235,10 @@ All inference runs locally with no cloud API calls (Off the Grid badge). Both la
235
 
236
  #### Acceptance Criteria
237
 
238
- 1. THE Invoice_Extractor SHALL run MiniCPM-V 4.6 inference exclusively via llama-cpp-python using the GGUF-quantised model file `naazimsnh02/minicpm-v-4-6-indian-invoice-extraction`, with no HTTP calls to any external AI API.
239
- 2. THE Product_Matcher SHALL run MiniCPM5-1B inference exclusively via llama-cpp-python using the GGUF-quantised model file `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer`, with no HTTP calls to any external AI API.
240
- 3. THE Savings_Agent SHALL run MiniCPM5-1B inference exclusively via llama-cpp-python using the GGUF-quantised model file `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer`, with no HTTP calls to any external AI API.
241
- 4. THE Visual_Counter SHALL run YOLO26n inference exclusively via onnxruntime using the locally-stored ONNX model file `naazimsnh02/yolo26n-indian-fmcg-detection`, with no HTTP calls to any external AI API.
242
  5. THE System SHALL load all model files from the HuggingFace Hub cache at startup and SHALL NOT download model files during an Audit_Run.
243
  6. WHILE operating in inference mode, THE System SHALL make no outbound HTTP calls except to the HuggingFace Hub dataset API for Agent_Trace publication (Requirement 11).
244
 
@@ -250,9 +250,9 @@ All inference runs locally with no cloud API calls (Off the Grid badge). Both la
250
 
251
  #### Acceptance Criteria
252
 
253
- 1. THE System SHALL use the model `naazimsnh02/minicpm-v-4-6-indian-invoice-extraction` — a MiniCPM-V 4.6 model fine-tuned via QLoRA on Unsloth and Modal using 500 synthetic Indian invoice images across 10 suppliers and 4 invoice formats.
254
- 2. THE System SHALL use the model `naazimsnh02/yolo26n-indian-fmcg-detection` — a YOLO26n model fine-tuned on the Roboflow Indian Grocery Object Detection dataset, exported to ONNX format.
255
- 3. THE System SHALL use the model `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer` — a MiniCPM5-1B model fine-tuned via QLoRA on Unsloth and Modal using 2,000 synthetic (raw_name, normalized_name) pairs covering the top 200 Indian FMCG SKUs.
256
  4. THE System SHALL reference each published model by its HuggingFace Hub repository identifier in the application configuration.
257
 
258
  ---
 
205
 
206
  1. THE System SHALL create one Agent_Trace entry for each agent call within an Audit_Run, capturing: `agent_name`, `agent_version`, `audit_run_id`, `timestamp_start`, `timestamp_end`, `duration_ms`, `input_summary`, and `output_summary`.
207
  2. THE System SHALL record Agent_Trace entries in the sequential pipeline order: Invoice_Extractor → Product_Matcher → Pricing_Agent → Visual_Counter → Reconciliation_Agent → Savings_Agent.
208
+ 3. THE System SHALL publish the complete Agent_Trace for each Audit_Run as a row in the HuggingFace Hub dataset `build-small-hackathon/kirana-detective-traces` within 10 seconds of the Audit_Run completing.
209
  4. WHEN the HuggingFace Hub dataset is unreachable, THE System SHALL save the Agent_Trace locally and retry publication with exponential back-off up to 3 attempts.
210
  5. THE Agent_Trace SHALL NOT include raw invoice image bytes, raw delivery photo bytes, or any personally identifiable information from the invoice.
211
  6. THE System SHALL assign a unique `audit_run_id` (UUID v4) to each Audit_Run and include it in every Agent_Trace entry and the Leakage_Report.
 
235
 
236
  #### Acceptance Criteria
237
 
238
+ 1. THE Invoice_Extractor SHALL run MiniCPM-V 4.6 inference exclusively via transformers (`AutoModel.chat()`) using the merged model `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged`, with no HTTP calls to any external AI API.
239
+ 2. THE Product_Matcher SHALL run MiniCPM5-1B inference exclusively via llama-cpp-python using the GGUF-quantised model file `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`, with no HTTP calls to any external AI API.
240
+ 3. THE Savings_Agent SHALL run MiniCPM5-1B inference exclusively via llama-cpp-python using the GGUF-quantised model file `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`, with no HTTP calls to any external AI API.
241
+ 4. THE Visual_Counter SHALL run YOLO26n inference exclusively via onnxruntime using the locally-stored ONNX model file `build-small-hackathon/yolo26n-indian-fmcg-detection`, with no HTTP calls to any external AI API.
242
  5. THE System SHALL load all model files from the HuggingFace Hub cache at startup and SHALL NOT download model files during an Audit_Run.
243
  6. WHILE operating in inference mode, THE System SHALL make no outbound HTTP calls except to the HuggingFace Hub dataset API for Agent_Trace publication (Requirement 11).
244
 
 
250
 
251
  #### Acceptance Criteria
252
 
253
+ 1. THE System SHALL use the model `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged` — a MiniCPM-V 4.6 model fine-tuned via QLoRA and merged (LoRA weights baked into base), trained on 500 synthetic Indian invoice images across 10 suppliers and 4 invoice formats.
254
+ 2. THE System SHALL use the model `build-small-hackathon/yolo26n-indian-fmcg-detection` — a YOLO26n model fine-tuned on the Roboflow Indian Grocery Object Detection dataset, exported to ONNX format.
255
+ 3. THE System SHALL use the model `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer` — a MiniCPM5-1B model fine-tuned via QLoRA on Unsloth and Modal using 2,000 synthetic (raw_name, normalized_name) pairs covering the top 200 Indian FMCG SKUs.
256
  4. THE System SHALL reference each published model by its HuggingFace Hub repository identifier in the application configuration.
257
 
258
  ---
.kiro/specs/kirana-detective/tasks.md CHANGED
@@ -14,16 +14,16 @@ The implementation language is **Python** (all agents, pipeline, storage, catalo
14
  - [ ] 0.1 Fine-tune YOLO26n on Indian grocery dataset (Day 1)
15
  - Write `finetune/train_yolo26n.py` using Modal + Roboflow Indian Grocery Object Detection dataset
16
  - Export trained weights to ONNX format (`yolo26n_fmcg.onnx`) and `class_names.json`
17
- - Publish to `naazimsnh02/yolo26n-indian-fmcg-detection` on HF Hub with model card
18
  - _Requirements: 8.4, 13.4, 14.2_
19
  - [ ] 0.2 Generate synthetic invoices and fine-tune MiniCPM-V 4.6 (Day 2)
20
  - Write `finetune/generate_invoices.py` — 500 synthetic Indian invoice images across 10 suppliers, 4 formats (printed GST, handwritten, Tally PDF, WhatsApp screenshot)
21
  - Write `finetune/train_minicpm_v.py` using QLoRA on Unsloth + Modal
22
- - Publish GGUF-quantised model (`model.gguf`, `mmproj.gguf`) to `naazimsnh02/minicpm-v-4-6-indian-invoice-extraction`
23
  - _Requirements: 2.1, 2.3, 13.1, 14.1_
24
  - [ ] 0.3 Fine-tune MiniCPM5-1B for FMCG normalisation (Day 3)
25
  - Write `finetune/train_minicpm5_1b.py` — 2,000 synthetic `(raw_name, normalized_name)` pairs covering 200 FMCG SKUs, QLoRA on Unsloth + Modal
26
- - Publish GGUF-quantised model (`model.gguf`) to `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer`
27
  - _Requirements: 3.2, 13.2, 13.3, 14.3_
28
 
29
  - [x] 1. Project scaffolding — directory structure, pinned dependencies, README skeleton
@@ -87,7 +87,7 @@ The implementation language is **Python** (all agents, pipeline, storage, catalo
87
  - `finalise(audit_run_id)` → `List[AgentTraceEntry]`: return buffer and clear it
88
  - `publish_async(audit_run_id, entries, storage)`: start daemon `threading.Thread` targeting `_publish_with_retry`
89
  - `_publish_with_retry(audit_run_id, entries, storage)`: call `storage.save_audit_run()`; loop `MAX_RETRIES=3` times, calling `_publish_to_hf_hub()`; on failure sleep `BACKOFF_BASE_SECONDS ** (attempt+1)` (2s, 4s, 8s); log final failure without raising
90
- - `_publish_to_hf_hub(audit_run_id, entries)`: use `HfApi.upload_file()` to append `traces/{audit_run_id}.json` to `naazimsnh02/kirana-detective-traces` dataset repo; row: `{audit_run_id, trace_json, timestamp}`; must NOT include raw invoice bytes, photos, or PII
91
  - Implement `_make_trace_entry()` module-level helper: captures `timestamp_start` (ISO 8601 UTC), `timestamp_end`, `duration_ms` from `time.monotonic()`
92
  - _Requirements: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6_
93
 
@@ -203,7 +203,7 @@ The implementation language is **Python** (all agents, pipeline, storage, catalo
203
  - [ ] 18. HF Space deployment — `README.md` and model verification
204
  - [ ] 18.1 Finalise `README.md` with HF Space config and model download verification
205
  - Add YAML front-matter: `sdk: gradio`, `sdk_version: 6.16.0`, `app_file: app.py`, `title: Kirana Detective AI`, `short_description: AI invoice auditor for kirana stores`, `license: mit`, `tags: [invoice-audit, llm, yolo, gguf, gradio]`
206
- - Document all three model repos (`naazimsnh02/minicpm-v-4-6-indian-invoice-extraction`, `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer`, `naazimsnh02/yolo26n-indian-fmcg-detection`)
207
  - Add section: "Model download verification" — describe `hf_hub_download()` usage and expected cache paths
208
  - Add "Running locally" section with `pip install -r requirements.txt` + `python app.py`
209
  - Add "Hackathon badges" section listing Off the Grid, Llama Champion, Off-Brand, Sharing is Caring, Well-Tuned, Tiny Titan
 
14
  - [ ] 0.1 Fine-tune YOLO26n on Indian grocery dataset (Day 1)
15
  - Write `finetune/train_yolo26n.py` using Modal + Roboflow Indian Grocery Object Detection dataset
16
  - Export trained weights to ONNX format (`yolo26n_fmcg.onnx`) and `class_names.json`
17
+ - Publish to `build-small-hackathon/yolo26n-indian-fmcg-detection` on HF Hub with model card
18
  - _Requirements: 8.4, 13.4, 14.2_
19
  - [ ] 0.2 Generate synthetic invoices and fine-tune MiniCPM-V 4.6 (Day 2)
20
  - Write `finetune/generate_invoices.py` — 500 synthetic Indian invoice images across 10 suppliers, 4 formats (printed GST, handwritten, Tally PDF, WhatsApp screenshot)
21
  - Write `finetune/train_minicpm_v.py` using QLoRA on Unsloth + Modal
22
+ - Publish GGUF-quantised model (`model.gguf`, `mmproj.gguf`) to `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction`
23
  - _Requirements: 2.1, 2.3, 13.1, 14.1_
24
  - [ ] 0.3 Fine-tune MiniCPM5-1B for FMCG normalisation (Day 3)
25
  - Write `finetune/train_minicpm5_1b.py` — 2,000 synthetic `(raw_name, normalized_name)` pairs covering 200 FMCG SKUs, QLoRA on Unsloth + Modal
26
+ - Publish GGUF-quantised model (`model.gguf`) to `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`
27
  - _Requirements: 3.2, 13.2, 13.3, 14.3_
28
 
29
  - [x] 1. Project scaffolding — directory structure, pinned dependencies, README skeleton
 
87
  - `finalise(audit_run_id)` → `List[AgentTraceEntry]`: return buffer and clear it
88
  - `publish_async(audit_run_id, entries, storage)`: start daemon `threading.Thread` targeting `_publish_with_retry`
89
  - `_publish_with_retry(audit_run_id, entries, storage)`: call `storage.save_audit_run()`; loop `MAX_RETRIES=3` times, calling `_publish_to_hf_hub()`; on failure sleep `BACKOFF_BASE_SECONDS ** (attempt+1)` (2s, 4s, 8s); log final failure without raising
90
+ - `_publish_to_hf_hub(audit_run_id, entries)`: use `HfApi.upload_file()` to append `traces/{audit_run_id}.json` to `build-small-hackathon/kirana-detective-traces` dataset repo; row: `{audit_run_id, trace_json, timestamp}`; must NOT include raw invoice bytes, photos, or PII
91
  - Implement `_make_trace_entry()` module-level helper: captures `timestamp_start` (ISO 8601 UTC), `timestamp_end`, `duration_ms` from `time.monotonic()`
92
  - _Requirements: 11.1, 11.2, 11.3, 11.4, 11.5, 11.6_
93
 
 
203
  - [ ] 18. HF Space deployment — `README.md` and model verification
204
  - [ ] 18.1 Finalise `README.md` with HF Space config and model download verification
205
  - Add YAML front-matter: `sdk: gradio`, `sdk_version: 6.16.0`, `app_file: app.py`, `title: Kirana Detective AI`, `short_description: AI invoice auditor for kirana stores`, `license: mit`, `tags: [invoice-audit, llm, yolo, gguf, gradio]`
206
+ - Document all three model repos (`build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction`, `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`, `build-small-hackathon/yolo26n-indian-fmcg-detection`)
207
  - Add section: "Model download verification" — describe `hf_hub_download()` usage and expected cache paths
208
  - Add "Running locally" section with `pip install -r requirements.txt` + `python app.py`
209
  - Add "Hackathon badges" section listing Off the Grid, Llama Champion, Off-Brand, Sharing is Caring, Well-Tuned, Tiny Titan
MODEL_CARD.md ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MODEL CARD: Kirana Detective Training Data & Fine-Tuned Models
2
+
3
+ **Repository**: `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged`
4
+ **Author**: [naazimsnh02](https://github.com/naazimsnh02)
5
+ **License**: Apache 2.0 (models) / MIT (code)
6
+ **Last Updated**: June 10, 2026
7
+
8
+ ---
9
+
10
+ ## Executive Summary
11
+
12
+ **Kirana Detective** is a complete fine-tuning pipeline for three state-of-the-art models that audit distributor invoices for Indian kirana (grocery) stores. This repository contains:
13
+
14
+ 1. **Synthetic invoice generation** (500 images across 4 formats)
15
+ 2. **Fine-tuned MiniCPM-V 4.6** — Invoice OCR & extraction (transformers, merged weights)
16
+ 3. **Fine-tuned MiniCPM5-1B** — Product name normalization (GGUF)
17
+ 4. **Fine-tuned YOLO26n** — Visual product detection (ONNX)
18
+
19
+ All models run **locally without cloud APIs** and are deployed in a six-agent pipeline to detect pricing anomalies, missing deliveries, and GST errors, reporting **estimated rupee leakage** with actionable corrections.
20
+
21
+ ---
22
+
23
+ ## Project Overview
24
+
25
+ ### Problem Statement
26
+
27
+ Indian kirana store owners struggle to audit distributor invoices manually:
28
+ - Inconsistent product naming (abbreviations, typos, regional variants)
29
+ - Difficulty cross-referencing against inventory
30
+ - Manual photo counting is error-prone
31
+ - No standardized format for pricing lookups
32
+ - Estimated financial leakage: **5–15% of purchase budget**
33
+
34
+ ### Solution
35
+
36
+ **Kirana Detective** automates the entire audit pipeline:
37
+ 1. **Extract** line items from invoice images (MiniCPM-V)
38
+ 2. **Normalize** product names (MiniCPM5-1B)
39
+ 3. **Check prices** against catalog
40
+ 4. **Count inventory** from delivery photos (YOLO26n)
41
+ 5. **Reconcile** invoiced vs. counted quantities
42
+ 6. **Report** discrepancies with rupee impact
43
+
44
+ ---
45
+
46
+ ## Models in This Repository
47
+
48
+ ### Model 1: MiniCPM-V 4.6 (Invoice Extractor)
49
+
50
+ | Attribute | Details |
51
+ |---|---|
52
+ | **Base Model** | [openbmb/MiniCPM-V-4.6](https://huggingface.co/openbmb/MiniCPM-V-4.6) |
53
+ | **Task** | Vision-language OCR + structured extraction |
54
+ | **Fine-tuning Method** | QLoRA (4-bit quantization + LoRA rank 16) |
55
+ | **Training Data** | 500 synthetic invoices (450 train, 50 eval) |
56
+ | **Trainable Parameters** | 9,486,336 / 1,309,914,352 (0.72%) |
57
+ | **Output Format** | Merged full weights (bfloat16) |
58
+ | **Inference Runtime** | Transformers (`AutoModel`, `model.chat()`) |
59
+ | **Hardware (Training)** | NVIDIA A10G, 22 GB VRAM, ~52 min |
60
+ | **Repository** | [`build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged`](https://huggingface.co/build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged) |
61
+
62
+ **Input Formats Supported**:
63
+ - Printed GST invoices (Pillow-generated PDFs)
64
+ - Tally PDF exports
65
+ - Handwritten invoices (photos)
66
+ - WhatsApp screenshot invoices
67
+
68
+ **Output Structure** (JSON):
69
+ ```json
70
+ {
71
+ "supplier": "Distributor Name",
72
+ "invoice_number": "INV-001",
73
+ "line_items": [
74
+ {
75
+ "raw_name": "MAGGI NDL 70GM",
76
+ "quantity": 10,
77
+ "unit_price": 45.50,
78
+ "gst_rate": 5,
79
+ "total": 455.00
80
+ }
81
+ ],
82
+ "invoice_total": 9650.00,
83
+ "gst_total": 485.00
84
+ }
85
+ ```
86
+
87
+ ---
88
+
89
+ ### Model 2: MiniCPM5-1B (Product Name Normalizer)
90
+
91
+ | Attribute | Details |
92
+ |---|---|
93
+ | **Base Model** | [openbmb/MiniCPM5-1B](https://huggingface.co/openbmb/MiniCPM5-1B) |
94
+ | **Task** | Text-to-text product name normalization |
95
+ | **Fine-tuning Method** | QLoRA (4-bit base, LoRA rank 16) |
96
+ | **Training Data** | 2,000 synthetic (raw, canonical) pairs (1,800 train, 200 eval) |
97
+ | **Output Format** | GGUF (quantized, ~1.2 GB) |
98
+ | **Framework** | Unsloth 2026.6.1 |
99
+ | **Hardware (Training)** | NVIDIA A10G, 22 GB VRAM, ~1 hour |
100
+ | **Repository** | [`build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`](https://huggingface.co/build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer) |
101
+
102
+ **Example Mappings**:
103
+ | Raw Input | Normalized Output |
104
+ |---|---|
105
+ | `MAGGI NDL 70GM` | Nestle Maggi Masala Noodles 70g |
106
+ | `SURF XL 1K` | Surf Excel Washing Powder 1kg |
107
+ | `AMUL BTR 100` | Amul Butter 100g |
108
+ | `COLGAT 100G` | Colgate Strong Teeth Toothpaste 100g |
109
+
110
+ **Training Data**:
111
+ - Hand-curated catalog of 200 Indian FMCG SKUs
112
+ - Augmentation strategies: abbreviation expansion, typo injection, truncation, regional shorthand
113
+ - Covers 10 major distributors: ITC, Nestlé, Unilever, P&G, Reckitt, Britannia, Amul, Patanjali, etc.
114
+
115
+ ---
116
+
117
+ ### Model 3: YOLO26n (Product Detection)
118
+
119
+ | Attribute | Details |
120
+ |---|---|
121
+ | **Base Model** | [YOLOv8 Nano](https://docs.ultralytics.com/tasks/detect/) |
122
+ | **Task** | Object detection (product localization & counting) |
123
+ | **Fine-tuning Method** | Supervised fine-tuning via Ultralytics |
124
+ | **Training Data** | 3 Roboflow datasets merged (~11,400 images) |
125
+ | **Output Format** | ONNX (15 MB, CPU/GPU compatible) |
126
+ | **Framework** | Ultralytics YOLOv8 |
127
+ | **Hardware (Training)** | NVIDIA A10G, 22 GB VRAM, ~2 hours, 100 epochs |
128
+ | **Repository** | [`build-small-hackathon/yolo26n-indian-fmcg-detection`](https://huggingface.co/build-small-hackathon/yolo26n-indian-fmcg-detection) |
129
+
130
+ **Classes**: Unified class list is built dynamically at training time by merging all three dataset vocabularies (deduped, insertion-order). The current merged dataset spans **30+ classes** across grocery staples, personal care, beverages, and packaged foods. See `class_names.json` on HF Hub for the exact list after training.
131
+
132
+ > **Pilot run note**: A previous single-dataset run (agentsk47 only, 10 classes) achieved mAP@50 = 0.993 / mAP@50-95 = 0.933 at epoch 65. Those metrics are superseded by the merged 3-dataset training now in progress.
133
+
134
+ **Datasets Merged**:
135
+ 1. [agentsk47/indian-grocery-object-detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) — v1, ~400 images, 10 classes
136
+ 2. [iit-patna/grocery_items](https://universe.roboflow.com/iit-patna-qg1jh/grocery_items-7i2em) — v45, 6,695 images, 20 classes
137
+ 3. [project-c5ho0/indian-market](https://universe.roboflow.com/project-c5ho0/indian-market-qieug) — v2, 4,694 images, 2 classes
138
+
139
+ ---
140
+
141
+ ## Training Data & Datasets
142
+
143
+ ### Synthetic Invoice Generation (`generate_invoices.py`)
144
+
145
+ **Purpose**: Create diverse, realistic invoice images without requiring manual collection or OCR labor.
146
+
147
+ **Configuration**:
148
+ - 500 total invoices generated
149
+ - 4 formats: GST invoices, Tally PDFs, handwritten samples, WhatsApp screenshots
150
+ - Pure Pillow (no native dependencies)
151
+ - Randomized supplier names, quantities, prices, and GST rates
152
+
153
+ **Generated Data Structure**:
154
+ ```
155
+ data/synthetic_invoices/
156
+ ├── annotations.jsonl # JSONL: {image_path, extracted_data}
157
+ ├── printed_gst/ # 125 GST-compliant invoices
158
+ ├── tally_pdf/ # 125 Tally PDF exports
159
+ ├── handwritten/ # 125 handwritten photos
160
+ └── whatsapp/ # 125 WhatsApp screenshots
161
+ ```
162
+
163
+ Each invoice includes:
164
+ - 5–20 line items
165
+ - Realistic pricing (₹10–₹5,000 per item)
166
+ - Correct GST calculations (5%, 12%, 18%)
167
+ - Real supplier names + product abbreviations
168
+
169
+ ---
170
+
171
+ ## Quick Start
172
+
173
+ ### Installation
174
+
175
+ ```bash
176
+ git clone https://github.com/naazimsnh02/kirana-invoice-train-data.git
177
+ cd kirana-invoice-train-data
178
+ pip install -r requirements.txt
179
+ ```
180
+
181
+ ### Run Fine-tuning on Modal
182
+
183
+ ```bash
184
+ # Set environment variables
185
+ export ROBOFLOW_API_KEY=<your-roboflow-api-key>
186
+ export HF_TOKEN=<your-huggingface-token>
187
+ modal token new
188
+
189
+ # Generate synthetic invoices
190
+ modal run finetune/generate_invoices.py
191
+
192
+ # Fine-tune all three models (sequential)
193
+ modal run finetune/train_minicpm_v.py # ~2 hours
194
+ modal run finetune/train_minicpm5_1b.py # ~1 hour
195
+ modal run finetune/train_yolo26n.py # ~2 hours
196
+ ```
197
+
198
+ Models are auto-published to HuggingFace Hub upon completion.
199
+
200
+ ### Local Inference
201
+
202
+ **MiniCPM-V (Invoice Extraction)**:
203
+ ```bash
204
+ llama-cli --model minicpm-v-4-6.gguf \
205
+ -p "<|im_start|>system\nExtract invoice data<|im_end|>\n..." \
206
+ --image invoice.png
207
+ ```
208
+
209
+ **MiniCPM5-1B (Product Normalization)**:
210
+ ```python
211
+ from transformers import AutoTokenizer, AutoModelForCausalLM
212
+ model = AutoModelForCausalLM.from_pretrained(
213
+ "build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer"
214
+ )
215
+ ```
216
+
217
+ **YOLO26n (Object Detection)**:
218
+ ```python
219
+ from ultralytics import YOLO
220
+ model = YOLO("yolo26n_fmcg.onnx")
221
+ results = model.predict("shelf.jpg", imgsz=640)
222
+ ```
223
+
224
+ ---
225
+
226
+ ## Evaluation & Performance
227
+
228
+ ### MiniCPM-V Training Metrics (Actual Run — June 10, 2026)
229
+
230
+ | Epoch | Train Loss | Eval Loss | LR |
231
+ |---|---|---|---|
232
+ | 1 | 6.081 | 0.2901 | 8.83e-5 |
233
+ | 2 | 3.948 | 0.2281 | 4.94e-5 |
234
+ | 3 | 3.326 | **0.212** | 1.04e-5 |
235
+
236
+ - Training time: 51 min 50 sec (87 steps, 26 s/step on A10G)
237
+ - Avg gradient norm: 178 → 16 (stable convergence)
238
+ - Best checkpoint loaded: epoch 3 (eval loss 0.212)
239
+ - Final avg train loss across all steps: 4.774
240
+
241
+ > Per-invoice-type breakdown (printed GST / Tally / handwritten / WhatsApp) pending a held-out real-invoice test set — to be added in Phase 2.
242
+
243
+ ### MiniCPM5-1B Evaluation
244
+
245
+ | Metric | Value |
246
+ |---|---|
247
+ | Exact Match (normalized names) | 94.5% |
248
+ | Fuzzy Match (Levenshtein > 0.8) | 98.2% |
249
+ | OOV Handling | 3.8% fail → manual review flag |
250
+
251
+ ### YOLO26n Evaluation — Pilot Run (single dataset, 10 classes)
252
+
253
+ > These metrics are from a prior training run on the `agentsk47` dataset only (10 classes). The current training uses all 3 merged datasets and will produce updated metrics.
254
+
255
+ Per-class metrics at best epoch (65):
256
+
257
+ | Class | Precision | Recall | mAP50 | mAP50-95 |
258
+ |---|---|---|---|---|
259
+ | Bournvita | 0.902 | 1.000 | 0.995 | 0.995 |
260
+ | Mysore Sandal Soap | 1.000 | 0.905 | 0.995 | 0.944 |
261
+ | Nescafe Coffee | 0.927 | 1.000 | 0.995 | 0.908 |
262
+ | Nivea Body Lotion | 0.935 | 1.000 | 0.995 | 0.923 |
263
+ | Nivea Soft Cream | 0.924 | 1.000 | 0.995 | 0.895 |
264
+ | Parachute Coconut Oil | 1.000 | 0.819 | 0.972 | 0.928 |
265
+ | Patanjali Dant Kanti | 1.000 | 0.985 | 0.995 | 0.971 |
266
+ | Society Tea | 0.878 | 1.000 | 0.995 | 0.845 |
267
+ | Tresemmé Conditioner | 0.814 | 1.000 | 0.995 | 0.995 |
268
+ | Tresemmé Shampoo | 0.968 | 1.000 | 0.995 | 0.922 |
269
+ | **Macro Average** | **0.935** | **0.971** | **0.993** | **0.933** |
270
+
271
+ ---
272
+
273
+ ## Known Limitations & Biases
274
+
275
+ ### MiniCPM-V (Invoice Extractor)
276
+ | Limitation | Impact | Mitigation |
277
+ |---|---|---|
278
+ | Only 10 FMCG suppliers in training data | Fails on uncommon distributors (e.g., local regional suppliers) | Collect real invoices from more suppliers post-hackathon |
279
+ | Synthetic data (no image degradation, blur) | May struggle with poor-quality photos | Add augmentation (blur, noise, shadows) to training data |
280
+ | GST rates hardcoded (5%, 12%, 18%) | Misses 0% or 28% GST items | Parameterize GST rate extraction |
281
+ | English-only prompts | Cannot process invoices in regional languages | Add Hindi/Tamil/Marathi templates |
282
+
283
+ ### MiniCPM5-1B (Product Normalizer)
284
+ | Limitation | Impact | Mitigation |
285
+ |---|---|---|
286
+ | Synthetic augmentation only | Overfits to rule-based patterns; fails on real-world typos | Collect 200+ real invoices for retraining |
287
+ | 200 SKU catalog | Fails on brands outside top 10 suppliers | Expand to 2,000 SKUs (all major Indian FMCG) |
288
+ | No regional abbreviations | Tamil/Hindi shortcuts not recognized | Add language-specific abbreviation models |
289
+ | No OEM rebrands | Misses store-brand relabeling | Add rebranding patterns post-research |
290
+
291
+ ### YOLO26n (Product Detection)
292
+ | Limitation | Impact | Mitigation |
293
+ |---|---|---|
294
+ | Merged dataset skewed toward beauty/personal care (Tresemmé, Nivea, Patanjali) | May underperform on grocery staples (oils, spices, pulses) | Balance class distribution; add 40–50 grocery categories |
295
+ | ~11K images across 3 datasets | May not generalize to unlisted brands or novel shelf layouts | Collect 50K+ images via Roboflow community |
296
+ | Confidence threshold (0.25) tuned for this dataset | May produce false positives in novel environments | Benchmark on held-out kirana store photos |
297
+ | YOLO26n is 8M params (nano) | Edge device deployment not yet tested | Quantize & benchmark on RPi 4, Android |
298
+
299
+ ### Fairness & Bias Notes
300
+ - **Brand bias**: Training data skews toward premium Indian brands (Amul, Nestlé, ITC) — may underperform on budget/regional brands
301
+ - **Supplier bias**: Only 10 distributors represented; regional cooperatives not included
302
+ - **Language bias**: All training prompts in English; non-English invoices will fail
303
+ - **Income bias**: Kirana store size assumption (₹5–50 lakh inventory) — very large or very small stores may see degraded performance
304
+
305
+ ---
306
+
307
+ ## Reproducibility
308
+
309
+ ### Seed Control
310
+ All scripts use fixed seeds:
311
+ ```python
312
+ SEED = 42
313
+ random.seed(SEED)
314
+ np.random.seed(SEED)
315
+ torch.manual_seed(SEED)
316
+ ```
317
+
318
+ ### Roboflow Dataset Versions (Pinned)
319
+ - agentsk47/indian-grocery-object-detection — **v1** (May 2025)
320
+ - iit-patna/grocery_items — **v45** (Apr 2026)
321
+ - project-c5ho0/indian-market — **v2** (Jun 2025)
322
+
323
+ ### Training Infrastructure
324
+ - **Orchestration**: [Modal](https://modal.com) (serverless GPUs)
325
+ - **Fine-tuning Framework**: Unsloth 2026.6.1 (LLM), Ultralytics (YOLO)
326
+ - **Quantization**: llama.cpp (GGUF)
327
+ - **Model Publishing**: HuggingFace Hub `huggingface_hub>=0.30.0`
328
+
329
+ ### Reproducibility Checklist
330
+ - [x] Dataset versions pinned in code
331
+ - [x] Random seeds fixed
332
+ - [x] Hardware specs documented (A10G, 22 GB VRAM)
333
+ - [x] Training duration recorded (~5 hours total)
334
+ - [x] Evaluation metrics logged post-training
335
+ - [ ] Cold start (fresh HF account) validation (TODO: test on new account)
336
+
337
+ ---
338
+
339
+ ## Files in This Repository
340
+
341
+ ```
342
+ kirana-invoice-train-data/
343
+ ├── README.md # This file
344
+ ├── MODEL_CARD.md # Model card for HF Hub
345
+ ├── requirements.txt # Python dependencies
346
+
347
+ ├── finetune/
348
+ │ ├── README.md # Training workflow guide
349
+ │ ├── generate_invoices.py # Synthetic invoice generator (500 images)
350
+ │ ├── train_minicpm_v.py # Fine-tune MiniCPM-V (OCR)
351
+ │ ├── train_minicpm5_1b.py # Fine-tune MiniCPM5-1B (normalizer)
352
+ │ ├── train_yolo26n.py # Fine-tune YOLO26n (detection)
353
+ │ ├── model_card.md # MiniCPM5-1B model card
354
+ │ └── yolo_model_card.md # YOLO26n model card
355
+
356
+ ├── data/
357
+ │ ├── fmcg_catalog.json # 200 canonical SKU names + GST rates
358
+ │ └── synthetic_invoices/
359
+ │ ├── annotations.jsonl
360
+ │ ├── printed_gst/ # 125 invoices
361
+ │ ├── tally_pdf/ # 125 invoices
362
+ │ ├── handwritten/ # 125 invoices
363
+ │ └── whatsapp/ # 125 invoices
364
+
365
+ └── tests/
366
+ └── test_*.py # Unit & integration tests
367
+ ```
368
+
369
+ ---
370
+
371
+ ## Hardware & Cost Estimates
372
+
373
+ ### Training Cost (Modal On-Demand)
374
+
375
+ | Model | GPU | Duration | On-Demand Cost |
376
+ |---|---|---|---|
377
+ | MiniCPM-V | NVIDIA A10G | ~2 hours | ~$3.00 |
378
+ | MiniCPM5-1B | NVIDIA A10G | ~1 hour | $1.50 |
379
+ | YOLO26n | NVIDIA A10G | ~2 hours | $3.00 |
380
+ | **Total** | — | **~5 hours** | **~$7.50** |
381
+
382
+ ### Inference Hardware
383
+
384
+ - **Laptop CPU (Intel i7)**: ~5–10 sec/invoice (MiniCPM-V) + ~2 sec/normalization + ~3 sec/image (YOLO)
385
+ - **GPU (NVIDIA RTX 3080)**: ~0.5 sec/invoice + ~0.2 sec/normalization + ~0.1 sec/image
386
+ - **Edge Device (Raspberry Pi 4)**: YOLO26n quantized to Q2_K ≈ 30–60 sec/image (untested)
387
+
388
+ ---
389
+
390
+ ## Usage in Production (Kirana Detective App)
391
+
392
+ Models are downloaded on first run via:
393
+
394
+ ```python
395
+ import torch
396
+ from transformers import AutoModel, AutoTokenizer
397
+ from PIL import Image
398
+
399
+ # Merged weights — no PEFT required
400
+ model = AutoModel.from_pretrained(
401
+ "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged",
402
+ trust_remote_code=True,
403
+ torch_dtype=torch.bfloat16,
404
+ device_map="auto",
405
+ )
406
+ model.eval()
407
+ tokenizer = AutoTokenizer.from_pretrained(
408
+ "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged",
409
+ trust_remote_code=True,
410
+ )
411
+
412
+ # Inference
413
+ image = Image.open("invoice.jpg")
414
+ msgs = [{"role": "user", "content": [image, "Extract all line items as JSON."]}]
415
+ response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048)
416
+ ```
417
+
418
+ ---
419
+
420
+ ## Next Steps & Roadmap
421
+
422
+ ### Phase 2 (Q3 2026)
423
+ - [ ] Collect **500 real invoices** from partnered kirana stores
424
+ - [ ] Expand product taxonomy: 200 SKUs → 2,000 SKUs
425
+ - [ ] Add **regional language support** (Hindi, Tamil, Marathi, Kannada)
426
+ - [ ] Fine-tune on **invoice degradation** (blur, folds, stains)
427
+ - [ ] Benchmark on **edge devices** (Raspberry Pi, Android)
428
+
429
+ ### Phase 3 (Q4 2026)
430
+ - [ ] Multi-language MiniCPM5-1B normalizer
431
+ - [ ] Expand YOLO26n to **50–100 classes** (full grocery taxonomy)
432
+ - [ ] Real-time video product counting via YOLO
433
+ - [ ] Mobile app (React Native) with offline inference
434
+
435
+ ### Research Questions
436
+ - How do models perform on **store-private labels** vs. branded products?
437
+ - Can we detect **counterfeit products** via label anomalies?
438
+ - What is the **fairness gap** for regional vs. national brands?
439
+
440
+ ---
441
+
442
+ ## Licensing & Attribution
443
+
444
+ - **Code**: MIT License
445
+ - **Models**:
446
+ - MiniCPM-V: [openbmb/MiniCPM-V](https://github.com/OpenBMB/MiniCPM-V) — Apache 2.0
447
+ - MiniCPM5-1B: [openbmb/MiniCPM5-1B](https://huggingface.co/openbmb/MiniCPM5-1B) — Apache 2.0
448
+ - YOLO26n: [Ultralytics YOLOv8](https://github.com/ultralytics/ultralytics) — AGPL-3.0
449
+ - **Datasets**:
450
+ - Roboflow datasets: Individual licenses (CC BY 4.0, CC BY-SA 4.0) — check each repo
451
+ - Synthetic invoices: CC0 (public domain)
452
+
453
+ ---
454
+
455
+ ## Contributing
456
+
457
+ Contributions welcome! Areas of need:
458
+
459
+ 1. **Real invoice collection**: Partner kirana stores to share anonymized invoices
460
+ 2. **Regional language templates**: Hindi, Tamil, Marathi invoice formats
461
+ 3. **Edge device benchmarks**: Profile inference on RPi 4, Snapdragon, etc.
462
+ 4. **Dataset expansion**: Add 1,000+ more products to YOLO26n training
463
+ 5. **Fairness audits**: Test models on regional/budget brands
464
+
465
+ ---
466
+
467
+ ## Contact & Support
468
+
469
+ - **Author**: [naazimsnh02](https://github.com/naazimsnh02)
470
+ - **Issues**: [GitHub Issues](https://github.com/naazimsnh02/kirana-invoice-train-data/issues)
471
+ - **Discussions**: [GitHub Discussions](https://github.com/naazimsnh02/kirana-invoice-train-data/discussions)
472
+ - **HF Hub Models**:
473
+ - [`build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged`](https://huggingface.co/build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged)
474
+ - [`build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`](https://huggingface.co/build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer)
475
+ - [`build-small-hackathon/yolo26n-indian-fmcg-detection`](https://huggingface.co/build-small-hackathon/yolo26n-indian-fmcg-detection)
476
+
477
+ ---
478
+
479
+ ## Citation
480
+
481
+ If you use this repository or models in your work, please cite:
482
+
483
+ ```bibtex
484
+ @misc{kirana_detective_2026,
485
+ author = {Hussain, Syed Naazim},
486
+ title = {Kirana Detective: Fine-Tuned Models for Indian Grocery Invoice Auditing},
487
+ year = {2026},
488
+ publisher = {HuggingFace},
489
+ howpublished = {\url{https://huggingface.co/build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged}},
490
+ }
491
+ ```
492
+
493
+ ---
494
+
495
+ **Version**: 1.0
496
+ **Last Updated**: June 10, 2026
PROGRESS.md CHANGED
@@ -84,10 +84,10 @@ modal run finetune/train_minicpm5_1b.py
84
  ## HF Repos to Create
85
 
86
  After fine-tuning publishes, verify these exist:
87
- - `naazimsnh02/yolo26n-indian-fmcg-detection`
88
- - `naazimsnh02/minicpm-v-4-6-indian-invoice-extraction`
89
- - `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer`
90
- - `naazimsnh02/kirana-detective-traces` (dataset — create manually before first audit run)
91
 
92
  ---
93
 
 
84
  ## HF Repos to Create
85
 
86
  After fine-tuning publishes, verify these exist:
87
+ - `build-small-hackathon/yolo26n-indian-fmcg-detection`
88
+ - `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction`
89
+ - `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`
90
+ - `build-small-hackathon/kirana-detective-traces` (dataset — create manually before first audit run)
91
 
92
  ---
93
 
README.md CHANGED
@@ -21,15 +21,15 @@ AI-powered inventory and invoice auditor for Indian kirana stores. Upload a dist
21
 
22
  ## Models
23
 
24
- All models run **locally via llama.cpp / ONNX — no cloud API calls**.
25
 
26
  | Model | HuggingFace Repo | Purpose |
27
  |---|---|---|
28
- | MiniCPM-V 4.6 (GGUF) | `naazimsnh02/minicpm-v-4-6-indian-invoice-extraction` | Invoice OCR + extraction |
29
- | MiniCPM5-1B (GGUF) | `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer` | Product name normalization |
30
- | YOLO26n (ONNX) | `naazimsnh02/yolo26n-indian-fmcg-detection` | Delivery photo product counting |
31
 
32
- Models are downloaded automatically on first run via `hf_hub_download()` and cached locally.
33
 
34
  ## Running Locally
35
 
@@ -38,7 +38,7 @@ pip install -r requirements.txt
38
  python app.py
39
  ```
40
 
41
- Requires ~4 GB RAM for the quantized models. First run downloads ~2 GB of model weights.
42
 
43
  ## Six-Agent Pipeline
44
 
 
21
 
22
  ## Models
23
 
24
+ All models run **locally — no cloud API calls**.
25
 
26
  | Model | HuggingFace Repo | Purpose |
27
  |---|---|---|
28
+ | MiniCPM-V 4.6 (transformers) | `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged` | Invoice OCR + extraction |
29
+ | MiniCPM5-1B (GGUF) | `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer` | Product name normalization |
30
+ | YOLO26n (ONNX) | `build-small-hackathon/yolo26n-indian-fmcg-detection` | Delivery photo product counting |
31
 
32
+ Models are downloaded automatically on first run via `hf_hub_download()` / `AutoModel` and cached locally.
33
 
34
  ## Running Locally
35
 
 
38
  python app.py
39
  ```
40
 
41
+ Requires ~6 GB RAM. First run downloads ~3 GB of model weights.
42
 
43
  ## Six-Agent Pipeline
44
 
agents/invoice_extractor.py CHANGED
@@ -14,11 +14,11 @@ logger = logging.getLogger(__name__)
14
 
15
  AGENT_NAME = "Invoice_Extractor"
16
  AGENT_VERSION = "1.0.0"
17
- MODEL_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction"
18
 
19
  _MAX_FILE_BYTES = 20 * 1024 * 1024 # 20 MB
20
  _ALLOWED_SUFFIXES = {".jpg", ".jpeg", ".png", ".webp", ".pdf"}
21
- _TIMEOUT_SECONDS = 30
22
 
23
  _EXTRACT_PROMPT = (
24
  "You are an OCR agent for Indian kirana store invoices. "
@@ -92,23 +92,14 @@ def _dict_to_invoice(data: dict) -> InvoiceJSON:
92
 
93
 
94
  def _call_llm_with_image(llm, image_bytes: bytes, prompt: str) -> str:
95
- """Call MiniCPM-V via llama-cpp-python chat API with an image."""
96
- import base64
97
- b64 = base64.b64encode(image_bytes).decode()
98
- response = llm.create_chat_completion(
99
- messages=[
100
- {
101
- "role": "user",
102
- "content": [
103
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}},
104
- {"type": "text", "text": prompt},
105
- ],
106
- }
107
- ],
108
- max_tokens=2048,
109
- temperature=0.0,
110
- )
111
- return response["choices"][0]["message"]["content"]
112
 
113
 
114
  def _extract_from_image(llm, image_bytes: bytes) -> Tuple[InvoiceJSON, list[str]]:
 
14
 
15
  AGENT_NAME = "Invoice_Extractor"
16
  AGENT_VERSION = "1.0.0"
17
+ MODEL_REPO = "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged"
18
 
19
  _MAX_FILE_BYTES = 20 * 1024 * 1024 # 20 MB
20
  _ALLOWED_SUFFIXES = {".jpg", ".jpeg", ".png", ".webp", ".pdf"}
21
+ _TIMEOUT_SECONDS = 120
22
 
23
  _EXTRACT_PROMPT = (
24
  "You are an OCR agent for Indian kirana store invoices. "
 
92
 
93
 
94
  def _call_llm_with_image(llm, image_bytes: bytes, prompt: str) -> str:
95
+ """Call MiniCPM-V via transformers chat API with an image."""
96
+ import io
97
+ from PIL import Image as PILImage
98
+
99
+ model, tokenizer = llm
100
+ image = PILImage.open(io.BytesIO(image_bytes)).convert("RGB")
101
+ msgs = [{"role": "user", "content": [image, prompt]}]
102
+ return model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048)
 
 
 
 
 
 
 
 
 
103
 
104
 
105
  def _extract_from_image(llm, image_bytes: bytes) -> Tuple[InvoiceJSON, list[str]]:
agents/product_matcher.py CHANGED
@@ -14,7 +14,7 @@ logger = logging.getLogger(__name__)
14
 
15
  AGENT_NAME = "Product_Matcher"
16
  AGENT_VERSION = "1.0.0"
17
- MODEL_REPO = "naazimsnh02/minicpm5-1b-indian-fmcg-normalizer"
18
 
19
  _TIMEOUT_SECONDS = 20
20
 
 
14
 
15
  AGENT_NAME = "Product_Matcher"
16
  AGENT_VERSION = "1.0.0"
17
+ MODEL_REPO = "build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer"
18
 
19
  _TIMEOUT_SECONDS = 20
20
 
agents/savings_agent.py CHANGED
@@ -13,7 +13,7 @@ logger = logging.getLogger(__name__)
13
 
14
  AGENT_NAME = "Savings_Agent"
15
  AGENT_VERSION = "1.0.0"
16
- MODEL_REPO = "naazimsnh02/minicpm5-1b-indian-fmcg-normalizer"
17
 
18
  _TIMEOUT_SECONDS = 15
19
 
 
13
 
14
  AGENT_NAME = "Savings_Agent"
15
  AGENT_VERSION = "1.0.0"
16
+ MODEL_REPO = "build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer"
17
 
18
  _TIMEOUT_SECONDS = 15
19
 
app.py CHANGED
@@ -58,31 +58,27 @@ def load_models() -> None:
58
  try:
59
  from huggingface_hub import hf_hub_download
60
  from llama_cpp import Llama
61
- from llama_cpp.llama_chat_format import MiniCPMv26ChatHandler
62
  import onnxruntime as ort
63
 
64
- logger.info("Downloading vision model (MiniCPM-V 4.6)…")
65
- vision_model_path = hf_hub_download(
66
- repo_id="naazimsnh02/minicpm-v-4-6-indian-invoice-extraction",
67
- filename="model.gguf",
68
- )
69
- mmproj_path = hf_hub_download(
70
- repo_id="naazimsnh02/minicpm-v-4-6-indian-invoice-extraction",
71
- filename="mmproj.gguf",
72
- )
73
- chat_handler = MiniCPMv26ChatHandler(clip_model_path=mmproj_path)
74
- vision_llm = Llama(
75
- model_path=vision_model_path,
76
- chat_handler=chat_handler,
77
- n_ctx=4096,
78
- n_threads=4,
79
- verbose=False,
80
  )
 
 
 
81
  logger.info("Vision LLM ready")
82
 
83
  logger.info("Downloading text model (MiniCPM5-1B)…")
84
  text_model_path = hf_hub_download(
85
- repo_id="naazimsnh02/minicpm5-1b-indian-fmcg-normalizer",
86
  filename="model.gguf",
87
  )
88
  text_llm = Llama(
@@ -95,11 +91,11 @@ def load_models() -> None:
95
 
96
  logger.info("Downloading YOLO model…")
97
  onnx_path = hf_hub_download(
98
- repo_id="naazimsnh02/yolo26n-indian-fmcg-detection",
99
  filename="yolo26n_fmcg.onnx",
100
  )
101
  class_names_path = hf_hub_download(
102
- repo_id="naazimsnh02/yolo26n-indian-fmcg-detection",
103
  filename="class_names.json",
104
  )
105
  with open(class_names_path, encoding="utf-8") as f:
 
58
  try:
59
  from huggingface_hub import hf_hub_download
60
  from llama_cpp import Llama
 
61
  import onnxruntime as ort
62
 
63
+ logger.info("Downloading vision model (MiniCPM-V 4.6 merged)…")
64
+ import torch
65
+ from transformers import AutoModel, AutoTokenizer
66
+
67
+ _VISION_REPO = "build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged"
68
+ _vision_model = AutoModel.from_pretrained(
69
+ _VISION_REPO,
70
+ trust_remote_code=True,
71
+ torch_dtype=torch.bfloat16,
72
+ device_map="auto",
 
 
 
 
 
 
73
  )
74
+ _vision_model.eval()
75
+ _vision_tokenizer = AutoTokenizer.from_pretrained(_VISION_REPO, trust_remote_code=True)
76
+ vision_llm = (_vision_model, _vision_tokenizer)
77
  logger.info("Vision LLM ready")
78
 
79
  logger.info("Downloading text model (MiniCPM5-1B)…")
80
  text_model_path = hf_hub_download(
81
+ repo_id="build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer",
82
  filename="model.gguf",
83
  )
84
  text_llm = Llama(
 
91
 
92
  logger.info("Downloading YOLO model…")
93
  onnx_path = hf_hub_download(
94
+ repo_id="build-small-hackathon/yolo26n-indian-fmcg-detection",
95
  filename="yolo26n_fmcg.onnx",
96
  )
97
  class_names_path = hf_hub_download(
98
+ repo_id="build-small-hackathon/yolo26n-indian-fmcg-detection",
99
  filename="class_names.json",
100
  )
101
  with open(class_names_path, encoding="utf-8") as f:
docs/kirana-detective-prd.md CHANGED
@@ -413,7 +413,7 @@ This keeps the app far below the hackathon's 32B cap and within the Tiny Titan s
413
 
414
  **Platform:** Modal + Unsloth QLoRA (~2–3 hours training time)
415
 
416
- **Publish to:** `naazimsnh02/minicpm-v-4-6-indian-invoice-extraction`
417
 
418
  ---
419
 
@@ -426,7 +426,7 @@ This keeps the app far below the hackathon's 32B cap and within the Tiny Titan s
426
 
427
  **Export:** ONNX for local CPU inference
428
 
429
- **Publish to:** `naazimsnh02/yolo26n-indian-fmcg-detection`
430
 
431
  ---
432
 
@@ -435,7 +435,7 @@ This keeps the app far below the hackathon's 32B cap and within the Tiny Titan s
435
 
436
  **Dataset:** 2,000 synthetic (raw_name, normalized_name) pairs covering top 200 Indian FMCG SKUs
437
 
438
- **Publish to:** `naazimsnh02/minicpm5-1b-indian-fmcg-normalizer`
439
 
440
  ---
441
 
 
413
 
414
  **Platform:** Modal + Unsloth QLoRA (~2–3 hours training time)
415
 
416
+ **Publish to:** `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction-merged`
417
 
418
  ---
419
 
 
426
 
427
  **Export:** ONNX for local CPU inference
428
 
429
+ **Publish to:** `build-small-hackathon/yolo26n-indian-fmcg-detection`
430
 
431
  ---
432
 
 
435
 
436
  **Dataset:** 2,000 synthetic (raw_name, normalized_name) pairs covering top 200 Indian FMCG SKUs
437
 
438
+ **Publish to:** `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`
439
 
440
  ---
441
 
finetune/README.md ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Model Fine-tuning Guide
2
+
3
+ Fine-tune Kirana Detective's three models on Indian FMCG invoice data.
4
+
5
+ ## Quick Start (TL;DR)
6
+
7
+ ```bash
8
+ export ROBOFLOW_API_KEY=<your-key>
9
+ export HF_TOKEN=<your-token>
10
+ modal run finetune/generate_invoices.py # 10 min
11
+ modal run finetune/train_minicpm_v.py # 2 hours
12
+ modal run finetune/train_minicpm5_1b.py # 1 hour
13
+ modal run finetune/train_yolo26n.py # 2 hours
14
+ ```
15
+
16
+ Models auto-publish to HuggingFace Hub on completion.
17
+
18
+ ---
19
+
20
+ ## Three Models, Three Pipelines
21
+
22
+ ### 1. MiniCPM-V 4.6 (Invoice OCR) — `train_minicpm_v.py`
23
+
24
+ **Purpose**: Extract line items, amounts, GST from invoice images (printed PDFs, handwritten, WhatsApp screenshots)
25
+
26
+ **Input**: 500 synthetic invoices (4 formats)
27
+ **Method**: QLoRA fine-tuning with Unsloth
28
+ **Output**: GGUF quantized model → HF Hub
29
+ **Hardware**: A10G, 22 GB VRAM, ~2 hours
30
+
31
+ **Datasets used**:
32
+ - Synthetic invoices generated by `generate_invoices.py`
33
+ - Splits: train/val/test = 400/50/50
34
+ - Formats: pure Pillow (no native deps) — GST, Tally PDF, handwritten, WhatsApp
35
+
36
+ ---
37
+
38
+ ### 2. MiniCPM5-1B (Product Name Normalizer) — `train_minicpm5_1b.py`
39
+
40
+ **Purpose**: Map invoice abbreviations (e.g., "MAGGI NDL 70GM") to canonical names
41
+
42
+ **Input**: 2,000 synthetic (raw, canonical) pairs
43
+ **Method**: QLoRA, 4-bit base + LoRA adapters
44
+ **Output**: GGUF quantized model
45
+ **Hardware**: A10G, ~1 hour
46
+
47
+ **Dataset generation**:
48
+ - Hand-curated 200 SKU catalog
49
+ - Rule-based augmentation: abbreviation expansion, typo injection, truncation
50
+ - Coverage: 10 major Indian FMCG suppliers
51
+
52
+ ---
53
+
54
+ ### 3. YOLO26n (Product Detection) — `train_yolo26n.py`
55
+
56
+ **Purpose**: Count packaged products in shelf/counter photos
57
+
58
+ **Input**: 3 Roboflow datasets merged (11,000+ images)
59
+ **Method**: Ultralytics standard training pipeline
60
+ **Output**: ONNX format for CPU/GPU inference
61
+ **Hardware**: A10G, ~2 hours
62
+
63
+ **Datasets merged**:
64
+ 1. [agentsk47/indian-grocery-object-detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) v1
65
+ 2. [iit-patna/grocery_items](https://universe.roboflow.com/iit-patna-qg1jh/grocery_items-7i2em) v45 (6,695 images)
66
+ 3. [project-c5ho0/indian-market](https://universe.roboflow.com/project-c5ho0/indian-market-qieug) v2 (4,694 images)
67
+
68
+ ---
69
+
70
+ ## Prerequisites
71
+
72
+ ```bash
73
+ # 1. Clone this repo
74
+ git clone https://github.com/build-small-hackathon/kirana-invoice-train-data.git
75
+ cd kirana-invoice-train-data
76
+
77
+ # 2. Install local deps (for generated synthetics preview only)
78
+ pip install -r requirements.txt
79
+
80
+ # 3. Set up secrets for Modal/HF
81
+ modal token new
82
+ export ROBOFLOW_API_KEY=<from Roboflow universe account>
83
+ export HF_TOKEN=<from huggingface.co/settings/tokens>
84
+
85
+ # 4. Test Modal setup
86
+ modal run finetune/generate_invoices.py
87
+ ```
88
+
89
+ ---
90
+
91
+ ## Reproducibility Checklist
92
+
93
+ - [ ] **Dataset versioning**: All Roboflow versions pinned (v1, v45, v2)
94
+ - [ ] **Seed control**: Random seeds fixed in all training scripts
95
+ - [ ] **Output validation**: Run `tests/` after each model completes
96
+ - [ ] **HF Hub publish logs**: Check model card auto-generated from training
97
+ - [ ] **GGUF quantization**: Verified mAP/F1 vs. float32 baseline
98
+
99
+ ---
100
+
101
+ ## Known Limitations & Biases
102
+
103
+ | Model | Limitation | Impact | Mitigation |
104
+ |---|---|---|---|
105
+ | MiniCPM-V | Only 10 FMCG suppliers in training data | Fails on uncommon brands | Add more invoices post-hackathon |
106
+ | MiniCPM5-1B | Synthetic data only (no real invoice typos) | Overfits to rule-based augmentation | Collect 200+ real examples next |
107
+ | YOLO26n | Merged dataset skewed toward beauty/personal care (Tresemmé, Nivea, Patanjali) | May underperform on grocery staples | Balance class distribution across grocery categories |
108
+
109
+ ---
110
+
111
+ ## Troubleshooting
112
+
113
+ **"Modal timeout after 2 hours?"**
114
+ → YOLO training can take 2–3h depending on GPU queue. Increase timeout in `modal.json`.
115
+
116
+ **"GGUF quantization fails?"**
117
+ → Ensure llama.cpp is compiled with CUDA support if GPU quantization intended.
118
+
119
+ **"HF Hub publish returns 403?"**
120
+ → `HF_TOKEN` must have write access. Regenerate at huggingface.co/settings/tokens.
121
+
122
+ ---
123
+
124
+ ## Output Files
125
+
126
+ After successful runs, check HF Hub:
127
+
128
+ - **MiniCPM-V**: `build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction`
129
+ - `model.gguf` (4.5 GB)
130
+ - `model_card.md`
131
+
132
+ - **MiniCPM5-1B**: `build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`
133
+ - `model.gguf` (1.2 GB)
134
+ - `model_card.md`
135
+
136
+ - **YOLO26n**: `build-small-hackathon/yolo26n-indian-fmcg-detection`
137
+ - `best.onnx` (15 MB)
138
+ - `class_names.json`
139
+ - `model_card.md`
140
+
141
+ ---
142
+
143
+ ## Next Steps Post-Hackathon
144
+
145
+ 1. **Collect real invoice data** from partnered kirana stores (500 minimum)
146
+ 2. **Expand product taxonomy** (currently 200 SKUs → 2000)
147
+ 3. **Add regional variants** (Hindi/Tamil/Malayalam abbreviations)
148
+ 4. **Benchmark inference latency** on Raspberry Pi / Android devices
finetune/export_minicpm_v_gguf.py ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Merge MiniCPM-V 4.6 LoRA adapter into the base model and push the merged
3
+ HF weights to Hugging Face.
4
+
5
+ Why merge instead of converting LoRA to GGUF directly:
6
+ llama.cpp's convert_lora_to_gguf.py and convert_hf_to_gguf.py both fail
7
+ for MiniCPMV4_6Model (architecture not in llama.cpp's registry). The only
8
+ working path is to have ggml.ai's GGUF-my-repo Space do the conversion —
9
+ it uses a patched llama.cpp that supports this architecture.
10
+
11
+ Two-step workflow:
12
+ Step 1 (this script):
13
+ - Load base model + LoRA from Modal volume
14
+ - Merge LoRA weights into the full model (merge_and_unload)
15
+ - Push merged HF model to MERGED_HF_REPO
16
+ - Download OpenBMB's mmproj.gguf and upload it to HF_REPO for immediate use
17
+
18
+ Step 2 (manual — ~15 min):
19
+ - Go to https://huggingface.co/spaces/ggml-org/gguf-my-repo
20
+ - Enter: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged
21
+ - Select Q4_K_M quantisation
22
+ - Wait for the Space to create the GGUF repo
23
+ - Update app.py MODEL_REPO to point to the resulting GGUF repo
24
+
25
+ Run:
26
+ modal run finetune/export_minicpm_v_gguf.py
27
+
28
+ Reads adapter from: /output/minicpm-v-lora in Modal volume kirana-minicpm-v-output
29
+ Publishes merged HF model to: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged
30
+ Also uploads mmproj.gguf to: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction
31
+ """
32
+
33
+ from __future__ import annotations
34
+
35
+ import os
36
+
37
+ import modal
38
+
39
+ app = modal.App("kirana-export-minicpm-v-gguf")
40
+
41
+ IMAGE = (
42
+ modal.Image.debian_slim(python_version="3.11")
43
+ .pip_install(
44
+ "huggingface_hub>=0.30.0",
45
+ "safetensors>=0.4.3",
46
+ "torch>=2.3.0",
47
+ "transformers>=5.7.0",
48
+ "peft>=0.14.0",
49
+ "accelerate>=0.34.0",
50
+ )
51
+ )
52
+
53
+ HF_SECRET = modal.Secret.from_name("hf-secret")
54
+
55
+ BASE_MODEL = "openbmb/MiniCPM-V-4.6"
56
+ SOURCE_GGUF_REPO = "openbmb/MiniCPM-V-4.6-gguf"
57
+ HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction"
58
+ MERGED_HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged"
59
+
60
+ # Full professional model card is maintained in push_minicpm_v_merged_card.py.
61
+ # This is a minimal card used during the merge+push run; run push_minicpm_v_merged_card.py
62
+ # separately to update the README on HF Hub.
63
+ MODEL_CARD_MERGED = f"""\
64
+ ---
65
+ license: apache-2.0
66
+ base_model: {BASE_MODEL}
67
+ datasets:
68
+ - build-small-hackathon/kirana-invoice-train-data
69
+ language:
70
+ - en
71
+ tags:
72
+ - invoice-extraction
73
+ - indian-fmcg
74
+ - minicpm-v
75
+ - vision-language
76
+ - ocr
77
+ - qlora
78
+ - merged-weights
79
+ - kirana
80
+ - hackathon
81
+ pipeline_tag: image-text-to-text
82
+ ---
83
+
84
+ # MiniCPM-V 4.6 — Indian Invoice Extraction (Merged)
85
+
86
+ Fine-tuned [`{BASE_MODEL}`](https://huggingface.co/{BASE_MODEL}) for structured
87
+ JSON extraction from Indian distributor (kirana) invoices. QLoRA adapter weights
88
+ are fully merged — no PEFT dependency at inference time.
89
+
90
+ See full model card: [`naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged`](https://huggingface.co/{MERGED_HF_REPO})
91
+
92
+ ## Quick Start
93
+
94
+ ```python
95
+ import torch
96
+ from transformers import AutoModel, AutoTokenizer
97
+ from PIL import Image
98
+
99
+ model = AutoModel.from_pretrained(
100
+ "{MERGED_HF_REPO}", trust_remote_code=True,
101
+ torch_dtype=torch.bfloat16, device_map="auto",
102
+ )
103
+ model.eval()
104
+ tokenizer = AutoTokenizer.from_pretrained("{MERGED_HF_REPO}", trust_remote_code=True)
105
+
106
+ image = Image.open("invoice.jpg").convert("RGB")
107
+ msgs = [{{"role": "user", "content": [image, "Extract all line items as JSON."]}}]
108
+ response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048)
109
+ ```
110
+
111
+ ## Training Summary
112
+
113
+ | Parameter | Value |
114
+ |---|---|
115
+ | Base model | `{BASE_MODEL}` |
116
+ | Fine-tuning | QLoRA rank 16 |
117
+ | Dataset | 450 train + 50 eval synthetic Indian invoices |
118
+ | Eval loss | 0.2120 (3 epochs) |
119
+ | Training hardware | Modal A10G, ~52 min |
120
+ | Adapter params | 9.5M / 1.3B total (0.72%) |
121
+
122
+ ## License
123
+
124
+ Apache 2.0 — same as base model.
125
+ """
126
+
127
+
128
+ def _validate_gguf_header(path: str) -> None:
129
+ with open(path, "rb") as f:
130
+ magic = f.read(4)
131
+ if magic != b"GGUF":
132
+ raise RuntimeError(f"Downloaded file is not a GGUF: {path}")
133
+
134
+
135
+ @app.function(
136
+ image=IMAGE,
137
+ timeout=3600,
138
+ secrets=[HF_SECRET],
139
+ volumes={
140
+ "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=False),
141
+ },
142
+ memory=16384, # 16 GB — 1.3B model in bfloat16 ≈ 2.6 GB; headroom for merge + save
143
+ )
144
+ def merge_and_push():
145
+ import torch
146
+ from pathlib import Path
147
+ from peft import PeftModel
148
+ from transformers import AutoModel, AutoTokenizer
149
+ from huggingface_hub import HfApi, hf_hub_download
150
+
151
+ token = os.environ["HF_TOKEN"]
152
+ api = HfApi(token=token)
153
+
154
+ adapter_dir = Path("/output/minicpm-v-lora")
155
+ merged_dir = Path("/output/minicpm-v-merged")
156
+
157
+ if not adapter_dir.exists():
158
+ raise RuntimeError(
159
+ f"Missing adapter directory: {adapter_dir}. "
160
+ "Run finetune/train_minicpm_v.py first."
161
+ )
162
+
163
+ # ── Step 1: Merge LoRA into base model ───────────────────────────────────
164
+
165
+ if (merged_dir / "config.json").exists():
166
+ print("Merged model already exists at /output/minicpm-v-merged, skipping merge.")
167
+ else:
168
+ print(f"Loading base model {BASE_MODEL} ...")
169
+ base_model = AutoModel.from_pretrained(
170
+ BASE_MODEL,
171
+ torch_dtype=torch.bfloat16,
172
+ trust_remote_code=True,
173
+ token=token,
174
+ )
175
+
176
+ print(f"Loading LoRA adapter from {adapter_dir} ...")
177
+ model = PeftModel.from_pretrained(base_model, str(adapter_dir))
178
+
179
+ print("Merging LoRA weights into base model ...")
180
+ merged_model = model.merge_and_unload()
181
+
182
+ print(f"Saving merged model to {merged_dir} ...")
183
+ merged_dir.mkdir(parents=True, exist_ok=True)
184
+ merged_model.save_pretrained(str(merged_dir))
185
+
186
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True, token=token)
187
+ tokenizer.save_pretrained(str(merged_dir))
188
+ print("Merge complete.")
189
+
190
+ # ── Step 2: Create HF repo and push merged model ─────────────────────────
191
+
192
+ print(f"Creating / verifying HF repo {MERGED_HF_REPO} ...")
193
+ api.create_repo(repo_id=MERGED_HF_REPO, repo_type="model", exist_ok=True, private=False)
194
+
195
+ print(f"Uploading merged model to {MERGED_HF_REPO} ...")
196
+ api.upload_folder(
197
+ folder_path=str(merged_dir),
198
+ repo_id=MERGED_HF_REPO,
199
+ repo_type="model",
200
+ commit_message="Add merged MiniCPM-V-4.6 invoice fine-tune",
201
+ )
202
+
203
+ print("Uploading README.md to merged repo ...")
204
+ api.upload_file(
205
+ path_or_fileobj=MODEL_CARD_MERGED.encode("utf-8"),
206
+ path_in_repo="README.md",
207
+ repo_id=MERGED_HF_REPO,
208
+ repo_type="model",
209
+ )
210
+
211
+ # ── Step 3: Download OpenBMB mmproj and upload to GGUF repo ──────────────
212
+ # The LoRA only touched LLM layers — mmproj weights are unchanged, so
213
+ # OpenBMB's mmproj.gguf is identical to what we would produce ourselves.
214
+
215
+ print(f"Listing GGUF files in {SOURCE_GGUF_REPO} ...")
216
+ source_files = list(api.list_repo_files(SOURCE_GGUF_REPO, repo_type="model"))
217
+ mmproj_files = [f for f in source_files if "mmproj" in f.lower() and f.endswith(".gguf")]
218
+ if not mmproj_files:
219
+ raise RuntimeError(f"No mmproj GGUF found in {SOURCE_GGUF_REPO}. Files: {source_files}")
220
+ source_mmproj = mmproj_files[0]
221
+ print(f"Downloading {source_mmproj} ...")
222
+
223
+ mmproj_path = hf_hub_download(
224
+ repo_id=SOURCE_GGUF_REPO,
225
+ filename=source_mmproj,
226
+ repo_type="model",
227
+ token=token,
228
+ local_dir="/output/minicpm-v-gguf",
229
+ )
230
+ _validate_gguf_header(mmproj_path)
231
+
232
+ print(f"Uploading mmproj.gguf to {HF_REPO} ...")
233
+ api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True, private=False)
234
+ api.upload_file(
235
+ path_or_fileobj=mmproj_path,
236
+ path_in_repo="mmproj.gguf",
237
+ repo_id=HF_REPO,
238
+ repo_type="model",
239
+ )
240
+
241
+ print()
242
+ print("=" * 70)
243
+ print("DONE. Next steps:")
244
+ print()
245
+ print("1. Go to: https://huggingface.co/spaces/ggml-org/gguf-my-repo")
246
+ print(f"2. Enter model ID: {MERGED_HF_REPO}")
247
+ print("3. Select quantisation: Q4_K_M")
248
+ print("4. Click convert — takes ~15 min on the Space's A10G")
249
+ print()
250
+ print("The Space will create a new repo (usually named")
251
+ print(f" {MERGED_HF_REPO}-GGUF")
252
+ print("containing model.gguf + mmproj.gguf (both for the fine-tuned model).")
253
+ print()
254
+ print(f"mmproj.gguf already uploaded to: https://huggingface.co/{HF_REPO}")
255
+ print("(usable immediately — vision encoder weights are unchanged by fine-tuning)")
256
+ print("=" * 70)
257
+
258
+
259
+ @app.local_entrypoint()
260
+ def main():
261
+ merge_and_push.remote()
finetune/push_minicpm_v_merged_card.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Push the professional README / model card to
3
+ naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged on HuggingFace.
4
+
5
+ No Modal required — runs locally using the HF token from environment.
6
+
7
+ Run:
8
+ $env:HF_TOKEN = "hf_..." # PowerShell
9
+ python finetune/push_minicpm_v_merged_card.py
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ from huggingface_hub import HfApi
16
+
17
+ HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged"
18
+ BASE_MODEL = "openbmb/MiniCPM-V-4.6"
19
+ DATASET_REPO = "build-small-hackathon/kirana-invoice-train-data"
20
+
21
+ MODEL_CARD = """\
22
+ ---
23
+ license: apache-2.0
24
+ base_model: openbmb/MiniCPM-V-4.6
25
+ datasets:
26
+ - build-small-hackathon/kirana-invoice-train-data
27
+ language:
28
+ - en
29
+ tags:
30
+ - invoice-extraction
31
+ - indian-fmcg
32
+ - minicpm-v
33
+ - vision-language
34
+ - ocr
35
+ - qlora
36
+ - merged-weights
37
+ - kirana
38
+ - hackathon
39
+ pipeline_tag: image-text-to-text
40
+ ---
41
+
42
+ # MiniCPM-V 4.6 — Indian Invoice Extraction (Merged)
43
+
44
+ Fine-tuned [`openbmb/MiniCPM-V-4.6`](https://huggingface.co/openbmb/MiniCPM-V-4.6) for
45
+ structured JSON extraction from Indian distributor (kirana) invoices.
46
+
47
+ QLoRA adapter weights are **fully merged** into the base model — no PEFT dependency at
48
+ inference time. Part of the **Kirana Detective** project: a six-agent AI pipeline that
49
+ audits invoices for pricing anomalies, missing deliveries, and GST errors.
50
+
51
+ ---
52
+
53
+ ## Model Details
54
+
55
+ | Attribute | Value |
56
+ |---|---|
57
+ | **Base model** | [openbmb/MiniCPM-V-4.6](https://huggingface.co/openbmb/MiniCPM-V-4.6) |
58
+ | **Task** | Vision-language OCR + structured JSON extraction |
59
+ | **Fine-tuning method** | QLoRA — 4-bit NF4 base, LoRA rank 16, α 32 |
60
+ | **Trainable parameters** | 9,486,336 / 1,309,914,352 **(0.72%)** |
61
+ | **Target modules** | `q_proj`, `k_proj`, `v_proj`, `o_proj`, `gate_proj`, `up_proj`, `down_proj` |
62
+ | **Training epochs** | 3 |
63
+ | **Final eval loss** | **0.2120** (↓ from 0.2901 at epoch 1) |
64
+ | **Training hardware** | NVIDIA A10G 22 GB VRAM (Modal) |
65
+ | **Training duration** | ~52 minutes |
66
+ | **Output format** | Merged full weights — bfloat16 |
67
+ | **Inference runtime** | `transformers` (`AutoModel` + `model.chat()`) |
68
+
69
+ ---
70
+
71
+ ## Training Data
72
+
73
+ **Dataset**: [`build-small-hackathon/kirana-invoice-train-data`](https://huggingface.co/datasets/build-small-hackathon/kirana-invoice-train-data)
74
+
75
+ | Split | Examples |
76
+ |---|---|
77
+ | Train | 450 |
78
+ | Eval | 50 |
79
+
80
+ Synthetic Indian distributor invoices generated with Pillow across:
81
+
82
+ - **10 suppliers**: HUL, Nestlé, Parle, Britannia, ITC, Amul, Dabur, Marico, Emami, Godrej
83
+ - **4 invoice formats**: Printed GST bill, Tally PDF export, handwritten, WhatsApp screenshot
84
+ - **Intentional errors injected**: GST rate mismatches, duplicate line items, price spikes — to
85
+ train the model to surface extraction warnings alongside extracted data
86
+
87
+ ---
88
+
89
+ ## Training Metrics
90
+
91
+ | Epoch | Train Loss | Eval Loss |
92
+ |---|---|---|
93
+ | 1 | — | 0.2901 |
94
+ | 2 | — | 0.2281 |
95
+ | 3 | — | **0.2120** |
96
+
97
+ ---
98
+
99
+ ## Supported Input Formats
100
+
101
+ | Format | Example |
102
+ |---|---|
103
+ | Printed GST invoice | Standard B2B tax invoice with HSN codes |
104
+ | Tally PDF export | Machine-generated tabular layout |
105
+ | Handwritten invoice | Photo of handwritten bill |
106
+ | WhatsApp screenshot | Low-resolution forwarded invoice image |
107
+
108
+ ---
109
+
110
+ ## Output Schema
111
+
112
+ The model returns **only** a JSON object matching this schema — no markdown, no prose:
113
+
114
+ ```json
115
+ {
116
+ "invoice_number": "INV-2024-001",
117
+ "supplier": "Hindustan Unilever Ltd.",
118
+ "date": "2026-06-10",
119
+ "items": [
120
+ {
121
+ "product_raw": "SURF XL 1KG",
122
+ "quantity": 12,
123
+ "unit_price": 95.00,
124
+ "gst_rate": 18,
125
+ "line_total": 1140.00
126
+ },
127
+ {
128
+ "product_raw": "MAGGI MASALA 70G",
129
+ "quantity": 48,
130
+ "unit_price": 14.00,
131
+ "gst_rate": 5,
132
+ "line_total": 672.00
133
+ }
134
+ ],
135
+ "grand_total": 9650.00,
136
+ "extraction_warnings": []
137
+ }
138
+ ```
139
+
140
+ **Field notes**:
141
+ - `product_raw` — verbatim as printed on the invoice (abbreviations, typos preserved)
142
+ - `gst_rate` — percentage value (5, 12, 18, 28), not a decimal
143
+ - `date` — ISO 8601 (`YYYY-MM-DD`) when parseable, raw string otherwise
144
+ - `extraction_warnings` — list of issues noticed (missing fields, illegible areas, GST anomalies)
145
+ - Numeric fields default to `0` when unreadable; `invoice_number`/`supplier`/`date` default to `null`
146
+
147
+ ---
148
+
149
+ ## Usage
150
+
151
+ ### Basic Inference
152
+
153
+ ```python
154
+ import torch
155
+ from transformers import AutoModel, AutoTokenizer
156
+ from PIL import Image
157
+
158
+ model = AutoModel.from_pretrained(
159
+ "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged",
160
+ trust_remote_code=True,
161
+ torch_dtype=torch.bfloat16,
162
+ device_map="auto",
163
+ )
164
+ model.eval()
165
+ tokenizer = AutoTokenizer.from_pretrained(
166
+ "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged",
167
+ trust_remote_code=True,
168
+ )
169
+
170
+ image = Image.open("invoice.jpg").convert("RGB")
171
+
172
+ prompt = (
173
+ "You are an OCR agent for Indian kirana store invoices. "
174
+ "Extract all information from this invoice image and return ONLY valid JSON "
175
+ "matching this schema exactly:\\n"
176
+ '{"invoice_number": string|null, "supplier": string|null, "date": string|null, '
177
+ '"items": [{"product_raw": string, "quantity": number, "unit_price": number, '
178
+ '"gst_rate": number, "line_total": number}], '
179
+ '"grand_total": number, "extraction_warnings": [string]}\\n'
180
+ "Return ONLY the JSON object, no markdown, no prose."
181
+ )
182
+
183
+ msgs = [{"role": "user", "content": [image, prompt]}]
184
+ response = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048)
185
+ print(response)
186
+ ```
187
+
188
+ ### From a PDF (multi-page)
189
+
190
+ ```python
191
+ import fitz # PyMuPDF
192
+ from PIL import Image
193
+ import io, json
194
+
195
+ doc = fitz.open("invoice.pdf")
196
+ results = []
197
+ for page in doc:
198
+ pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
199
+ img = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
200
+ msgs = [{"role": "user", "content": [img, prompt]}]
201
+ raw = model.chat(image=None, msgs=msgs, tokenizer=tokenizer, sampling=False, max_new_tokens=2048)
202
+ results.append(json.loads(raw))
203
+ ```
204
+
205
+ ---
206
+
207
+ ## How It Fits in Kirana Detective
208
+
209
+ ```
210
+ Invoice image
211
+
212
+
213
+ ┌─────────────────────────────┐
214
+ │ Agent 1 — Invoice Extractor │ ← this model
215
+ │ MiniCPM-V 4.6 (merged) │
216
+ └─────────────────────────────┘
217
+ │ InvoiceJSON (raw product names)
218
+
219
+ ┌─────────────────────────────┐
220
+ │ Agent 2 — Product Matcher │ MiniCPM5-1B normalizer
221
+ └─────────────────────────────┘
222
+
223
+
224
+ ┌─────────────────────────────┐
225
+ │ Agent 3 — Pricing Check │ catalog + price history
226
+ └─────────────────────────────┘
227
+
228
+ ▼ (+ delivery photos)
229
+ ┌─────────────────────────────┐
230
+ │ Agent 4 — Visual Counter │ YOLO26n ONNX
231
+ └─────────────────────────────┘
232
+
233
+
234
+ ┌─────────────────────────────┐
235
+ │ Agent 5 — Reconciliation │
236
+ │ Agent 6 — Savings Report │ MiniCPM5-1B
237
+ └─────────────────────────────┘
238
+
239
+
240
+ ₹ Leakage report + action items
241
+ ```
242
+
243
+ Related repos:
244
+ - [`build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer`](https://huggingface.co/build-small-hackathon/minicpm5-1b-indian-fmcg-normalizer) — product name normalizer
245
+ - [`build-small-hackathon/yolo26n-indian-fmcg-detection`](https://huggingface.co/build-small-hackathon/yolo26n-indian-fmcg-detection) — YOLO product counter
246
+
247
+ ---
248
+
249
+ ## Limitations
250
+
251
+ - Trained on **synthetic** invoices only — real-world performance may vary on heavily degraded,
252
+ stamped, or non-standard layouts until production data is collected.
253
+ - Optimised for **English and numeric** invoice content; Hindi/regional-language invoices are
254
+ not yet covered.
255
+ - Product names are extracted **verbatim** (`product_raw`) — normalization to canonical SKU
256
+ names is handled downstream by the MiniCPM5-1B normalizer agent.
257
+ - `grand_total` extraction can fail on invoices with complex multi-page subtotal structures.
258
+
259
+ ---
260
+
261
+ ## Reproducibility
262
+
263
+ The LoRA adapter was trained with this script and then merged:
264
+
265
+ ```bash
266
+ modal run finetune/train_minicpm_v.py # fine-tune → saves adapter to Modal volume
267
+ modal run finetune/export_minicpm_v_gguf.py # merge LoRA → push merged weights to HF
268
+ ```
269
+
270
+ Source: [GitHub — Kirana Detective](https://github.com/naazimsnh02/kirana-detective)
271
+
272
+ ---
273
+
274
+ ## Citation
275
+
276
+ ```bibtex
277
+ @misc{kirana_detective_minicpmv_2026,
278
+ author = {Hussain, Syed Naazim},
279
+ title = {MiniCPM-V 4.6 Fine-Tuned for Indian Invoice Extraction},
280
+ year = {2026},
281
+ publisher = {HuggingFace},
282
+ howpublished = {\\url{https://huggingface.co/naazimsnh02/minicpm-v-4-6-indian-invoice-extraction-merged}},
283
+ }
284
+ ```
285
+
286
+ ---
287
+
288
+ ## License
289
+
290
+ Apache 2.0 — same license as the base [`openbmb/MiniCPM-V-4.6`](https://huggingface.co/openbmb/MiniCPM-V-4.6) model.
291
+ """
292
+
293
+
294
+ def main() -> None:
295
+ token = os.environ.get("HF_TOKEN")
296
+ if not token:
297
+ raise SystemExit("Set HF_TOKEN environment variable before running.")
298
+
299
+ api = HfApi(token=token)
300
+ api.create_repo(repo_id=HF_REPO, repo_type="model", exist_ok=True, private=False)
301
+ api.upload_file(
302
+ path_or_fileobj=MODEL_CARD.encode("utf-8"),
303
+ path_in_repo="README.md",
304
+ repo_id=HF_REPO,
305
+ repo_type="model",
306
+ commit_message="Update professional model card",
307
+ )
308
+ print(f"Model card pushed to https://huggingface.co/{HF_REPO}")
309
+
310
+
311
+ if __name__ == "__main__":
312
+ main()
finetune/push_minicpm_v_to_hf.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Push the trained MiniCPM-V LoRA adapter from Modal volume to HuggingFace Hub.
3
+
4
+ Usage:
5
+ modal run finetune/push_minicpm_v_to_hf.py
6
+
7
+ Reads from Modal volume: kirana-minicpm-v-output (/output/minicpm-v-lora)
8
+ Pushes to: naazimsnh02/minicpm-v-4-6-indian-invoice-extraction
9
+ """
10
+
11
+ import os
12
+ import modal
13
+
14
+ app = modal.App("kirana-push-minicpm-v")
15
+
16
+ IMAGE = (
17
+ modal.Image.debian_slim(python_version="3.11")
18
+ .pip_install("huggingface_hub>=0.30.0")
19
+ )
20
+
21
+ HF_SECRET = modal.Secret.from_name("hf-secret")
22
+
23
+ HF_REPO = "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction"
24
+ BASE_MODEL = "openbmb/MiniCPM-V-4.6"
25
+ HF_DATASET_REPO = "build-small-hackathon/kirana-invoice-train-data"
26
+
27
+ MODEL_CARD = """\
28
+ ---
29
+ license: apache-2.0
30
+ base_model: openbmb/MiniCPM-V-4.6
31
+ datasets:
32
+ - build-small-hackathon/kirana-invoice-train-data
33
+ language:
34
+ - en
35
+ tags:
36
+ - invoice-extraction
37
+ - indian-fmcg
38
+ - minicpm-v
39
+ - ocr
40
+ - qlora
41
+ - peft
42
+ - kirana
43
+ - vision-language
44
+ pipeline_tag: image-text-to-text
45
+ ---
46
+
47
+ # MiniCPM-V 4.6 — Indian Invoice Extraction (LoRA Adapter)
48
+
49
+ Fine-tuned [`openbmb/MiniCPM-V-4.6`](https://huggingface.co/openbmb/MiniCPM-V-4.6) for structured JSON extraction from Indian distributor invoices.
50
+ Part of the **Kirana Detective** project — an AI audit pipeline for small Indian grocery (kirana) stores.
51
+
52
+ > **This is a PEFT LoRA adapter** — you need the base model + this adapter to run inference.
53
+
54
+ ## Training Results
55
+
56
+ | Epoch | Train Loss | Eval Loss |
57
+ |-------|-----------|-----------|
58
+ | 1 | ~6.08 | 0.2901 |
59
+ | 2 | ~3.95 | 0.2281 |
60
+ | 3 | ~3.33 | **0.212** |
61
+
62
+ **Training summary** (3 epochs, 87 steps, ~52 min on A10G):
63
+ - Total average train loss: 4.774
64
+ - Best eval loss: **0.212** (epoch 3, loaded as final checkpoint)
65
+ - Trainable parameters: 9,486,336 / 1,309,914,352 (0.72%)
66
+ - Dataset: 450 train + 50 eval synthetic invoices
67
+
68
+ ## Usage
69
+
70
+ ```python
71
+ from peft import PeftModel, PeftConfig
72
+ from transformers import AutoModel, AutoProcessor
73
+ import torch
74
+
75
+ # Load adapter config to get base model id
76
+ config = PeftConfig.from_pretrained("naazimsnh02/minicpm-v-4-6-indian-invoice-extraction")
77
+
78
+ base_model = AutoModel.from_pretrained(
79
+ config.base_model_name_or_path,
80
+ trust_remote_code=True,
81
+ torch_dtype=torch.bfloat16,
82
+ device_map="auto",
83
+ )
84
+
85
+ model = PeftModel.from_pretrained(base_model, "naazimsnh02/minicpm-v-4-6-indian-invoice-extraction")
86
+ processor = AutoProcessor.from_pretrained("naazimsnh02/minicpm-v-4-6-indian-invoice-extraction", trust_remote_code=True)
87
+ ```
88
+
89
+ ### Inference Example
90
+
91
+ ```python
92
+ from PIL import Image
93
+
94
+ image = Image.open("invoice.jpg")
95
+ messages = [
96
+ {
97
+ "role": "system",
98
+ "content": "You are an invoice extraction assistant. Given an invoice image, extract all fields as valid JSON. Return ONLY the JSON object, no explanation."
99
+ },
100
+ {
101
+ "role": "user",
102
+ "content": [
103
+ {"type": "image", "image": image},
104
+ {"type": "text", "text": "Extract all invoice fields as JSON."}
105
+ ]
106
+ }
107
+ ]
108
+
109
+ inputs = processor(messages, return_tensors="pt").to(model.device)
110
+ with torch.no_grad():
111
+ output = model.generate(**inputs, max_new_tokens=512)
112
+ result_json = processor.decode(output[0], skip_special_tokens=True)
113
+ ```
114
+
115
+ ## Output Schema
116
+
117
+ ```json
118
+ {
119
+ "invoice_number": "INV-2024-001",
120
+ "supplier": "Hindustan Unilever Ltd.",
121
+ "date": "2026-06-10",
122
+ "items": [
123
+ {
124
+ "product_raw": "SURF XL 1KG",
125
+ "quantity": 12,
126
+ "unit_price": 95.00,
127
+ "gst_rate": 18,
128
+ "line_total": 1140.00
129
+ }
130
+ ],
131
+ "grand_total": 9650.00,
132
+ "extraction_warnings": []
133
+ }
134
+ ```
135
+
136
+ ## Supported Invoice Formats
137
+
138
+ - Printed GST invoices (Tally-style, thermal-print)
139
+ - Tally PDF exports
140
+ - WhatsApp screenshot invoices
141
+ - Handwritten bills
142
+
143
+ ## Training Details
144
+
145
+ | Parameter | Value |
146
+ |-----------|-------|
147
+ | Base model | openbmb/MiniCPM-V-4.6 |
148
+ | Model class | MiniCPMV4_6ForConditionalGeneration |
149
+ | Fine-tuning method | QLoRA (4-bit + LoRA) |
150
+ | LoRA rank | 16 |
151
+ | Quantization | bitsandbytes 4-bit (nf4) |
152
+ | Batch size | 1 (grad accum × 16 = effective 16) |
153
+ | Learning rate | 1e-4 (cosine decay, warmup 10 steps) |
154
+ | Epochs | 3 |
155
+ | Total steps | 87 |
156
+ | Hardware | NVIDIA A10G (22 GB VRAM) |
157
+ | Training time | ~52 minutes |
158
+ | Orchestration | Modal (serverless GPU) |
159
+ | Framework | Transformers ≥ 5.7.0 + PEFT |
160
+
161
+ ## Citation
162
+
163
+ ```bibtex
164
+ @misc{kirana-detector-minicpm-v-2026,
165
+ title = {Kirana Detective: MiniCPM-V 4.6 Indian Invoice Extraction},
166
+ author = {Syed Naazim Hussain},
167
+ year = {2026},
168
+ url = {https://huggingface.co/naazimsnh02/minicpm-v-4-6-indian-invoice-extraction}
169
+ }
170
+ ```
171
+
172
+ ## License
173
+
174
+ Apache 2.0 (same as base model openbmb/MiniCPM-V-4.6)
175
+ """
176
+
177
+
178
+ @app.function(
179
+ image=IMAGE,
180
+ timeout=600,
181
+ secrets=[HF_SECRET],
182
+ volumes={
183
+ "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=False)
184
+ },
185
+ )
186
+ def push_to_hub():
187
+ from huggingface_hub import HfApi
188
+ from pathlib import Path
189
+
190
+ token = os.environ["HF_TOKEN"]
191
+ api = HfApi(token=token)
192
+
193
+ print(f"Creating repo: {HF_REPO}")
194
+ api.create_repo(HF_REPO, repo_type="model", exist_ok=True, private=False)
195
+
196
+ adapter_dir = Path("/output/minicpm-v-lora")
197
+ if not adapter_dir.exists():
198
+ raise FileNotFoundError(
199
+ f"Adapter not found at {adapter_dir}. "
200
+ "Did the training job complete successfully?"
201
+ )
202
+
203
+ files = list(adapter_dir.iterdir())
204
+ print(f"Found {len(files)} files in {adapter_dir}:")
205
+ for f in files:
206
+ print(f" {f.name} ({f.stat().st_size / 1024:.1f} KB)")
207
+
208
+ for f in files:
209
+ if f.is_file():
210
+ print(f"Uploading {f.name}...")
211
+ api.upload_file(
212
+ path_or_fileobj=str(f),
213
+ path_in_repo=f.name,
214
+ repo_id=HF_REPO,
215
+ repo_type="model",
216
+ )
217
+
218
+ print("Uploading README.md (model card)...")
219
+ api.upload_file(
220
+ path_or_fileobj=MODEL_CARD.encode(),
221
+ path_in_repo="README.md",
222
+ repo_id=HF_REPO,
223
+ repo_type="model",
224
+ )
225
+
226
+ print(f"\nDone! Model published at: https://huggingface.co/{HF_REPO}")
227
+
228
+
229
+ @app.local_entrypoint()
230
+ def main():
231
+ push_to_hub.remote()
finetune/push_yolo_to_hf.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Push trained YOLO artifacts from Modal volume to HuggingFace Hub.
3
+
4
+ Usage:
5
+ modal run finetune/push_yolo_to_hf.py
6
+
7
+ Reads from Modal volume: kirana-yolo-output (/output/)
8
+ Pushes to: naazimsnh02/yolo26n-indian-fmcg-detection
9
+ - best.pt (PyTorch weights)
10
+ - best.onnx (ONNX, opset 12)
11
+ - class_names.json
12
+ - README.md (model card)
13
+ """
14
+
15
+ import os
16
+ import modal
17
+
18
+ app = modal.App("kirana-push-yolo")
19
+
20
+ IMAGE = (
21
+ modal.Image.debian_slim(python_version="3.11")
22
+ .pip_install("huggingface_hub>=0.30.0")
23
+ )
24
+
25
+ HF_SECRET = modal.Secret.from_name("hf-secret")
26
+ HF_REPO = "naazimsnh02/yolo26n-indian-fmcg-detection"
27
+
28
+ MODEL_CARD = """\
29
+ ---
30
+ license: apache-2.0
31
+ base_model: yolo26n
32
+ language:
33
+ - en
34
+ tags:
35
+ - object-detection
36
+ - yolo
37
+ - indian-fmcg
38
+ - onnx
39
+ - ultralytics
40
+ - kirana
41
+ pipeline_tag: object-detection
42
+ datasets:
43
+ - agentsk47/indian-grocery-object-detection-mfsnx
44
+ - iit-patna-qg1jh/grocery_items-7i2em
45
+ - project-c5ho0/indian-market-qieug
46
+ ---
47
+
48
+ # YOLO26n — Indian FMCG Product Detection
49
+
50
+ Fine-tuned [YOLO26n](https://docs.ultralytics.com) on a **merged dataset of three Indian grocery sources**
51
+ from Roboflow Universe. Part of the **Kirana Detective** project — an AI system for small Indian grocery
52
+ stores to visually count and reconcile shelf/counter inventory from photos.
53
+
54
+ ## Performance
55
+
56
+ | Metric | Value |
57
+ |---|---|
58
+ | mAP50 (all classes) | **0.428** |
59
+ | mAP50-95 (all classes) | **0.302** |
60
+ | Total classes | 1,831 |
61
+ | Validation images | 1,236 |
62
+ | Validation instances | 13,443 |
63
+
64
+ Training ran for **100 epochs** (60 initial + 40 resumed after restart) on an NVIDIA A10G via Modal.
65
+
66
+ ## Training Datasets
67
+
68
+ | Dataset | Workspace | Version | Images | Classes |
69
+ |---|---|---|---|---|
70
+ | [Indian Grocery Object Detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) | agentsk47 | v1 | ~400 | 10 |
71
+ | [Grocery Items](https://universe.roboflow.com/iit-patna-qg1jh/grocery_items-7i2em) | IIT Patna | v45 | 6,695 | 20 |
72
+ | [Indian Market](https://universe.roboflow.com/project-c5ho0/indian-market-qieug) | project-c5ho0 | v2 | 4,694 | 2 |
73
+
74
+ All three datasets were downloaded in **YOLOv8 format**, class IDs remapped to a unified list, and merged
75
+ before training. The full unified class list (1,831 entries) is available in `class_names.json`.
76
+
77
+ ## Files
78
+
79
+ | File | Description |
80
+ |---|---|
81
+ | `best.pt` | PyTorch checkpoint (best mAP50 epoch) |
82
+ | `best.onnx` | ONNX export, opset 12 (recommended for inference) |
83
+ | `class_names.json` | Full list of 1,831 class names (index = class_id) |
84
+
85
+ ## How to Use
86
+
87
+ ### ONNX Runtime (CPU / any platform)
88
+
89
+ ```python
90
+ import json, numpy as np, onnxruntime as ort
91
+ from PIL import Image
92
+
93
+ session = ort.InferenceSession("best.onnx", providers=["CPUExecutionProvider"])
94
+ class_names = json.load(open("class_names.json"))
95
+
96
+ def preprocess(path, size=640):
97
+ img = Image.open(path).convert("RGB").resize((size, size))
98
+ return (np.array(img, dtype=np.float32) / 255.0).transpose(2, 0, 1)[None]
99
+
100
+ input_name = session.get_inputs()[0].name
101
+ outputs = session.run(None, {input_name: preprocess("shelf.jpg")})
102
+ # outputs[0]: (1, 300, 6) — [x1, y1, x2, y2, confidence, class_id]
103
+ ```
104
+
105
+ ### Ultralytics (PyTorch)
106
+
107
+ ```python
108
+ from ultralytics import YOLO
109
+
110
+ model = YOLO("best.pt")
111
+ results = model.predict("shelf.jpg", imgsz=640, conf=0.25)
112
+ results[0].show()
113
+ ```
114
+
115
+ ## Training Details
116
+
117
+ | Parameter | Value |
118
+ |---|---|
119
+ | Base model | YOLO26n |
120
+ | Input size | 640 × 640 |
121
+ | Epochs | 100 (60 + 40 resumed) |
122
+ | Batch size | 16 |
123
+ | Early stopping patience | 20 |
124
+ | Export format | ONNX opset 12 |
125
+ | Hardware | NVIDIA A10G (Modal) |
126
+
127
+ ## Citation
128
+
129
+ ```bibtex
130
+ @misc{kirana-detective-yolo-2026,
131
+ title = {Kirana Detective: YOLO26n Indian FMCG Product Detector},
132
+ author = {Naazim},
133
+ year = {2026},
134
+ url = {https://huggingface.co/naazimsnh02/yolo26n-indian-fmcg-detection}
135
+ }
136
+ ```
137
+ """
138
+
139
+
140
+ @app.function(
141
+ image=IMAGE,
142
+ timeout=600,
143
+ secrets=[HF_SECRET],
144
+ volumes={"/output": modal.Volume.from_name("kirana-yolo-output", create_if_missing=False)},
145
+ )
146
+ def push_to_hub():
147
+ import json
148
+ import shutil
149
+ import tempfile
150
+ from pathlib import Path
151
+ from huggingface_hub import HfApi
152
+
153
+ # --- Locate artifacts ---
154
+ output = Path("/output")
155
+ best_pt = output / "runs/yolo26n_fmcg/weights/best.pt"
156
+ best_onnx = output / "runs/yolo26n_fmcg/weights/best.onnx"
157
+ cls_json = output / "class_names.json"
158
+
159
+ print("=== Volume contents (/output) ===")
160
+ for p in sorted(output.rglob("*")):
161
+ if p.is_file():
162
+ print(f" {p.relative_to(output)} ({p.stat().st_size / 1024:.1f} KB)")
163
+
164
+ missing = [p for p in (best_pt, best_onnx, cls_json) if not p.exists()]
165
+ if missing:
166
+ raise FileNotFoundError(f"Missing artifacts: {[str(m) for m in missing]}")
167
+
168
+ with open(cls_json) as f:
169
+ classes = json.load(f)
170
+ print(f"\nClass count: {len(classes)}")
171
+
172
+ # --- Stage all files into a temp folder, then push as a single commit ---
173
+ api = HfApi(token=os.environ["HF_TOKEN"])
174
+ api.create_repo(HF_REPO, repo_type="model", exist_ok=True, private=False)
175
+
176
+ with tempfile.TemporaryDirectory() as staging:
177
+ staging = Path(staging)
178
+ shutil.copy(best_pt, staging / "best.pt")
179
+ shutil.copy(best_onnx, staging / "best.onnx")
180
+ shutil.copy(cls_json, staging / "class_names.json")
181
+ (staging / "README.md").write_text(MODEL_CARD, encoding="utf-8")
182
+
183
+ print("\nFiles staged for upload:")
184
+ for f in sorted(staging.iterdir()):
185
+ print(f" {f.name} ({f.stat().st_size / 1024:.1f} KB)")
186
+
187
+ print("\nPushing to HF Hub (single commit)...")
188
+ api.upload_folder(
189
+ folder_path=str(staging),
190
+ repo_id=HF_REPO,
191
+ repo_type="model",
192
+ commit_message="Add best.pt, best.onnx, class_names.json, README (100-epoch FMCG detector)",
193
+ )
194
+
195
+ print(f"\nDone — https://huggingface.co/{HF_REPO}")
196
+
197
+
198
+ @app.local_entrypoint()
199
+ def main():
200
+ push_to_hub.remote()
finetune/train_minicpm_v.py CHANGED
@@ -10,10 +10,11 @@ Two-step workflow:
10
 
11
  Publishes:
12
  build-small-hackathon/kirana-invoice-train-data (HF dataset, reusable)
13
- naazimsnh02/minicpm-v-4-6-indian-invoice-extraction (model GGUF)
14
 
15
  Training approach:
16
- QLoRA via Unsloth on base openbmb/MiniCPM-V-4.6
 
17
  System prompt: "Extract invoice JSON"
18
  User turn: <image> + "Extract all invoice fields as JSON"
19
  Assistant turn: <annotation JSON>
@@ -29,17 +30,20 @@ app = modal.App("kirana-minicpm-v-finetune")
29
 
30
  IMAGE = (
31
  modal.Image.debian_slim(python_version="3.11")
32
- .apt_install("libsm6", "libxext6")
33
  .pip_install(
34
- "unsloth>=2026.5.0",
35
  "huggingface_hub>=0.30.0",
36
  "datasets>=3.0.0",
37
  "torch>=2.3.0",
38
  "torchvision>=0.18.0",
39
- "transformers[torch]<=5.5.0",
 
 
 
40
  "trl>=0.9.0",
41
- "peft>=0.18.0",
42
  "pillow>=10.0.0",
 
 
43
  )
44
  )
45
 
@@ -53,9 +57,9 @@ ANNOTATIONS_PATH = "/data/synthetic_invoices/annotations.jsonl"
53
  LORA_RANK = 16
54
  MAX_SEQ_LENGTH = 2048
55
  EPOCHS = 3
56
- BATCH_SIZE = 4
57
- GRAD_ACCUM = 4
58
- LEARNING_RATE = 2e-4
59
 
60
  SYSTEM_PROMPT = (
61
  "You are an invoice extraction assistant. "
@@ -91,11 +95,7 @@ JSON_SCHEMA = """{
91
  memory=8192,
92
  )
93
  def push_dataset():
94
- """Build a HF dataset from the Modal volume and push to Hub.
95
-
96
- Uses flat image column (one Image() per row, not a nested list) so Arrow
97
- serialisation never encounters mixed list/non-list types.
98
- """
99
  from PIL import Image as PILImage
100
  from datasets import Dataset, Features, Value
101
  from datasets import Image as HFImage
@@ -135,7 +135,7 @@ def push_dataset():
135
  @app.function(
136
  image=IMAGE,
137
  gpu="A10G",
138
- timeout=14400, # 4 hours
139
  secrets=[HF_SECRET],
140
  volumes={
141
  "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=True),
@@ -143,125 +143,287 @@ def push_dataset():
143
  memory=32768,
144
  )
145
  def train():
 
146
  from datasets import load_dataset
147
  from huggingface_hub import HfApi
148
- from unsloth import FastVisionModel
149
- from unsloth.trainer import UnslothVisionDataCollator
150
- from trl import SFTTrainer, SFTConfig
 
 
 
 
 
 
151
 
152
- # Load the pre-built dataset from HF Hub.
153
- # image column is decoded to PIL on access; response is a plain string.
154
  hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
155
  print(f"Dataset: {len(hf_ds['train'])} train, {len(hf_ds['test'])} eval")
156
 
157
- instruction = (
158
- f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}"
 
 
 
 
 
 
 
 
 
 
 
 
159
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- def to_conversation(sample):
162
- # All content values are lists so Arrow infers a single consistent struct
163
- # type (struct<type, text?>) with a nullable text field — no mixed
164
- # list/non-list values at any nesting level.
165
  return {
166
- "messages": [
167
- {
168
- "role": "system",
169
- "content": [{"type": "text", "text": SYSTEM_PROMPT}],
170
- },
171
- {
172
- "role": "user",
173
- "content": [
174
- {"type": "image"}, # placeholder matched by collator
175
- {"type": "text", "text": instruction},
176
- ],
177
- },
178
- {
179
- "role": "assistant",
180
- "content": [{"type": "text", "text": sample["response"]}],
181
- },
182
- ],
183
- # Single PIL Image per row (flat, not wrapped in a list).
184
- # Arrow stores this as Image() — no nested-list serialisation issue.
185
- "images": sample["image"],
186
  }
187
 
188
- train_dataset = hf_ds["train"].map(
189
- to_conversation, remove_columns=hf_ds["train"].column_names
190
- )
191
- eval_dataset = hf_ds["test"].map(
192
- to_conversation, remove_columns=hf_ds["test"].column_names
193
- )
 
 
 
 
 
 
 
194
 
195
- # --- Load model with Unsloth ---
196
- model, tokenizer = FastVisionModel.from_pretrained(
197
- BASE_MODEL,
198
- load_in_4bit=True,
199
- use_gradient_checkpointing="unsloth",
200
- )
201
 
202
- model = FastVisionModel.get_peft_model(
203
- model,
204
- finetune_vision_layers=True,
205
- finetune_language_layers=True,
206
- finetune_attention_modules=True,
207
- finetune_mlp_modules=True,
208
  r=LORA_RANK,
209
- lora_alpha=LORA_RANK,
210
- lora_dropout=0,
 
211
  bias="none",
212
- random_state=42,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
  )
214
 
215
- # UnslothVisionDataCollator handles apply_chat_template + image injection
216
- # at batch time; remove_unused_columns=False keeps the images column.
217
- trainer = SFTTrainer(
218
  model=model,
219
- tokenizer=tokenizer,
220
- data_collator=UnslothVisionDataCollator(model, tokenizer),
221
  train_dataset=train_dataset,
222
  eval_dataset=eval_dataset,
223
- args=SFTConfig(
224
- output_dir="/output/minicpm-v-sft",
225
- per_device_train_batch_size=BATCH_SIZE,
226
- gradient_accumulation_steps=GRAD_ACCUM,
227
- warmup_steps=10,
228
- num_train_epochs=EPOCHS,
229
- learning_rate=LEARNING_RATE,
230
- fp16=False,
231
- bf16=True,
232
- logging_steps=10,
233
- eval_strategy="epoch",
234
- save_strategy="epoch",
235
- load_best_model_at_end=True,
236
- report_to="none",
237
- max_seq_length=MAX_SEQ_LENGTH,
238
- remove_unused_columns=False,
239
- ),
240
  )
241
  trainer.train()
242
  print("Training complete")
243
 
244
- # --- Save merged model ---
245
- model.save_pretrained_merged(
246
- "/output/minicpm-v-merged", tokenizer, save_method="merged_16bit"
247
- )
248
-
249
- # --- Export to GGUF Q4_K_M ---
250
- model.save_pretrained_gguf(
251
- "/output/minicpm-v-gguf",
252
- tokenizer,
253
- quantization_method="q4_k_m",
254
- )
255
- print("GGUF export complete")
256
 
257
- # --- Publish model to HF Hub ---
258
  api = HfApi(token=os.environ["HF_TOKEN"])
259
  api.create_repo(HF_REPO, repo_type="model", exist_ok=True)
260
 
261
- gguf_dir = Path("/output/minicpm-v-gguf")
262
- for f in gguf_dir.glob("*.gguf"):
263
- print(f"Uploading {f.name}...")
264
- api.upload_file(path_or_fileobj=str(f), path_in_repo=f.name, repo_id=HF_REPO)
 
265
 
266
  model_card = f"""---
267
  license: apache-2.0
@@ -271,13 +433,13 @@ datasets:
271
  tags:
272
  - invoice-extraction
273
  - indian-fmcg
274
- - gguf
275
  - minicpm-v
276
  - ocr
277
  - qlora
 
278
  ---
279
 
280
- # MiniCPM-V 4.6 — Indian Invoice Extraction
281
 
282
  Fine-tuned [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}) for structured extraction
283
  from Indian distributor invoices (printed GST bills, Tally exports, WhatsApp screenshots,
@@ -286,18 +448,19 @@ handwritten bills).
286
  ## Usage
287
 
288
  ```python
289
- from llama_cpp import Llama
290
- from llama_cpp.llama_chat_format import MiniCPMv26ChatHandler
291
 
292
- handler = MiniCPMv26ChatHandler(clip_model_path="mmproj.gguf")
293
- llm = Llama(model_path="model.gguf", chat_handler=handler, n_ctx=4096)
 
294
  ```
295
 
296
  ## Training
297
 
298
  - Base model: {BASE_MODEL}
299
- - Method: QLoRA (rank {LORA_RANK}) via Unsloth on Modal A10G
300
- - Dataset: [{HF_DATASET_REPO}](https://huggingface.co/datasets/{HF_DATASET_REPO}) — 500 synthetic Indian invoices (4 formats × 10 suppliers)
301
  - Part of: Kirana Detective AI (HuggingFace Build Small Hackathon 2026)
302
  """
303
  api.upload_file(
@@ -308,6 +471,78 @@ llm = Llama(model_path="model.gguf", chat_handler=handler, n_ctx=4096)
308
  print(f"Published to {HF_REPO}")
309
 
310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  # ─── Local entrypoints ─────────────────────────────────────────────────────────
312
 
313
  @app.local_entrypoint()
@@ -318,3 +553,8 @@ def main():
318
  @app.local_entrypoint()
319
  def main_push():
320
  push_dataset.remote()
 
 
 
 
 
 
10
 
11
  Publishes:
12
  build-small-hackathon/kirana-invoice-train-data (HF dataset, reusable)
13
+ build-small-hackathon/minicpm-v-4-6-indian-invoice-extraction (model adapter + GGUF)
14
 
15
  Training approach:
16
+ QLoRA via native PEFT + bitsandbytes on base openbmb/MiniCPM-V-4.6
17
+ (unsloth is incompatible with MiniCPM-V-4.6 which requires transformers>=5.7.0)
18
  System prompt: "Extract invoice JSON"
19
  User turn: <image> + "Extract all invoice fields as JSON"
20
  Assistant turn: <annotation JSON>
 
30
 
31
  IMAGE = (
32
  modal.Image.debian_slim(python_version="3.11")
33
+ .apt_install("libsm6", "libxext6", "git")
34
  .pip_install(
 
35
  "huggingface_hub>=0.30.0",
36
  "datasets>=3.0.0",
37
  "torch>=2.3.0",
38
  "torchvision>=0.18.0",
39
+ "transformers[torch]>=5.7.0",
40
+ "peft>=0.14.0",
41
+ "bitsandbytes>=0.43.0",
42
+ "accelerate>=0.28.0",
43
  "trl>=0.9.0",
 
44
  "pillow>=10.0.0",
45
+ "sentencepiece>=0.2.0",
46
+ "timm>=0.9.0",
47
  )
48
  )
49
 
 
57
  LORA_RANK = 16
58
  MAX_SEQ_LENGTH = 2048
59
  EPOCHS = 3
60
+ BATCH_SIZE = 1
61
+ GRAD_ACCUM = 16
62
+ LEARNING_RATE = 1e-4
63
 
64
  SYSTEM_PROMPT = (
65
  "You are an invoice extraction assistant. "
 
95
  memory=8192,
96
  )
97
  def push_dataset():
98
+ """Build a HF dataset from the Modal volume and push to Hub."""
 
 
 
 
99
  from PIL import Image as PILImage
100
  from datasets import Dataset, Features, Value
101
  from datasets import Image as HFImage
 
135
  @app.function(
136
  image=IMAGE,
137
  gpu="A10G",
138
+ timeout=14400,
139
  secrets=[HF_SECRET],
140
  volumes={
141
  "/output": modal.Volume.from_name("kirana-minicpm-v-output", create_if_missing=True),
 
143
  memory=32768,
144
  )
145
  def train():
146
+ import torch
147
  from datasets import load_dataset
148
  from huggingface_hub import HfApi
149
+ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
150
+ from torch.utils.data import Dataset as TorchDataset
151
+ from transformers import (
152
+ AutoModelForMultimodalLM,
153
+ AutoTokenizer,
154
+ BitsAndBytesConfig,
155
+ Trainer,
156
+ TrainingArguments,
157
+ )
158
 
159
+ # ── Load dataset ──────────────────────────────────────────────────────────
 
160
  hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
161
  print(f"Dataset: {len(hf_ds['train'])} train, {len(hf_ds['test'])} eval")
162
 
163
+ # ── Load model with 4-bit QLoRA ───────────────────────────────────────────
164
+ bnb_config = BitsAndBytesConfig(
165
+ load_in_4bit=True,
166
+ bnb_4bit_quant_type="nf4",
167
+ bnb_4bit_compute_dtype=torch.bfloat16,
168
+ bnb_4bit_use_double_quant=True,
169
+ )
170
+
171
+ raw_model = AutoModelForMultimodalLM.from_pretrained(
172
+ BASE_MODEL,
173
+ quantization_config=bnb_config,
174
+ trust_remote_code=True,
175
+ dtype=torch.bfloat16,
176
+ device_map="auto",
177
  )
178
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
179
+ print(f"Loaded model class: {raw_model.__class__.__name__}")
180
+
181
+ # ── Discover image preprocessing API ─────────────────────────────────────
182
+ # Try AutoProcessor first (modern HuggingFace VLM interface)
183
+ processor = None
184
+ try:
185
+ from transformers import AutoProcessor
186
+ processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
187
+ print(f"Loaded processor: {processor.__class__.__name__}")
188
+ except Exception as e:
189
+ print(f"AutoProcessor not available: {e}")
190
+
191
+ # Find image placeholder token (MiniCPM-V uses (<image>./</image>) or <image>)
192
+ image_token = None
193
+ vocab = tokenizer.get_vocab()
194
+ for candidate in ["(<image>./</image>)", "<image>", "<IMAGE>", "[IMAGE]"]:
195
+ if candidate in vocab:
196
+ image_token = candidate
197
+ break
198
+ # Fallback: scan tokenizer's special/added tokens
199
+ if image_token is None:
200
+ for token in list(tokenizer.special_tokens_map.values()) + list(tokenizer.added_tokens_encoder.keys()):
201
+ if isinstance(token, str) and "image" in token.lower():
202
+ image_token = token
203
+ break
204
+ print(f"Image placeholder token: {image_token!r}")
205
+ print(f"Special tokens: {tokenizer.special_tokens_map}")
206
+
207
+ # ── Pre-process ALL data before PEFT wrapping ─────────────────────────────
208
+ instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}"
209
+
210
+ def preprocess_one(image, response, debug=False):
211
+ image = image.convert("RGB")
212
+
213
+ # MUST use processor.apply_chat_template with {"type": "image"} —
214
+ # NOT tokenizer.apply_chat_template with a "<image>" string.
215
+ # Only the processor knows to expand {"type":"image"} into the correct
216
+ # number of <|image_pad|> tokens; the tokenizer leaves a bare <image>
217
+ # placeholder and the model then finds tokens:0, features:N mismatch.
218
+ msgs = [
219
+ {"role": "system", "content": SYSTEM_PROMPT},
220
+ {"role": "user", "content": [
221
+ {"type": "image"},
222
+ {"type": "text", "text": instruction},
223
+ ]},
224
+ ]
225
+ text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
226
+
227
+ # max_slice_nums=6: balance context vs OOM for portrait invoices
228
+ proc_out = processor(
229
+ text=text,
230
+ images=[image],
231
+ return_tensors="pt",
232
+ max_slice_nums=6,
233
+ )
234
+
235
+ if debug:
236
+ print("=== PROCESSOR OUTPUT SHAPES (first sample) ===")
237
+ for k, v in proc_out.items():
238
+ if isinstance(v, torch.Tensor):
239
+ print(f" {k}: shape={list(v.shape)}, dtype={v.dtype}")
240
+ elif isinstance(v, list):
241
+ item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:3]]
242
+ print(f" {k}: list[{len(v)}] = {item_info}")
243
+ else:
244
+ print(f" {k}: {type(v).__name__} = {v!r}")
245
+
246
+ prompt_ids = proc_out["input_ids"][0]
247
+
248
+ # pixel_values: processor returns (1, 3, 14, W) WITH batch dim.
249
+ # target_sizes: returned as (N_tiles, 2) with NO batch dim.
250
+ # Strip batch dim only where it exists (shape[0]==1).
251
+ vision_fields = {}
252
+ for k, v in proc_out.items():
253
+ if k in ("input_ids", "attention_mask"):
254
+ continue
255
+ if isinstance(v, torch.Tensor):
256
+ if k == "pixel_values":
257
+ vision_fields[k] = v[0] # (1,3,14,W) → (3,14,W); collator stacks to (B,3,14,W)
258
+ elif v.shape[0] == 1:
259
+ vision_fields[k] = v[0] # strip batch-1 wrapper from metadata scalars
260
+ else:
261
+ vision_fields[k] = v # e.g. target_sizes (N_tiles,2) — no batch dim
262
+ elif isinstance(v, list) and len(v) == 1:
263
+ vision_fields[k] = v[0]
264
+ else:
265
+ vision_fields[k] = v
266
+
267
+ if debug:
268
+ print("=== VISION FIELDS AFTER PROCESSING ===")
269
+ for k, v in vision_fields.items():
270
+ if isinstance(v, torch.Tensor):
271
+ print(f" {k}: shape={list(v.shape)}")
272
+ else:
273
+ print(f" {k}: {type(v).__name__} = {v!r}")
274
+
275
+ response_ids = torch.tensor(
276
+ tokenizer.encode(response + tokenizer.eos_token, add_special_tokens=False),
277
+ dtype=torch.long,
278
+ )
279
+
280
+ full_ids = torch.cat([prompt_ids, response_ids])
281
+ labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids])
282
+ full_ids = full_ids[:MAX_SEQ_LENGTH]
283
+ labels = labels[:MAX_SEQ_LENGTH]
284
 
 
 
 
 
285
  return {
286
+ "input_ids": full_ids,
287
+ "attention_mask": torch.ones_like(full_ids),
288
+ "labels": labels,
289
+ **vision_fields, # pixel_values, image_sizes, etc.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  }
291
 
292
+ print("Pre-processing training data...")
293
+ train_data, eval_data = [], []
294
+ for i, s in enumerate(hf_ds["train"]):
295
+ try:
296
+ train_data.append(preprocess_one(s["image"], s["response"], debug=(i == 0)))
297
+ except Exception as e:
298
+ print(f" Skipping train[{i}]: {e}")
299
+ for i, s in enumerate(hf_ds["test"]):
300
+ try:
301
+ eval_data.append(preprocess_one(s["image"], s["response"]))
302
+ except Exception as e:
303
+ print(f" Skipping eval[{i}]: {e}")
304
+ print(f"Pre-processed {len(train_data)} train, {len(eval_data)} eval samples")
305
 
306
+ # ── PEFT wrapping ─────────────────────────────────────────────────────────
307
+ model = prepare_model_for_kbit_training(raw_model, use_gradient_checkpointing=True)
 
 
 
 
308
 
309
+ # task_type=None → base PeftModel; avoids requiring prepare_inputs_for_generation
310
+ lora_config = LoraConfig(
 
 
 
 
311
  r=LORA_RANK,
312
+ lora_alpha=LORA_RANK * 2,
313
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
314
+ lora_dropout=0.05,
315
  bias="none",
316
+ task_type=None,
317
+ )
318
+ model = get_peft_model(model, lora_config)
319
+ model.print_trainable_parameters()
320
+
321
+ # ── Dataset (just wraps pre-processed list) ───────────────────────────────
322
+ class PreprocessedDataset(TorchDataset):
323
+ def __init__(self, data):
324
+ self.data = data
325
+
326
+ def __len__(self):
327
+ return len(self.data)
328
+
329
+ def __getitem__(self, idx):
330
+ return self.data[idx]
331
+
332
+ train_dataset = PreprocessedDataset(train_data)
333
+ eval_dataset = PreprocessedDataset(eval_data)
334
+
335
+ # ── Collator: pad to batch max length ─────────────────────────────────────
336
+ pad_id = tokenizer.pad_token_id or 0
337
+
338
+ import torch.nn.functional as F
339
+
340
+ def collate_fn(batch):
341
+ max_len = max(b["input_ids"].size(0) for b in batch)
342
+ result = {}
343
+ for b in batch:
344
+ pad = max_len - b["input_ids"].size(0)
345
+ result.setdefault("input_ids", []).append(F.pad(b["input_ids"], (0, pad), value=pad_id))
346
+ result.setdefault("attention_mask", []).append(F.pad(b["attention_mask"], (0, pad), value=0))
347
+ result.setdefault("labels", []).append(F.pad(b["labels"], (0, pad), value=-100))
348
+ result = {k: torch.stack(v) for k, v in result.items()}
349
+
350
+ # Pass through every vision field.
351
+ # pixel_values (3,14,W) per sample → stack → (B,3,14,W) [4D for conv2d]
352
+ # target_sizes (N_tiles,2) per sample → cat → (total_tiles,2) [no extra batch dim]
353
+ extra_keys = [k for k in batch[0] if k not in result]
354
+ for k in extra_keys:
355
+ vals = [b[k] for b in batch]
356
+ if k == "target_sizes":
357
+ result[k] = torch.cat(vals, dim=0) # (total_tiles, 2)
358
+ else:
359
+ try:
360
+ result[k] = torch.stack(vals)
361
+ except (RuntimeError, TypeError):
362
+ result[k] = vals
363
+ return result
364
+
365
+ # ── Debug Trainer: print input shapes on first batch ─────────────────────
366
+ _debug_step_done = [False]
367
+
368
+ class DebugTrainer(Trainer):
369
+ def compute_loss(self, model, inputs, num_items_in_batch=None, **kwargs):
370
+ if not _debug_step_done[0]:
371
+ _debug_step_done[0] = True
372
+ print("=== MODEL INPUT SHAPES (first batch) ===")
373
+ for k, v in inputs.items():
374
+ if isinstance(v, torch.Tensor):
375
+ print(f" {k}: shape={list(v.shape)}, dtype={v.dtype}")
376
+ elif isinstance(v, list):
377
+ item_info = [f"shape={list(x.shape)}" if isinstance(x, torch.Tensor) else repr(x) for x in v[:2]]
378
+ print(f" {k}: list[{len(v)}] = {item_info}")
379
+ else:
380
+ print(f" {k}: {type(v).__name__} = {v!r}")
381
+ return super().compute_loss(model, inputs, num_items_in_batch=num_items_in_batch, **kwargs)
382
+
383
+ # ── Training ──────────────────────────────────────────────────────────────
384
+ training_args = TrainingArguments(
385
+ output_dir="/output/minicpm-v-sft",
386
+ per_device_train_batch_size=BATCH_SIZE,
387
+ per_device_eval_batch_size=BATCH_SIZE,
388
+ gradient_accumulation_steps=GRAD_ACCUM,
389
+ warmup_steps=10,
390
+ num_train_epochs=EPOCHS,
391
+ learning_rate=LEARNING_RATE,
392
+ bf16=True,
393
+ fp16=False,
394
+ logging_steps=10,
395
+ eval_strategy="epoch",
396
+ save_strategy="epoch",
397
+ load_best_model_at_end=True,
398
+ report_to="none",
399
+ remove_unused_columns=False,
400
+ dataloader_num_workers=0, # data already pre-processed; no workers needed
401
  )
402
 
403
+ trainer = DebugTrainer(
 
 
404
  model=model,
405
+ args=training_args,
 
406
  train_dataset=train_dataset,
407
  eval_dataset=eval_dataset,
408
+ data_collator=collate_fn,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  )
410
  trainer.train()
411
  print("Training complete")
412
 
413
+ # ── Save LoRA adapter ─────────────────────────────────────────────────────
414
+ model.save_pretrained("/output/minicpm-v-lora")
415
+ tokenizer.save_pretrained("/output/minicpm-v-lora")
416
+ print("LoRA adapter saved to /output/minicpm-v-lora")
 
 
 
 
 
 
 
 
417
 
418
+ # ── Publish adapter to HF Hub ─────────────────────────────────────────────
419
  api = HfApi(token=os.environ["HF_TOKEN"])
420
  api.create_repo(HF_REPO, repo_type="model", exist_ok=True)
421
 
422
+ adapter_dir = Path("/output/minicpm-v-lora")
423
+ for f in adapter_dir.iterdir():
424
+ if f.is_file():
425
+ print(f"Uploading {f.name}...")
426
+ api.upload_file(path_or_fileobj=str(f), path_in_repo=f.name, repo_id=HF_REPO)
427
 
428
  model_card = f"""---
429
  license: apache-2.0
 
433
  tags:
434
  - invoice-extraction
435
  - indian-fmcg
 
436
  - minicpm-v
437
  - ocr
438
  - qlora
439
+ - peft
440
  ---
441
 
442
+ # MiniCPM-V 4.6 — Indian Invoice Extraction (LoRA Adapter)
443
 
444
  Fine-tuned [{BASE_MODEL}](https://huggingface.co/{BASE_MODEL}) for structured extraction
445
  from Indian distributor invoices (printed GST bills, Tally exports, WhatsApp screenshots,
 
448
  ## Usage
449
 
450
  ```python
451
+ from transformers import AutoModelForMultimodalLM, AutoTokenizer
452
+ from peft import PeftModel
453
 
454
+ base = AutoModelForMultimodalLM.from_pretrained("{BASE_MODEL}", trust_remote_code=True)
455
+ model = PeftModel.from_pretrained(base, "{HF_REPO}")
456
+ tokenizer = AutoTokenizer.from_pretrained("{HF_REPO}", trust_remote_code=True)
457
  ```
458
 
459
  ## Training
460
 
461
  - Base model: {BASE_MODEL}
462
+ - Method: QLoRA (rank {LORA_RANK}) via PEFT + bitsandbytes on Modal A10G
463
+ - Dataset: [{HF_DATASET_REPO}](https://huggingface.co/datasets/{HF_DATASET_REPO}) — 500 synthetic Indian invoices
464
  - Part of: Kirana Detective AI (HuggingFace Build Small Hackathon 2026)
465
  """
466
  api.upload_file(
 
471
  print(f"Published to {HF_REPO}")
472
 
473
 
474
+ # ─── Dry-run: verify preprocessing + first batch shapes without training ──────
475
+ # Usage: modal run finetune/train_minicpm_v.py::main_dryrun
476
+ # Completes in ~2 min; confirms shapes are correct before a full training run.
477
+
478
+ @app.function(
479
+ image=IMAGE,
480
+ gpu="A10G",
481
+ timeout=600,
482
+ secrets=[HF_SECRET],
483
+ memory=32768,
484
+ )
485
+ def dryrun():
486
+ import torch
487
+ from datasets import load_dataset
488
+ from transformers import AutoModelForMultimodalLM, AutoTokenizer, AutoProcessor
489
+
490
+ hf_ds = load_dataset(HF_DATASET_REPO, token=os.environ["HF_TOKEN"])
491
+ raw_model = AutoModelForMultimodalLM.from_pretrained(BASE_MODEL, trust_remote_code=True, device_map="auto")
492
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
493
+ processor = AutoProcessor.from_pretrained(BASE_MODEL, trust_remote_code=True)
494
+
495
+ sample = hf_ds["train"][0]
496
+ image = sample["image"].convert("RGB")
497
+ instruction = f"Extract all invoice fields as JSON. Use this schema:\n{JSON_SCHEMA}"
498
+ msgs = [
499
+ {"role": "system", "content": SYSTEM_PROMPT},
500
+ {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": instruction}]},
501
+ ]
502
+ text = processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
503
+ proc_out = processor(text=text, images=[image], return_tensors="pt", max_slice_nums=6)
504
+
505
+ print("=== DRY-RUN: processor output ===")
506
+ image_pad_id = tokenizer.convert_tokens_to_ids("<|image_pad|>")
507
+ for k, v in proc_out.items():
508
+ if isinstance(v, torch.Tensor):
509
+ count = (v == image_pad_id).sum().item() if k == "input_ids" else ""
510
+ pad_info = f" (<|image_pad|> count={count})" if count != "" else ""
511
+ print(f" {k}: shape={list(v.shape)}{pad_info}")
512
+
513
+ response_ids = torch.tensor(
514
+ tokenizer.encode(sample["response"] + tokenizer.eos_token, add_special_tokens=False),
515
+ dtype=torch.long,
516
+ )
517
+ prompt_ids = proc_out["input_ids"][0]
518
+ full_ids = torch.cat([prompt_ids, response_ids])[:MAX_SEQ_LENGTH]
519
+ labels = torch.cat([torch.full_like(prompt_ids, -100), response_ids])[:MAX_SEQ_LENGTH]
520
+
521
+ model_inputs = {
522
+ "input_ids": full_ids.unsqueeze(0),
523
+ "attention_mask": torch.ones_like(full_ids).unsqueeze(0),
524
+ "labels": labels.unsqueeze(0),
525
+ }
526
+ for k, v in proc_out.items():
527
+ if k not in ("input_ids", "attention_mask"):
528
+ model_inputs[k] = v
529
+
530
+ device = next(raw_model.parameters()).device
531
+ model_inputs = {
532
+ k: v.to(device) if isinstance(v, torch.Tensor) else v
533
+ for k, v in model_inputs.items()
534
+ }
535
+ raw_model.eval()
536
+ with torch.no_grad():
537
+ outputs = raw_model(**model_inputs)
538
+ if getattr(outputs, "loss", None) is None:
539
+ raise RuntimeError(
540
+ f"Dry-run forward did not return loss. Output keys: {list(outputs.keys())}"
541
+ )
542
+ print(f"Dry-run forward loss: {outputs.loss.detach().float().item():.4f}")
543
+ print("Dry-run complete - shapes and labeled forward pass look correct")
544
+
545
+
546
  # ─── Local entrypoints ─────────────────────────────────────────────────────────
547
 
548
  @app.local_entrypoint()
 
553
  @app.local_entrypoint()
554
  def main_push():
555
  push_dataset.remote()
556
+
557
+
558
+ @app.local_entrypoint()
559
+ def main_dryrun():
560
+ dryrun.remote()
finetune/train_yolo26n.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Task 0.1 — Fine-tune YOLO26n on Indian Grocery Object Detection dataset.
3
 
4
  Run on Modal (A10G GPU, ~1-2 hours):
5
  modal run finetune/train_yolo26n.py
@@ -10,6 +10,11 @@ Publishes:
10
  class_names.json
11
  model card
12
 
 
 
 
 
 
13
  Prerequisites:
14
  ROBOFLOW_API_KEY in env (for dataset download)
15
  HF_TOKEN in env (for HF Hub publish)
@@ -25,8 +30,6 @@ app = modal.App("kirana-yolo26n-finetune")
25
 
26
  IMAGE = (
27
  modal.Image.debian_slim(python_version="3.11")
28
- # libGL.so.1 is required by opencv-python (pulled in by ultralytics)
29
- # libglib2.0-0 is required by libGL on Debian slim
30
  .apt_install("libgl1-mesa-glx", "libglib2.0-0")
31
  .pip_install(
32
  "ultralytics>=8.4.0",
@@ -34,33 +37,174 @@ IMAGE = (
34
  "huggingface_hub>=0.30.0",
35
  "onnx>=1.16.0",
36
  "onnxruntime>=1.18.0",
 
37
  )
38
  )
39
 
40
  ROBOFLOW_API_KEY = modal.Secret.from_name("roboflow-secret")
41
  HF_SECRET = modal.Secret.from_name("hf-secret")
42
 
43
- # Roboflow dataset: Indian Grocery Object Detection (AgentSK47)
44
- # https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx
45
- ROBOFLOW_WORKSPACE = "agentsk47"
46
- ROBOFLOW_PROJECT = "indian-grocery-object-detection-mfsnx"
47
- ROBOFLOW_VERSION = 1
48
-
49
- HF_REPO = "naazimsnh02/yolo26n-indian-fmcg-detection"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- # YOLO model name — update if Ultralytics renames the YOLO26 nano checkpoint
52
  YOLO_BASE_MODEL = "yolo26n.pt"
53
- YOLO_FALLBACK = "yolo11n.pt" # use if yolo26n is not yet released
54
 
55
- EPOCHS = 100
56
  IMG_SIZE = 640
57
- BATCH = 16
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  @app.function(
61
  image=IMAGE,
62
  gpu="A10G",
63
- timeout=7200,
64
  secrets=[ROBOFLOW_API_KEY, HF_SECRET],
65
  volumes={"/output": modal.Volume.from_name("kirana-yolo-output", create_if_missing=True)},
66
  )
@@ -72,36 +216,86 @@ def train():
72
  from ultralytics import YOLO
73
  from huggingface_hub import HfApi
74
 
75
- # --- Download dataset from Roboflow ---
76
- rf = Roboflow(api_key=os.environ["ROBOFLOW_API_KEY"])
77
- project = rf.workspace(ROBOFLOW_WORKSPACE).project(ROBOFLOW_PROJECT)
78
- dataset = project.version(ROBOFLOW_VERSION).download("yolov8", location="/data/indian-grocery")
79
-
80
- data_yaml = "/data/indian-grocery/data.yaml"
81
- print(f"Dataset downloaded to {dataset.location}")
82
-
83
- # --- Load model ---
84
- try:
85
- model = YOLO(YOLO_BASE_MODEL)
86
- print(f"Loaded base model: {YOLO_BASE_MODEL}")
87
- except Exception:
88
- print(f"YOLO26n not found, falling back to {YOLO_FALLBACK}")
89
- model = YOLO(YOLO_FALLBACK)
90
-
91
- # --- Train ---
92
- results = model.train(
93
- data=data_yaml,
94
- epochs=EPOCHS,
95
- imgsz=IMG_SIZE,
96
- batch=BATCH,
97
- project="/output/runs",
98
- name="yolo26n_fmcg",
99
- exist_ok=True,
100
- device=0,
101
- patience=20,
102
- save=True,
103
- plots=True,
104
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  print(f"Training complete. Best mAP50: {results.results_dict.get('metrics/mAP50(B)', 'N/A')}")
106
 
107
  best_pt = Path("/output/runs/yolo26n_fmcg/weights/best.pt")
@@ -112,61 +306,115 @@ def train():
112
  shutil.copy(str(onnx_path), "/output/yolo26n_fmcg.onnx")
113
  print(f"Exported ONNX to {onnx_path}")
114
 
115
- # --- Save class names ---
116
  import yaml
117
  with open(data_yaml) as f:
118
  data_cfg = yaml.safe_load(f)
119
- class_names = data_cfg.get("names", {})
120
  if isinstance(class_names, dict):
121
  class_names = [class_names[i] for i in sorted(class_names.keys())]
122
  with open("/output/class_names.json", "w") as f:
123
- json.dump(class_names, f, indent=2)
124
- print(f"Saved {len(class_names)} class names")
125
 
126
  # --- Publish to HF Hub ---
127
  api = HfApi(token=os.environ["HF_TOKEN"])
128
  api.create_repo(HF_REPO, repo_type="model", exist_ok=True)
129
 
130
- api.upload_file(path_or_fileobj="/output/yolo26n_fmcg.onnx", path_in_repo="yolo26n_fmcg.onnx", repo_id=HF_REPO)
131
- api.upload_file(path_or_fileobj="/output/class_names.json", path_in_repo="class_names.json", repo_id=HF_REPO)
132
 
133
- # Upload model card
134
  model_card = f"""---
135
  license: apache-2.0
 
 
 
136
  tags:
137
  - object-detection
138
  - yolo
139
  - indian-fmcg
140
  - onnx
141
  - ultralytics
 
 
 
 
 
142
  ---
143
 
144
- # YOLO26n Indian FMCG Detection
145
 
146
- Fine-tuned YOLO26n on the [Indian Grocery Object Detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) dataset from Roboflow.
 
 
 
 
 
 
 
 
 
 
147
 
148
  ## Classes ({len(class_names)} total)
149
 
150
- {chr(10).join(f"- {name}" for name in class_names[:30])}
151
- {"..." if len(class_names) > 30 else ""}
 
152
 
153
- ## Usage
154
 
155
  ```python
156
- import onnxruntime as ort
157
  import json
 
 
 
158
 
159
  session = ort.InferenceSession("yolo26n_fmcg.onnx", providers=["CPUExecutionProvider"])
160
  class_names = json.load(open("class_names.json"))
 
 
 
 
 
 
 
 
 
161
  ```
162
 
163
- ## Training
 
 
 
 
 
 
 
 
164
 
165
- - Base model: YOLO26n (Ultralytics)
166
- - Dataset: Indian Grocery Object Detection (Roboflow, {ROBOFLOW_VERSION} version)
167
- - Epochs: {EPOCHS}, img_size: {IMG_SIZE}x{IMG_SIZE}
168
- - Platform: Modal A10G GPU
169
- - Part of: Kirana Detective AI (HuggingFace Build Small Hackathon 2026)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  """
171
  api.upload_file(
172
  path_or_fileobj=model_card.encode(),
 
1
  """
2
+ Task 0.1 — Fine-tune YOLO26n on merged Indian grocery datasets.
3
 
4
  Run on Modal (A10G GPU, ~1-2 hours):
5
  modal run finetune/train_yolo26n.py
 
10
  class_names.json
11
  model card
12
 
13
+ Datasets merged (all downloaded as yolov8 format, NOT openai format):
14
+ 1. agentsk47/indian-grocery-object-detection-mfsnx v1 (~10 classes, small)
15
+ 2. iit-patna-qg1jh/grocery_items-7i2em v45 (20 classes, 6,695 images)
16
+ 3. project-c5ho0/indian-market-qieug v2 (2 classes, 4,694 images)
17
+
18
  Prerequisites:
19
  ROBOFLOW_API_KEY in env (for dataset download)
20
  HF_TOKEN in env (for HF Hub publish)
 
30
 
31
  IMAGE = (
32
  modal.Image.debian_slim(python_version="3.11")
 
 
33
  .apt_install("libgl1-mesa-glx", "libglib2.0-0")
34
  .pip_install(
35
  "ultralytics>=8.4.0",
 
37
  "huggingface_hub>=0.30.0",
38
  "onnx>=1.16.0",
39
  "onnxruntime>=1.18.0",
40
+ "pyyaml>=6.0",
41
  )
42
  )
43
 
44
  ROBOFLOW_API_KEY = modal.Secret.from_name("roboflow-secret")
45
  HF_SECRET = modal.Secret.from_name("hf-secret")
46
 
47
+ # All three Roboflow datasets downloaded as "yolov8" format (NOT "openai")
48
+ DATASETS = [
49
+ {
50
+ "workspace": "agentsk47",
51
+ "project": "indian-grocery-object-detection-mfsnx",
52
+ "version": 1,
53
+ "location": "/data/ds_agentsk47",
54
+ },
55
+ {
56
+ "workspace": "iit-patna-qg1jh",
57
+ "project": "grocery_items-7i2em",
58
+ "version": 45,
59
+ "location": "/data/ds_iitpatna",
60
+ },
61
+ {
62
+ "workspace": "project-c5ho0",
63
+ "project": "indian-market-qieug",
64
+ "version": 2,
65
+ "location": "/data/ds_indianmarket",
66
+ },
67
+ ]
68
+
69
+ MERGED_DIR = "/output/merged_dataset" # persisted on volume → skip re-download on resume
70
+ HF_REPO = "naazimsnh02/yolo26n-indian-fmcg-detection"
71
 
 
72
  YOLO_BASE_MODEL = "yolo26n.pt"
73
+ YOLO_FALLBACK = "yolo11n.pt"
74
 
75
+ EPOCHS = 100
76
  IMG_SIZE = 640
77
+ BATCH = 16
78
+
79
+
80
+ # ── Dataset merge helpers ──────────────────────────────────────────────────────
81
+
82
+ def _read_class_names(data_yaml_path: str) -> list[str]:
83
+ import yaml
84
+ with open(data_yaml_path) as f:
85
+ cfg = yaml.safe_load(f)
86
+ names = cfg.get("names", [])
87
+ if isinstance(names, dict):
88
+ names = [names[i] for i in sorted(names.keys())]
89
+ return names
90
+
91
 
92
+ def _remap_label_file(src: str, dst: str, id_map: dict[int, int]) -> None:
93
+ """Copy a YOLO label file, remapping class IDs via id_map."""
94
+ from pathlib import Path
95
+ Path(dst).parent.mkdir(parents=True, exist_ok=True)
96
+ lines_out = []
97
+ with open(src) as f:
98
+ for line in f:
99
+ line = line.strip()
100
+ if not line:
101
+ continue
102
+ parts = line.split()
103
+ old_id = int(parts[0])
104
+ new_id = id_map.get(old_id, old_id)
105
+ lines_out.append(f"{new_id} {' '.join(parts[1:])}")
106
+ with open(dst, "w") as f:
107
+ f.write("\n".join(lines_out))
108
+
109
+
110
+ def merge_yolo_datasets(dataset_locations: list[str], output_dir: str) -> str:
111
+ """
112
+ Merge N YOLOv8 datasets into one directory with unified class IDs.
113
+ Returns the path to the merged data.yaml.
114
+ """
115
+ import shutil
116
+ import yaml
117
+ from pathlib import Path
118
+
119
+ # 1. Build unified class list (insertion-order dedup across all datasets)
120
+ unified_classes: list[str] = []
121
+ per_ds_classes: list[list[str]] = []
122
+ for loc in dataset_locations:
123
+ yaml_path = Path(loc) / "data.yaml"
124
+ if not yaml_path.exists():
125
+ # Try one level deeper (Roboflow sometimes nests)
126
+ candidates = list(Path(loc).rglob("data.yaml"))
127
+ yaml_path = candidates[0] if candidates else yaml_path
128
+ names = _read_class_names(str(yaml_path))
129
+ per_ds_classes.append(names)
130
+ for name in names:
131
+ if name not in unified_classes:
132
+ unified_classes.append(name)
133
+
134
+ print(f"Unified class list ({len(unified_classes)} classes): {unified_classes}")
135
+
136
+ # 2. Build per-dataset old_id → new_id maps
137
+ id_maps: list[dict[int, int]] = []
138
+ for names in per_ds_classes:
139
+ id_maps.append({i: unified_classes.index(name) for i, name in enumerate(names)})
140
+
141
+ # 3. Copy images + remapped labels for each split
142
+ splits = ["train", "valid", "test"]
143
+ out_root = Path(output_dir)
144
+
145
+ for ds_idx, loc in enumerate(dataset_locations):
146
+ ds_root = Path(loc)
147
+ # Roboflow may nest under a subdirectory matching the project name
148
+ if not (ds_root / "train").exists():
149
+ subdirs = [d for d in ds_root.iterdir() if d.is_dir() and (d / "train").exists()]
150
+ if subdirs:
151
+ ds_root = subdirs[0]
152
+
153
+ id_map = id_maps[ds_idx]
154
+ ds_tag = f"ds{ds_idx}"
155
+
156
+ for split in splits:
157
+ img_src = ds_root / split / "images"
158
+ lbl_src = ds_root / split / "labels"
159
+ if not img_src.exists():
160
+ continue
161
+
162
+ img_dst = out_root / split / "images"
163
+ lbl_dst = out_root / split / "labels"
164
+ img_dst.mkdir(parents=True, exist_ok=True)
165
+ lbl_dst.mkdir(parents=True, exist_ok=True)
166
+
167
+ for img_file in img_src.iterdir():
168
+ # Prefix filename with dataset tag to avoid collisions
169
+ new_name = f"{ds_tag}_{img_file.name}"
170
+ shutil.copy(str(img_file), str(img_dst / new_name))
171
+
172
+ stem = img_file.stem
173
+ lbl_file = lbl_src / f"{stem}.txt"
174
+ if lbl_file.exists():
175
+ _remap_label_file(
176
+ str(lbl_file),
177
+ str(lbl_dst / f"{ds_tag}_{stem}.txt"),
178
+ id_map,
179
+ )
180
+
181
+ # 4. Write merged data.yaml
182
+ merged_yaml = out_root / "data.yaml"
183
+ cfg = {
184
+ "path": str(out_root),
185
+ "train": "train/images",
186
+ "val": "valid/images",
187
+ "test": "test/images",
188
+ "nc": len(unified_classes),
189
+ "names": unified_classes,
190
+ }
191
+ with open(merged_yaml, "w") as f:
192
+ yaml.dump(cfg, f, allow_unicode=True, default_flow_style=False)
193
+
194
+ # Count merged images
195
+ for split in splits:
196
+ n = len(list((out_root / split / "images").glob("*"))) if (out_root / split / "images").exists() else 0
197
+ print(f" {split}: {n} images")
198
+
199
+ return str(merged_yaml)
200
+
201
+
202
+ # ── Modal function ─────────────────────────────────────────────────────────────
203
 
204
  @app.function(
205
  image=IMAGE,
206
  gpu="A10G",
207
+ timeout=28800,
208
  secrets=[ROBOFLOW_API_KEY, HF_SECRET],
209
  volumes={"/output": modal.Volume.from_name("kirana-yolo-output", create_if_missing=True)},
210
  )
 
216
  from ultralytics import YOLO
217
  from huggingface_hub import HfApi
218
 
219
+ last_pt = Path("/output/runs/yolo26n_fmcg/weights/last.pt")
220
+ merged_yaml = Path(MERGED_DIR) / "data.yaml"
221
+
222
+ # --- Dataset: skip download+merge if already cached on the volume ---
223
+ if merged_yaml.exists():
224
+ print(f"Merged dataset found at {merged_yaml}, skipping download.")
225
+ data_yaml = str(merged_yaml)
226
+ else:
227
+ rf = Roboflow(api_key=os.environ["ROBOFLOW_API_KEY"])
228
+ locations = []
229
+ for ds in DATASETS:
230
+ print(f"Downloading {ds['workspace']}/{ds['project']} v{ds['version']}...")
231
+ project = rf.workspace(ds["workspace"]).project(ds["project"])
232
+ result = project.version(ds["version"]).download("yolov8", location=ds["location"])
233
+ locations.append(ds["location"])
234
+ print(f" -> {result.location}")
235
+
236
+ print("Merging datasets...")
237
+ data_yaml = merge_yolo_datasets(locations, MERGED_DIR)
238
+ print(f"Merged data.yaml: {data_yaml}")
239
+
240
+ # --- Resume from checkpoint if one exists, otherwise start fresh ---
241
+ if last_pt.exists():
242
+ import torch
243
+ ckpt = torch.load(str(last_pt), map_location="cpu", weights_only=False)
244
+ done_epoch = ckpt.get("epoch", 0) # 0-indexed epoch that finished
245
+ remaining = EPOCHS - (done_epoch + 1)
246
+ print(f"Checkpoint found — epoch {done_epoch + 1}/{EPOCHS}, {remaining} epochs remaining.")
247
+
248
+ if remaining <= 0:
249
+ print("Training already complete, skipping to export.")
250
+ results = type("R", (), {"results_dict": {}})() # dummy result
251
+ else:
252
+ try:
253
+ model = YOLO(str(last_pt))
254
+ results = model.train(resume=True)
255
+ except (ValueError, RuntimeError) as exc:
256
+ # Optimizer state mismatch (e.g. after env/package upgrade).
257
+ # Fall back: load weights, continue for remaining epochs with a
258
+ # lower LR so we don't disturb the already-converged parameters.
259
+ print(f"Full resume failed ({exc}).")
260
+ print(f"Falling back to weights-only resume: {remaining} more epochs.")
261
+ model = YOLO(str(last_pt))
262
+ results = model.train(
263
+ data=data_yaml,
264
+ epochs=remaining,
265
+ imgsz=IMG_SIZE,
266
+ batch=BATCH,
267
+ project="/output/runs",
268
+ name="yolo26n_fmcg",
269
+ exist_ok=True,
270
+ device=0,
271
+ patience=20,
272
+ save=True,
273
+ plots=True,
274
+ lr0=0.0005, # reduced: weights already partially trained
275
+ lrf=0.01,
276
+ )
277
+ else:
278
+ try:
279
+ model = YOLO(YOLO_BASE_MODEL)
280
+ print(f"Loaded base model: {YOLO_BASE_MODEL}")
281
+ except Exception:
282
+ print(f"YOLO26n not found, falling back to {YOLO_FALLBACK}")
283
+ model = YOLO(YOLO_FALLBACK)
284
+
285
+ results = model.train(
286
+ data=data_yaml,
287
+ epochs=EPOCHS,
288
+ imgsz=IMG_SIZE,
289
+ batch=BATCH,
290
+ project="/output/runs",
291
+ name="yolo26n_fmcg",
292
+ exist_ok=True,
293
+ device=0,
294
+ patience=20,
295
+ save=True,
296
+ plots=True,
297
+ )
298
+
299
  print(f"Training complete. Best mAP50: {results.results_dict.get('metrics/mAP50(B)', 'N/A')}")
300
 
301
  best_pt = Path("/output/runs/yolo26n_fmcg/weights/best.pt")
 
306
  shutil.copy(str(onnx_path), "/output/yolo26n_fmcg.onnx")
307
  print(f"Exported ONNX to {onnx_path}")
308
 
309
+ # --- Save unified class names ---
310
  import yaml
311
  with open(data_yaml) as f:
312
  data_cfg = yaml.safe_load(f)
313
+ class_names = data_cfg.get("names", [])
314
  if isinstance(class_names, dict):
315
  class_names = [class_names[i] for i in sorted(class_names.keys())]
316
  with open("/output/class_names.json", "w") as f:
317
+ json.dump(class_names, f, indent=2, ensure_ascii=False)
318
+ print(f"Saved {len(class_names)} unified class names")
319
 
320
  # --- Publish to HF Hub ---
321
  api = HfApi(token=os.environ["HF_TOKEN"])
322
  api.create_repo(HF_REPO, repo_type="model", exist_ok=True)
323
 
324
+ api.upload_file(path_or_fileobj="/output/yolo26n_fmcg.onnx", path_in_repo="yolo26n_fmcg.onnx", repo_id=HF_REPO)
325
+ api.upload_file(path_or_fileobj="/output/class_names.json", path_in_repo="class_names.json", repo_id=HF_REPO)
326
 
 
327
  model_card = f"""---
328
  license: apache-2.0
329
+ base_model: yolo26n
330
+ language:
331
+ - en
332
  tags:
333
  - object-detection
334
  - yolo
335
  - indian-fmcg
336
  - onnx
337
  - ultralytics
338
+ pipeline_tag: object-detection
339
+ datasets:
340
+ - agentsk47/indian-grocery-object-detection-mfsnx
341
+ - iit-patna-qg1jh/grocery_items-7i2em
342
+ - project-c5ho0/indian-market-qieug
343
  ---
344
 
345
+ # YOLO26n Indian FMCG Product Detection
346
 
347
+ Fine-tuned [YOLO26n](https://docs.ultralytics.com) on a **merged dataset of three Indian grocery sources** from Roboflow Universe. Part of the **Kirana Detective** project — an AI system for small Indian grocery stores to visually count and reconcile inventory from shelf/counter photos.
348
+
349
+ ## Training Datasets
350
+
351
+ | Dataset | Workspace | Version | Images | Classes |
352
+ |---|---|---|---|---|
353
+ | [Indian Grocery Object Detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) | agentsk47 | v1 | ~400 | 10 |
354
+ | [Grocery Items](https://universe.roboflow.com/iit-patna-qg1jh/grocery_items-7i2em) | IIT Patna | v45 | 6,695 | 20 |
355
+ | [Indian Market](https://universe.roboflow.com/project-c5ho0/indian-market-qieug) | project-c5ho0 | v2 | 4,694 | 2 |
356
+
357
+ All three datasets were downloaded in **YOLOv8 format**, class IDs remapped to a unified list, and merged before training.
358
 
359
  ## Classes ({len(class_names)} total)
360
 
361
+ {chr(10).join(f"- {name}" for name in class_names)}
362
+
363
+ ## How to Use
364
 
365
+ ### Python (ONNX Runtime)
366
 
367
  ```python
 
368
  import json
369
+ import numpy as np
370
+ import onnxruntime as ort
371
+ from PIL import Image
372
 
373
  session = ort.InferenceSession("yolo26n_fmcg.onnx", providers=["CPUExecutionProvider"])
374
  class_names = json.load(open("class_names.json"))
375
+
376
+ def preprocess(image_path, size=640):
377
+ img = Image.open(image_path).convert("RGB").resize((size, size))
378
+ arr = np.array(img, dtype=np.float32) / 255.0
379
+ return arr.transpose(2, 0, 1)[None] # BCHW
380
+
381
+ input_name = session.get_inputs()[0].name
382
+ outputs = session.run(None, {{input_name: preprocess("shelf.jpg")}})
383
+ # outputs[0]: (1, 300, 6) — [x1, y1, x2, y2, confidence, class_id]
384
  ```
385
 
386
+ ### Ultralytics (PyTorch)
387
+
388
+ ```python
389
+ from ultralytics import YOLO
390
+
391
+ model = YOLO("yolo26n_fmcg.onnx", task="detect")
392
+ results = model.predict("shelf.jpg", imgsz=640, conf=0.25)
393
+ results[0].show()
394
+ ```
395
 
396
+ ## Training Details
397
+
398
+ | Parameter | Value |
399
+ |---|---|
400
+ | Base model | YOLO26n |
401
+ | Input size | 640 × 640 |
402
+ | Epochs (scheduled) | {EPOCHS} |
403
+ | Batch size | {BATCH} |
404
+ | Early stopping patience | 20 |
405
+ | Export format | ONNX opset 12 |
406
+ | Hardware | NVIDIA A10G (Modal) |
407
+
408
+ ## Citation
409
+
410
+ ```bibtex
411
+ @misc{{kirana-detective-yolo-2026,
412
+ title = {{Kirana Detective: YOLO26n Indian FMCG Product Detector}},
413
+ author = {{Naazim}},
414
+ year = {{2026}},
415
+ url = {{https://huggingface.co/naazimsnh02/yolo26n-indian-fmcg-detection}}
416
+ }}
417
+ ```
418
  """
419
  api.upload_file(
420
  path_or_fileobj=model_card.encode(),
finetune/upload_yolo_to_hf.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Upload locally-downloaded YOLO artifacts to HuggingFace Hub.
3
+
4
+ Prerequisites:
5
+ pip install huggingface_hub
6
+ Set HF_TOKEN env var (or run `huggingface-cli login`)
7
+
8
+ Usage:
9
+ python finetune/upload_yolo_to_hf.py
10
+
11
+ Uploads from: model_artifacts/yolo26n_fmcg/
12
+ best.pt, best.onnx, class_names.json → naazimsnh02/yolo26n-indian-fmcg-detection
13
+ """
14
+
15
+ import os
16
+ from pathlib import Path
17
+
18
+ HF_REPO = "naazimsnh02/yolo26n-indian-fmcg-detection"
19
+ ARTIFACTS = Path(__file__).parent.parent / "model_artifacts" / "yolo26n_fmcg"
20
+
21
+ MODEL_CARD = """\
22
+ ---
23
+ license: apache-2.0
24
+ base_model: yolo26n
25
+ language:
26
+ - en
27
+ tags:
28
+ - object-detection
29
+ - yolo
30
+ - indian-fmcg
31
+ - onnx
32
+ - ultralytics
33
+ - kirana
34
+ pipeline_tag: object-detection
35
+ datasets:
36
+ - agentsk47/indian-grocery-object-detection-mfsnx
37
+ - iit-patna-qg1jh/grocery_items-7i2em
38
+ - project-c5ho0/indian-market-qieug
39
+ ---
40
+
41
+ # YOLO26n — Indian FMCG Product Detection
42
+
43
+ Fine-tuned [YOLO26n](https://docs.ultralytics.com) on a **merged dataset of three Indian grocery sources**
44
+ from Roboflow Universe. Part of the **Kirana Detective** project — an AI system for small Indian grocery
45
+ stores to visually count and reconcile shelf/counter inventory from photos.
46
+
47
+ ## Performance
48
+
49
+ | Metric | Value |
50
+ |---|---|
51
+ | mAP50 (all classes) | **0.428** |
52
+ | mAP50-95 (all classes) | **0.302** |
53
+ | Total classes | 1,831 |
54
+ | Validation images | 1,236 |
55
+ | Validation instances | 13,443 |
56
+
57
+ Training ran for **100 epochs** (60 initial + 40 resumed after restart) on an NVIDIA A10G via Modal.
58
+
59
+ ## Training Datasets
60
+
61
+ | Dataset | Workspace | Version | Images | Classes |
62
+ |---|---|---|---|---|
63
+ | [Indian Grocery Object Detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) | agentsk47 | v1 | ~400 | 10 |
64
+ | [Grocery Items](https://universe.roboflow.com/iit-patna-qg1jh/grocery_items-7i2em) | IIT Patna | v45 | 6,695 | 20 |
65
+ | [Indian Market](https://universe.roboflow.com/project-c5ho0/indian-market-qieug) | project-c5ho0 | v2 | 4,694 | 2 |
66
+
67
+ All three datasets were downloaded in **YOLOv8 format**, class IDs remapped to a unified list, and merged
68
+ before training. The full unified class list (1,831 entries) is in `class_names.json`.
69
+
70
+ ## Files
71
+
72
+ | File | Description |
73
+ |---|---|
74
+ | `best.pt` | PyTorch checkpoint (best mAP50 epoch) |
75
+ | `best.onnx` | ONNX export, opset 12 (recommended for inference) |
76
+ | `class_names.json` | Full list of 1,831 class names (index = class_id) |
77
+
78
+ ## How to Use
79
+
80
+ ### ONNX Runtime (CPU / any platform)
81
+
82
+ ```python
83
+ import json, numpy as np, onnxruntime as ort
84
+ from PIL import Image
85
+
86
+ session = ort.InferenceSession("best.onnx", providers=["CPUExecutionProvider"])
87
+ class_names = json.load(open("class_names.json"))
88
+
89
+ def preprocess(path, size=640):
90
+ img = Image.open(path).convert("RGB").resize((size, size))
91
+ return (np.array(img, dtype=np.float32) / 255.0).transpose(2, 0, 1)[None]
92
+
93
+ input_name = session.get_inputs()[0].name
94
+ outputs = session.run(None, {input_name: preprocess("shelf.jpg")})
95
+ # outputs[0]: (1, 300, 6) — [x1, y1, x2, y2, confidence, class_id]
96
+ ```
97
+
98
+ ### Ultralytics (PyTorch)
99
+
100
+ ```python
101
+ from ultralytics import YOLO
102
+
103
+ model = YOLO("best.pt")
104
+ results = model.predict("shelf.jpg", imgsz=640, conf=0.25)
105
+ results[0].show()
106
+ ```
107
+
108
+ ## Training Details
109
+
110
+ | Parameter | Value |
111
+ |---|---|
112
+ | Base model | YOLO26n |
113
+ | Input size | 640 × 640 |
114
+ | Epochs | 100 (60 initial + 40 resumed) |
115
+ | Batch size | 16 |
116
+ | Early stopping patience | 20 |
117
+ | Export format | ONNX opset 12 |
118
+ | Hardware | NVIDIA A10G (Modal) |
119
+
120
+ ## Citation
121
+
122
+ ```bibtex
123
+ @misc{kirana-detective-yolo-2026,
124
+ title = {Kirana Detective: YOLO26n Indian FMCG Product Detector},
125
+ author = {Naazim},
126
+ year = {2026},
127
+ url = {https://huggingface.co/naazimsnh02/yolo26n-indian-fmcg-detection}
128
+ }
129
+ ```
130
+ """
131
+
132
+
133
+ def main():
134
+ from huggingface_hub import HfApi
135
+
136
+ token = os.environ.get("HF_TOKEN")
137
+ if not token:
138
+ raise EnvironmentError("HF_TOKEN env var not set. Run: set HF_TOKEN=hf_...")
139
+
140
+ api = HfApi(token=token)
141
+
142
+ files = {
143
+ "best.pt": ARTIFACTS / "best.pt",
144
+ "best.onnx": ARTIFACTS / "best.onnx",
145
+ "class_names.json": ARTIFACTS / "class_names.json",
146
+ }
147
+ for name, path in files.items():
148
+ if not path.exists():
149
+ raise FileNotFoundError(f"Missing: {path}")
150
+
151
+ print(f"Repo: {HF_REPO}")
152
+ api.create_repo(HF_REPO, repo_type="model", exist_ok=True, private=False)
153
+
154
+ # Upload all files in a single commit to avoid the "no changes" skip bug
155
+ from huggingface_hub import CommitOperationAdd
156
+
157
+ operations = []
158
+ for repo_path, local_path in files.items():
159
+ size_mb = local_path.stat().st_size / 1024 / 1024
160
+ print(f" Staging {repo_path} ({size_mb:.1f} MB)")
161
+ operations.append(CommitOperationAdd(path_in_repo=repo_path, path_or_fileobj=str(local_path)))
162
+
163
+ operations.append(
164
+ CommitOperationAdd(
165
+ path_in_repo="README.md",
166
+ path_or_fileobj=MODEL_CARD.encode("utf-8"),
167
+ )
168
+ )
169
+ print(" Staging README.md")
170
+
171
+ print("\nCommitting...")
172
+ commit = api.create_commit(
173
+ repo_id=HF_REPO,
174
+ repo_type="model",
175
+ operations=operations,
176
+ commit_message="Add best.pt, best.onnx, class_names.json, README (100-epoch FMCG detector)",
177
+ )
178
+ print(f"Done — {commit.commit_url}")
179
+
180
+
181
+ if __name__ == "__main__":
182
+ main()
finetune/yolo_model_card.md ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: yolo26n
4
+ language:
5
+ - en
6
+ tags:
7
+ - object-detection
8
+ - yolo
9
+ - indian-fmcg
10
+ - onnx
11
+ - ultralytics
12
+ pipeline_tag: object-detection
13
+ datasets:
14
+ - agentsk47/indian-grocery-object-detection-mfsnx
15
+ - iit-patna-qg1jh/grocery_items-7i2em
16
+ - project-c5ho0/indian-market-qieug
17
+ ---
18
+
19
+ # YOLO26n — Indian FMCG Product Detection
20
+
21
+ Fine-tuned [YOLO26n](https://docs.ultralytics.com) on a **merged dataset of three Indian grocery sources** from Roboflow Universe to detect and localize packaged FMCG products in shelf or counter images. Part of the **Kirana Detective** project — an AI system for small Indian grocery stores to visually count and reconcile inventory from photos.
22
+
23
+ > **Note**: This file is a local reference snapshot. The actual model card uploaded to HuggingFace Hub is generated dynamically at the end of `train_yolo26n.py` and will include the real class list and metrics from the latest training run.
24
+
25
+ ## Model Description
26
+
27
+ The model takes a 640×640 image and returns bounding boxes, class labels, and confidence scores for all detected Indian FMCG product categories. It is exported as ONNX for deployment on both CPU and GPU without requiring a full PyTorch installation.
28
+
29
+ The class list is built at training time by merging the three Roboflow dataset vocabularies (deduped, insertion-order). See `class_names.json` on HF Hub for the exact unified list.
30
+
31
+ ## Pilot Run Results (single dataset, 10 classes)
32
+
33
+ The metrics below are from a previous training run using only the `agentsk47/indian-grocery-object-detection` dataset (10 classes). They are superseded by the current merged 3-dataset training run.
34
+
35
+ **Pilot classes:**
36
+
37
+ | # | Class |
38
+ |---|---|
39
+ | 0 | Bournvita |
40
+ | 1 | Mysore Sandal Soap |
41
+ | 2 | Nescafe Classic Coffee |
42
+ | 3 | Nivea Body Lotion |
43
+ | 4 | Nivea Soft Moisturising Cream |
44
+ | 5 | Parachute Coconut Oil |
45
+ | 6 | Patanjali Dant Kanti |
46
+ | 7 | Society Tea Powder Plain |
47
+ | 8 | Tresemme Hairfall Defense Conditioner |
48
+ | 9 | Tresemme Hairfall Defense Shampoo |
49
+
50
+ ## Pilot Evaluation Results (best.pt, epoch 65 — single dataset run)
51
+
52
+ | Class | Images | Instances | Precision | Recall | mAP50 | mAP50-95 |
53
+ |---|---|---|---|---|---|---|
54
+ | **all** | **41** | **51** | **0.935** | **0.971** | **0.993** | **0.933** |
55
+ | Bournvita | 3 | 3 | 0.902 | 1.000 | 0.995 | 0.995 |
56
+ | Mysore Sandal Soap | 8 | 8 | 1.000 | 0.905 | 0.995 | 0.944 |
57
+ | Nescafe Classic Coffee | 4 | 4 | 0.927 | 1.000 | 0.995 | 0.908 |
58
+ | Nivea Body Lotion | 7 | 7 | 0.935 | 1.000 | 0.995 | 0.923 |
59
+ | Nivea Soft Moisturising Cream | 3 | 3 | 0.924 | 1.000 | 0.995 | 0.895 |
60
+ | Parachute Coconut Oil | 6 | 6 | 1.000 | 0.819 | 0.972 | 0.928 |
61
+ | Patanjali Dant Kanti | 7 | 7 | 1.000 | 0.985 | 0.995 | 0.971 |
62
+ | Society Tea Powder Plain | 2 | 2 | 0.878 | 1.000 | 0.995 | 0.845 |
63
+ | Tresemme Hairfall Defense Conditioner | 1 | 1 | 0.814 | 1.000 | 0.995 | 0.995 |
64
+ | Tresemme Hairfall Defense Shampoo | 10 | 10 | 0.968 | 1.000 | 0.995 | 0.922 |
65
+
66
+ ## How to Use
67
+
68
+ ### Python (ONNX Runtime)
69
+
70
+ ```python
71
+ import json
72
+ import numpy as np
73
+ import onnxruntime as ort
74
+ from PIL import Image
75
+
76
+ session = ort.InferenceSession("yolo26n_fmcg.onnx", providers=["CPUExecutionProvider"])
77
+ class_names = json.load(open("class_names.json"))
78
+
79
+ def preprocess(image_path, size=640):
80
+ img = Image.open(image_path).convert("RGB").resize((size, size))
81
+ arr = np.array(img, dtype=np.float32) / 255.0
82
+ return arr.transpose(2, 0, 1)[None] # BCHW
83
+
84
+ input_name = session.get_inputs()[0].name
85
+ outputs = session.run(None, {input_name: preprocess("shelf.jpg")})
86
+ # outputs[0]: (1, 300, 6) — [x1, y1, x2, y2, confidence, class_id]
87
+ ```
88
+
89
+ ### Ultralytics (PyTorch)
90
+
91
+ ```python
92
+ from ultralytics import YOLO
93
+
94
+ model = YOLO("yolo26n_fmcg.onnx", task="detect")
95
+ results = model.predict("shelf.jpg", imgsz=640, conf=0.25)
96
+ results[0].show()
97
+ ```
98
+
99
+ ## Training Details
100
+
101
+ ### Datasets (merged)
102
+
103
+ All three downloaded in **YOLOv8 format** (not openai), class IDs remapped to a unified list before training.
104
+
105
+ | Dataset | Workspace | Version | Images | Classes |
106
+ |---|---|---|---|---|
107
+ | [Indian Grocery Object Detection](https://universe.roboflow.com/agentsk47/indian-grocery-object-detection-mfsnx) | agentsk47 | v1 | ~400 | 10 |
108
+ | [Grocery Items](https://universe.roboflow.com/iit-patna-qg1jh/grocery_items-7i2em) | IIT Patna | v45 | 6,695 | 20 |
109
+ | [Indian Market](https://universe.roboflow.com/project-c5ho0/indian-market-qieug) | project-c5ho0 | v2 | 4,694 | 2 |
110
+
111
+ ### Hyperparameters
112
+
113
+ | Parameter | Value |
114
+ |---|---|
115
+ | Base model | YOLO26n |
116
+ | Input size | 640 × 640 |
117
+ | Epochs (scheduled) | 100 |
118
+ | Epochs (actual) | 85 (early stop at 65+20) |
119
+ | Batch size | 16 |
120
+ | Early stopping patience | 20 |
121
+ | Optimizer | Auto (Ultralytics default) |
122
+ | Export format | ONNX opset 12 |
123
+
124
+ ### Training Infrastructure
125
+
126
+ | Field | Value |
127
+ |---|---|
128
+ | Hardware | NVIDIA A10G (22 GB VRAM) |
129
+ | Framework | Ultralytics 8.4.63 |
130
+ | PyTorch | 2.12.0+cu130 |
131
+ | Orchestration | Modal |
132
+ | Training time | 0.094 hours (~5.6 minutes) |
133
+ | Model size | 5.4 MB (PyTorch) · 9.4 MB (ONNX) |
134
+ | Parameters | 2,376,786 |
135
+ | GFLOPs | 5.2 |
136
+ | Inference speed | 0.2 ms preprocess + 1.0 ms inference (A10G) |
137
+
138
+ ### Training Curve Notes (pilot run)
139
+
140
+ - Best checkpoint at **epoch 65** (mAP50 = 0.993, mAP50-95 = 0.933) — single-dataset pilot
141
+ - EarlyStopping triggered at epoch 85 (no improvement for 20 epochs)
142
+ - Final box loss: 0.4201 · cls loss: 0.4657 · dfl loss: 0.006
143
+
144
+ The current 3-dataset merged training run will produce updated curve notes.
145
+
146
+ ## Limitations
147
+
148
+ - Merged dataset skewed toward beauty/personal care (Tresemmé, Nivea, Patanjali); may underperform on grocery staples
149
+ - ~11K images across 3 sources; performance on crowded shelves or partial occlusions is untested
150
+ - Exported at opset 12 for broad compatibility; advanced indexing operations use multi-op decomposition (see ONNX export warning)
151
+
152
+ ## Citation
153
+
154
+ ```bibtex
155
+ @misc{kirana-detective-yolo-2026,
156
+ title = {Kirana Detective: YOLO26n Indian FMCG Product Detector},
157
+ author = {Syed Naazim Hussain},
158
+ year = {2026},
159
+ url = {https://huggingface.co/build-small-hackathon/yolo26n-indian-fmcg-detection}
160
+ }
161
+ ```
model_artifacts/yolo26n_fmcg/class_names.json ADDED
@@ -0,0 +1,1833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "Bournvita",
3
+ "Mysore Sandal Soap",
4
+ "Nescafe_Classic_Coffee",
5
+ "Nivea Body Lotion",
6
+ "Nivea_Soft_Moisturising_cream",
7
+ "Parachute coconut Oil",
8
+ "Patanjali Dant Kanti",
9
+ "Society_TeaPowder_plain",
10
+ "Tresemme_Hairfall_Defense_conditioner",
11
+ "Tresemme_Hairfall_Defense_shampoo",
12
+ "24Mantra",
13
+ "3D_Fryums_Ajwah",
14
+ "50 50 Gol Maal",
15
+ "50 50 sweet and salty",
16
+ "50 50 top",
17
+ "72H_AxeSignature",
18
+ "8am soyachunks",
19
+ "ALphonsoMangoPulp_GoldenCrown",
20
+ "Aakash",
21
+ "Aarambh",
22
+ "Aashirvaad",
23
+ "Aashirvaad_Besan",
24
+ "Aashirvaad_Salt",
25
+ "Aashirvaad_TurmericPowder",
26
+ "Abzorb",
27
+ "AcaciaHoney_KhushNuma",
28
+ "Act2",
29
+ "Action_Dishwash_Liquid",
30
+ "ActiveSaltNeem_Toothpaste_Colgate",
31
+ "ActiveSalt_Toothpaste_Colgate",
32
+ "Adrenaline_48H_AxeSignature",
33
+ "Agarbathi_Bansuri",
34
+ "Agarbathi_Baratanatyam",
35
+ "Agarbathi_Cycle",
36
+ "Agarbatti_SalShakti",
37
+ "Agastya",
38
+ "Agrabathi_Mangaldeep",
39
+ "Agrawal",
40
+ "AirFreshner",
41
+ "AllOut",
42
+ "Almendro",
43
+ "AlmondFlour_Granos",
44
+ "AlmondKernals_Aram-s",
45
+ "Almonds",
46
+ "AloFrut_Juice",
47
+ "Aloevera_Handwash_Patanjali",
48
+ "Aloo Bhujia",
49
+ "AlooBhujia_Bicano",
50
+ "AlooBhujia_Bikano",
51
+ "AlpineFresh_BodySpray_ShotLayerr",
52
+ "AlterEgo_BodySpray_ParkAvenue",
53
+ "AmericanGarden",
54
+ "AmericanPancakeCo",
55
+ "AmlaCandy_Patanjali",
56
+ "AmlaOil",
57
+ "Ammol_LemonMazaa",
58
+ "Amul",
59
+ "Amul Butter",
60
+ "Amul ghee",
61
+ "Amul-Dreamlite_Biscuites",
62
+ "AmulCool",
63
+ "Amul_BadamShake",
64
+ "Amul_ButterCookies",
65
+ "Amul_Kool",
66
+ "Amulya",
67
+ "AntiRoachgel_Hit",
68
+ "Anymany",
69
+ "Apis",
70
+ "AplineRose-BlackCurrent_Perfume_YardleyLondon",
71
+ "AppleCiderVinegar_AmericanGarden",
72
+ "AppleCiderVinegar_CanadianGarden",
73
+ "AppleCiderVinegar_JustOrganic",
74
+ "AppleCiderVinegar_WOW",
75
+ "AppleCiderVinegar_Wanposh",
76
+ "Apple_Juice_Tropicana",
77
+ "AppyFizz",
78
+ "Apricots",
79
+ "Apsara",
80
+ "Aravalli_Rice",
81
+ "Ariel",
82
+ "Ashok",
83
+ "BC",
84
+ "BLackCode_Denver",
85
+ "BNatural",
86
+ "Babe_BodySpray_Vanesa",
87
+ "BabyCereal",
88
+ "BabyCorn_GoldenCrown",
89
+ "BabyWash_Himalaya",
90
+ "Babyrub_Vicks",
91
+ "BadamDrink_Cavins",
92
+ "BadamDrink_MTR",
93
+ "BadamLachha_Bicano",
94
+ "BadamMilkshake_Cavins",
95
+ "Baggrys",
96
+ "Bagrrys",
97
+ "Baidyanath",
98
+ "Bajaj",
99
+ "Bakeshree",
100
+ "BakingPaper",
101
+ "BakingPowder",
102
+ "Balaji",
103
+ "Bambino",
104
+ "Banana_chips",
105
+ "BanneNawabs",
106
+ "BasilSeed_Ajwah",
107
+ "BasilSeeds",
108
+ "Batchelors",
109
+ "BathingSoap_LifeBuoy",
110
+ "BathingSoap_Pears",
111
+ "Bathroom_Cleaner",
112
+ "Bauli",
113
+ "BauliSavoriz",
114
+ "Beans",
115
+ "BeardTrimmer_Babila",
116
+ "Belgian",
117
+ "BerryJam_Mala",
118
+ "BesanLaddoo_BigBasket",
119
+ "Besan_Rajdhani",
120
+ "BhujiaMasala_Brij",
121
+ "Bhujia_Bicano",
122
+ "Bhujia_FirstCrop",
123
+ "Bhujia_Haldirams",
124
+ "Bhujialalji",
125
+ "Bhujiya_Prabhuji",
126
+ "Bicano_PanjeeriLaddu",
127
+ "Bigbabol_Chewinggum",
128
+ "Bikaji",
129
+ "Bikaji Tana bana",
130
+ "Bikaji_Kuch Kuch",
131
+ "Bikaji_navratan mixtute",
132
+ "BikaneriBhujia_Brij",
133
+ "Bikano",
134
+ "Bikano Aloo Bhujia",
135
+ "Bikano Royal Rasgulla",
136
+ "Bikano Tasty",
137
+ "BikanoRasgulla",
138
+ "Bikano_navratan mixtute",
139
+ "Bikano_navratanmixtute",
140
+ "Bingo",
141
+ "BiotinTablets_HKVitals",
142
+ "Biotique",
143
+ "Biscuit",
144
+ "Biscuit_Bourbon_Britania",
145
+ "Biscuit_Bourbon_Britannia",
146
+ "Biscuit_Bourbon_FullBloom",
147
+ "Biscuit_ButterBite_PriyaGold",
148
+ "Biscuit_ButterBite_first crop",
149
+ "Biscuit_CashewBadam_FirstCrop",
150
+ "Biscuit_ChocoRolls_HideandSeek",
151
+ "Biscuit_Chocolate_FullBloom",
152
+ "Biscuit_DarkFantasy",
153
+ "Biscuit_Digestive_FirstCrop",
154
+ "Biscuit_GlucosePlus_Sunfeast",
155
+ "Biscuit_HideandSeek_Parle",
156
+ "Biscuit_Nutrichoice_Arrowroot_Britannia",
157
+ "Biscuit_Nutrichoice_Britannia",
158
+ "Biscuit_Nutrichoice_Cracker_Britannia",
159
+ "Biscuit_Nutricrunch_Britannia",
160
+ "Biscuit_SweetnSalty",
161
+ "Biscuit_TigerKrunch",
162
+ "Biscuits_50_50_Britannia",
163
+ "Biscuits_Amul",
164
+ "Biscuits_Buttery_50_50_Britannia",
165
+ "Bisleri",
166
+ "BlackBourbon_Hide-Seek_ParlePlatina",
167
+ "BlackMagic_Deo_Spinz",
168
+ "BlackPepperPowder_MDH",
169
+ "BlackPepper_Catch",
170
+ "Bleach_Fem",
171
+ "BlueBird",
172
+ "BlueCurrents_St.John",
173
+ "Blueberries",
174
+ "Blueberry_Gaurmia",
175
+ "Blush_BodySpray_Eva",
176
+ "Blush_FragrantTalc_Envy",
177
+ "BodyHairRemover_Veet",
178
+ "BodyLotion",
179
+ "BodyLotion_Parachute",
180
+ "BodyMist_Ossum",
181
+ "BodySpray_BoldSecret",
182
+ "BodySpray_Eva",
183
+ "BodySpray_Jovan",
184
+ "BodySpray_ParkAvenue",
185
+ "BodySpray_The_Man_Company",
186
+ "BodySpray_Women",
187
+ "BodySpray_men",
188
+ "BombayBiryani_Alamin",
189
+ "Bookside",
190
+ "Boondi_Jabson-s",
191
+ "Boost",
192
+ "Borges",
193
+ "BoroPlus",
194
+ "Boroline",
195
+ "Bounce",
196
+ "Bounty",
197
+ "Bourbon cream biscuits",
198
+ "Bourbon_Cremica",
199
+ "Bourbon_FullBoom",
200
+ "Bread",
201
+ "Brij_Bhujia",
202
+ "Brij_Namkeen",
203
+ "Brij_Panchratan",
204
+ "Britannia",
205
+ "Britannia_50_50_Biscuits",
206
+ "Britannia_WinkinCow_lassi",
207
+ "BrookeBond",
208
+ "Brown-Haley",
209
+ "BrownRice",
210
+ "BrowniesBasket",
211
+ "Bru",
212
+ "BubbleGumFlovour_ToothpasteForKids_DentoShine",
213
+ "Budweiser",
214
+ "Buldak_Ramen",
215
+ "Bumtum_BabyPaints",
216
+ "Bumtum_DoobiDoo",
217
+ "Butlers",
218
+ "Butter",
219
+ "ButterBatter",
220
+ "ButterBite_Cookies",
221
+ "ButterCookies_Amul",
222
+ "ButterCookies_Unibic",
223
+ "ButterLite_NamkeenJeera_Biscuits",
224
+ "Butter_AmulLite",
225
+ "Butter_MilkyMist",
226
+ "Butter_MotherDairy",
227
+ "Butter_Nutralite",
228
+ "ButtonMushroom_GoldenCrown",
229
+ "Cadbury",
230
+ "Cadbury_Celebrations",
231
+ "Cake",
232
+ "CaliforniaAlmonds_Wonderland",
233
+ "CalifornianAlmonds_happilo",
234
+ "Candey_Butter_PureBurst",
235
+ "Candy",
236
+ "Candy_Butter_PureBurst",
237
+ "Candy_Chocolate_PureBurst",
238
+ "Candy_Halls",
239
+ "Candy_KachaAam_PureBurst",
240
+ "Candy_KachchaAam_PureBurst",
241
+ "Candy_Lacmi",
242
+ "Candy_Mandola",
243
+ "Candy_Mentos",
244
+ "Candy_OrangeBite_Parle",
245
+ "Candy_Orange_PureBurst",
246
+ "Candy_PureBurst_KachchaAam",
247
+ "Candy_PureBurst_MangoTango",
248
+ "Candy_PureBurst_Orange",
249
+ "Candy_PureBurst_chocolate",
250
+ "Candyman_ChocoDouble",
251
+ "CaramelBliss_Popcorn_ActII",
252
+ "CashewSticks_GoneMad",
253
+ "Cashews",
254
+ "CastorOil",
255
+ "Catch",
256
+ "Catch Black Pepper",
257
+ "Catch Table Salt",
258
+ "Catch chicken masala",
259
+ "Catch_Cola",
260
+ "Catch_GaramMasala",
261
+ "Catch_KashmiriMirchi",
262
+ "Catch_MeatMasala",
263
+ "Catch_Pepper",
264
+ "Catch_SprintUp",
265
+ "Catch_jeera powder",
266
+ "Cavin",
267
+ "Cavin_VaneelaMilkShake",
268
+ "Cavins",
269
+ "Cerelac_Apple_Nestle",
270
+ "Cerelac_Wheat_Nestle",
271
+ "ChakkiAtta",
272
+ "ChampSportingClub_Denver",
273
+ "ChanaMasala_Jobsons",
274
+ "ChanaMasala_MinuteMeals_MTR",
275
+ "ChanaMasala_MinuteMeals_Mir",
276
+ "CharcoalCleangel_Toothpaste_Colgate",
277
+ "Charged_thumbsup",
278
+ "Cheese",
279
+ "CheeseCubes",
280
+ "CheeseCubes_D-lecta",
281
+ "CheeseCubes_MilkyMist",
282
+ "CheeseCubes_MotherDairy",
283
+ "CheeseSlices",
284
+ "CheeseSlices_Amul",
285
+ "CheeseSlices_Britannia",
286
+ "CheeseSlices_D-lecta",
287
+ "CheeseSlices_Go",
288
+ "CheeseSlices_MilkyMist",
289
+ "CheeseSlices_MotherDairy",
290
+ "Cheese_D-lecta",
291
+ "Cheetos",
292
+ "Cheezza_Britannia",
293
+ "ChefsBasket",
294
+ "ChiaSeed_Ajwah",
295
+ "ChiaSeeds",
296
+ "ChickenMasala_Orika",
297
+ "Chik",
298
+ "ChileanInshellWalnut_Happilo",
299
+ "ChileanInshellWalnut_nutraj",
300
+ "ChileanInshellcashenut_Happilo",
301
+ "ChileanInshellcashenut_nutraj",
302
+ "ChilliPowder_Kanwal",
303
+ "Chings",
304
+ "Chings_Chutney",
305
+ "Chips_Bicano",
306
+ "Chips_Crisps_Lays",
307
+ "Chips_Doritos",
308
+ "Chips_Haldirams",
309
+ "Chips_Pudina_haldirams",
310
+ "Choco-Pie",
311
+ "ChocoBakes_Cookies_Cadbury",
312
+ "ChocoFills_Kellogg-s",
313
+ "ChocoHazlenut_Cookies_Unibic",
314
+ "ChocoNutCookies_Unibic",
315
+ "ChocoPie",
316
+ "ChocoSpread_HealthyDay_Rasna",
317
+ "Chocoflakes_Kwality",
318
+ "Chocola",
319
+ "ChocolairsGoldCoffee_Cadbury",
320
+ "Chocolate",
321
+ "ChocolateCookies_Amul",
322
+ "ChocolatePeanut_Butter_MB",
323
+ "Chocolate_5Star",
324
+ "Chocolate_ChoclairsGold_Cadbury",
325
+ "Chocolate_ChocoBarXL",
326
+ "Chocolate_DairyMilk_Cadbury",
327
+ "Chocolate_Horlicks",
328
+ "Chocolate_Nutties_Cadburry",
329
+ "Chocolate_Snakker_PriyaGold",
330
+ "Chocolate_Truffel_Dukes",
331
+ "Chocozay",
332
+ "ChoiceMega",
333
+ "Chokolas",
334
+ "Cibaca_Toothpaste_Colgate",
335
+ "ClassicButter_Unibic",
336
+ "ClassicMalt_Horlicks",
337
+ "ClassicSalted_Popcorn_ActII",
338
+ "Classicgentleman_Perfume_Yardley",
339
+ "Clean-Clear",
340
+ "ClinicPlus",
341
+ "Cockroach_repllent_baygon",
342
+ "Cocktail",
343
+ "Cocoa_Cadbury",
344
+ "Cocoa_Hershey-s",
345
+ "Coconut7BrownSugar_Pocky",
346
+ "CoconutCookies_Americana",
347
+ "CoconutCookies_Gaia",
348
+ "CoconutLadoo_Ajwah",
349
+ "CoconutOil_Nirmal",
350
+ "CoconutPeda_Ajwah",
351
+ "CoconutPowder_Sunrise",
352
+ "CoconutWater_CocoRoyal",
353
+ "CoffeeMate_Nestle",
354
+ "Coffee_FullBloom",
355
+ "ColaCandy_Ajwah",
356
+ "Colgate",
357
+ "Colgate Total12",
358
+ "Colgate_BubbleFruit_ChihldPaste",
359
+ "Colgate_Strawberry_ChihldPaste",
360
+ "Colgate_Toothbrush",
361
+ "Colin",
362
+ "Comfort",
363
+ "Complan",
364
+ "CookTop",
365
+ "Cookies",
366
+ "Cookies_ButterBake",
367
+ "CoolKick_NiveaMen",
368
+ "CoolWave_Talc_Envy",
369
+ "Corainder_Everest",
370
+ "CornFlakes_FirstCrop",
371
+ "CornSoup_Bambino",
372
+ "CornStarch_Brown-Polson",
373
+ "Cornflour",
374
+ "Cornitos",
375
+ "CoughDrops_Honitus_Dabur",
376
+ "CoughDrops_Vicks",
377
+ "CowGhee_ThackerDairy",
378
+ "Crackers_Munchy-s",
379
+ "Cranberrries_Granos",
380
+ "Cream_Lakme",
381
+ "Cream_vitaminFaceWash",
382
+ "CremesBiscuit_Strawberry_Cremica",
383
+ "CrunchyChocolate_Pepero",
384
+ "CrunchyMunchy_Bicano",
385
+ "CrunchyMunchy_Bikano",
386
+ "CupCakes",
387
+ "Curd",
388
+ "Curd_MilkyMist",
389
+ "Curry_TopRamen",
390
+ "CustardPowder_Brown-Polson",
391
+ "Cycle",
392
+ "Czar_Scent_Fogg",
393
+ "DLecta",
394
+ "Daawat",
395
+ "Daawat_RozanaBasmatiRiceGold",
396
+ "Dabur",
397
+ "DaimSnax",
398
+ "DairyMilk_Chocolate",
399
+ "DairyMilk_Silk_Cadbury",
400
+ "DalMakhani_RedayMeals_Gits",
401
+ "Dal_Bikano",
402
+ "Dal_KhattaMeetha_BC",
403
+ "Dal_Pitara_DBL",
404
+ "Dal_Rajdhani",
405
+ "Dalmakhni_Gits",
406
+ "Dant Rakshak",
407
+ "DantKanti_Toothpaste_Patanjali",
408
+ "DarkChocolateChipsCookies_Gaia",
409
+ "DarkFantasy_ChocolateShake",
410
+ "DarkTemptation_CologneTalc_Axe",
411
+ "DarkTemptation_Fragrance_Axe",
412
+ "Dates",
413
+ "DatesSyrup_Lion",
414
+ "Davidoff",
415
+ "DelMonte",
416
+ "DelMonte_Mayonnaise",
417
+ "Delfi",
418
+ "Delicious",
419
+ "Delifresh",
420
+ "Delight",
421
+ "Delisious",
422
+ "Delmonte_GreenApple",
423
+ "Delmonte_Pineapple",
424
+ "Delmonte_peachjuice",
425
+ "Denver",
426
+ "DetergentBar_Henko",
427
+ "Dettol",
428
+ "Dev",
429
+ "Dexolac",
430
+ "Dhara",
431
+ "Dhara_Oil",
432
+ "DiSano",
433
+ "DiabeticCare_Nimbark",
434
+ "Diamond",
435
+ "Diapers",
436
+ "Diapers_MamyPokoPants",
437
+ "Diapers_Pampers",
438
+ "Digestibes_Nutricrunch",
439
+ "Digestive_NutriChoice_Britania",
440
+ "Discover_BodySpray_ParkAvenue",
441
+ "Dishwash_Prill",
442
+ "Dishwash_Scrubz",
443
+ "Dishwasher_Exo",
444
+ "Diva_BodySpray_Vanesa",
445
+ "Divine Agro Kasuri Methi",
446
+ "Divya",
447
+ "DogFood_Pedigree",
448
+ "Domex",
449
+ "Don_PerfumeDoeSpray_Beardo",
450
+ "Donut_Ziggy",
451
+ "DoobiDoo_BabyPaints",
452
+ "Doritos",
453
+ "Dorje",
454
+ "Dove",
455
+ "Dove_FacialTissue",
456
+ "DrOetker",
457
+ "DryFruit",
458
+ "DryFruitPanjeeriLadoo_Haldiram",
459
+ "DryFruits",
460
+ "Dukes",
461
+ "Dyna Sandal",
462
+ "EasyFun",
463
+ "EclairOs",
464
+ "Eclairs",
465
+ "Eclairs_PureBurst",
466
+ "EcoValley",
467
+ "Eggs",
468
+ "Elite",
469
+ "Emami",
470
+ "Emperia",
471
+ "Enchante_Spinz",
472
+ "EnergyMax",
473
+ "Engage",
474
+ "Eno",
475
+ "Ensure",
476
+ "Enzo",
477
+ "Epigamia",
478
+ "Everest",
479
+ "Everyuth",
480
+ "Exotic_Spinz",
481
+ "Ezee",
482
+ "FabBourbon_Biscuit_Parle",
483
+ "FabricConditioner_Softlan",
484
+ "Fair-Lovely",
485
+ "FairMart",
486
+ "Fairmart",
487
+ "FaloodaMix_KesarPista_Weikfield",
488
+ "FaraliChiwda_Bhujialalji",
489
+ "FastCard_GoodNight",
490
+ "FennelSeedPowder_Kanwal",
491
+ "FerreroRocher",
492
+ "Fiama",
493
+ "FieryPride_BodySpray_ShotLayerr",
494
+ "Fiesta_Dal",
495
+ "Figaro",
496
+ "Figs",
497
+ "Finosta",
498
+ "First Crop Bikaneri Bhuji",
499
+ "First Crop Potato chips",
500
+ "First crop Corn Flakes",
501
+ "FishOil_HKVitals",
502
+ "Flakes_Choco_FirstCrop",
503
+ "FlaxSeeds_Nutraj",
504
+ "FlossPicks_DeepClean_DentoShine",
505
+ "FlossPicks_DentoShine",
506
+ "Fogg",
507
+ "FoodCoast",
508
+ "ForestSpice_Deodrant_WildStone",
509
+ "Fortune",
510
+ "Fortune besen",
511
+ "Fortune chana sattu",
512
+ "Fortune maida",
513
+ "Fortune suji",
514
+ "Fortune_Sugar",
515
+ "Forture indori poha",
516
+ "FourSquare_CigarettePack",
517
+ "Fox-s",
518
+ "Foxs",
519
+ "FreshComfort_Deodrant_Nivea",
520
+ "FreshGel_Sensodyne",
521
+ "FreshMint_Sensodyne",
522
+ "Fresho",
523
+ "FrozenPackagedFood",
524
+ "Fruit-Nut_Cookies_Unibic",
525
+ "FruitCocktail_GoldenCrown",
526
+ "FruitGummies_Tapi",
527
+ "FruitJuice_Litchi_Real",
528
+ "Fruitins",
529
+ "Frutins",
530
+ "Fulvadi_Jabson",
531
+ "GM",
532
+ "GMFoods",
533
+ "Gaia",
534
+ "GaiaLite",
535
+ "Gaialite",
536
+ "Galaxy",
537
+ "Gamnuts_DryFruits",
538
+ "Gamnuts_Masale",
539
+ "Gangwal",
540
+ "GaramMasala_Orika",
541
+ "Garden Bhelpuri - Sevpuri",
542
+ "GarlicPaste_Nilon-s",
543
+ "Garnier",
544
+ "Gasona_Kudos",
545
+ "Geham",
546
+ "Geisha",
547
+ "Genteel",
548
+ "Gentleman_Perfume_YardleyLondon",
549
+ "Gerber",
550
+ "GetReal",
551
+ "Ghadi",
552
+ "Ghee",
553
+ "Ghee_Cow_PureBurst",
554
+ "Gillette",
555
+ "Girnar",
556
+ "Gits",
557
+ "GlassCleaner_Action",
558
+ "Glicy",
559
+ "Glow",
560
+ "Glow-Handsome",
561
+ "Glow-Lovely",
562
+ "GlucoPlusC_Dabur",
563
+ "GlucoPlusD_Dabur",
564
+ "Glucon-D",
565
+ "GluconD",
566
+ "Glycerin",
567
+ "Go",
568
+ "GoalSportingClub_Denver",
569
+ "Godrej",
570
+ "GoldFlake_CigarettePack",
571
+ "Gold_Nescafe",
572
+ "GoldenScent_Next",
573
+ "GoneMad",
574
+ "Good Life toor dal",
575
+ "Good life",
576
+ "Good life masoor",
577
+ "Good life urad whole",
578
+ "GoodDay",
579
+ "GoodHome",
580
+ "GoodKnight",
581
+ "GoodLife",
582
+ "GoodMorning",
583
+ "Gourmet-sDelite",
584
+ "Gowardhan",
585
+ "Grand_Coffee_Tata",
586
+ "GreenChilliPickle_Nilon-s",
587
+ "GreenChilliPickle_Tops",
588
+ "GreenTea_Pocky",
589
+ "GulabJamunMix_ Bambino",
590
+ "GupShupPeanuts_Haldirams",
591
+ "Guruji",
592
+ "Gustora",
593
+ "Guylian",
594
+ "Hair-Care",
595
+ "HairColor",
596
+ "HairDryer_Babila",
597
+ "HairOil_Clear",
598
+ "HairOil_Indulekha",
599
+ "Hair_Color_ColorMate",
600
+ "Hajmola",
601
+ "Haldiram-s_Chips",
602
+ "Haldirams",
603
+ "Haldirams_GujratiMix",
604
+ "Haldirams_KajuMixture",
605
+ "Haldirams_MoongDal",
606
+ "Haldirams_Peanuts",
607
+ "Hamilton_Perfume_Denver",
608
+ "HandSanitizer",
609
+ "HandSanitizer_Dettol",
610
+ "HandSanitizer_Himalaya",
611
+ "HandSanitizer_Lifeboy",
612
+ "HandwashVitamins_LifeBouy",
613
+ "Handwash_Herbal_Patanjali",
614
+ "Handwash_Santoor",
615
+ "Happilo",
616
+ "Happilo_Almonds",
617
+ "Happilo_PeanutButter",
618
+ "HappyHappyCreme_Biscuit_Parle",
619
+ "Haribo",
620
+ "Harpic",
621
+ "HastyTasty",
622
+ "Hawanan Barbeque Cheese Popcorn",
623
+ "Head-Shoulders",
624
+ "HealthyLife",
625
+ "Heinz",
626
+ "HellMann-s_Mayonnaise",
627
+ "Hellmanns",
628
+ "Hem",
629
+ "Herbal_Toothpaste_Colgate",
630
+ "Hersheys",
631
+ "Hersheys_MilkShake_vaneeliFlavour",
632
+ "Hide-Seek_Chocolate_Parle",
633
+ "Hide-Seek_CremeSandwiches",
634
+ "Hide-Seek_Milano_ParlePlatina",
635
+ "Hide-Seek_ParlePlatina",
636
+ "Himalaya",
637
+ "Hing_Ramdev",
638
+ "Hit",
639
+ "Hitkary",
640
+ "Hocco",
641
+ "HoneyOatmeal_Cookies_Unibic",
642
+ "Honey_Barosi",
643
+ "Honey_GoldDrops",
644
+ "Honey_Lion",
645
+ "Horlicks",
646
+ "Horlicks women-s plus",
647
+ "Hurricane_EnergyDrink",
648
+ "HyderabadiBiryani_Rehmat",
649
+ "HydraEnergy_Deodrant_WildStone",
650
+ "IceCream",
651
+ "IceCream_Amul",
652
+ "IceCream_BaskinRobbins",
653
+ "IceCream_Kulfi_Havmor",
654
+ "IespressoCoffee_Davidoff",
655
+ "ImliCandy_Ajwah",
656
+ "IndiShop",
657
+ "IndianSweet_MaysorePak_Nandini",
658
+ "Indulekha",
659
+ "Inhaler_Vicks",
660
+ "Insect_Killer_Hit",
661
+ "Insect_Killer_strategi",
662
+ "Insight_Perfume",
663
+ "Intense_Fragrance_Axe",
664
+ "Iodex",
665
+ "JMRFoods",
666
+ "Jabsons",
667
+ "Jackpot",
668
+ "JaggeryPowder",
669
+ "JainsTrupti",
670
+ "Jalani",
671
+ "Jaljira",
672
+ "Jam_Kissan",
673
+ "Jam_MixedFruit_Kissan",
674
+ "Jam_Patanjali",
675
+ "Jeera_GoodLife",
676
+ "Jel_SetWet",
677
+ "Jet_Imli_Toffee",
678
+ "JewelFarmer",
679
+ "Johnsons",
680
+ "Joiner_Drink",
681
+ "Joshina_Hamdard",
682
+ "Jovees",
683
+ "Joy",
684
+ "Jucie_CranberryDelight_Tropicana",
685
+ "Jucie_GuavaDelight_Tropicana",
686
+ "Jugnu_NapthaleneBalls",
687
+ "Juice_Apple_Natural",
688
+ "Juice_CoconutMilkDrink_MangoNataDeCoco_Uglobe",
689
+ "Juice_CoconutMilkDrink_PineappleNataDeCoco_Uglobe",
690
+ "Juice_CranberryDelight_Tropicana",
691
+ "Juice_Greenapple_Delmonte",
692
+ "Juice_Guava_Real",
693
+ "Juice_MixedFruit_Natural",
694
+ "Juice_Orange_Real",
695
+ "Juice_Peach_Delmonte",
696
+ "Juice_Peach_Rani",
697
+ "Juice_Pineapple_Delmonte",
698
+ "Juice_Pomegranate_Joiner",
699
+ "Juice_Pomegrate_Real",
700
+ "Juice_Pomegrate_Real_Dabur",
701
+ "Juice_Swing",
702
+ "Juice_coconut_Paper_Boat",
703
+ "JuniorTomatoKetchup_Tops",
704
+ "KTH",
705
+ "Kaffe",
706
+ "KajuMixture_Bikano",
707
+ "KalongiSeeds_Ajwah",
708
+ "Kamasutra",
709
+ "Kapiva",
710
+ "KashmiriMixture_Bicano",
711
+ "KashmiriMixture_Bikano",
712
+ "KashurDal_HudHud",
713
+ "KasundiSauce_Elmac",
714
+ "Kehwa_Aram-s",
715
+ "Kelloggs Corn Flakes Real Haney",
716
+ "Kellogs",
717
+ "KesarPeda_Ajwah",
718
+ "KesarQueen",
719
+ "KeshKing",
720
+ "Keya",
721
+ "KhattaMeetha_FirstCrop",
722
+ "KhattaMithaMix_Haldirams",
723
+ "Kids_Toothbrush_OralB",
724
+ "Kimchi",
725
+ "Kimchi_Samyang",
726
+ "KinderJoy",
727
+ "Kingfisher",
728
+ "Kinley",
729
+ "Kissan",
730
+ "KitchenAffairs",
731
+ "KitchenTreasures",
732
+ "Knorr",
733
+ "KuchKuch_Bikaji",
734
+ "Kurkure",
735
+ "Kurkure_ChatFills",
736
+ "Kurkure_SizzlinHot",
737
+ "Kwality",
738
+ "Kwality_Muesli",
739
+ "L-oreal",
740
+ "LactoCalamine",
741
+ "Lakme",
742
+ "Lal",
743
+ "LalMirchPaste_Aram-s",
744
+ "Lassi_WinkinCowClassic_Britannia",
745
+ "LaxmanRekhaa",
746
+ "Layka",
747
+ "Lays",
748
+ "Lays_American",
749
+ "Lays_ChilliLimbu",
750
+ "Lays_Indian",
751
+ "Lays_Spanish",
752
+ "Lays_WestIndies",
753
+ "LeCafe",
754
+ "Lehar",
755
+ "Lemon_Toothpaste_Colgate",
756
+ "Lifebuoy",
757
+ "Lime_Dishwash_Prill",
758
+ "Limonata",
759
+ "Lindt",
760
+ "Lipton",
761
+ "LiquidDeodrant_Lawman_pg3",
762
+ "LiquidDetergent_Patanjali",
763
+ "LiquidMosquitoRepellent_GoodNight",
764
+ "Liril",
765
+ "Listerine",
766
+ "Lite",
767
+ "LittleHearts",
768
+ "Lizol",
769
+ "LollipopTongueCleaner_DentoShine",
770
+ "LondonBubble",
771
+ "LondonMist_BodySpray_YardleyLondon",
772
+ "LondonMist_Perfume_Yardley",
773
+ "Lotte",
774
+ "Lotus",
775
+ "Lux",
776
+ "M-M",
777
+ "MB",
778
+ "MDH",
779
+ "MIxedFruit_Juice_Tropicana",
780
+ "MTR",
781
+ "Mad_Soap",
782
+ "Madhubani",
783
+ "Madhuri",
784
+ "MadrasiNamkeen_Brij",
785
+ "Maggi",
786
+ "Magnolia7GrapeFruit_Perfume_YardleyLondon",
787
+ "Mahakosh",
788
+ "Makhana",
789
+ "Makino Nacho chips",
790
+ "Mala_MixedFruitJam",
791
+ "Malas",
792
+ "Malkist",
793
+ "Mangalam",
794
+ "Mangaldeep",
795
+ "MangoBite_Parle",
796
+ "MangoFlavour_ToothpasteForKids_DentoShine",
797
+ "MangoJuice_Fresca",
798
+ "MangoMerry",
799
+ "MangoPickle_ImliTree",
800
+ "MangoPickle_Tops",
801
+ "Manna",
802
+ "Marie_McVities",
803
+ "Mars",
804
+ "Marvel",
805
+ "MarvelTea",
806
+ "MasalaMunch_Kurkure",
807
+ "MasalaNoodles_TopRamen",
808
+ "MasalaNoodles_Tops",
809
+ "MasalaNoodles_maggi",
810
+ "MasalaTikki_Kanwal",
811
+ "Masala_CuppaNoodles_Maggi",
812
+ "Masala_CuppaNoodles_Manchow",
813
+ "Masala_TopRamen",
814
+ "MaxFresh_Toothpaste_Colgate",
815
+ "MaxProtein",
816
+ "MazedaarMasala_CupNoodles",
817
+ "Mazic",
818
+ "McCain",
819
+ "McVities",
820
+ "MeatMasala_Orika",
821
+ "Meiji",
822
+ "MelonSeeds",
823
+ "MilkBooster_PureBurst",
824
+ "MilkCompound_Morde",
825
+ "MilkMagic",
826
+ "MilkShakti_Biscuit_Parle",
827
+ "Milk_Everyday_Nestle",
828
+ "Milk_Rusk_Mario",
829
+ "Milka",
830
+ "Milkshake_Badam_Cavins",
831
+ "Milkshake_Chocolate_Cavins",
832
+ "Milkshake_Straberry_Cavins",
833
+ "Milkshake_Vanilla_Cavins",
834
+ "Milky bar",
835
+ "MilkyMist",
836
+ "MiniChocolate_Oreo",
837
+ "MiniJumbo_MosquuitoCoil_GoodNight",
838
+ "MiniMeBakers",
839
+ "MinuteMaid",
840
+ "MirchiQuormaPaste_Kanwal",
841
+ "Mishri",
842
+ "MixedFruitJam_Sil",
843
+ "MixedPickle_Nilon-s",
844
+ "MixedPickle_Tops",
845
+ "Mixture_ALLINONE",
846
+ "Mixture_Ajwah",
847
+ "Mixture_Bicano",
848
+ "Mohuns",
849
+ "Mojito_Orange_Cravova",
850
+ "Mom",
851
+ "Moments_Chocolate",
852
+ "MomsMagic",
853
+ "MomsMagic_Biscuit_Sunfeast",
854
+ "Monaco_Biscuit_Parle",
855
+ "Monaco_Cheeslings",
856
+ "Monaco_PiriPiri",
857
+ "MongDal_Bikaji",
858
+ "Monita",
859
+ "Monster",
860
+ "MontexFoil",
861
+ "MoongDal_Bikaji",
862
+ "MoongFali",
863
+ "Mopz Floral Fresh",
864
+ "Mopz Lime Fresh",
865
+ "MorningDew_Perfume_Yardley",
866
+ "Morton",
867
+ "MosquitoKiller_Mortein",
868
+ "MosquitoOil_Genius_Maxo",
869
+ "MosquitoRepellent",
870
+ "MosquitoRepellentBlack_Hit",
871
+ "MosquitoRepellentRed_Hit",
872
+ "Mother-sRecipe",
873
+ "MotherDairy",
874
+ "MothersRecipe",
875
+ "Muesli_King",
876
+ "Munch Max",
877
+ "MuscleBlaze",
878
+ "Museli_Kellogs",
879
+ "MustardOil",
880
+ "MustardOil_ValleyKing",
881
+ "MutterPaneer_MinuteMeals_MTR",
882
+ "MutterPaneer_MinuteMeals_Mir",
883
+ "MyFruit",
884
+ "MysticWhite_Spinz",
885
+ "NANPro",
886
+ "Nafees",
887
+ "NailClipper",
888
+ "Nakoda",
889
+ "Namkeen",
890
+ "Namkeen_AllinOne_BC",
891
+ "Namkeen_BombayMixture_BC",
892
+ "Namkeen_KhattaMeetha_DC",
893
+ "Namkeen_MultiGrain_Jabson",
894
+ "NaturalChoice_ChannaDal",
895
+ "NaturalChoice_MixDal",
896
+ "NaturalChoice_MoongDal",
897
+ "NaturalChoice_RajmaChitra",
898
+ "NaturalColor_ColorMate",
899
+ "NaturalGlow_Deodrant_Nivea",
900
+ "NaturalHoney_Capilano",
901
+ "Nature-sChoice_Kismis",
902
+ "Nature-sChoice_MixedDryFruit",
903
+ "Navratan",
904
+ "Navratan_FirstCrop",
905
+ "NeemActive_Toothpaste",
906
+ "Neeraj",
907
+ "Neo_BodySpray_ParkAvenue",
908
+ "Nescafe",
909
+ "Nestle",
910
+ "Nestle Cerelac",
911
+ "Nilon-s_SAUCE",
912
+ "Nilons",
913
+ "Nissin",
914
+ "Nivea",
915
+ "Nongshim",
916
+ "NoodleSoup_ShinCup",
917
+ "Noodles_Chings_Hot Garlic",
918
+ "Noodles_Hakka Noodles",
919
+ "Noodles_Jumbo",
920
+ "Noodles_Yippee",
921
+ "NoonChai_Girnar",
922
+ "NoonChai_TezPremium",
923
+ "Nusobee_Dexolac",
924
+ "Nutella",
925
+ "Nutraj",
926
+ "Nutralite",
927
+ "Nutrela",
928
+ "Nutri choice thin arrowroot",
929
+ "NutriDelite",
930
+ "Nutricia",
931
+ "Nutricrunch_Biscuit_Britannia",
932
+ "Nutridelite",
933
+ "Nuts_Fruits_Berries_Gourmia",
934
+ "Nutveda",
935
+ "Nyle",
936
+ "OLiveOil_Sansu",
937
+ "Oats_CrunchyMuesli_Grry-s",
938
+ "Oats_ProteinRich_FirstCrop",
939
+ "Odonil",
940
+ "Oil_RefinedRice_Saffola",
941
+ "Oil_Sesame_FirstCrop",
942
+ "OldSpice",
943
+ "OleevActive",
944
+ "OleevSmart",
945
+ "OliveOil",
946
+ "OliveOil_Jivo",
947
+ "OliveOil_KeoKarpin",
948
+ "OrangeDrink_MinuteMaid",
949
+ "OrangeFlavour_ToothpasteForKids_DentoShine",
950
+ "OrangeJuice_Tropicana",
951
+ "OrangeSplash_Cookies_Unibic",
952
+ "Orbit",
953
+ "Oreo",
954
+ "OrganicAloeVera_Juice_Nimbark",
955
+ "OrganicAmla-Juice_Nimbark",
956
+ "OrganicIndia",
957
+ "OrganicTattva",
958
+ "Organica",
959
+ "Organicana",
960
+ "Oriental",
961
+ "OriginalChocolate_Pepero",
962
+ "Orion",
963
+ "Ortho_Oil_Zandu",
964
+ "Outshine_HandWash",
965
+ "Pack_BadamDrink_MTR",
966
+ "Pack_Chabaa_PineapplePulp",
967
+ "Pack_Chabaa_RedGrapeFruit",
968
+ "Pack_Coffee_Nescafe",
969
+ "Pack_Delmonte_GreenApple",
970
+ "Pack_Juice_Original",
971
+ "Pack_Juice_Peach_Delmonte",
972
+ "Pack_Juice_Pineapple_Delmonte",
973
+ "Pack_Juice_rani",
974
+ "PalakPaneer_MinuteMeals_MTR",
975
+ "Pancake Mix",
976
+ "Panchratna_Bikaji",
977
+ "PaneerBhujia_Bikano",
978
+ "Pansari_Poha",
979
+ "PaperBoat",
980
+ "PappaPig",
981
+ "PaprikaCashews_GoldenGate",
982
+ "Parachute",
983
+ "Paramute",
984
+ "ParisAgro",
985
+ "ParkAvenue",
986
+ "Parle",
987
+ "ParleGRoyale_Biscuit_Parle",
988
+ "ParleG_BigPack_Biscuit_Parle",
989
+ "ParleG_Biscuit_Parle",
990
+ "Parle_20_20_Nice_Biscuits",
991
+ "Parrot",
992
+ "Passion_Talc_Envy",
993
+ "Pasta Masala",
994
+ "Pasta_Fusilli_FirstCrop",
995
+ "Pasta_Wokifield",
996
+ "PastyPixel",
997
+ "Patanjali",
998
+ "Patisa_Haldirams",
999
+ "PauBhaji_Gits",
1000
+ "Pav",
1001
+ "PeanutButter_MyFitness",
1002
+ "PeanutButter_funfoods",
1003
+ "PeanutChikki_Parmod",
1004
+ "Peanut_Butter_AmericanGarden",
1005
+ "Peanutbutter_Alpino",
1006
+ "Peanutbutter_FirstCrop",
1007
+ "Peanutbutter_Pinotola",
1008
+ "Pearl-Beauty_Deodrant_Nivea",
1009
+ "Pears",
1010
+ "PeasInBrine_GoldenCrown",
1011
+ "PediaSure",
1012
+ "PeppaPig",
1013
+ "Peppy",
1014
+ "Perfume_Denver",
1015
+ "Pichkoo_TomatoKetchup_Maggi",
1016
+ "Pickle_DoubleHorse",
1017
+ "Pickle_Garlic_Mother-s",
1018
+ "Pickle_KitchenTreasures",
1019
+ "Pickle_Lime_Mother-s",
1020
+ "Pickle_Mango_Mother-s",
1021
+ "Pickle_Mother-sRecipe",
1022
+ "Piknik",
1023
+ "PineappleJam_FullBloom",
1024
+ "PineappleJam_Kissan",
1025
+ "PineappleSlice_GoldenCrown",
1026
+ "PinkDelight",
1027
+ "PinkSalt",
1028
+ "Pinkrush_facewash",
1029
+ "Pintola",
1030
+ "Pipo",
1031
+ "Pistachios_Wonderland",
1032
+ "Pitambari",
1033
+ "PizzaCheese_Go",
1034
+ "PlainBhujia_Haldirams",
1035
+ "PlaxSpicyFresh_Colgate",
1036
+ "PlumCake",
1037
+ "Poha_FirstCrop",
1038
+ "Poha_Fortune",
1039
+ "Ponds",
1040
+ "Ponds_CharcoalFaceWash",
1041
+ "Popcorn_Ajwah",
1042
+ "Potata_Biscuit_Pran",
1043
+ "PotatoChips_Cream - Onion_FirstCrop",
1044
+ "Pramod",
1045
+ "Pramod Tilkut",
1046
+ "Pramod chikki Gaja",
1047
+ "Pramod peanut chikki",
1048
+ "Pramod sweet bliss",
1049
+ "Pramod_Tilkut",
1050
+ "Pran",
1051
+ "PremiumTea_Mohan",
1052
+ "Prestige",
1053
+ "Primora",
1054
+ "Princles",
1055
+ "Pringles",
1056
+ "ProNature",
1057
+ "Protect-Care_Deodrant_NIvea",
1058
+ "ProteinPlus",
1059
+ "ProteinWater_Aquatein",
1060
+ "Protinex",
1061
+ "Prunes",
1062
+ "Puffs_Funflips",
1063
+ "Pulse_CoolTalc_Axe",
1064
+ "Pulse_Fragrance_Axe",
1065
+ "Pulses",
1066
+ "Pulses_Goodlife",
1067
+ "Pulses_SafeHarvest",
1068
+ "Pulses_TataSampann",
1069
+ "PunjabiChole_Gits",
1070
+ "PunjabiTadka_Bikaji",
1071
+ "PurHoney_Zandu",
1072
+ "Pure",
1073
+ "Purix",
1074
+ "Pushp",
1075
+ "Quaker",
1076
+ "Queen_BodySpray_Vanesa",
1077
+ "QuinoaSeeds_King",
1078
+ "Raavi",
1079
+ "RabdiDrink_Cavins",
1080
+ "Racy",
1081
+ "Rafaelo",
1082
+ "Rajhans",
1083
+ "RajmaMasala_Gits",
1084
+ "RajmaMasala_RedayMeals_Gits",
1085
+ "Ramu",
1086
+ "Rani_Juice",
1087
+ "RapidRelief_Sensodyne",
1088
+ "Rasa",
1089
+ "Rasna",
1090
+ "RasoiMagic",
1091
+ "RaspberryFlavour_ToothpasteForKids_DentoShine",
1092
+ "Raw",
1093
+ "RawPressery",
1094
+ "ReadyToEatNoodles_WaiWai",
1095
+ "Real",
1096
+ "RealThai",
1097
+ "RealThal",
1098
+ "Real_Juice",
1099
+ "RedBull",
1100
+ "RedChilliPowder_Badshah",
1101
+ "RedChilliPowder_Kanwal",
1102
+ "RedChilliPowder_Rehmat",
1103
+ "RedChilliPowder_c",
1104
+ "RedChilliSauce",
1105
+ "RedCurrents_St.John",
1106
+ "RedLabel",
1107
+ "RedTea_Mohan",
1108
+ "RefinedOil",
1109
+ "RefinedOil_Dhara",
1110
+ "RefinedSoyabeanOil_NutriLive",
1111
+ "Regular_GluconD",
1112
+ "Repair-Protect_Sensodyne",
1113
+ "RevitalH_Woman",
1114
+ "Revive",
1115
+ "Rex Baking Powder",
1116
+ "Rice",
1117
+ "Rice_Minimogra",
1118
+ "Rich-Moist_PlumCake_Winkies",
1119
+ "Rin",
1120
+ "RiteBite",
1121
+ "RoastedAndSaltedCashews_Happilo",
1122
+ "RoastedChana_GoldenGate",
1123
+ "RoastedFlexSeed_Ajwah",
1124
+ "RockSalt",
1125
+ "RoohAfza",
1126
+ "RoomFreshner",
1127
+ "Room_Mist_Lia",
1128
+ "Rostaa",
1129
+ "RoyalCupTea_Girnar",
1130
+ "RoyalRatan",
1131
+ "RoyalRedRoses_BodySpray_YardleyLondon",
1132
+ "RoyaleGentleman_Perfume_Yardley",
1133
+ "RuchiStar",
1134
+ "Rusk_FirstCrop",
1135
+ "Rusk_Toastea_Amul",
1136
+ "SRK_AutographCollection_Denver",
1137
+ "Sachamoti",
1138
+ "SadaBahar",
1139
+ "Sadabahar",
1140
+ "Safal",
1141
+ "SafeWash",
1142
+ "Safffola",
1143
+ "Saffola",
1144
+ "Saffola Jammuni Veda",
1145
+ "Saffola oil",
1146
+ "SahiMixture_Bicano",
1147
+ "Salt",
1148
+ "Sams",
1149
+ "Sanchi",
1150
+ "Sapphire",
1151
+ "SaraShree",
1152
+ "SaunfPowder_Rehmat",
1153
+ "Saunf_Everest",
1154
+ "Savera",
1155
+ "Savlon",
1156
+ "Sayang",
1157
+ "Schweppes",
1158
+ "ScotchBrite",
1159
+ "ScotchBrite_Scrubber",
1160
+ "Seacod_CodLiverOilCapsules",
1161
+ "SeedsandNuts_Happilo",
1162
+ "SensitivePlus_Toothpaste_Colgate",
1163
+ "Sensitive_Toothpaste_Colgate",
1164
+ "Sensitivity-Gum_Sensodyne",
1165
+ "ShahiMix_Bikano",
1166
+ "Shampoo_Pantene",
1167
+ "Shampoo_Shanelle",
1168
+ "Shan",
1169
+ "Shan_TandooriMasala",
1170
+ "ShantaG",
1171
+ "Shareat",
1172
+ "ShavingFoam_ViJhon",
1173
+ "Shero_Vanesa",
1174
+ "Shot_Perfume_Layerr",
1175
+ "ShudhUrja",
1176
+ "Siddhayu",
1177
+ "SilverCoin",
1178
+ "Similac",
1179
+ "Similac_FollowUpFormula_Abbot",
1180
+ "Simple_GluconD",
1181
+ "SizzlinHot_Kurkure",
1182
+ "Skippi",
1183
+ "Skittles",
1184
+ "SlicedMushroom_Habit",
1185
+ "SmartSecret_Fragrance_SummerSpring",
1186
+ "SmithandJones",
1187
+ "SnacTac",
1188
+ "Snackible",
1189
+ "Snacks_Ajwah",
1190
+ "Snacks_Kuch-kuch_Bicano",
1191
+ "Snacks_SnacLite_Haldirams",
1192
+ "Snacks_ZigZag",
1193
+ "Snactac",
1194
+ "Snapin",
1195
+ "Snicker",
1196
+ "Snickers",
1197
+ "Snug",
1198
+ "SoYum",
1199
+ "Soap",
1200
+ "SoapNo1_Godrej",
1201
+ "Soap_Aloevera_Dettol",
1202
+ "Soap_Camay",
1203
+ "Soap_Dettol cool",
1204
+ "Soap_Fresh",
1205
+ "Soap_IcyCool_Dettol",
1206
+ "Soap_Jasmine_Lux",
1207
+ "Soap_Liril",
1208
+ "Soap_Neem_Dyna",
1209
+ "Soap_Original_Dettol",
1210
+ "Soap_Rose_Lux",
1211
+ "Soap_Sandal_Dyna",
1212
+ "Soap_fena",
1213
+ "Society",
1214
+ "SocietyTea",
1215
+ "Sofit",
1216
+ "SoftDrink",
1217
+ "SoftDrink_Rasna",
1218
+ "SoftDrink_cola_Campa",
1219
+ "SoftDrink_sprite_Campa",
1220
+ "Softdrink_thumbsup",
1221
+ "Softouch",
1222
+ "Soni Fresh",
1223
+ "SoyaChunks_Ei8amhit",
1224
+ "SoyaSticks-s_Bikaji",
1225
+ "SoyaSticks_Bhujialalji",
1226
+ "SoyaSticks_Jabsons",
1227
+ "Soya_Sauce_Nilons",
1228
+ "Spice_Everest",
1229
+ "Spices_24MantraOrganic",
1230
+ "Spices_Catch",
1231
+ "Spices_FineLife",
1232
+ "Spices_MDH",
1233
+ "Spices_ProNature",
1234
+ "SportingClub_Denver",
1235
+ "Spreads_Hershey-s",
1236
+ "SpringBlossom_perfume_YardleyLondon",
1237
+ "SriSri",
1238
+ "StarBlossom_Perfume_YardleyLondon",
1239
+ "StarFlowerrazi_Perfume_Yardley",
1240
+ "Stayfree",
1241
+ "Sting",
1242
+ "Storia",
1243
+ "StrawberryFlovour_ToothpasteForKids_DentoShine",
1244
+ "Streax",
1245
+ "String",
1246
+ "StrongTeeth_Toothpaste_Colgate",
1247
+ "StuffedChilliPickle_Tops",
1248
+ "SubhKamal",
1249
+ "Suffola",
1250
+ "Sugar",
1251
+ "SugarFree",
1252
+ "SugarFreeCookies_Gaia",
1253
+ "SugarFree_BiskFarm",
1254
+ "SugarLite",
1255
+ "Suhana",
1256
+ "Sultan_Scent_Fogg",
1257
+ "Sundrop",
1258
+ "Sunfeast",
1259
+ "Sunfeast Glucose plus",
1260
+ "SunflowerOil_Cargill",
1261
+ "SunflowerSeed_Ajwah",
1262
+ "SunflowerSeeds_King",
1263
+ "Sunrich",
1264
+ "Sunscream_Lakme",
1265
+ "Sunsilk",
1266
+ "SuperSarvottam",
1267
+ "SurfExcel",
1268
+ "SurfaceDisinfectantSpray_Savlon",
1269
+ "Svami",
1270
+ "Swach",
1271
+ "Swadist",
1272
+ "Swaminarayan",
1273
+ "SweetCorn_Snacko",
1274
+ "SwwetCorn_Sundrop",
1275
+ "SyntheticClearVinegar_Solar",
1276
+ "T-Boost_TrueBasics",
1277
+ "TaaliPeanuts",
1278
+ "Taaza",
1279
+ "TabelSalt_catch",
1280
+ "Tadaa",
1281
+ "TajMahal",
1282
+ "Talati",
1283
+ "Talod",
1284
+ "Tamarind",
1285
+ "Tamarind_Dishwash_Prill",
1286
+ "Tamarind_Priya",
1287
+ "Tang",
1288
+ "TastyNutes_Bikano",
1289
+ "TastyNutes_FirstCrop",
1290
+ "Tata",
1291
+ "TataGold",
1292
+ "TataPremium",
1293
+ "TataTea taaza",
1294
+ "Tata_Agni",
1295
+ "TeaCountry",
1296
+ "TeaTime",
1297
+ "TeaValley",
1298
+ "Tea_3Roses_BrookeBond",
1299
+ "Tea_ElaichiChai_Shera",
1300
+ "Tea_Mayur",
1301
+ "Tea_Ruby_BrookeBond",
1302
+ "Tea_Shera",
1303
+ "Tea_royal",
1304
+ "Tealeaves_DoubleDiamond",
1305
+ "TedheMedhe_Bingo_AlooBhuja",
1306
+ "TedheMedhe_Bingo_PulseMix",
1307
+ "TeekhaMeetha_Bhujialalji",
1308
+ "TeekhaMeetha_FirstCrop",
1309
+ "Tetley",
1310
+ "TheBakersDozen",
1311
+ "TicTac",
1312
+ "Tide",
1313
+ "Tiffany",
1314
+ "Tiger",
1315
+ "Timios",
1316
+ "Tingle",
1317
+ "Toblerone",
1318
+ "Toffee_Ajwah",
1319
+ "ToiletCleaner_Expelz",
1320
+ "TomatoChilliSauce_Maggi",
1321
+ "TomatoDiscs_Peppy",
1322
+ "TomatoKetchup",
1323
+ "TomatoPuree_GodenCrown",
1324
+ "TomatoPuree_Kissan",
1325
+ "TomatoSoup_Bambino",
1326
+ "TongGarden",
1327
+ "TooYumm",
1328
+ "ToothBrush",
1329
+ "ToothPaste",
1330
+ "ToothPaste_CloseUp_ExtremeCool",
1331
+ "ToothPaste_PepsodentG",
1332
+ "ToothPowder_Colgate",
1333
+ "Toothbrush_Sensodyne",
1334
+ "ToothpasteForLittleOnes_DentoShine",
1335
+ "Toothpaste_Meswak",
1336
+ "Toothpaste_Meswak_Dabur",
1337
+ "Toothpaste_RedGel_Dabur",
1338
+ "Toothpaste_Red_BAEFresh_Dabur",
1339
+ "Toothpaste_tulsi_Dabur",
1340
+ "TopBiscuit_Parle",
1341
+ "TopNTown",
1342
+ "TopNut",
1343
+ "TopRamen",
1344
+ "Top_Biscuit_Parle",
1345
+ "Top_Ramen",
1346
+ "Tops",
1347
+ "Tops_DrinkingChocolate",
1348
+ "Total12_Toothpaste_Colgate",
1349
+ "TragacanthGum_Ajwah",
1350
+ "Trail_Mix_DryFruit",
1351
+ "Trapa",
1352
+ "Treat_BasmatiRice",
1353
+ "Trelish",
1354
+ "Tresemme",
1355
+ "Trident",
1356
+ "TriphalaChurna_Baidyanath",
1357
+ "Tropicana",
1358
+ "Truffles_Joyland",
1359
+ "Trump_Scent_Fogg",
1360
+ "TulsiGingerTurmeric_OrganicIndia",
1361
+ "TunaNaturalOil_GoldenCrown",
1362
+ "TunaOil_GoldenCrown",
1363
+ "TurkishApricot_Happilo",
1364
+ "TurmericPowder_Kanwal",
1365
+ "TurmericPowder_Rehmat",
1366
+ "Twinings",
1367
+ "TwistiesNamkeen_FirstCrop",
1368
+ "Twix",
1369
+ "Tycoon_Scent_Fogg",
1370
+ "Ujala",
1371
+ "UltraPowerBalm_Zandu",
1372
+ "UltraSensual_Deodrant_WildStone",
1373
+ "UncleChips",
1374
+ "Unibic",
1375
+ "Unibic_CashewBadam_Cookies",
1376
+ "Upma_Bambino",
1377
+ "Upma_MTR",
1378
+ "Utsav",
1379
+ "VJohn",
1380
+ "VWash",
1381
+ "Vahdam",
1382
+ "VajradantiSF_Toothpaste_Vicco",
1383
+ "Vanish",
1384
+ "Vaseline",
1385
+ "Vatika",
1386
+ "VedShakti_Toothpaste_Colgate",
1387
+ "Veeba",
1388
+ "VegBiryani_Gits",
1389
+ "VegMayonnaise_Imli_Tree",
1390
+ "VegMayonnaise_Veeba",
1391
+ "VeganMayo_HellManns",
1392
+ "Veggi_CupNoodles_Manchow",
1393
+ "VeggieManchow_CupNoodles",
1394
+ "Vermicelli",
1395
+ "Vicks",
1396
+ "Vicks_VapoRub_SteamPads",
1397
+ "Vidal",
1398
+ "Vim",
1399
+ "Vim Anti bac",
1400
+ "Vinegar",
1401
+ "Vinegar_Everest",
1402
+ "VisibleWhite_Toothpaste_Colgate",
1403
+ "VitaminCapsules_Seacod",
1404
+ "Voilet-Raspberry_Perfume_YardleyLondon",
1405
+ "Voyage_Perfume_ParkAvenue",
1406
+ "WSQ_VICCO",
1407
+ "WaferRoll_Champion",
1408
+ "Wafer_Orange_FullBloom",
1409
+ "Wafer_Strawberry_FullBloom",
1410
+ "Wafers_Gastone_Lago",
1411
+ "Wafers_Tirameso_Creweto",
1412
+ "Wafers_Treat_Britannia",
1413
+ "WaffleBites_Craveto",
1414
+ "Waffy_Parle",
1415
+ "WaghBakri",
1416
+ "WaiWai",
1417
+ "Walnut",
1418
+ "WalnutKernels_Kohinoor",
1419
+ "Walnut_Nutraj",
1420
+ "Water",
1421
+ "WatermelonFlovour_ToothpasteForKids_DentoShine",
1422
+ "WeikField",
1423
+ "Weikfield",
1424
+ "WellnessCollection_TGLCo.",
1425
+ "Wheel",
1426
+ "Whisper",
1427
+ "WhiteBread_Dewz",
1428
+ "WhiteCurrents_St.John",
1429
+ "WhiteningSmoothSkin_Deodrant_Nivea",
1430
+ "Whitening_Sensodyne",
1431
+ "Wilkinson_Sword_Gillette",
1432
+ "Win2",
1433
+ "Winegreens",
1434
+ "Winkies",
1435
+ "Women-sPlus_Horlicks",
1436
+ "WomensPlus",
1437
+ "WottaGirl_Perfume_Layerr",
1438
+ "Yardley",
1439
+ "Yeah",
1440
+ "Yeast",
1441
+ "YellowBananaChips_Haldirams",
1442
+ "YellowBlossom_Spinz",
1443
+ "YogaBar",
1444
+ "Yogurt",
1445
+ "Yummies",
1446
+ "Yummiez",
1447
+ "Yummy",
1448
+ "Zandu",
1449
+ "ZanduBalm",
1450
+ "Zenzi",
1451
+ "Zouk_BodySpray_ParkAvenue",
1452
+ "ZzzQuil_Natura",
1453
+ "aachar-sethi",
1454
+ "aashirvaad milk",
1455
+ "adzanarice_charminar",
1456
+ "agra petha",
1457
+ "aircleaner_airwick",
1458
+ "aircleaner_campure",
1459
+ "airfreshber_koparo",
1460
+ "airfreshner_hifresh",
1461
+ "airfreshner_lavender",
1462
+ "airfreshner_wonderfresh",
1463
+ "almondmilk_at",
1464
+ "amul cow milk",
1465
+ "amul masti dahi",
1466
+ "amul moti milk",
1467
+ "amulcow milk",
1468
+ "amulmilk",
1469
+ "amulmithai mate",
1470
+ "amulspray infant milk food",
1471
+ "ananda ghee",
1472
+ "apple_fishermanfriend",
1473
+ "appy",
1474
+ "aquafly_water",
1475
+ "asian cosmos",
1476
+ "axe",
1477
+ "bagrry-s corn flakes",
1478
+ "bajaj hair oil",
1479
+ "bajaj majesty RX11",
1480
+ "banana fryums",
1481
+ "bansal tableware",
1482
+ "batook",
1483
+ "bc bikaneri rasgulla",
1484
+ "bc_rasgulla_ bikaneri",
1485
+ "beer_oldammaiga",
1486
+ "besen",
1487
+ "betty crocker pancske mix",
1488
+ "bicano kaju mixture",
1489
+ "bikaneri bhujia first crop",
1490
+ "bikaneri_bhujia_firstcrop",
1491
+ "bikano bhelpuri mixture",
1492
+ "bikano time mixture",
1493
+ "biscuit_bourbon",
1494
+ "biscuit_deluxe",
1495
+ "biscuit_sugarcracker",
1496
+ "bisk farn googly",
1497
+ "biskfarm eat fit",
1498
+ "biskfarm top herbs",
1499
+ "black salt zoff",
1500
+ "borges",
1501
+ "boroplus_bodylotion",
1502
+ "britannia cow ghee",
1503
+ "britannia the laughiung cow",
1504
+ "britannia tiger crunch",
1505
+ "britannia50 50 time pass",
1506
+ "britanniacheese slicess",
1507
+ "brown - Polson custard powder",
1508
+ "camembert",
1509
+ "campoor",
1510
+ "candy_kisses",
1511
+ "candy_m-m",
1512
+ "candy_nerds",
1513
+ "candy_ricola",
1514
+ "catch black papper",
1515
+ "catch red chilli powder",
1516
+ "catch_black_papper",
1517
+ "cello",
1518
+ "chabaa_redgrape",
1519
+ "chabaa_whitegrapefruit",
1520
+ "chana dal",
1521
+ "cheese-corn bite",
1522
+ "cheese_soignon",
1523
+ "chile",
1524
+ "chings soy sauce",
1525
+ "chocalateHorlicks",
1526
+ "choclate-gaia",
1527
+ "choclate_ambriona",
1528
+ "choclate_epigamia",
1529
+ "choclate_lindberg",
1530
+ "choclate_loacker",
1531
+ "choclate_melt",
1532
+ "classic bread",
1533
+ "cobra",
1534
+ "coconut milk_uglobe",
1535
+ "cod Liver oil capsules",
1536
+ "colgate max",
1537
+ "conditioner natural",
1538
+ "cookie_merba",
1539
+ "cool freshness",
1540
+ "cows_ghee",
1541
+ "cream-vlcc",
1542
+ "crop oats Yummy protein-rich oats",
1543
+ "cup noodles italiano",
1544
+ "dabar anmol hair oil",
1545
+ "dabar champrash",
1546
+ "dabar gulabari moisturizing",
1547
+ "dabur babool",
1548
+ "dahi_at",
1549
+ "dalda",
1550
+ "danaram",
1551
+ "dantkanti",
1552
+ "dark fantasy sandwich cream",
1553
+ "delisoga",
1554
+ "double diamond tea",
1555
+ "dove daily shine shampoo",
1556
+ "dove intense repair shampoo",
1557
+ "dozo power wash",
1558
+ "drink_toran",
1559
+ "drink_torani",
1560
+ "drools",
1561
+ "dyna neem soap",
1562
+ "elmac lemon",
1563
+ "emami 7oils",
1564
+ "expelz ultra clean",
1565
+ "expelz_ultraclean",
1566
+ "facewash_beardo",
1567
+ "facewash_coffee",
1568
+ "facewash_wow",
1569
+ "fair and handsome",
1570
+ "fanta",
1571
+ "femina casserole",
1572
+ "figo detergent",
1573
+ "first crop all in one mixture",
1574
+ "first crop aloo bhujia",
1575
+ "first crop besan",
1576
+ "first crop butter cookies",
1577
+ "first crop cream onion potato chips",
1578
+ "first crop dry fruit",
1579
+ "first crop instant noodle",
1580
+ "first crop navratan",
1581
+ "first crop oats",
1582
+ "first crop peanut butter",
1583
+ "first crop tasty peanut",
1584
+ "first crop tastypeanuts",
1585
+ "first crop zahidi dates",
1586
+ "fist crop navratan",
1587
+ "fist crop oats",
1588
+ "fortune indori poha",
1589
+ "fresca_greenapple",
1590
+ "freshNatural_Deodrant_Nivea",
1591
+ "full bloom",
1592
+ "full bloom classic tea",
1593
+ "full bloom ketchup",
1594
+ "full bloom strawberry",
1595
+ "gaurianjeer",
1596
+ "ghee_Patanjali",
1597
+ "ghee_milkfood",
1598
+ "ginger nuts",
1599
+ "gits rabdi",
1600
+ "glow-lovely_cream",
1601
+ "glow-lovely_winter brightcream",
1602
+ "gluta-hya_Vaseline",
1603
+ "go cheese processed",
1604
+ "go pizza cheese",
1605
+ "godrej yummiez",
1606
+ "godrejrich creme",
1607
+ "goldengate roasted almonds",
1608
+ "good life chana dal",
1609
+ "good life jeera",
1610
+ "greenmint_Mala-s",
1611
+ "harpic",
1612
+ "hathi brand mustard oil",
1613
+ "hawkins",
1614
+ "head and shoulder 2 in 1",
1615
+ "highonCranberry_Beer_Coolberg",
1616
+ "himalaya complete care",
1617
+ "himalaya neem face wash",
1618
+ "himalaya shampoo",
1619
+ "himalayan body lotion",
1620
+ "hotdog",
1621
+ "icecream_creambell",
1622
+ "imli treengreen chilli pickle",
1623
+ "inchi",
1624
+ "independence",
1625
+ "independence biryani special",
1626
+ "independence dal",
1627
+ "india gate jeera rice",
1628
+ "indica easy hair colour",
1629
+ "intense engage",
1630
+ "jira_goodlife",
1631
+ "johnson",
1632
+ "joyo clean max",
1633
+ "juice_edwesis",
1634
+ "juice_sante",
1635
+ "junior horlicks",
1636
+ "kachi ghani mustard oil first crop",
1637
+ "kakaji",
1638
+ "kelloggy-s corn flakes real honey",
1639
+ "kellogs corn flakes",
1640
+ "keya all purpose",
1641
+ "keya piri piri",
1642
+ "kissan mixed fruit jam",
1643
+ "kissan peanut butter",
1644
+ "knorr chicken soup",
1645
+ "krack track",
1646
+ "light frydrate_Vaseline",
1647
+ "livon",
1648
+ "loreal colour protect shampoo",
1649
+ "lotus moisturiser",
1650
+ "lotus whiteglow",
1651
+ "making corn",
1652
+ "mala jamun jam",
1653
+ "malist cheese",
1654
+ "mama earth baby body wash",
1655
+ "mamaearth baby body wash",
1656
+ "manish",
1657
+ "manypoko pants",
1658
+ "masala chai",
1659
+ "masala_sbm",
1660
+ "masti oye",
1661
+ "max protein daily",
1662
+ "maza",
1663
+ "mdh amchur powder",
1664
+ "mdh hing",
1665
+ "mdh jeera powder",
1666
+ "mdh kashmiri mirch",
1667
+ "mdh lal mirch",
1668
+ "melody",
1669
+ "men turbobright_Garnier",
1670
+ "meusli",
1671
+ "mevities ginger nuts",
1672
+ "milton",
1673
+ "minakshi ghee",
1674
+ "miranda",
1675
+ "mixture_indiaah",
1676
+ "mohan red tea",
1677
+ "mosquito repllent_baygon",
1678
+ "mosquito repllent_bbhome",
1679
+ "mosquito repllent_mamaearth",
1680
+ "mosquito repllents_bodyguard",
1681
+ "mosquito repllents_campure",
1682
+ "mother ginger pickle",
1683
+ "mother recipe garlic pickle",
1684
+ "mother teekha meeth",
1685
+ "mothers upma",
1686
+ "mothers_upma",
1687
+ "mstick",
1688
+ "munch the cashew",
1689
+ "mushroom Soup_Knorr",
1690
+ "my home",
1691
+ "namaste india desi ghee",
1692
+ "napthaleneballs_ultra",
1693
+ "natural amla shikakai shampoo",
1694
+ "nescafe classic blast roast",
1695
+ "nestle lactogen pro",
1696
+ "nestle nan pro",
1697
+ "nimbu hi nimbu clanser",
1698
+ "nivea men deep impact",
1699
+ "noodles_MastiOye",
1700
+ "nutralite sampriti ghee",
1701
+ "oil_iris",
1702
+ "organic suji",
1703
+ "osam dahi",
1704
+ "osam plain dahi",
1705
+ "osam plaindahi",
1706
+ "outshuine",
1707
+ "pack_chabaa_whitegrapefruit",
1708
+ "paint_asianpaint",
1709
+ "paneer butterMasala_CuppaNoodles_Maggi",
1710
+ "pantene advance hairfall solution shampoo",
1711
+ "paraclute body lotion",
1712
+ "parle jaggery",
1713
+ "parle occasions",
1714
+ "parth jaggery",
1715
+ "patanjali saundarya face wash",
1716
+ "peanut_Bicano",
1717
+ "pedigree",
1718
+ "phenyl_ambetol",
1719
+ "poloqueen healthy",
1720
+ "poloqueen jasmine",
1721
+ "poloqueen lemon",
1722
+ "poltcab",
1723
+ "potato fryums",
1724
+ "potatochips_kakaji",
1725
+ "prabhaji jhakaas mix",
1726
+ "prabhaji moong daal",
1727
+ "prabhaji all in one",
1728
+ "prabhaji bhujia",
1729
+ "prabhaji chat pata",
1730
+ "prabhaji khata meetha",
1731
+ "prabhji khatta meetha",
1732
+ "prabhuji moong dal -",
1733
+ "priemer_terminator",
1734
+ "priya gold CNC",
1735
+ "punch the healthy crunch chile",
1736
+ "punchthehealthy_crunch_chile",
1737
+ "pure burst cow ghee",
1738
+ "pureGhee_Amul",
1739
+ "quickwrapp",
1740
+ "real man",
1741
+ "redpaste",
1742
+ "rite bite nuts-seeds",
1743
+ "riya bindas",
1744
+ "riya hum tum",
1745
+ "roasted vermicelli",
1746
+ "roomfreshner_ambipur",
1747
+ "ruchi kheer mix",
1748
+ "ruchiayuna jaggery",
1749
+ "saffalo masala oats",
1750
+ "saffalo oats",
1751
+ "saffola mustard oil",
1752
+ "sauce_schezwan",
1753
+ "savlon deep clean",
1754
+ "sawan",
1755
+ "shahi pariwar mustard oil",
1756
+ "shampoo_Natural",
1757
+ "shaving_foam_gillete",
1758
+ "snac tac chana",
1759
+ "snac tac hot -sour soup",
1760
+ "snac tac moong dal",
1761
+ "snac tac navratan mix",
1762
+ "snac tac tomato ketchup",
1763
+ "snac tac tomato soup",
1764
+ "snacks_jabsons",
1765
+ "snowhite detergent powder",
1766
+ "soap-protam",
1767
+ "soap-santoor",
1768
+ "soap-savloon",
1769
+ "soap_ghari",
1770
+ "soap_nip",
1771
+ "soap_no.1",
1772
+ "softdrink_arupe",
1773
+ "softdrink_edwises",
1774
+ "softdrink_evocus",
1775
+ "softdrink_raze",
1776
+ "softdrink_v8",
1777
+ "sofy anti bacteria",
1778
+ "soya chunks",
1779
+ "soya sauce_sam",
1780
+ "spice chicken_CuppaNoodles_Maggi",
1781
+ "spout",
1782
+ "spray_moov",
1783
+ "sudha lassi",
1784
+ "sudha milk",
1785
+ "sudha peda",
1786
+ "sundrop oil",
1787
+ "sunfeast dream",
1788
+ "sweet bliss peanut chikki",
1789
+ "sweet corn chicken soup_Knorr",
1790
+ "sweet_marshmelts",
1791
+ "tata samparn hing",
1792
+ "tea-tgl",
1793
+ "the man companyaloevera",
1794
+ "tomato fryums",
1795
+ "top Ramen masala",
1796
+ "top herbs",
1797
+ "top ramen curry",
1798
+ "towels_blooms",
1799
+ "troplcana",
1800
+ "tsauce_ops",
1801
+ "uncl",
1802
+ "vaseline deep moisture",
1803
+ "vasmol kesh kala",
1804
+ "veg cheese finger",
1805
+ "vicco",
1806
+ "vim Maha bar",
1807
+ "vintagecheddar_wyke",
1808
+ "vlcc charco",
1809
+ "vlcc eternal youth skin firming",
1810
+ "vlcc insta glow diamonfd bleach",
1811
+ "wafers-moneta",
1812
+ "wafers_unibic",
1813
+ "wal wal takka",
1814
+ "weikfield",
1815
+ "whippingcream",
1816
+ "whiskas",
1817
+ "white hansha jasmine fragrance",
1818
+ "white hansha rose fragrance",
1819
+ "white hansha sanitary",
1820
+ "whiteningSensitive_Deodrant_Nivea",
1821
+ "wiekfeild custard powder",
1822
+ "wildstone code",
1823
+ "woodpriemer_terminator",
1824
+ "yardley talc",
1825
+ "yardleys after shave lotionn",
1826
+ "yippee",
1827
+ "zoff",
1828
+ "Kitchen",
1829
+ "MixedPickle_Alps",
1830
+ "jam",
1831
+ "oil",
1832
+ "undefined"
1833
+ ]
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  gradio==6.16.0
2
- llama-cpp-python==0.3.9
3
  onnxruntime==1.21.0
4
  Pillow==11.2.1
5
  PyMuPDF==1.25.5
 
1
  gradio==6.16.0
2
+ llama-cpp-python==0.3.28
3
  onnxruntime==1.21.0
4
  Pillow==11.2.1
5
  PyMuPDF==1.25.5
tracer.py CHANGED
@@ -14,7 +14,7 @@ if TYPE_CHECKING:
14
 
15
  logger = logging.getLogger(__name__)
16
 
17
- _HF_DATASET_REPO = "naazimsnh02/kirana-detective-traces"
18
  MAX_RETRIES = 3
19
  BACKOFF_BASE_SECONDS = 2 # sleeps 2s, 4s, 8s on successive failures
20
 
 
14
 
15
  logger = logging.getLogger(__name__)
16
 
17
+ _HF_DATASET_REPO = "build-small-hackathon/kirana-detective-traces"
18
  MAX_RETRIES = 3
19
  BACKOFF_BASE_SECONDS = 2 # sleeps 2s, 4s, 8s on successive failures
20