Spaces:

build-small-hackathon
/

kirana-detective

Sleeping

App Files Files Community

naazimsnh02 commited on 9 days ago

Commit

131f661

1 Parent(s): e4a9cee

Yolo model fixes

Browse files

Files changed (3) hide show

agents/visual_counter.py +33 -14
data/fmcg_catalog.json +6 -3
finetune/add_yolo_aliases.py +5 -0

agents/visual_counter.py CHANGED Viewed

@@ -62,36 +62,55 @@ class VisualCounterAgent:
     def _preprocess(self, image_path: str) -> Tuple[np.ndarray, Tuple[int, int]]:
         img = Image.open(image_path).convert("RGB")
         orig_shape = (img.height, img.width)
-        img = img.resize((INPUT_SIZE, INPUT_SIZE), Image.BILINEAR)
-        arr = np.array(img, dtype=np.float32) / 255.0  # [H, W, C]
-        arr = arr.transpose(2, 0, 1)                   # [C, H, W]
-        arr = np.expand_dims(arr, axis=0)              # [1, C, H, W]
         return arr, orig_shape
     def _postprocess(
         self, raw_output: np.ndarray, orig_shape: Tuple[int, int]
     ) -> Dict[str, int]:
-        """
-        raw_output shape: [1, num_preds, 4+1+num_classes]  (YOLOv8 ONNX format)
-        or [1, 4+num_classes, num_preds] (older export layout).
-        We handle both by checking shape.
         """
         out = raw_output[0]  # remove batch dim
-        # Normalise to [num_preds, 4+1+num_classes]
         if out.shape[0] < out.shape[1]:
-            # shape is [channels, num_preds] → transpose
             out = out.T
         num_classes = len(self._class_names)
-        # boxes (cx, cy, w, h), objectness, class scores
         boxes_raw = out[:, :4]
-        # YOLOv8 exports no explicit objectness; class scores are out[:,4:]
         class_scores = out[:, 4:4 + num_classes]
         scores = class_scores.max(axis=1)
         class_ids = class_scores.argmax(axis=1)
-        # Filter by confidence
         mask = scores > CONF_THRESHOLD
         if not mask.any():
             return {}
@@ -109,7 +128,7 @@ class VisualCounterAgent:
         kept = _nms(boxes, scores, IOU_THRESHOLD)
-        counts: Dict[str, int] = {}
         for idx in kept:
             cid = int(class_ids[idx])
             if 0 <= cid < len(self._class_names):

     def _preprocess(self, image_path: str) -> Tuple[np.ndarray, Tuple[int, int]]:
         img = Image.open(image_path).convert("RGB")
         orig_shape = (img.height, img.width)
+        # Letterbox (aspect-preserving resize + grey pad) — the format Ultralytics
+        # YOLO is trained/exported with. A plain squash-resize distorts objects and
+        # noticeably lowers detection confidence.
+        w, h = img.size
+        scale = min(INPUT_SIZE / w, INPUT_SIZE / h)
+        nw, nh = int(round(w * scale)), int(round(h * scale))
+        resized = img.resize((nw, nh), Image.BILINEAR)
+        canvas = Image.new("RGB", (INPUT_SIZE, INPUT_SIZE), (114, 114, 114))
+        canvas.paste(resized, ((INPUT_SIZE - nw) // 2, (INPUT_SIZE - nh) // 2))
+        arr = np.array(canvas, dtype=np.float32) / 255.0  # [H, W, C]
+        arr = arr.transpose(2, 0, 1)                      # [C, H, W]
+        arr = np.expand_dims(arr, axis=0)                 # [1, C, H, W]
         return arr, orig_shape
     def _postprocess(
         self, raw_output: np.ndarray, orig_shape: Tuple[int, int]
     ) -> Dict[str, int]:
+        """Tally class detections from the YOLO ONNX output.
+        This export uses YOLO's end-to-end (built-in NMS) format:
+            [1, num_preds, 6]  where each row is (x1, y1, x2, y2, confidence, class_id)
+        We also keep a fallback for the raw YOLOv8 head
+            [1, num_preds, 4+num_classes]  /  [1, 4+num_classes, num_preds].
         """
         out = raw_output[0]  # remove batch dim
+        # ── End-to-end NMS format: (x1, y1, x2, y2, confidence, class_id) ──
+        if out.ndim == 2 and out.shape[-1] == 6:
+            conf = out[:, 4]
+            class_ids = out[:, 5].astype(int)
+            mask = conf > CONF_THRESHOLD
+            counts: Dict[str, int] = {}
+            for cid in class_ids[mask]:
+                if 0 <= cid < len(self._class_names):
+                    name = self._class_names[cid]
+                    counts[name] = counts.get(name, 0) + 1
+            return counts
+        # ── Fallback: raw YOLOv8 head with per-class scores ──
+        # Normalise to [num_preds, 4+num_classes]
         if out.shape[0] < out.shape[1]:
             out = out.T
         num_classes = len(self._class_names)
         boxes_raw = out[:, :4]
         class_scores = out[:, 4:4 + num_classes]
         scores = class_scores.max(axis=1)
         class_ids = class_scores.argmax(axis=1)
         mask = scores > CONF_THRESHOLD
         if not mask.any():
             return {}
         kept = _nms(boxes, scores, IOU_THRESHOLD)
+        counts = {}
         for idx in kept:
             cid = int(class_ids[idx])
             if 0 <= cid < len(self._class_names):

data/fmcg_catalog.json CHANGED Viewed

@@ -28,7 +28,8 @@
       "PG 250",
       "P-G 250GM",
       "PARLE G 250",
-      "ParleG_Biscuit_Parle"
     ]
   },
   {
@@ -1802,7 +1803,8 @@
       "GDJ NO.1 100G",
       "GODREJ 1 SOAP 100",
       "GDJ NO 1 100",
-      "soap_no.1"
     ]
   },
   {
@@ -2756,7 +2758,8 @@
       "NESCAFE CLASSIC 100G",
       "NSCFE CLS 100G",
       "NESCAFE INST 100",
-      "Nescafe_Classic_Coffee"
     ]
   },
   {

       "PG 250",
       "P-G 250GM",
       "PARLE G 250",
+      "ParleG_Biscuit_Parle",
+      "ParleG"
     ]
   },
   {
       "GDJ NO.1 100G",
       "GODREJ 1 SOAP 100",
       "GDJ NO 1 100",
+      "soap_no.1",
+      "SoapNo1_Godrej"
     ]
   },
   {
       "NESCAFE CLASSIC 100G",
       "NSCFE CLS 100G",
       "NESCAFE INST 100",
+      "Nescafe_Classic_Coffee",
+      "Nescafe"
     ]
   },
   {

finetune/add_yolo_aliases.py CHANGED Viewed

@@ -29,12 +29,17 @@ CATALOG = Path(__file__).parent.parent / "data" / "fmcg_catalog.json"
 THRESHOLD = 0.34  # min token-Jaccard to auto-bridge a non-demo class
 # Demo items pinned to exact catalog sizes so the sample invoice reconciles.
 DEMO_MAP = {
     "ParleG_Biscuit_Parle": "parle_g_250g",
     "Amul Butter": "amul_butter_500g",
     "Parachute coconut Oil": "parachute_oil_500ml",
     "Soap_Original_Dettol": "dettol_soap_125g",
     "Nescafe_Classic_Coffee": "nescafe_classic_100g",
     "Biscuit_Bourbon_Britannia": "bourbon_cream_150g",
 }

 THRESHOLD = 0.34  # min token-Jaccard to auto-bridge a non-demo class
 # Demo items pinned to exact catalog sizes so the sample invoice reconciles.
+# Includes the generic class names the model actually emits in practice (verified
+# from real detections) alongside the fine-grained variants — both point to one SKU.
 DEMO_MAP = {
     "ParleG_Biscuit_Parle": "parle_g_250g",
+    "ParleG": "parle_g_250g",
     "Amul Butter": "amul_butter_500g",
     "Parachute coconut Oil": "parachute_oil_500ml",
     "Soap_Original_Dettol": "dettol_soap_125g",
     "Nescafe_Classic_Coffee": "nescafe_classic_100g",
+    "Nescafe": "nescafe_classic_100g",            # generic class the model emits
+    "SoapNo1_Godrej": "godrej_no1_soap_100g",     # generic class the model emits
     "Biscuit_Bourbon_Britannia": "bourbon_cream_150g",
 }