naazimsnh02 commited on
Commit
131f661
Β·
1 Parent(s): e4a9cee

Yolo model fixes

Browse files
agents/visual_counter.py CHANGED
@@ -62,36 +62,55 @@ class VisualCounterAgent:
62
  def _preprocess(self, image_path: str) -> Tuple[np.ndarray, Tuple[int, int]]:
63
  img = Image.open(image_path).convert("RGB")
64
  orig_shape = (img.height, img.width)
65
- img = img.resize((INPUT_SIZE, INPUT_SIZE), Image.BILINEAR)
66
- arr = np.array(img, dtype=np.float32) / 255.0 # [H, W, C]
67
- arr = arr.transpose(2, 0, 1) # [C, H, W]
68
- arr = np.expand_dims(arr, axis=0) # [1, C, H, W]
 
 
 
 
 
 
 
 
69
  return arr, orig_shape
70
 
71
  def _postprocess(
72
  self, raw_output: np.ndarray, orig_shape: Tuple[int, int]
73
  ) -> Dict[str, int]:
74
- """
75
- raw_output shape: [1, num_preds, 4+1+num_classes] (YOLOv8 ONNX format)
76
- or [1, 4+num_classes, num_preds] (older export layout).
77
- We handle both by checking shape.
 
 
78
  """
79
  out = raw_output[0] # remove batch dim
80
 
81
- # Normalise to [num_preds, 4+1+num_classes]
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  if out.shape[0] < out.shape[1]:
83
- # shape is [channels, num_preds] β†’ transpose
84
  out = out.T
85
 
86
  num_classes = len(self._class_names)
87
- # boxes (cx, cy, w, h), objectness, class scores
88
  boxes_raw = out[:, :4]
89
- # YOLOv8 exports no explicit objectness; class scores are out[:,4:]
90
  class_scores = out[:, 4:4 + num_classes]
91
  scores = class_scores.max(axis=1)
92
  class_ids = class_scores.argmax(axis=1)
93
 
94
- # Filter by confidence
95
  mask = scores > CONF_THRESHOLD
96
  if not mask.any():
97
  return {}
@@ -109,7 +128,7 @@ class VisualCounterAgent:
109
 
110
  kept = _nms(boxes, scores, IOU_THRESHOLD)
111
 
112
- counts: Dict[str, int] = {}
113
  for idx in kept:
114
  cid = int(class_ids[idx])
115
  if 0 <= cid < len(self._class_names):
 
62
  def _preprocess(self, image_path: str) -> Tuple[np.ndarray, Tuple[int, int]]:
63
  img = Image.open(image_path).convert("RGB")
64
  orig_shape = (img.height, img.width)
65
+ # Letterbox (aspect-preserving resize + grey pad) β€” the format Ultralytics
66
+ # YOLO is trained/exported with. A plain squash-resize distorts objects and
67
+ # noticeably lowers detection confidence.
68
+ w, h = img.size
69
+ scale = min(INPUT_SIZE / w, INPUT_SIZE / h)
70
+ nw, nh = int(round(w * scale)), int(round(h * scale))
71
+ resized = img.resize((nw, nh), Image.BILINEAR)
72
+ canvas = Image.new("RGB", (INPUT_SIZE, INPUT_SIZE), (114, 114, 114))
73
+ canvas.paste(resized, ((INPUT_SIZE - nw) // 2, (INPUT_SIZE - nh) // 2))
74
+ arr = np.array(canvas, dtype=np.float32) / 255.0 # [H, W, C]
75
+ arr = arr.transpose(2, 0, 1) # [C, H, W]
76
+ arr = np.expand_dims(arr, axis=0) # [1, C, H, W]
77
  return arr, orig_shape
78
 
79
  def _postprocess(
80
  self, raw_output: np.ndarray, orig_shape: Tuple[int, int]
81
  ) -> Dict[str, int]:
82
+ """Tally class detections from the YOLO ONNX output.
83
+
84
+ This export uses YOLO's end-to-end (built-in NMS) format:
85
+ [1, num_preds, 6] where each row is (x1, y1, x2, y2, confidence, class_id)
86
+ We also keep a fallback for the raw YOLOv8 head
87
+ [1, num_preds, 4+num_classes] / [1, 4+num_classes, num_preds].
88
  """
89
  out = raw_output[0] # remove batch dim
90
 
91
+ # ── End-to-end NMS format: (x1, y1, x2, y2, confidence, class_id) ──
92
+ if out.ndim == 2 and out.shape[-1] == 6:
93
+ conf = out[:, 4]
94
+ class_ids = out[:, 5].astype(int)
95
+ mask = conf > CONF_THRESHOLD
96
+ counts: Dict[str, int] = {}
97
+ for cid in class_ids[mask]:
98
+ if 0 <= cid < len(self._class_names):
99
+ name = self._class_names[cid]
100
+ counts[name] = counts.get(name, 0) + 1
101
+ return counts
102
+
103
+ # ── Fallback: raw YOLOv8 head with per-class scores ──
104
+ # Normalise to [num_preds, 4+num_classes]
105
  if out.shape[0] < out.shape[1]:
 
106
  out = out.T
107
 
108
  num_classes = len(self._class_names)
 
109
  boxes_raw = out[:, :4]
 
110
  class_scores = out[:, 4:4 + num_classes]
111
  scores = class_scores.max(axis=1)
112
  class_ids = class_scores.argmax(axis=1)
113
 
 
114
  mask = scores > CONF_THRESHOLD
115
  if not mask.any():
116
  return {}
 
128
 
129
  kept = _nms(boxes, scores, IOU_THRESHOLD)
130
 
131
+ counts = {}
132
  for idx in kept:
133
  cid = int(class_ids[idx])
134
  if 0 <= cid < len(self._class_names):
data/fmcg_catalog.json CHANGED
@@ -28,7 +28,8 @@
28
  "PG 250",
29
  "P-G 250GM",
30
  "PARLE G 250",
31
- "ParleG_Biscuit_Parle"
 
32
  ]
33
  },
34
  {
@@ -1802,7 +1803,8 @@
1802
  "GDJ NO.1 100G",
1803
  "GODREJ 1 SOAP 100",
1804
  "GDJ NO 1 100",
1805
- "soap_no.1"
 
1806
  ]
1807
  },
1808
  {
@@ -2756,7 +2758,8 @@
2756
  "NESCAFE CLASSIC 100G",
2757
  "NSCFE CLS 100G",
2758
  "NESCAFE INST 100",
2759
- "Nescafe_Classic_Coffee"
 
2760
  ]
2761
  },
2762
  {
 
28
  "PG 250",
29
  "P-G 250GM",
30
  "PARLE G 250",
31
+ "ParleG_Biscuit_Parle",
32
+ "ParleG"
33
  ]
34
  },
35
  {
 
1803
  "GDJ NO.1 100G",
1804
  "GODREJ 1 SOAP 100",
1805
  "GDJ NO 1 100",
1806
+ "soap_no.1",
1807
+ "SoapNo1_Godrej"
1808
  ]
1809
  },
1810
  {
 
2758
  "NESCAFE CLASSIC 100G",
2759
  "NSCFE CLS 100G",
2760
  "NESCAFE INST 100",
2761
+ "Nescafe_Classic_Coffee",
2762
+ "Nescafe"
2763
  ]
2764
  },
2765
  {
finetune/add_yolo_aliases.py CHANGED
@@ -29,12 +29,17 @@ CATALOG = Path(__file__).parent.parent / "data" / "fmcg_catalog.json"
29
  THRESHOLD = 0.34 # min token-Jaccard to auto-bridge a non-demo class
30
 
31
  # Demo items pinned to exact catalog sizes so the sample invoice reconciles.
 
 
32
  DEMO_MAP = {
33
  "ParleG_Biscuit_Parle": "parle_g_250g",
 
34
  "Amul Butter": "amul_butter_500g",
35
  "Parachute coconut Oil": "parachute_oil_500ml",
36
  "Soap_Original_Dettol": "dettol_soap_125g",
37
  "Nescafe_Classic_Coffee": "nescafe_classic_100g",
 
 
38
  "Biscuit_Bourbon_Britannia": "bourbon_cream_150g",
39
  }
40
 
 
29
  THRESHOLD = 0.34 # min token-Jaccard to auto-bridge a non-demo class
30
 
31
  # Demo items pinned to exact catalog sizes so the sample invoice reconciles.
32
+ # Includes the generic class names the model actually emits in practice (verified
33
+ # from real detections) alongside the fine-grained variants β€” both point to one SKU.
34
  DEMO_MAP = {
35
  "ParleG_Biscuit_Parle": "parle_g_250g",
36
+ "ParleG": "parle_g_250g",
37
  "Amul Butter": "amul_butter_500g",
38
  "Parachute coconut Oil": "parachute_oil_500ml",
39
  "Soap_Original_Dettol": "dettol_soap_125g",
40
  "Nescafe_Classic_Coffee": "nescafe_classic_100g",
41
+ "Nescafe": "nescafe_classic_100g", # generic class the model emits
42
+ "SoapNo1_Godrej": "godrej_no1_soap_100g", # generic class the model emits
43
  "Biscuit_Bourbon_Britannia": "bourbon_cream_150g",
44
  }
45