Commit Β·
131f661
1
Parent(s): e4a9cee
Yolo model fixes
Browse files- agents/visual_counter.py +33 -14
- data/fmcg_catalog.json +6 -3
- finetune/add_yolo_aliases.py +5 -0
agents/visual_counter.py
CHANGED
|
@@ -62,36 +62,55 @@ class VisualCounterAgent:
|
|
| 62 |
def _preprocess(self, image_path: str) -> Tuple[np.ndarray, Tuple[int, int]]:
|
| 63 |
img = Image.open(image_path).convert("RGB")
|
| 64 |
orig_shape = (img.height, img.width)
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
return arr, orig_shape
|
| 70 |
|
| 71 |
def _postprocess(
|
| 72 |
self, raw_output: np.ndarray, orig_shape: Tuple[int, int]
|
| 73 |
) -> Dict[str, int]:
|
| 74 |
-
"""
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
| 78 |
"""
|
| 79 |
out = raw_output[0] # remove batch dim
|
| 80 |
|
| 81 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
if out.shape[0] < out.shape[1]:
|
| 83 |
-
# shape is [channels, num_preds] β transpose
|
| 84 |
out = out.T
|
| 85 |
|
| 86 |
num_classes = len(self._class_names)
|
| 87 |
-
# boxes (cx, cy, w, h), objectness, class scores
|
| 88 |
boxes_raw = out[:, :4]
|
| 89 |
-
# YOLOv8 exports no explicit objectness; class scores are out[:,4:]
|
| 90 |
class_scores = out[:, 4:4 + num_classes]
|
| 91 |
scores = class_scores.max(axis=1)
|
| 92 |
class_ids = class_scores.argmax(axis=1)
|
| 93 |
|
| 94 |
-
# Filter by confidence
|
| 95 |
mask = scores > CONF_THRESHOLD
|
| 96 |
if not mask.any():
|
| 97 |
return {}
|
|
@@ -109,7 +128,7 @@ class VisualCounterAgent:
|
|
| 109 |
|
| 110 |
kept = _nms(boxes, scores, IOU_THRESHOLD)
|
| 111 |
|
| 112 |
-
counts
|
| 113 |
for idx in kept:
|
| 114 |
cid = int(class_ids[idx])
|
| 115 |
if 0 <= cid < len(self._class_names):
|
|
|
|
| 62 |
def _preprocess(self, image_path: str) -> Tuple[np.ndarray, Tuple[int, int]]:
|
| 63 |
img = Image.open(image_path).convert("RGB")
|
| 64 |
orig_shape = (img.height, img.width)
|
| 65 |
+
# Letterbox (aspect-preserving resize + grey pad) β the format Ultralytics
|
| 66 |
+
# YOLO is trained/exported with. A plain squash-resize distorts objects and
|
| 67 |
+
# noticeably lowers detection confidence.
|
| 68 |
+
w, h = img.size
|
| 69 |
+
scale = min(INPUT_SIZE / w, INPUT_SIZE / h)
|
| 70 |
+
nw, nh = int(round(w * scale)), int(round(h * scale))
|
| 71 |
+
resized = img.resize((nw, nh), Image.BILINEAR)
|
| 72 |
+
canvas = Image.new("RGB", (INPUT_SIZE, INPUT_SIZE), (114, 114, 114))
|
| 73 |
+
canvas.paste(resized, ((INPUT_SIZE - nw) // 2, (INPUT_SIZE - nh) // 2))
|
| 74 |
+
arr = np.array(canvas, dtype=np.float32) / 255.0 # [H, W, C]
|
| 75 |
+
arr = arr.transpose(2, 0, 1) # [C, H, W]
|
| 76 |
+
arr = np.expand_dims(arr, axis=0) # [1, C, H, W]
|
| 77 |
return arr, orig_shape
|
| 78 |
|
| 79 |
def _postprocess(
|
| 80 |
self, raw_output: np.ndarray, orig_shape: Tuple[int, int]
|
| 81 |
) -> Dict[str, int]:
|
| 82 |
+
"""Tally class detections from the YOLO ONNX output.
|
| 83 |
+
|
| 84 |
+
This export uses YOLO's end-to-end (built-in NMS) format:
|
| 85 |
+
[1, num_preds, 6] where each row is (x1, y1, x2, y2, confidence, class_id)
|
| 86 |
+
We also keep a fallback for the raw YOLOv8 head
|
| 87 |
+
[1, num_preds, 4+num_classes] / [1, 4+num_classes, num_preds].
|
| 88 |
"""
|
| 89 |
out = raw_output[0] # remove batch dim
|
| 90 |
|
| 91 |
+
# ββ End-to-end NMS format: (x1, y1, x2, y2, confidence, class_id) ββ
|
| 92 |
+
if out.ndim == 2 and out.shape[-1] == 6:
|
| 93 |
+
conf = out[:, 4]
|
| 94 |
+
class_ids = out[:, 5].astype(int)
|
| 95 |
+
mask = conf > CONF_THRESHOLD
|
| 96 |
+
counts: Dict[str, int] = {}
|
| 97 |
+
for cid in class_ids[mask]:
|
| 98 |
+
if 0 <= cid < len(self._class_names):
|
| 99 |
+
name = self._class_names[cid]
|
| 100 |
+
counts[name] = counts.get(name, 0) + 1
|
| 101 |
+
return counts
|
| 102 |
+
|
| 103 |
+
# ββ Fallback: raw YOLOv8 head with per-class scores ββ
|
| 104 |
+
# Normalise to [num_preds, 4+num_classes]
|
| 105 |
if out.shape[0] < out.shape[1]:
|
|
|
|
| 106 |
out = out.T
|
| 107 |
|
| 108 |
num_classes = len(self._class_names)
|
|
|
|
| 109 |
boxes_raw = out[:, :4]
|
|
|
|
| 110 |
class_scores = out[:, 4:4 + num_classes]
|
| 111 |
scores = class_scores.max(axis=1)
|
| 112 |
class_ids = class_scores.argmax(axis=1)
|
| 113 |
|
|
|
|
| 114 |
mask = scores > CONF_THRESHOLD
|
| 115 |
if not mask.any():
|
| 116 |
return {}
|
|
|
|
| 128 |
|
| 129 |
kept = _nms(boxes, scores, IOU_THRESHOLD)
|
| 130 |
|
| 131 |
+
counts = {}
|
| 132 |
for idx in kept:
|
| 133 |
cid = int(class_ids[idx])
|
| 134 |
if 0 <= cid < len(self._class_names):
|
data/fmcg_catalog.json
CHANGED
|
@@ -28,7 +28,8 @@
|
|
| 28 |
"PG 250",
|
| 29 |
"P-G 250GM",
|
| 30 |
"PARLE G 250",
|
| 31 |
-
"ParleG_Biscuit_Parle"
|
|
|
|
| 32 |
]
|
| 33 |
},
|
| 34 |
{
|
|
@@ -1802,7 +1803,8 @@
|
|
| 1802 |
"GDJ NO.1 100G",
|
| 1803 |
"GODREJ 1 SOAP 100",
|
| 1804 |
"GDJ NO 1 100",
|
| 1805 |
-
"soap_no.1"
|
|
|
|
| 1806 |
]
|
| 1807 |
},
|
| 1808 |
{
|
|
@@ -2756,7 +2758,8 @@
|
|
| 2756 |
"NESCAFE CLASSIC 100G",
|
| 2757 |
"NSCFE CLS 100G",
|
| 2758 |
"NESCAFE INST 100",
|
| 2759 |
-
"Nescafe_Classic_Coffee"
|
|
|
|
| 2760 |
]
|
| 2761 |
},
|
| 2762 |
{
|
|
|
|
| 28 |
"PG 250",
|
| 29 |
"P-G 250GM",
|
| 30 |
"PARLE G 250",
|
| 31 |
+
"ParleG_Biscuit_Parle",
|
| 32 |
+
"ParleG"
|
| 33 |
]
|
| 34 |
},
|
| 35 |
{
|
|
|
|
| 1803 |
"GDJ NO.1 100G",
|
| 1804 |
"GODREJ 1 SOAP 100",
|
| 1805 |
"GDJ NO 1 100",
|
| 1806 |
+
"soap_no.1",
|
| 1807 |
+
"SoapNo1_Godrej"
|
| 1808 |
]
|
| 1809 |
},
|
| 1810 |
{
|
|
|
|
| 2758 |
"NESCAFE CLASSIC 100G",
|
| 2759 |
"NSCFE CLS 100G",
|
| 2760 |
"NESCAFE INST 100",
|
| 2761 |
+
"Nescafe_Classic_Coffee",
|
| 2762 |
+
"Nescafe"
|
| 2763 |
]
|
| 2764 |
},
|
| 2765 |
{
|
finetune/add_yolo_aliases.py
CHANGED
|
@@ -29,12 +29,17 @@ CATALOG = Path(__file__).parent.parent / "data" / "fmcg_catalog.json"
|
|
| 29 |
THRESHOLD = 0.34 # min token-Jaccard to auto-bridge a non-demo class
|
| 30 |
|
| 31 |
# Demo items pinned to exact catalog sizes so the sample invoice reconciles.
|
|
|
|
|
|
|
| 32 |
DEMO_MAP = {
|
| 33 |
"ParleG_Biscuit_Parle": "parle_g_250g",
|
|
|
|
| 34 |
"Amul Butter": "amul_butter_500g",
|
| 35 |
"Parachute coconut Oil": "parachute_oil_500ml",
|
| 36 |
"Soap_Original_Dettol": "dettol_soap_125g",
|
| 37 |
"Nescafe_Classic_Coffee": "nescafe_classic_100g",
|
|
|
|
|
|
|
| 38 |
"Biscuit_Bourbon_Britannia": "bourbon_cream_150g",
|
| 39 |
}
|
| 40 |
|
|
|
|
| 29 |
THRESHOLD = 0.34 # min token-Jaccard to auto-bridge a non-demo class
|
| 30 |
|
| 31 |
# Demo items pinned to exact catalog sizes so the sample invoice reconciles.
|
| 32 |
+
# Includes the generic class names the model actually emits in practice (verified
|
| 33 |
+
# from real detections) alongside the fine-grained variants β both point to one SKU.
|
| 34 |
DEMO_MAP = {
|
| 35 |
"ParleG_Biscuit_Parle": "parle_g_250g",
|
| 36 |
+
"ParleG": "parle_g_250g",
|
| 37 |
"Amul Butter": "amul_butter_500g",
|
| 38 |
"Parachute coconut Oil": "parachute_oil_500ml",
|
| 39 |
"Soap_Original_Dettol": "dettol_soap_125g",
|
| 40 |
"Nescafe_Classic_Coffee": "nescafe_classic_100g",
|
| 41 |
+
"Nescafe": "nescafe_classic_100g", # generic class the model emits
|
| 42 |
+
"SoapNo1_Godrej": "godrej_no1_soap_100g", # generic class the model emits
|
| 43 |
"Biscuit_Bourbon_Britannia": "bourbon_cream_150g",
|
| 44 |
}
|
| 45 |
|