""" 05b_inference_int8_raw.py — Inference using the RAW INT8 ONNX (3 outputs) plus a Python decoder that reproduces gwinndr's YoloLayer transforms. Decoder steps per scale: 1. reshape (1,255,H,W) -> (1,3,85,H,W) -> (1,H,W,3,85) 2. tx,ty = sigmoid(tx,ty) * scale_xy - (scale_xy-1)/2 + grid_offset 3. tw,th = exp(tw,th) * anchor_wh 4. obj = sigmoid(obj); cls = sigmoid(cls) 5. xy,wh *= stride """ import os, sys, time import numpy as np import cv2, requests from io import BytesIO from PIL import Image, ImageDraw, ImageFont import onnxruntime as ort SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) # Cfg-derived constants (yolov4-leaky-416) ANCHORS = [(10,13),(16,30),(33,23),(30,61),(62,45),(59,119),(116,90),(156,198),(373,326)] # Order matches our DarknetRaw export: out0=stride8 (mask 0,1,2), out1=stride16 (3,4,5), out2=stride32 (6,7,8) CFG_HEADS = [ # (stride, anchor_indices, scale_xy) (8, [0, 1, 2], 1.2), (16, [3, 4, 5], 1.1), (32, [6, 7, 8], 1.05), ] INPUT_SIZE = 416 SCORE_THR = 0.30 NMS_THR = 0.45 COCO = ["person","bicycle","car","motorcycle","airplane","bus","train","truck","boat", "traffic light","fire hydrant","stop sign","parking meter","bench","bird","cat", "dog","horse","sheep","cow","elephant","bear","zebra","giraffe","backpack","umbrella", "handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite", "baseball bat","baseball glove","skateboard","surfboard","tennis racket","bottle", "wine glass","cup","fork","knife","spoon","bowl","banana","apple","sandwich", "orange","broccoli","carrot","hot dog","pizza","donut","cake","chair","couch", "potted plant","bed","dining table","toilet","tv","laptop","mouse","remote", "keyboard","cell phone","microwave","oven","toaster","sink","refrigerator","book", "clock","vase","scissors","teddy bear","hair drier","toothbrush"] np.random.seed(42) PALETTE = [(int(r), int(g), int(b)) for r, g, b in np.random.randint(60, 255, (80, 3))] def get_font(size): for f in ("arialbd.ttf", "arial.ttf", "segoeui.ttf"): try: return ImageFont.truetype(f, size) except Exception: continue return ImageFont.load_default() def sigmoid(x): return 1.0 / (1.0 + np.exp(-np.clip(x, -50, 50))) def letterbox_nchw(rgb, size=INPUT_SIZE): h, w = rgb.shape[:2]; s = min(size/h, size/w) nh, nw = int(round(h*s)), int(round(w*s)) resized = cv2.resize(rgb, (nw, nh)) pad = np.full((size, size, 3), 114, np.uint8); pad[:nh,:nw] = resized chw = pad.astype(np.float32).transpose(2,0,1) / 255.0 return np.expand_dims(chw, 0), s def decode_one_head(raw, stride, anchor_idxs, scale_xy): """raw shape (1, 255, H, W) -> list of decoded predictions (cx,cy,w,h,obj,cls...) absolute pixel.""" _, ch, H, W = raw.shape n_anchors = len(anchor_idxs) n_classes = ch // n_anchors - 5 # 85 - 5 = 80 # (1, n_anchors, 5+nc, H, W) x = raw.reshape(1, n_anchors, 5 + n_classes, H, W) # Move to (H, W, n_anchors, 5+nc) x = x.transpose(0, 3, 4, 1, 2)[0] # (H, W, n_anchors, 5+nc) # Sigmoid + scale_xy on tx, ty txty = sigmoid(x[..., 0:2]) * scale_xy - (scale_xy - 1) / 2 twth = np.exp(np.clip(x[..., 2:4], -10, 10)) # exp on logits, clipped to avoid overflow obj = sigmoid(x[..., 4:5]) cls = sigmoid(x[..., 5:]) # Multiply twth by anchor (anchor_w, anchor_h) in grid units anchors_grid = np.array([[ANCHORS[i][0] / stride, ANCHORS[i][1] / stride] for i in anchor_idxs], dtype=np.float32) # (n_anchors, 2) twth = twth * anchors_grid[None, None, :, :] # Add grid offsets to txty (in grid units) yy, xx = np.meshgrid(np.arange(H), np.arange(W), indexing="ij") # (H, W) grid_xy = np.stack([xx, yy], axis=-1).astype(np.float32) # (H, W, 2) txty = txty + grid_xy[:, :, None, :] # broadcast over anchors # Multiply xywh by stride to get absolute pixel coords in input (416-letterboxed) space txty *= stride twth *= stride # Concat to (H*W*n_anchors, 4 + 1 + nc) pred = np.concatenate([txty, twth, obj, cls], axis=-1) pred = pred.reshape(-1, 5 + n_classes) return pred def decode_all(raws, ratio): """raws = [out_stride8, out_stride16, out_stride32] Returns post-NMS detections.""" all_pred = [] for raw, (stride, anc_idx, sxy) in zip(raws, CFG_HEADS): all_pred.append(decode_one_head(raw, stride, anc_idx, sxy)) pred = np.concatenate(all_pred, axis=0) # (N, 85) obj = pred[:, 4] cls = pred[:, 5:] cls_id = np.argmax(cls, axis=1) cls_score = cls[np.arange(len(cls)), cls_id] score = obj * cls_score keep = score > SCORE_THR pred = pred[keep]; score = score[keep]; cls_id = cls_id[keep] if len(pred) == 0: return [] cx, cy, w, h = pred[:,0], pred[:,1], pred[:,2], pred[:,3] x1 = (cx - w/2) / ratio; y1 = (cy - h/2) / ratio x2 = (cx + w/2) / ratio; y2 = (cy + h/2) / ratio dets = [{"class": int(cls_id[i]), "score": float(score[i]), "bbox":[float(x1[i]), float(y1[i]), float(x2[i]), float(y2[i])]} for i in range(len(pred))] dets.sort(key=lambda d: -d["score"]) keep_dets = [] while dets: keep_dets.append(dets[0]); rest = [] for d in dets[1:]: if d["class"] != keep_dets[-1]["class"]: rest.append(d); continue ax1,ay1,ax2,ay2 = keep_dets[-1]["bbox"]; bx1,by1,bx2,by2 = d["bbox"] iw = max(0,min(ax2,bx2)-max(ax1,bx1)); ih = max(0,min(ay2,by2)-max(ay1,by1)) inter = iw*ih; aa=max(0,(ax2-ax1)*(ay2-ay1)); ab=max(0,(bx2-bx1)*(by2-by1)) iou = inter / (aa+ab-inter+1e-9) if iou < NMS_THR: rest.append(d) dets = rest return keep_dets def draw(pil_img, dets): img = pil_img.copy(); d = ImageDraw.Draw(img) W, H = img.size th = max(3, min(W, H) // 200) font = get_font(max(14, min(W, H) // 40)) for x in dets: x1, y1, x2, y2 = x["bbox"] x1 = max(0, min(x1, W-1)); y1 = max(0, min(y1, H-1)) x2 = max(0, min(x2, W-1)); y2 = max(0, min(y2, H-1)) cls = x["class"]; cname = COCO[cls]; color = PALETTE[cls % len(PALETTE)] for t in range(th): d.rectangle([x1-t, y1-t, x2+t, y2+t], outline=color) label = f"{cname} {x['score']*100:.0f}%" bb = d.textbbox((x1, y1-18), label, font=font) d.rectangle(bb, fill=color) d.text((bb[0], bb[1]), label, fill=(0,0,0), font=font) return img def main(): import argparse ap = argparse.ArgumentParser() ap.add_argument("--onnx", default=os.path.join(SCRIPT_DIR, "out_onnx", "yolov4-leaky-416_int8_qop_raw.onnx")) ap.add_argument("--out-dir", default=os.path.join(SCRIPT_DIR, "inference_int8_raw")) args = ap.parse_args() if not os.path.isfile(args.onnx): print(f"[FAIL] {args.onnx} doesn't exist yet"); return 1 os.makedirs(args.out_dir, exist_ok=True) print(f"Loading ONNX: {args.onnx}") print(f" size: {os.path.getsize(args.onnx)/1e6:.1f} MB") sess = ort.InferenceSession(args.onnx, providers=["CPUExecutionProvider"]) inp = sess.get_inputs()[0].name print(f" inputs: {[i.name for i in sess.get_inputs()]}") print(f" outputs: {[o.name for o in sess.get_outputs()]}") tests = [ ("dog", "https://raw.githubusercontent.com/pjreddie/darknet/master/data/dog.jpg"), ("traffic", "http://images.cocodataset.org/val2017/000000011197.jpg"), ("skaters", "http://images.cocodataset.org/val2017/000000087038.jpg"), ("bus", "https://ultralytics.com/images/bus.jpg"), ("kitchen", "http://images.cocodataset.org/val2017/000000037777.jpg"), ("market", "http://images.cocodataset.org/val2017/000000289343.jpg"), ("parking", "http://images.cocodataset.org/val2017/000000017627.jpg"), ("dining", "http://images.cocodataset.org/val2017/000000080340.jpg"), ] for name, url in tests: try: r = requests.get(url, timeout=30); r.raise_for_status() pil = Image.open(BytesIO(r.content)).convert("RGB") except Exception as e: print(f"[skip {name}] {e}"); continue rgb = np.array(pil) blob, ratio = letterbox_nchw(rgb, INPUT_SIZE) t0 = time.time() outs = sess.run(None, {inp: blob}) t = (time.time() - t0) * 1000 dets = decode_all(outs, ratio) annotated = draw(pil, dets) out_path = os.path.join(args.out_dir, f"int8_raw_{name}.png") annotated.save(out_path) print(f" {name:>10s}: {len(dets):>2d} dets in {t:6.1f} ms") for d in dets[:8]: print(f" {COCO[d['class']]:>16s} {d['score']*100:5.1f}%") print(f" -> {out_path}") if __name__ == "__main__": sys.exit(main())