"""
05b_inference_int8_raw.py — Inference using the RAW INT8 ONNX (3 outputs)
plus a Python decoder that reproduces gwinndr's YoloLayer transforms.

Decoder steps per scale:
  1. reshape (1,255,H,W) -> (1,3,85,H,W) -> (1,H,W,3,85)
  2. tx,ty = sigmoid(tx,ty) * scale_xy - (scale_xy-1)/2  + grid_offset
  3. tw,th = exp(tw,th) * anchor_wh
  4. obj   = sigmoid(obj);  cls = sigmoid(cls)
  5. xy,wh *= stride
"""
import os, sys, time
import numpy as np
import cv2, requests
from io import BytesIO
from PIL import Image, ImageDraw, ImageFont
import onnxruntime as ort

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))

# Cfg-derived constants (yolov4-leaky-416)
ANCHORS = [(10,13),(16,30),(33,23),(30,61),(62,45),(59,119),(116,90),(156,198),(373,326)]
# Order matches our DarknetRaw export: out0=stride8 (mask 0,1,2), out1=stride16 (3,4,5), out2=stride32 (6,7,8)
CFG_HEADS = [
    # (stride, anchor_indices, scale_xy)
    (8,  [0, 1, 2], 1.2),
    (16, [3, 4, 5], 1.1),
    (32, [6, 7, 8], 1.05),
]

INPUT_SIZE = 416
SCORE_THR  = 0.30
NMS_THR    = 0.45

COCO = ["person","bicycle","car","motorcycle","airplane","bus","train","truck","boat",
        "traffic light","fire hydrant","stop sign","parking meter","bench","bird","cat",
        "dog","horse","sheep","cow","elephant","bear","zebra","giraffe","backpack","umbrella",
        "handbag","tie","suitcase","frisbee","skis","snowboard","sports ball","kite",
        "baseball bat","baseball glove","skateboard","surfboard","tennis racket","bottle",
        "wine glass","cup","fork","knife","spoon","bowl","banana","apple","sandwich",
        "orange","broccoli","carrot","hot dog","pizza","donut","cake","chair","couch",
        "potted plant","bed","dining table","toilet","tv","laptop","mouse","remote",
        "keyboard","cell phone","microwave","oven","toaster","sink","refrigerator","book",
        "clock","vase","scissors","teddy bear","hair drier","toothbrush"]

np.random.seed(42)
PALETTE = [(int(r), int(g), int(b)) for r, g, b in np.random.randint(60, 255, (80, 3))]


def get_font(size):
    for f in ("arialbd.ttf", "arial.ttf", "segoeui.ttf"):
        try: return ImageFont.truetype(f, size)
        except Exception: continue
    return ImageFont.load_default()


def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-np.clip(x, -50, 50)))


def letterbox_nchw(rgb, size=INPUT_SIZE):
    h, w = rgb.shape[:2]; s = min(size/h, size/w)
    nh, nw = int(round(h*s)), int(round(w*s))
    resized = cv2.resize(rgb, (nw, nh))
    pad = np.full((size, size, 3), 114, np.uint8); pad[:nh,:nw] = resized
    chw = pad.astype(np.float32).transpose(2,0,1) / 255.0
    return np.expand_dims(chw, 0), s


def decode_one_head(raw, stride, anchor_idxs, scale_xy):
    """raw shape (1, 255, H, W) -> list of decoded predictions (cx,cy,w,h,obj,cls...) absolute pixel."""
    _, ch, H, W = raw.shape
    n_anchors = len(anchor_idxs)
    n_classes = ch // n_anchors - 5    # 85 - 5 = 80
    # (1, n_anchors, 5+nc, H, W)
    x = raw.reshape(1, n_anchors, 5 + n_classes, H, W)
    # Move to (H, W, n_anchors, 5+nc)
    x = x.transpose(0, 3, 4, 1, 2)[0]   # (H, W, n_anchors, 5+nc)

    # Sigmoid + scale_xy on tx, ty
    txty = sigmoid(x[..., 0:2]) * scale_xy - (scale_xy - 1) / 2
    twth = np.exp(np.clip(x[..., 2:4], -10, 10))   # exp on logits, clipped to avoid overflow
    obj  = sigmoid(x[..., 4:5])
    cls  = sigmoid(x[..., 5:])

    # Multiply twth by anchor (anchor_w, anchor_h) in grid units
    anchors_grid = np.array([[ANCHORS[i][0] / stride, ANCHORS[i][1] / stride] for i in anchor_idxs],
                             dtype=np.float32)   # (n_anchors, 2)
    twth = twth * anchors_grid[None, None, :, :]

    # Add grid offsets to txty (in grid units)
    yy, xx = np.meshgrid(np.arange(H), np.arange(W), indexing="ij")  # (H, W)
    grid_xy = np.stack([xx, yy], axis=-1).astype(np.float32)         # (H, W, 2)
    txty = txty + grid_xy[:, :, None, :]                              # broadcast over anchors

    # Multiply xywh by stride to get absolute pixel coords in input (416-letterboxed) space
    txty *= stride
    twth *= stride

    # Concat to (H*W*n_anchors, 4 + 1 + nc)
    pred = np.concatenate([txty, twth, obj, cls], axis=-1)
    pred = pred.reshape(-1, 5 + n_classes)
    return pred


def decode_all(raws, ratio):
    """raws = [out_stride8, out_stride16, out_stride32]
    Returns post-NMS detections."""
    all_pred = []
    for raw, (stride, anc_idx, sxy) in zip(raws, CFG_HEADS):
        all_pred.append(decode_one_head(raw, stride, anc_idx, sxy))
    pred = np.concatenate(all_pred, axis=0)   # (N, 85)
    obj = pred[:, 4]
    cls = pred[:, 5:]
    cls_id = np.argmax(cls, axis=1)
    cls_score = cls[np.arange(len(cls)), cls_id]
    score = obj * cls_score
    keep = score > SCORE_THR
    pred = pred[keep]; score = score[keep]; cls_id = cls_id[keep]
    if len(pred) == 0: return []
    cx, cy, w, h = pred[:,0], pred[:,1], pred[:,2], pred[:,3]
    x1 = (cx - w/2) / ratio; y1 = (cy - h/2) / ratio
    x2 = (cx + w/2) / ratio; y2 = (cy + h/2) / ratio
    dets = [{"class": int(cls_id[i]), "score": float(score[i]),
             "bbox":[float(x1[i]), float(y1[i]), float(x2[i]), float(y2[i])]}
            for i in range(len(pred))]
    dets.sort(key=lambda d: -d["score"])
    keep_dets = []
    while dets:
        keep_dets.append(dets[0]); rest = []
        for d in dets[1:]:
            if d["class"] != keep_dets[-1]["class"]:
                rest.append(d); continue
            ax1,ay1,ax2,ay2 = keep_dets[-1]["bbox"]; bx1,by1,bx2,by2 = d["bbox"]
            iw = max(0,min(ax2,bx2)-max(ax1,bx1)); ih = max(0,min(ay2,by2)-max(ay1,by1))
            inter = iw*ih; aa=max(0,(ax2-ax1)*(ay2-ay1)); ab=max(0,(bx2-bx1)*(by2-by1))
            iou = inter / (aa+ab-inter+1e-9)
            if iou < NMS_THR: rest.append(d)
        dets = rest
    return keep_dets


def draw(pil_img, dets):
    img = pil_img.copy(); d = ImageDraw.Draw(img)
    W, H = img.size
    th = max(3, min(W, H) // 200)
    font = get_font(max(14, min(W, H) // 40))
    for x in dets:
        x1, y1, x2, y2 = x["bbox"]
        x1 = max(0, min(x1, W-1)); y1 = max(0, min(y1, H-1))
        x2 = max(0, min(x2, W-1)); y2 = max(0, min(y2, H-1))
        cls = x["class"]; cname = COCO[cls]; color = PALETTE[cls % len(PALETTE)]
        for t in range(th):
            d.rectangle([x1-t, y1-t, x2+t, y2+t], outline=color)
        label = f"{cname} {x['score']*100:.0f}%"
        bb = d.textbbox((x1, y1-18), label, font=font)
        d.rectangle(bb, fill=color)
        d.text((bb[0], bb[1]), label, fill=(0,0,0), font=font)
    return img


def main():
    import argparse
    ap = argparse.ArgumentParser()
    ap.add_argument("--onnx", default=os.path.join(SCRIPT_DIR, "out_onnx", "yolov4-leaky-416_int8_qop_raw.onnx"))
    ap.add_argument("--out-dir", default=os.path.join(SCRIPT_DIR, "inference_int8_raw"))
    args = ap.parse_args()

    if not os.path.isfile(args.onnx):
        print(f"[FAIL] {args.onnx} doesn't exist yet"); return 1
    os.makedirs(args.out_dir, exist_ok=True)

    print(f"Loading ONNX: {args.onnx}")
    print(f"  size: {os.path.getsize(args.onnx)/1e6:.1f} MB")
    sess = ort.InferenceSession(args.onnx, providers=["CPUExecutionProvider"])
    inp = sess.get_inputs()[0].name
    print(f"  inputs:  {[i.name for i in sess.get_inputs()]}")
    print(f"  outputs: {[o.name for o in sess.get_outputs()]}")

    tests = [
        ("dog",     "https://raw.githubusercontent.com/pjreddie/darknet/master/data/dog.jpg"),
        ("traffic", "http://images.cocodataset.org/val2017/000000011197.jpg"),
        ("skaters", "http://images.cocodataset.org/val2017/000000087038.jpg"),
        ("bus",     "https://ultralytics.com/images/bus.jpg"),
        ("kitchen", "http://images.cocodataset.org/val2017/000000037777.jpg"),
        ("market",  "http://images.cocodataset.org/val2017/000000289343.jpg"),
        ("parking", "http://images.cocodataset.org/val2017/000000017627.jpg"),
        ("dining",  "http://images.cocodataset.org/val2017/000000080340.jpg"),
    ]
    for name, url in tests:
        try:
            r = requests.get(url, timeout=30); r.raise_for_status()
            pil = Image.open(BytesIO(r.content)).convert("RGB")
        except Exception as e:
            print(f"[skip {name}] {e}"); continue
        rgb = np.array(pil)
        blob, ratio = letterbox_nchw(rgb, INPUT_SIZE)
        t0 = time.time()
        outs = sess.run(None, {inp: blob})
        t = (time.time() - t0) * 1000
        dets = decode_all(outs, ratio)
        annotated = draw(pil, dets)
        out_path = os.path.join(args.out_dir, f"int8_raw_{name}.png")
        annotated.save(out_path)
        print(f"  {name:>10s}: {len(dets):>2d} dets in {t:6.1f} ms")
        for d in dets[:8]:
            print(f"      {COCO[d['class']]:>16s}  {d['score']*100:5.1f}%")
        print(f"      -> {out_path}")


if __name__ == "__main__":
    sys.exit(main())