"""
PaddleOCR (PaddlePaddle Runtime, PP-OCRv5) — Standalone OCR Benchmark Space
"""

import os
import time
import json
import importlib
import importlib.metadata
import tempfile
from pathlib import Path
from collections import OrderedDict

import gradio as gr
import numpy as np
from PIL import Image
from jiwer import cer, wer
from datasets import load_dataset

# ---------------------------------------------------------------------------
# Dataset registry
# ---------------------------------------------------------------------------
DATASETS = OrderedDict(
    {
        "FUNSD — Forms (50 test docs)": {
            "hf_id": "nielsr/funsd",
            "split": "test",
            "image_col": "image",
            "gt_fn": "funsd",
            "description": "Form Understanding in Noisy Scanned Documents. 50 test documents with word-level GT.",
        },
        "IAM — Handwriting lines (test set, 50 samples)": {
            "hf_id": "Teklia/IAM-line",
            "split": "test",
            "image_col": "image",
            "gt_fn": "iam",
            "description": "IAM handwriting database, line-level images with transcriptions.",
        },
        "CORD-v2 — Receipts (50 samples)": {
            "hf_id": "naver-clova-ix/cord-v2",
            "split": "test",
            "image_col": "image",
            "gt_fn": "cord",
            "description": "Consolidated Receipt Dataset v2. Complex receipt images with structured GT.",
        },
        "Invoices & Receipts (50 samples)": {
            "hf_id": "mychen76/invoices-and-receipts_ocr_v1",
            "split": "test",
            "image_col": "image",
            "gt_fn": "invoices",
            "description": "Invoices and receipts with OCR ground truth text.",
        },
    }
)

MAX_SAMPLES = 50

# ---------------------------------------------------------------------------
# Ground-truth extraction helpers
# ---------------------------------------------------------------------------

def _gt_funsd(row):
    words = row.get("words", [])
    return " ".join(words)

def _gt_iam(row):
    return row.get("text", "")

def _gt_cord(row):
    try:
        gt = json.loads(row.get("ground_truth", "{}"))
        parse = gt.get("gt_parse", {})
        parts = []
        for menu_item in parse.get("menu", []):
            for key in ("nm", "cnt", "price", "unitprice", "itemsubtotal", "sub", "etc"):
                val = menu_item.get(key)
                if val and isinstance(val, str):
                    parts.append(val)
                elif isinstance(val, dict):
                    for v2 in val.values():
                        if isinstance(v2, str):
                            parts.append(v2)
        for section in ("subtotal", "total", "tax"):
            sec_data = parse.get(section, {})
            if isinstance(sec_data, dict):
                for v in sec_data.values():
                    if isinstance(v, str):
                        parts.append(v)
            elif isinstance(sec_data, list):
                for item in sec_data:
                    if isinstance(item, dict):
                        for v in item.values():
                            if isinstance(v, str):
                                parts.append(v)
        return " ".join(parts) if parts else ""
    except Exception:
        return ""

def _gt_invoices(row):
    try:
        raw = json.loads(row.get("raw_data", "{}"))
        words_str = raw.get("ocr_words", "")
        if isinstance(words_str, str) and words_str.startswith("["):
            import ast
            words = ast.literal_eval(words_str)
            return " ".join(words)
        return str(words_str)
    except Exception:
        return ""

GT_EXTRACTORS = {
    "funsd": _gt_funsd,
    "iam": _gt_iam,
    "cord": _gt_cord,
    "invoices": _gt_invoices,
}

# ---------------------------------------------------------------------------
# OCR engine
# ---------------------------------------------------------------------------

class PaddleOCREngine:
    def __init__(self, lang="en", ocr_version="PP-OCRv5"):
        from paddleocr import PaddleOCR
        self.ocr = PaddleOCR(
            lang=lang,
            ocr_version=ocr_version,
            use_doc_orientation_classify=False,
            use_doc_unwarping=False,
            use_textline_orientation=False,
        )
        self.version = ocr_version

    def run(self, image: Image.Image):
        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
            image.save(f, format="PNG")
            tmp_path = f.name
        try:
            t0 = time.perf_counter()
            results = self.ocr.predict(tmp_path)
            elapsed = time.perf_counter() - t0

            texts, scores = [], []
            for r in results:
                rec_texts = r.get("rec_texts", []) if hasattr(r, "get") else getattr(r, "rec_texts", [])
                rec_scores = r.get("rec_scores", []) if hasattr(r, "get") else getattr(r, "rec_scores", [])
                if not rec_texts:
                    try:
                        rec_texts = r["rec_texts"]
                        rec_scores = r["rec_scores"]
                    except Exception:
                        pass
                texts.extend(rec_texts if rec_texts else [])
                scores.extend(list(rec_scores) if rec_scores is not None else [])
            return texts, scores, elapsed
        finally:
            os.unlink(tmp_path)

# ---------------------------------------------------------------------------
# Deployment size — REAL on-disk measurement
# ---------------------------------------------------------------------------

def _get_dist_dirs(dist_name: str) -> list[str]:
    """Find all directories on disk belonging to a pip distribution."""
    try:
        dist = importlib.metadata.distribution(dist_name)
    except importlib.metadata.PackageNotFoundError:
        return []

    dist_info_path = Path(dist._path)
    site_packages = dist_info_path.parent

    dirs: set[str] = set()
    dirs.add(str(dist_info_path))

    # top_level.txt lists the importable package names
    try:
        top_level = dist.read_text("top_level.txt")
        if top_level:
            for name in top_level.strip().splitlines():
                name = name.strip()
                candidate = site_packages / name
                if candidate.is_dir():
                    dirs.add(str(candidate))
                elif candidate.with_suffix(".py").is_file():
                    dirs.add(str(candidate.with_suffix(".py")))
    except Exception:
        pass

    # Also check RECORD for top-level dirs we may have missed
    if dist.files:
        for f in dist.files:
            parts = str(f).split("/")
            if parts and parts[0] not in (".", ".."):
                top = parts[0]
                if top.endswith((".dist-info", ".egg-info")):
                    continue
                candidate = site_packages / top
                if candidate.is_dir():
                    dirs.add(str(candidate))
                elif candidate.is_file():
                    dirs.add(str(candidate))

    return list(dirs)


def _size_bytes(path: str) -> int:
    """Recursively sum real file sizes under a path."""
    p = Path(path)
    if p.is_file():
        return p.stat().st_size
    total = 0
    for dirpath, _, filenames in os.walk(p):
        for fname in filenames:
            try:
                total += os.path.getsize(os.path.join(dirpath, fname))
            except OSError:
                pass
    return total


def get_package_real_size_mb(dist_name: str) -> float | None:
    """Measure the REAL on-disk installed size of a package in MB."""
    dirs = _get_dist_dirs(dist_name)
    if not dirs:
        return None
    # Deduplicate — a dir and its subdirs shouldn't be double-counted
    # since we only collect top-level dirs, os.walk handles the rest
    total = sum(_size_bytes(d) for d in dirs)
    return total / (1024 * 1024)


def estimate_deployment_size():
    """Measure real installed sizes for the PaddleOCR deployment stack."""
    packages = [
        ("paddleocr", "paddleocr"),
        ("paddlepaddle (+ paddlex)", "paddlepaddle"),
        ("paddlex", "paddlex"),
        ("opencv-contrib-python", "opencv-contrib-python"),
        ("opencv-python-headless", "opencv-python-headless"),
        ("opencv-python", "opencv-python"),
        ("numpy", "numpy"),
        ("Pillow", "Pillow"),
        ("shapely", "shapely"),
        ("pyclipper", "pyclipper"),
        ("scipy", "scipy"),
        ("scikit-learn", "scikit-learn"),
        ("lxml", "lxml"),
        ("onnxruntime", "onnxruntime"),
        ("protobuf", "protobuf"),
    ]
    total = 0.0
    details = {}
    for label, dist_name in packages:
        size = get_package_real_size_mb(dist_name)
        if size is not None and size > 0.1:  # skip trivially small
            total += size
            details[label] = round(size, 1)
    return round(total, 1), details

# ---------------------------------------------------------------------------
# Metrics
# ---------------------------------------------------------------------------

def compute_metrics(gt_text: str, ocr_text: str):
    if not gt_text.strip() or not ocr_text.strip():
        return {"CER": None, "WER": None}
    try:
        c = cer(gt_text.strip(), ocr_text.strip())
    except Exception:
        c = None
    try:
        w = wer(gt_text.strip(), ocr_text.strip())
    except Exception:
        w = None
    return {"CER": c, "WER": w}

# ---------------------------------------------------------------------------
# Benchmark runner
# ---------------------------------------------------------------------------

def run_benchmark(dataset_name, num_samples, progress=gr.Progress()):
    if dataset_name not in DATASETS:
        return "❌ Unknown dataset", None, None, None, None

    ds_info = DATASETS[dataset_name]
    progress(0, desc=f"Loading dataset: {ds_info['hf_id']}...")

    try:
        ds = load_dataset(ds_info["hf_id"], split=ds_info["split"], trust_remote_code=True)
    except Exception as e:
        return f"❌ Failed to load dataset: {e}", None, None, None, None

    n = min(int(num_samples), len(ds), MAX_SAMPLES)
    ds = ds.select(range(n))
    gt_fn = GT_EXTRACTORS[ds_info["gt_fn"]]

    progress(0.05, desc="Initializing PaddleOCR (PP-OCRv5) engine...")
    try:
        engine = PaddleOCREngine(lang="en", ocr_version="PP-OCRv5")
    except Exception as e:
        return f"❌ Failed to init PaddleOCR: {e}", None, None, None, None

    results = []
    per_sample = []

    for i, row in enumerate(ds):
        progress((0.1 + 0.85 * i / n), desc=f"Processing sample {i+1}/{n}...")
        image = row[ds_info["image_col"]]
        if not isinstance(image, Image.Image):
            continue
        gt_text = gt_fn(row)
        if not gt_text.strip():
            continue

        sample = {"#": i, "Ground Truth": gt_text[:120] + "..." if len(gt_text) > 120 else gt_text}
        try:
            texts, scores, elapsed = engine.run(image)
            ocr_text = " ".join(texts)
            metrics = compute_metrics(gt_text, ocr_text)
            results.append({
                "elapsed": elapsed,
                "cer": metrics["CER"],
                "wer": metrics["WER"],
                "num_detections": len(texts),
                "mean_confidence": float(np.mean(scores)) if scores else 0,
            })
            sample["OCR Text"] = ocr_text[:120] + "..." if len(ocr_text) > 120 else ocr_text
            sample["CER"] = round(metrics["CER"], 4) if metrics["CER"] is not None else "N/A"
            sample["WER"] = round(metrics["WER"], 4) if metrics["WER"] is not None else "N/A"
            sample["Confidence"] = round(float(np.mean(scores)), 4) if scores else "N/A"
            sample["Time (s)"] = round(elapsed, 3)
        except Exception as e:
            sample["OCR Text"] = f"ERROR: {e}"
            sample["CER"] = "N/A"
            sample["WER"] = "N/A"
            sample["Confidence"] = "N/A"
            sample["Time (s)"] = "N/A"
        per_sample.append(sample)

    progress(0.97, desc="Computing summary...")

    if not results:
        return "❌ No valid results", None, None, None, None

    cers = [r["cer"] for r in results if r["cer"] is not None]
    wers = [r["wer"] for r in results if r["wer"] is not None]
    times = [r["elapsed"] for r in results]
    confs = [r["mean_confidence"] for r in results]

    summary = [
        {"Metric": "Mean CER ↓", "Value": f"{np.mean(cers):.4f}" if cers else "N/A"},
        {"Metric": "Median CER ↓", "Value": f"{np.median(cers):.4f}" if cers else "N/A"},
        {"Metric": "Mean WER ↓", "Value": f"{np.mean(wers):.4f}" if wers else "N/A"},
        {"Metric": "Median WER ↓", "Value": f"{np.median(wers):.4f}" if wers else "N/A"},
        {"Metric": "Mean inference time (s) ↓", "Value": f"{np.mean(times):.3f}"},
        {"Metric": "Median inference time (s) ↓", "Value": f"{np.median(times):.3f}"},
        {"Metric": "Total time (s)", "Value": f"{sum(times):.2f}"},
        {"Metric": "Mean confidence", "Value": f"{np.mean(confs):.4f}" if confs else "N/A"},
        {"Metric": "Samples processed", "Value": str(len(results))},
    ]

    progress(0.99, desc="Measuring deployment size (real on-disk)...")
    total_mb, pkg_details = estimate_deployment_size()
    size_rows = [{"Package": pkg, "Size (MB)": sz} for pkg, sz in pkg_details.items()]
    size_rows.append({"Package": "📦 TOTAL (installed)", "Size (MB)": total_mb})

    verdict_lines = [
        "## 📊 Summary\n",
        f"**Engine:** PaddleOCR PP-OCRv5 (PaddlePaddle runtime)",
        f"\n**Accuracy:** Mean CER = {np.mean(cers):.4f}, Mean WER = {np.mean(wers):.4f}" if cers else "\n**Accuracy:** N/A",
        f"\n**Speed:** {np.mean(times):.3f}s avg per image ({len(results)} samples)",
        f"\n**Deployment footprint:** ~{total_mb} MB installed on disk",
        f"\n**AWS Lambda 250 MB zip limit:** {'Fits ✅' if total_mb < 250 else 'Exceeds ❌ — requires container image (10 GB limit)'}",
        f"\n\n> ⚠️ Sizes are measured from **actual installed files** on disk via `os.walk`, not from pip metadata.",
    ]

    return (
        f"✅ Benchmark complete — {len(results)} samples processed",
        summary,
        per_sample,
        size_rows,
        "\n".join(verdict_lines),
    )

# ---------------------------------------------------------------------------
# Single image
# ---------------------------------------------------------------------------

def run_single_image(image):
    if image is None:
        return "Upload an image first"
    if not isinstance(image, Image.Image):
        image = Image.fromarray(image)
    try:
        engine = PaddleOCREngine(lang="en", ocr_version="PP-OCRv5")
        texts, scores, elapsed = engine.run(image)
        lines = [f"[{s:.2f}] {t}" for t, s in zip(texts, scores)]
        header = f"### PaddleOCR (PP-OCRv5) — {elapsed:.3f}s — {len(texts)} detections\n"
        return header + ("\n".join(lines) if lines else "(no text detected)")
    except Exception as e:
        return f"### ERROR\n{e}"

# ---------------------------------------------------------------------------
# UI
# ---------------------------------------------------------------------------

HEADER = """
# 🐉 PaddleOCR Benchmark (PP-OCRv5 — PaddlePaddle Runtime)

Benchmark **PaddleOCR** with the full **PaddlePaddle** inference runtime on public OCR datasets.

| Property | Value |
|---|---|
| **Engine** | PaddleOCR 3.5+ |
| **Model version** | PP-OCRv5 (latest) |
| **Runtime** | PaddlePaddle (native) |
| **AWS Lambda zip (250 MB)?** | ❌ Exceeds limit |
| **AWS Lambda container (10 GB)?** | ✅ Fits |

> 📏 Deployment sizes are **measured from actual installed files** on disk — not pip metadata.
>
> 💡 Compare with the [RapidOCR benchmark Space](https://huggingface.co/spaces/rbaks/rapidocr-benchmark) to see how ONNX Runtime reduces deployment size while preserving accuracy.
"""

with gr.Blocks(title="PaddleOCR Benchmark") as demo:
    gr.Markdown(HEADER)

    with gr.Tabs():
        with gr.Tab("📊 Dataset Benchmark"):
            gr.Markdown("### Run PaddleOCR on a benchmark dataset and measure accuracy, speed & deployment footprint.")
            with gr.Row():
                dataset_dd = gr.Dropdown(
                    choices=list(DATASETS.keys()),
                    value=list(DATASETS.keys())[0],
                    label="Select Benchmark Dataset",
                )
                num_slider = gr.Slider(minimum=5, maximum=MAX_SAMPLES, value=20, step=5, label="Number of samples")

            run_btn = gr.Button("🚀 Run Benchmark", variant="primary", size="lg")
            status_box = gr.Textbox(label="Status", interactive=False)

            with gr.Accordion("📈 Summary Metrics", open=True):
                summary_tbl = gr.Dataframe(headers=["Metric", "Value"], label="Metrics", wrap=True)

            verdict_md = gr.Markdown("")

            with gr.Accordion("📦 Deployment Size Breakdown (real on-disk)", open=False):
                size_tbl = gr.Dataframe(headers=["Package", "Size (MB)"], label="Installed sizes (os.walk)", wrap=True)

            with gr.Accordion("🔎 Per-Sample Details", open=False):
                detail_tbl = gr.Dataframe(
                    headers=["#", "Ground Truth", "OCR Text", "CER", "WER", "Confidence", "Time (s)"],
                    label="Per-sample results",
                    wrap=True,
                )

            run_btn.click(
                fn=run_benchmark,
                inputs=[dataset_dd, num_slider],
                outputs=[status_box, summary_tbl, detail_tbl, size_tbl, verdict_md],
            )

        with gr.Tab("🖼️ Try Single Image"):
            gr.Markdown("### Upload an image to run PaddleOCR.")
            img_input = gr.Image(type="pil", label="Upload Image")
            single_btn = gr.Button("🔍 Run OCR", variant="primary")
            single_out = gr.Markdown("")
            single_btn.click(fn=run_single_image, inputs=[img_input], outputs=[single_out])

        with gr.Tab("ℹ️ About"):
            gr.Markdown("""
## About this Space

This Space benchmarks **PaddleOCR** using the **PaddlePaddle** inference runtime — the original, full-weight deployment.

### Pipeline
```
Image → [Text Detection (DB-Net)] → [Text Classification] → [Text Recognition (SVTR)] → Text
```

### PP-OCRv5
- Latest generation of PaddleOCR models (2025)
- Improved detection head and recognition accuracy
- Models are in PaddlePaddle native format (`.pdmodel` / `.pdiparams`)
- Requires the full PaddlePaddle framework to run

### Size measurement methodology
Deployment sizes are measured by walking the **actual installed directories** on disk using `os.walk()` and summing file sizes.
This is the real footprint you'd see on an EC2 instance or Lambda container — not the compressed wheel size from pip.

### Metrics
| Metric | Description | Good value |
|--------|-------------|------------|
| **CER** | Character Error Rate | Lower = better (0 = perfect) |
| **WER** | Word Error Rate | Lower = better (0 = perfect) |
| **Inference time** | Wall-clock time per image | Lower = better |
| **Confidence** | Mean OCR confidence score | Higher = better |

### Datasets
| Dataset | Type | Content |
|---------|------|---------|
| FUNSD | Forms | Noisy scanned business forms |
| IAM | Handwriting | English handwritten text lines |
| CORD-v2 | Receipts | Receipt images with structured GT |
| Invoices & Receipts | Documents | Synthetic invoices with OCR GT |
""")

if __name__ == "__main__":
    demo.launch()