""" PaddleOCR (PaddlePaddle Runtime, PP-OCRv5) — Standalone OCR Benchmark Space """ import os import time import json import importlib import importlib.metadata import tempfile from pathlib import Path from collections import OrderedDict import gradio as gr import numpy as np from PIL import Image from jiwer import cer, wer from datasets import load_dataset # --------------------------------------------------------------------------- # Dataset registry # --------------------------------------------------------------------------- DATASETS = OrderedDict( { "FUNSD — Forms (50 test docs)": { "hf_id": "nielsr/funsd", "split": "test", "image_col": "image", "gt_fn": "funsd", "description": "Form Understanding in Noisy Scanned Documents. 50 test documents with word-level GT.", }, "IAM — Handwriting lines (test set, 50 samples)": { "hf_id": "Teklia/IAM-line", "split": "test", "image_col": "image", "gt_fn": "iam", "description": "IAM handwriting database, line-level images with transcriptions.", }, "CORD-v2 — Receipts (50 samples)": { "hf_id": "naver-clova-ix/cord-v2", "split": "test", "image_col": "image", "gt_fn": "cord", "description": "Consolidated Receipt Dataset v2. Complex receipt images with structured GT.", }, "Invoices & Receipts (50 samples)": { "hf_id": "mychen76/invoices-and-receipts_ocr_v1", "split": "test", "image_col": "image", "gt_fn": "invoices", "description": "Invoices and receipts with OCR ground truth text.", }, } ) MAX_SAMPLES = 50 # --------------------------------------------------------------------------- # Ground-truth extraction helpers # --------------------------------------------------------------------------- def _gt_funsd(row): words = row.get("words", []) return " ".join(words) def _gt_iam(row): return row.get("text", "") def _gt_cord(row): try: gt = json.loads(row.get("ground_truth", "{}")) parse = gt.get("gt_parse", {}) parts = [] for menu_item in parse.get("menu", []): for key in ("nm", "cnt", "price", "unitprice", "itemsubtotal", "sub", "etc"): val = menu_item.get(key) if val and isinstance(val, str): parts.append(val) elif isinstance(val, dict): for v2 in val.values(): if isinstance(v2, str): parts.append(v2) for section in ("subtotal", "total", "tax"): sec_data = parse.get(section, {}) if isinstance(sec_data, dict): for v in sec_data.values(): if isinstance(v, str): parts.append(v) elif isinstance(sec_data, list): for item in sec_data: if isinstance(item, dict): for v in item.values(): if isinstance(v, str): parts.append(v) return " ".join(parts) if parts else "" except Exception: return "" def _gt_invoices(row): try: raw = json.loads(row.get("raw_data", "{}")) words_str = raw.get("ocr_words", "") if isinstance(words_str, str) and words_str.startswith("["): import ast words = ast.literal_eval(words_str) return " ".join(words) return str(words_str) except Exception: return "" GT_EXTRACTORS = { "funsd": _gt_funsd, "iam": _gt_iam, "cord": _gt_cord, "invoices": _gt_invoices, } # --------------------------------------------------------------------------- # OCR engine # --------------------------------------------------------------------------- class PaddleOCREngine: def __init__(self, lang="en", ocr_version="PP-OCRv5"): from paddleocr import PaddleOCR self.ocr = PaddleOCR( lang=lang, ocr_version=ocr_version, use_doc_orientation_classify=False, use_doc_unwarping=False, use_textline_orientation=False, ) self.version = ocr_version def run(self, image: Image.Image): with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: image.save(f, format="PNG") tmp_path = f.name try: t0 = time.perf_counter() results = self.ocr.predict(tmp_path) elapsed = time.perf_counter() - t0 texts, scores = [], [] for r in results: rec_texts = r.get("rec_texts", []) if hasattr(r, "get") else getattr(r, "rec_texts", []) rec_scores = r.get("rec_scores", []) if hasattr(r, "get") else getattr(r, "rec_scores", []) if not rec_texts: try: rec_texts = r["rec_texts"] rec_scores = r["rec_scores"] except Exception: pass texts.extend(rec_texts if rec_texts else []) scores.extend(list(rec_scores) if rec_scores is not None else []) return texts, scores, elapsed finally: os.unlink(tmp_path) # --------------------------------------------------------------------------- # Deployment size — REAL on-disk measurement # --------------------------------------------------------------------------- def _get_dist_dirs(dist_name: str) -> list[str]: """Find all directories on disk belonging to a pip distribution.""" try: dist = importlib.metadata.distribution(dist_name) except importlib.metadata.PackageNotFoundError: return [] dist_info_path = Path(dist._path) site_packages = dist_info_path.parent dirs: set[str] = set() dirs.add(str(dist_info_path)) # top_level.txt lists the importable package names try: top_level = dist.read_text("top_level.txt") if top_level: for name in top_level.strip().splitlines(): name = name.strip() candidate = site_packages / name if candidate.is_dir(): dirs.add(str(candidate)) elif candidate.with_suffix(".py").is_file(): dirs.add(str(candidate.with_suffix(".py"))) except Exception: pass # Also check RECORD for top-level dirs we may have missed if dist.files: for f in dist.files: parts = str(f).split("/") if parts and parts[0] not in (".", ".."): top = parts[0] if top.endswith((".dist-info", ".egg-info")): continue candidate = site_packages / top if candidate.is_dir(): dirs.add(str(candidate)) elif candidate.is_file(): dirs.add(str(candidate)) return list(dirs) def _size_bytes(path: str) -> int: """Recursively sum real file sizes under a path.""" p = Path(path) if p.is_file(): return p.stat().st_size total = 0 for dirpath, _, filenames in os.walk(p): for fname in filenames: try: total += os.path.getsize(os.path.join(dirpath, fname)) except OSError: pass return total def get_package_real_size_mb(dist_name: str) -> float | None: """Measure the REAL on-disk installed size of a package in MB.""" dirs = _get_dist_dirs(dist_name) if not dirs: return None # Deduplicate — a dir and its subdirs shouldn't be double-counted # since we only collect top-level dirs, os.walk handles the rest total = sum(_size_bytes(d) for d in dirs) return total / (1024 * 1024) def estimate_deployment_size(): """Measure real installed sizes for the PaddleOCR deployment stack.""" packages = [ ("paddleocr", "paddleocr"), ("paddlepaddle (+ paddlex)", "paddlepaddle"), ("paddlex", "paddlex"), ("opencv-contrib-python", "opencv-contrib-python"), ("opencv-python-headless", "opencv-python-headless"), ("opencv-python", "opencv-python"), ("numpy", "numpy"), ("Pillow", "Pillow"), ("shapely", "shapely"), ("pyclipper", "pyclipper"), ("scipy", "scipy"), ("scikit-learn", "scikit-learn"), ("lxml", "lxml"), ("onnxruntime", "onnxruntime"), ("protobuf", "protobuf"), ] total = 0.0 details = {} for label, dist_name in packages: size = get_package_real_size_mb(dist_name) if size is not None and size > 0.1: # skip trivially small total += size details[label] = round(size, 1) return round(total, 1), details # --------------------------------------------------------------------------- # Metrics # --------------------------------------------------------------------------- def compute_metrics(gt_text: str, ocr_text: str): if not gt_text.strip() or not ocr_text.strip(): return {"CER": None, "WER": None} try: c = cer(gt_text.strip(), ocr_text.strip()) except Exception: c = None try: w = wer(gt_text.strip(), ocr_text.strip()) except Exception: w = None return {"CER": c, "WER": w} # --------------------------------------------------------------------------- # Benchmark runner # --------------------------------------------------------------------------- def run_benchmark(dataset_name, num_samples, progress=gr.Progress()): if dataset_name not in DATASETS: return "❌ Unknown dataset", None, None, None, None ds_info = DATASETS[dataset_name] progress(0, desc=f"Loading dataset: {ds_info['hf_id']}...") try: ds = load_dataset(ds_info["hf_id"], split=ds_info["split"], trust_remote_code=True) except Exception as e: return f"❌ Failed to load dataset: {e}", None, None, None, None n = min(int(num_samples), len(ds), MAX_SAMPLES) ds = ds.select(range(n)) gt_fn = GT_EXTRACTORS[ds_info["gt_fn"]] progress(0.05, desc="Initializing PaddleOCR (PP-OCRv5) engine...") try: engine = PaddleOCREngine(lang="en", ocr_version="PP-OCRv5") except Exception as e: return f"❌ Failed to init PaddleOCR: {e}", None, None, None, None results = [] per_sample = [] for i, row in enumerate(ds): progress((0.1 + 0.85 * i / n), desc=f"Processing sample {i+1}/{n}...") image = row[ds_info["image_col"]] if not isinstance(image, Image.Image): continue gt_text = gt_fn(row) if not gt_text.strip(): continue sample = {"#": i, "Ground Truth": gt_text[:120] + "..." if len(gt_text) > 120 else gt_text} try: texts, scores, elapsed = engine.run(image) ocr_text = " ".join(texts) metrics = compute_metrics(gt_text, ocr_text) results.append({ "elapsed": elapsed, "cer": metrics["CER"], "wer": metrics["WER"], "num_detections": len(texts), "mean_confidence": float(np.mean(scores)) if scores else 0, }) sample["OCR Text"] = ocr_text[:120] + "..." if len(ocr_text) > 120 else ocr_text sample["CER"] = round(metrics["CER"], 4) if metrics["CER"] is not None else "N/A" sample["WER"] = round(metrics["WER"], 4) if metrics["WER"] is not None else "N/A" sample["Confidence"] = round(float(np.mean(scores)), 4) if scores else "N/A" sample["Time (s)"] = round(elapsed, 3) except Exception as e: sample["OCR Text"] = f"ERROR: {e}" sample["CER"] = "N/A" sample["WER"] = "N/A" sample["Confidence"] = "N/A" sample["Time (s)"] = "N/A" per_sample.append(sample) progress(0.97, desc="Computing summary...") if not results: return "❌ No valid results", None, None, None, None cers = [r["cer"] for r in results if r["cer"] is not None] wers = [r["wer"] for r in results if r["wer"] is not None] times = [r["elapsed"] for r in results] confs = [r["mean_confidence"] for r in results] summary = [ {"Metric": "Mean CER ↓", "Value": f"{np.mean(cers):.4f}" if cers else "N/A"}, {"Metric": "Median CER ↓", "Value": f"{np.median(cers):.4f}" if cers else "N/A"}, {"Metric": "Mean WER ↓", "Value": f"{np.mean(wers):.4f}" if wers else "N/A"}, {"Metric": "Median WER ↓", "Value": f"{np.median(wers):.4f}" if wers else "N/A"}, {"Metric": "Mean inference time (s) ↓", "Value": f"{np.mean(times):.3f}"}, {"Metric": "Median inference time (s) ↓", "Value": f"{np.median(times):.3f}"}, {"Metric": "Total time (s)", "Value": f"{sum(times):.2f}"}, {"Metric": "Mean confidence", "Value": f"{np.mean(confs):.4f}" if confs else "N/A"}, {"Metric": "Samples processed", "Value": str(len(results))}, ] progress(0.99, desc="Measuring deployment size (real on-disk)...") total_mb, pkg_details = estimate_deployment_size() size_rows = [{"Package": pkg, "Size (MB)": sz} for pkg, sz in pkg_details.items()] size_rows.append({"Package": "📦 TOTAL (installed)", "Size (MB)": total_mb}) verdict_lines = [ "## 📊 Summary\n", f"**Engine:** PaddleOCR PP-OCRv5 (PaddlePaddle runtime)", f"\n**Accuracy:** Mean CER = {np.mean(cers):.4f}, Mean WER = {np.mean(wers):.4f}" if cers else "\n**Accuracy:** N/A", f"\n**Speed:** {np.mean(times):.3f}s avg per image ({len(results)} samples)", f"\n**Deployment footprint:** ~{total_mb} MB installed on disk", f"\n**AWS Lambda 250 MB zip limit:** {'Fits ✅' if total_mb < 250 else 'Exceeds ❌ — requires container image (10 GB limit)'}", f"\n\n> ⚠️ Sizes are measured from **actual installed files** on disk via `os.walk`, not from pip metadata.", ] return ( f"✅ Benchmark complete — {len(results)} samples processed", summary, per_sample, size_rows, "\n".join(verdict_lines), ) # --------------------------------------------------------------------------- # Single image # --------------------------------------------------------------------------- def run_single_image(image): if image is None: return "Upload an image first" if not isinstance(image, Image.Image): image = Image.fromarray(image) try: engine = PaddleOCREngine(lang="en", ocr_version="PP-OCRv5") texts, scores, elapsed = engine.run(image) lines = [f"[{s:.2f}] {t}" for t, s in zip(texts, scores)] header = f"### PaddleOCR (PP-OCRv5) — {elapsed:.3f}s — {len(texts)} detections\n" return header + ("\n".join(lines) if lines else "(no text detected)") except Exception as e: return f"### ERROR\n{e}" # --------------------------------------------------------------------------- # UI # --------------------------------------------------------------------------- HEADER = """ # 🐉 PaddleOCR Benchmark (PP-OCRv5 — PaddlePaddle Runtime) Benchmark **PaddleOCR** with the full **PaddlePaddle** inference runtime on public OCR datasets. | Property | Value | |---|---| | **Engine** | PaddleOCR 3.5+ | | **Model version** | PP-OCRv5 (latest) | | **Runtime** | PaddlePaddle (native) | | **AWS Lambda zip (250 MB)?** | ❌ Exceeds limit | | **AWS Lambda container (10 GB)?** | ✅ Fits | > 📏 Deployment sizes are **measured from actual installed files** on disk — not pip metadata. > > 💡 Compare with the [RapidOCR benchmark Space](https://huggingface.co/spaces/rbaks/rapidocr-benchmark) to see how ONNX Runtime reduces deployment size while preserving accuracy. """ with gr.Blocks(title="PaddleOCR Benchmark") as demo: gr.Markdown(HEADER) with gr.Tabs(): with gr.Tab("📊 Dataset Benchmark"): gr.Markdown("### Run PaddleOCR on a benchmark dataset and measure accuracy, speed & deployment footprint.") with gr.Row(): dataset_dd = gr.Dropdown( choices=list(DATASETS.keys()), value=list(DATASETS.keys())[0], label="Select Benchmark Dataset", ) num_slider = gr.Slider(minimum=5, maximum=MAX_SAMPLES, value=20, step=5, label="Number of samples") run_btn = gr.Button("🚀 Run Benchmark", variant="primary", size="lg") status_box = gr.Textbox(label="Status", interactive=False) with gr.Accordion("📈 Summary Metrics", open=True): summary_tbl = gr.Dataframe(headers=["Metric", "Value"], label="Metrics", wrap=True) verdict_md = gr.Markdown("") with gr.Accordion("📦 Deployment Size Breakdown (real on-disk)", open=False): size_tbl = gr.Dataframe(headers=["Package", "Size (MB)"], label="Installed sizes (os.walk)", wrap=True) with gr.Accordion("🔎 Per-Sample Details", open=False): detail_tbl = gr.Dataframe( headers=["#", "Ground Truth", "OCR Text", "CER", "WER", "Confidence", "Time (s)"], label="Per-sample results", wrap=True, ) run_btn.click( fn=run_benchmark, inputs=[dataset_dd, num_slider], outputs=[status_box, summary_tbl, detail_tbl, size_tbl, verdict_md], ) with gr.Tab("🖼️ Try Single Image"): gr.Markdown("### Upload an image to run PaddleOCR.") img_input = gr.Image(type="pil", label="Upload Image") single_btn = gr.Button("🔍 Run OCR", variant="primary") single_out = gr.Markdown("") single_btn.click(fn=run_single_image, inputs=[img_input], outputs=[single_out]) with gr.Tab("ℹ️ About"): gr.Markdown(""" ## About this Space This Space benchmarks **PaddleOCR** using the **PaddlePaddle** inference runtime — the original, full-weight deployment. ### Pipeline ``` Image → [Text Detection (DB-Net)] → [Text Classification] → [Text Recognition (SVTR)] → Text ``` ### PP-OCRv5 - Latest generation of PaddleOCR models (2025) - Improved detection head and recognition accuracy - Models are in PaddlePaddle native format (`.pdmodel` / `.pdiparams`) - Requires the full PaddlePaddle framework to run ### Size measurement methodology Deployment sizes are measured by walking the **actual installed directories** on disk using `os.walk()` and summing file sizes. This is the real footprint you'd see on an EC2 instance or Lambda container — not the compressed wheel size from pip. ### Metrics | Metric | Description | Good value | |--------|-------------|------------| | **CER** | Character Error Rate | Lower = better (0 = perfect) | | **WER** | Word Error Rate | Lower = better (0 = perfect) | | **Inference time** | Wall-clock time per image | Lower = better | | **Confidence** | Mean OCR confidence score | Higher = better | ### Datasets | Dataset | Type | Content | |---------|------|---------| | FUNSD | Forms | Noisy scanned business forms | | IAM | Handwriting | English handwritten text lines | | CORD-v2 | Receipts | Receipt images with structured GT | | Invoices & Receipts | Documents | Synthetic invoices with OCR GT | """) if __name__ == "__main__": demo.launch()