"""HIPE-OCRepair leaderboard Space. Public Gradio leaderboard for the HIPE-OCRepair 2026 shared task. Reads scored results from `hipe-eval/hipe-ocrepair-results` (public), accepts new submissions and writes them to `hipe-eval/hipe-ocrepair-submissions` (private). Auto-refreshes every 5 minutes. Ranking and weighting logic mirrors the authoritative orchestrator at `HIPE-OCRepair-2026-eval/lib/build_rankings.py`. The DTA weighting (each of `dta19-l0`, `dta19-l1`, `dta19-l2` carries weight 1/3, every other cell weight 1) is inlined as `COMPETITION_CELLS`. Non-obvious decisions are logged in `./decisions.md`. """ from __future__ import annotations import logging import os import sys from datetime import datetime, timezone from pathlib import Path from typing import Any, Iterable import gradio as gr import pandas as pd from datasets import Dataset, load_dataset from datasets.exceptions import DatasetNotFoundError from huggingface_hub import HfApi from validate import validate_content, validate_filename # -- Configuration ----------------------------------------------------------- RESULTS_REPO = "hipe-eval/hipe-ocrepair-results" SUBMISSIONS_REPO = "hipe-eval/hipe-ocrepair-submissions" SUBMISSIONS_FILES_PREFIX = "submissions/" REFRESH_INTERVAL_SECONDS = 5 * 60 HF_TOKEN = os.environ.get("HF_TOKEN") # Competition cells: (dataset_versioned, language) -> weight. # Source: HIPE-OCRepair-2026-eval/lib/competition_config.json. COMPETITION_CELLS: dict[tuple[str, str], float] = { ("dta19-l0_v0.1", "de"): 1.0 / 3.0, ("dta19-l1_v0.1", "de"): 1.0 / 3.0, ("dta19-l2_v0.1", "de"): 1.0 / 3.0, ("impresso-snippets_v1.0", "de"): 1.0, ("impresso-snippets_v1.0", "en"): 1.0, ("impresso-snippets_v1.0", "fr"): 1.0, ("icdar2017_v1.1", "en"): 1.0, ("icdar2017_v1.1", "fr"): 1.0, } LANGUAGES = ("de", "en", "fr") LANGUAGE_NAMES = {"de": "German", "en": "English", "fr": "French"} # (base_dataset, dataset_version, language) — display order for the # "By dataset" sub-tabs and the Submit dropdown. DATASET_CELLS_ORDERED: list[tuple[str, str, str]] = [ ("dta19-l0", "v0.1", "de"), ("dta19-l1", "v0.1", "de"), ("dta19-l2", "v0.1", "de"), ("icdar2017", "v1.1", "en"), ("icdar2017", "v1.1", "fr"), ("impresso-snippets", "v1.0", "de"), ("impresso-snippets", "v1.0", "en"), ("impresso-snippets", "v1.0", "fr"), ] N_TOTAL_OVERALL = len(COMPETITION_CELLS) N_TOTAL_BY_LANG = { lang: sum(1 for (_, l) in COMPETITION_CELLS if l == lang) for lang in LANGUAGES } # Display columns OVERALL_COLS = [ "Rank", "System", "cMER ↓", "95% CI cMER", "prefScore ↑", "95% CI prefScore", "Test sets", ] DATASET_COLS = [ "Rank", "System", "cMER micro ↓", "95% CI cMER", "Pref cMER Macro ↑", "95% CI prefScore", "cMER macro", "wMER macro", ] SYSTEMS_COLS = [ "Team", "Run", "System description", "Datasets submitted", "First submitted", ] logging.basicConfig( level=logging.INFO, format="[%(asctime)s] %(levelname)s %(message)s", stream=sys.stderr, ) log = logging.getLogger("leaderboard") # -- HF I/O ------------------------------------------------------------------ def load_results_df() -> pd.DataFrame: """Load the results dataset; return an empty DataFrame on any failure.""" try: ds = load_dataset( RESULTS_REPO, split="train", token=HF_TOKEN, download_mode="force_redownload", ) return ds.to_pandas() except (DatasetNotFoundError, FileNotFoundError): log.info("Results dataset is empty") return pd.DataFrame() except Exception as e: log.warning(f"Could not load results dataset: {e!r}") return pd.DataFrame() def load_submissions_df() -> pd.DataFrame: """Load the submissions metadata index; empty if no token or dataset.""" if not HF_TOKEN: log.info("No HF_TOKEN; skipping submissions index load") return pd.DataFrame() try: ds = load_dataset( SUBMISSIONS_REPO, split="train", token=HF_TOKEN, download_mode="force_redownload", ) return ds.to_pandas() except (DatasetNotFoundError, FileNotFoundError): log.info("Submissions index is empty") return pd.DataFrame() except Exception as e: log.warning(f"Could not load submissions index: {e!r}") return pd.DataFrame() # -- Weighted-mean and formatting helpers ------------------------------------ # Mirrors HIPE-OCRepair-2026-eval/lib/build_rankings.py:78-84 def _is_missing(v: Any) -> bool: return v is None or (isinstance(v, float) and pd.isna(v)) def weighted_mean(pairs: Iterable[tuple[Any, float]]) -> float | None: kept = [(v, w) for v, w in pairs if not _is_missing(v)] if not kept: return None total_w = sum(w for _, w in kept) return sum(v * w for v, w in kept) / total_w def fmt(val: Any, precision: int = 4) -> str: if _is_missing(val): return "—" try: return f"{float(val):.{precision}f}" except (ValueError, TypeError): return str(val) def fmt_ci(low: Any, high: Any) -> str: if _is_missing(low) and _is_missing(high): return "—" return f"[{fmt(low)}, {fmt(high)}]" def system_display_name(team: str, bench_version: str, run: int) -> str: return f"{team}_hipe-ocrepair-bench_{bench_version}_run{run}" # -- Ranking builders -------------------------------------------------------- def compute_weighted_ranking( results_df: pd.DataFrame, cells_filter: dict[tuple[str, str], float], n_total: int, ) -> tuple[pd.DataFrame, pd.DataFrame]: """Return `(complete_df, incomplete_df)` for the given cell subset. `cells_filter` maps `(dataset_versioned, language) -> weight`. Rows whose `(dataset, language)` is not in `cells_filter` are dropped before aggregating. A system is "complete" if it covers every cell in the filter. """ if results_df.empty or not cells_filter: return ( pd.DataFrame(columns=OVERALL_COLS), pd.DataFrame(columns=OVERALL_COLS), ) df = results_df.copy() df["_key"] = list(zip(df["dataset"], df["language"])) df = df[df["_key"].isin(cells_filter)] if df.empty: return ( pd.DataFrame(columns=OVERALL_COLS), pd.DataFrame(columns=OVERALL_COLS), ) df["_weight"] = df["_key"].map(cells_filter) rows: list[dict[str, Any]] = [] for (team, bench_version, run), grp in df.groupby( ["team", "bench_version", "run"], as_index=False, sort=False ): weights = grp["_weight"].tolist() def wm(col: str) -> float | None: return weighted_mean(zip(grp[col].tolist(), weights)) rows.append( { "system_display": system_display_name( str(team), str(bench_version), int(run) ), "cmer": wm("cmer_micro"), "cmer_lo": wm("cmer_micro_ci_low"), "cmer_hi": wm("cmer_micro_ci_high"), "pref": wm("pref_cmer_macro"), "pref_lo": wm("pref_cmer_macro_ci_low"), "pref_hi": wm("pref_cmer_macro_ci_high"), "n_test_sets": len(grp), "n_total": n_total, } ) rows.sort( key=lambda r: ( r["cmer"] if r["cmer"] is not None else float("inf"), -(r["pref"] if r["pref"] is not None else float("-inf")), ) ) complete_rows = [r for r in rows if r["n_test_sets"] == n_total] incomplete_rows = [r for r in rows if r["n_test_sets"] < n_total] def to_df(items: list[dict[str, Any]]) -> pd.DataFrame: records = [] for rank, r in enumerate(items, 1): records.append( { "Rank": rank, "System": r["system_display"], "cMER ↓": fmt(r["cmer"]), "95% CI cMER": fmt_ci(r["cmer_lo"], r["cmer_hi"]), "prefScore ↑": fmt(r["pref"]), "95% CI prefScore": fmt_ci(r["pref_lo"], r["pref_hi"]), "Test sets": f"{r['n_test_sets']}/{r['n_total']}", } ) return pd.DataFrame(records, columns=OVERALL_COLS) return to_df(complete_rows), to_df(incomplete_rows) def compute_per_dataset( results_df: pd.DataFrame, dataset_versioned: str, language: str, ) -> pd.DataFrame: if results_df.empty: return pd.DataFrame(columns=DATASET_COLS) sub = results_df[ (results_df["dataset"] == dataset_versioned) & (results_df["language"] == language) ] if sub.empty: return pd.DataFrame(columns=DATASET_COLS) rows = sub.to_dict("records") rows.sort( key=lambda r: ( r["cmer_micro"] if not _is_missing(r.get("cmer_micro")) else float("inf"), -( r["pref_cmer_macro"] if not _is_missing(r.get("pref_cmer_macro")) else float("-inf") ), ) ) out = [] for rank, r in enumerate(rows, 1): out.append( { "Rank": rank, "System": r["system"], "cMER micro ↓": fmt(r.get("cmer_micro")), "95% CI cMER": fmt_ci( r.get("cmer_micro_ci_low"), r.get("cmer_micro_ci_high") ), "Pref cMER Macro ↑": fmt(r.get("pref_cmer_macro")), "95% CI prefScore": fmt_ci( r.get("pref_cmer_macro_ci_low"), r.get("pref_cmer_macro_ci_high"), ), "cMER macro": fmt(r.get("cmer_macro")), "wMER macro": fmt(r.get("wmer_macro")), } ) return pd.DataFrame(out, columns=DATASET_COLS) def compute_systems_table( results_df: pd.DataFrame, submissions_df: pd.DataFrame, ) -> pd.DataFrame: """List every (team, run) pair known to either dataset. `Datasets submitted` is the union of cells with a results record or a pending submission; out of `N_TOTAL_OVERALL`. `First submitted` is the earliest `submitted_at` in the submissions index, falling back to the earliest `evaluated_at` for seed-only entries. """ keys: set[tuple[str, int]] = set() if not results_df.empty: for _, r in results_df[["team", "run"]].drop_duplicates().iterrows(): keys.add((str(r["team"]), int(r["run"]))) has_submissions = ( not submissions_df.empty and "team" in submissions_df.columns and "run" in submissions_df.columns ) if has_submissions: for _, r in submissions_df[["team", "run"]].drop_duplicates().iterrows(): keys.add((str(r["team"]), int(r["run"]))) if not keys: return pd.DataFrame(columns=SYSTEMS_COLS) sub_has_cell_cols = has_submissions and {"dataset", "language"} <= set( submissions_df.columns ) rows = [] for team, run in sorted(keys, key=lambda k: (k[0].lower(), k[1])): result_cells: set[tuple[str, str]] = set() if not results_df.empty: mask = (results_df["team"] == team) & (results_df["run"] == run) for _, r in results_df[mask].iterrows(): result_cells.add((r["dataset"], r["language"])) submitted_cells: set[tuple[str, str]] = set() description = "" first_submitted: str | None = None if has_submissions: mask = (submissions_df["team"] == team) & (submissions_df["run"] == run) sub_rows = submissions_df[mask].sort_values("submitted_at") for _, r in sub_rows.iterrows(): if sub_has_cell_cols and not _is_missing(r.get("dataset")): submitted_cells.add((r["dataset"], r["language"])) if not description and not _is_missing(r.get("system_description")): description = str(r["system_description"]).strip() ts = str(r["submitted_at"]) if not _is_missing(r.get("submitted_at")) else None if ts and (first_submitted is None or ts < first_submitted): first_submitted = ts if first_submitted is None and not results_df.empty: mask = (results_df["team"] == team) & (results_df["run"] == run) evals = results_df[mask]["evaluated_at"].dropna().astype(str) if not evals.empty: first_submitted = evals.min() n_cells = len(result_cells | submitted_cells) rows.append( { "Team": team, "Run": run, "System description": description or "—", "Datasets submitted": f"{n_cells}/{N_TOTAL_OVERALL}", "First submitted": first_submitted or "—", } ) return pd.DataFrame(rows, columns=SYSTEMS_COLS) # -- Refresh ----------------------------------------------------------------- def refresh_all() -> tuple: """Reload data from HF and return every display DataFrame in `OUTPUT_ORDER` order.""" results_df = load_results_df() submissions_df = load_submissions_df() overall_complete, overall_incomplete = compute_weighted_ranking( results_df, COMPETITION_CELLS, N_TOTAL_OVERALL ) lang_outputs: list[pd.DataFrame] = [] for lang in LANGUAGES: lang_cells = {k: v for k, v in COMPETITION_CELLS.items() if k[1] == lang} comp, incomp = compute_weighted_ranking( results_df, lang_cells, N_TOTAL_BY_LANG[lang] ) lang_outputs.extend([comp, incomp]) dataset_outputs: list[pd.DataFrame] = [] for base_ds, ver, lang in DATASET_CELLS_ORDERED: dataset_outputs.append( compute_per_dataset(results_df, f"{base_ds}_{ver}", lang) ) systems = compute_systems_table(results_df, submissions_df) last_updated = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S") return ( overall_complete, overall_incomplete, *lang_outputs, *dataset_outputs, systems, FOOTER_TEMPLATE.format( ts=last_updated, minutes=REFRESH_INTERVAL_SECONDS // 60 ), ) # -- Submission handling ----------------------------------------------------- DROPDOWN_CHOICES: list[tuple[str, str]] = [ (f"{base_ds}_{ver} — {lang}", f"{base_ds}|{ver}|{lang}") for (base_ds, ver, lang) in DATASET_CELLS_ORDERED ] def parse_dataset_choice(value: str) -> tuple[str, str, str]: parts = value.split("|") if len(parts) != 3: raise ValueError(f"Invalid dataset choice: {value!r}") return parts[0], parts[1], parts[2] def _existing_match( df: pd.DataFrame, team: str, run: int, dataset_versioned: str, language: str, ) -> bool: if df.empty: return False cols = set(df.columns) if not {"team", "run", "dataset", "language"} <= cols: return False mask = ( (df["team"] == team) & (df["run"] == run) & (df["dataset"] == dataset_versioned) & (df["language"] == language) ) return bool(mask.any()) def _sanitize(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: out = [] for row in rows: clean = {} for k, v in row.items(): if isinstance(v, float) and pd.isna(v): clean[k] = None else: clean[k] = v out.append(clean) return out def handle_submission( team_name: str, system_description: str, run_number: float | int | None, dataset_choice: str | None, file_path: str | None, ) -> str: """Validate and accept one submission file; return a Markdown status.""" if HF_TOKEN is None: return ( "**Error**: This Space is missing the `HF_TOKEN` secret; submissions " "are disabled. Contact the maintainer." ) errors: list[str] = [] team_name = (team_name or "").strip() if not team_name: errors.append("Team name is required.") run: int | None try: run = int(run_number) if run_number is not None else None if run is None or run < 1: errors.append("Run number must be an integer ≥ 1.") run = None except (ValueError, TypeError): errors.append("Run number must be an integer ≥ 1.") run = None if not dataset_choice: errors.append("Dataset is required.") if not file_path: errors.append("A submission `.jsonl` file is required.") if errors or run is None or not dataset_choice or not file_path: return "**Errors:**\n\n" + "\n".join(f"- {e}" for e in errors) try: base_ds, ver, lang = parse_dataset_choice(dataset_choice) except ValueError as e: return f"**Error**: {e}" dataset_versioned = f"{base_ds}_{ver}" file_p = Path(file_path) filename = file_p.name fname_errors, metadata = validate_filename(filename) errors.extend(fname_errors) if metadata: if metadata.get("team") != team_name: errors.append( f"Team in filename is `{metadata.get('team')}` but form team is " f"`{team_name}`." ) if metadata.get("dataset") != base_ds: errors.append( f"Dataset in filename is `{metadata.get('dataset')}` but selected " f"dataset is `{base_ds}`." ) if metadata.get("dataset_version") != ver: errors.append( f"Dataset version in filename is `{metadata.get('dataset_version')}` " f"but selected version is `{ver}`." ) if metadata.get("language") != lang: errors.append( f"Language in filename is `{metadata.get('language')}` but selected " f"language is `{lang}`." ) if metadata.get("run") != run: errors.append( f"Run number in filename is `{metadata.get('run')}` but form run is " f"`{run}`." ) if metadata.get("split") != "masked-test": errors.append( f"Split in filename must be `masked-test` for official submissions; " f"got `{metadata.get('split')}`." ) content_errors = validate_content(file_p) errors.extend(content_errors) if errors: return "**Errors:**\n\n" + "\n".join(f"- {e}" for e in errors) bench_version = metadata["bench_version"] results_df = load_results_df() submissions_df = load_submissions_df() dup_msg = ( f"A submission already exists for `{team_name}` / `{dataset_versioned}` " f"(`{lang}`) / `run{run}`. Use a different run number to submit a new attempt." ) if _existing_match(submissions_df, team_name, run, dataset_versioned, lang): return f"**Error**: {dup_msg}" if _existing_match(results_df, team_name, run, dataset_versioned, lang): return f"**Error**: {dup_msg}" submitted_at = datetime.now(timezone.utc) submitted_at_iso = submitted_at.strftime("%Y-%m-%dT%H:%M:%S") submitted_at_str = submitted_at.strftime("%Y-%m-%d %H:%M:%S") submission_id = ( f"{team_name}_run{run}_{dataset_versioned}_{lang}_{submitted_at_iso}" ) api = HfApi(token=HF_TOKEN) try: api.upload_file( path_or_fileobj=str(file_p), path_in_repo=f"{SUBMISSIONS_FILES_PREFIX}{filename}", repo_id=SUBMISSIONS_REPO, repo_type="dataset", commit_message=f"Add submission {submission_id}", ) except Exception as e: log.exception("Failed to upload submission file") return f"**Error**: Could not upload submission file: `{e!r}`" new_row: dict[str, Any] = { "submission_id": submission_id, "team": team_name, "run": run, "system_description": (system_description or "").strip(), "submitted_at": submitted_at_str, "filename": filename, "status": "pending", "dataset": dataset_versioned, "language": lang, "bench_version": bench_version, } try: existing = ( _sanitize(submissions_df.to_dict("records")) if not submissions_df.empty else [] ) ds = Dataset.from_list(existing + [new_row]) ds.push_to_hub( SUBMISSIONS_REPO, token=HF_TOKEN, commit_message=f"Add submission index row {submission_id}", ) except Exception as e: log.exception("Failed to push submissions index") return ( f"**Warning**: The submission file uploaded but the metadata index " f"could not be updated: `{e!r}`. Contact the maintainer with this " f"submission ID: `{submission_id}`." ) # Coverage: count distinct (dataset, language) cells now associated with # this (team, run) — from results plus submissions (incl. the new row). cells_for_run: set[tuple[str, str]] = {(dataset_versioned, lang)} if not results_df.empty: m = (results_df["team"] == team_name) & (results_df["run"] == run) for _, r in results_df[m].iterrows(): cells_for_run.add((r["dataset"], r["language"])) if not submissions_df.empty and {"dataset", "language"} <= set(submissions_df.columns): m = (submissions_df["team"] == team_name) & (submissions_df["run"] == run) for _, r in submissions_df[m].iterrows(): if not _is_missing(r.get("dataset")): cells_for_run.add((r["dataset"], r["language"])) coverage = len(cells_for_run) return ( f"**Submission accepted!**\n\n" f"- Submission ID: `{submission_id}`\n" f"- Coverage for `{team_name}` / `run{run}`: **{coverage}/{N_TOTAL_OVERALL}** datasets submitted.\n\n" f"Your submission is now in the queue. Scores will appear on the leaderboard " f"after the evaluator processes it (within ~15 minutes)." ) # -- UI ---------------------------------------------------------------------- def _empty(cols: list[str]) -> pd.DataFrame: return pd.DataFrame(columns=cols) CUSTOM_CSS = """ .gradio-container { font-size: 18px; } .gradio-container p, .gradio-container li, .gradio-container label, .gradio-container .markdown-body { font-size: 18px; } .gradio-container h1 { font-size: 32px; } .gradio-container h2 { font-size: 26px; } .gradio-container h3 { font-size: 22px; } .dataframe { font-size: 17px; } .dataframe th { font-size: 17px; font-weight: bold; } button, .tab-nav button { font-size: 17px; } """ FOOTER_TEMPLATE = """
Last refreshed: {ts} UTC  •  next refresh in {minutes} min.
""" BIBTEX_PLACEHOLDER = """```bibtex @inproceedings{hipe-ocrepair-2026, title = {ICDAR 2026 HIPE-OCRepair Competition on LLM-Assisted OCR Post-Correction for Historical Documents}, booktitle = {Proceedings of ICDAR 2026}, year = {2026}, url = {https://github.com/hipe-eval/HIPE-OCRepair-2026-eval}, note = {Placeholder; the full citation will be added when the ICDAR 2026 proceedings are published.} } ```""" with gr.Blocks(title="HIPE-OCRepair leaderboard", css=CUSTOM_CSS) as demo: gr.HTML("""
HIPE-OCRepair — Historical OCR document

🏆 HIPE-OCRepair Leaderboard

Can LLMs address the OCR debt in digitized historical collections?

A multilingual benchmark for LLM-based OCR post-correction of historical documents.

Evaluating systems on historical newspapers and printed books in English, French, and German (17th–20th century) across 8 test sets and 3 languages.

📄 Competition report  ·  🌐 Benchmark website  ·  📦 Data  ·  ⚖️ Scorer

""") lang_tables: dict[str, dict[str, gr.Dataframe]] = {} dataset_tables: dict[tuple[str, str], gr.Dataframe] = {} with gr.Tabs(): # --- Tab 1: Overall ranking --- with gr.Tab("🏆 Overall Ranking"): gr.Markdown( "Weighted mean across all 8 test sets. **cMER (↓)** lower is " "better; **prefScore (↑)** higher is better. Only systems " "covering all 8 test sets are ranked here; partial-coverage " "systems are listed separately below." ) overall_table = gr.Dataframe( _empty(OVERALL_COLS), interactive=False, wrap=True, ) gr.Markdown("### Incomplete submissions\n_Systems missing at least one test set._") overall_incomplete_table = gr.Dataframe( _empty(OVERALL_COLS), interactive=False, wrap=True, ) # --- Tab 2: By language --- with gr.Tab("🌍 By Language"): with gr.Tabs(): for lang in LANGUAGES: with gr.Tab(LANGUAGE_NAMES[lang]): gr.Markdown( f"Weighted mean across {LANGUAGE_NAMES[lang]} " f"test sets only ({N_TOTAL_BY_LANG[lang]} cells). " f"**cMER (↓)** lower is better; **prefScore (↑)** " f"higher is better." ) comp_tbl = gr.Dataframe( _empty(OVERALL_COLS), interactive=False, wrap=True ) gr.Markdown( "### Incomplete submissions\n" f"_Systems missing at least one {LANGUAGE_NAMES[lang]} test set._" ) incomp_tbl = gr.Dataframe( _empty(OVERALL_COLS), interactive=False, wrap=True ) lang_tables[lang] = {"complete": comp_tbl, "incomplete": incomp_tbl} # --- Tab 3: By dataset --- with gr.Tab("📂 By Dataset"): with gr.Tabs(): for base_ds, ver, lang in DATASET_CELLS_ORDERED: ds_versioned = f"{base_ds}_{ver}" with gr.Tab(f"{ds_versioned} ({lang})"): gr.Markdown( f"Per-system scores on `{ds_versioned}` " f"({LANGUAGE_NAMES[lang]}). **cMER micro (↓)** " f"is the primary metric; **Pref cMER Macro (↑)** " f"is the preference score." ) tbl = gr.Dataframe( _empty(DATASET_COLS), interactive=False, wrap=True ) dataset_tables[(ds_versioned, lang)] = tbl # --- Tab 4: Systems --- with gr.Tab("🤖 Systems"): gr.Markdown( "All registered systems — every `(team, run)` combination, " "complete or partial. _First submitted_ is the earliest upload " "timestamp; seeded ICDAR 2026 entries show their evaluation date." ) systems_table = gr.Dataframe( _empty(SYSTEMS_COLS), interactive=False, wrap=True ) # --- Tab 5: Submit --- with gr.Tab("📤 Submit"): gr.Markdown( "Upload one `.jsonl` file per test set. A team needs 8 files " "(one per test set) for full coverage.\n\n" "Filename pattern:\n\n" "```\n" "_hipe-ocrepair-bench___" "_masked-test__run.jsonl\n" "```\n\n" "Example: `bnf-mistral_hipe-ocrepair-bench_v0.9_impresso-snippets_v1.0_masked-test_fr_run1.jsonl`." ) with gr.Row(): with gr.Column(): team_input = gr.Textbox( label="Team name", placeholder="e.g. bnf-mistral" ) description_input = gr.Textbox( label="System description (optional; per (team, run); first one wins)", lines=3, ) run_input = gr.Number( label="Run number", value=1, precision=0, minimum=1 ) dataset_input = gr.Dropdown( label="Dataset", choices=DROPDOWN_CHOICES, value=None, ) file_input = gr.File( label="Submission file (.jsonl)", file_types=[".jsonl"], type="filepath", ) submit_btn = gr.Button("Submit", variant="primary") with gr.Column(): submit_status = gr.Markdown("_Awaiting submission…_") submit_btn.click( fn=handle_submission, inputs=[ team_input, description_input, run_input, dataset_input, file_input, ], outputs=submit_status, ) # --- Tab 6: About --- with gr.Tab("ℹ️ About"): gr.Markdown( "## About HIPE-OCRepair 2026\n\n" "**HIPE-OCRepair 2026: Shared Task on OCR Post-Correction of " "Historical Documents** — part of the ICDAR 2026 competition series. " "Systems are evaluated on 8 test sets across 3 languages " "(German, English, French).\n\n" "### Metrics\n\n" "- **cMER (character-level Match Error Rate, micro-averaged)** " "— lower is better (↓). Proportion of mismatched characters between " "the predicted post-correction and the gold reference.\n" "- **prefScore (preference cMER, macro-averaged)** — higher is " "better (↑). Captures whether post-correction improves over " "leaving the OCR unchanged.\n\n" "Both metrics report 95% bootstrap confidence intervals.\n\n" "### Weighting rule\n\n" "Overall and per-language scores are weighted means. The three " "DTA cells (`dta19-l0_v0.1`, `dta19-l1_v0.1`, `dta19-l2_v0.1`) " "each carry weight 1/3 so that the three together count as one " "dataset. All other cells carry weight 1.\n\n" "### Completeness\n\n" "Only systems covering all 8 test sets appear in the overall " "ranking. Partial-coverage systems are listed in a separate " "_Incomplete submissions_ section under each ranking view. " "The _Systems_ tab shows every registered `(team, run)` pair.\n\n" "### Code and data\n\n" "- Evaluation orchestrator: \n" "- Scorer package: \n" "- Public data: \n\n" # "### Citation\n\n" # "*HIPE-OCRepair 2026: Shared Task on OCR Post-Correction of " # "Historical Documents*. " # ". " # "Full citation will be added when the ICDAR 2026 proceedings " # "are published.\n\n" # + BIBTEX_PLACEHOLDER ) with gr.Accordion("📄 Citation", open=False): gr.Markdown( "If you use these results, please cite **HIPE-OCRepair-2026 " "Competition** (ICDAR 2026). Placeholder BibTeX — will be " "replaced once the ICDAR 2026 proceedings are published.\n\n" + BIBTEX_PLACEHOLDER ) refresh_footer = gr.HTML( elem_id="refresh-footer", value=FOOTER_TEMPLATE.format( ts="—", minutes=REFRESH_INTERVAL_SECONDS // 60 ), ) # Wire refresh outputs in the exact order returned by `refresh_all`. refresh_outputs: list[gr.Dataframe | gr.Markdown] = [ overall_table, overall_incomplete_table, ] for lang in LANGUAGES: refresh_outputs.append(lang_tables[lang]["complete"]) refresh_outputs.append(lang_tables[lang]["incomplete"]) for base_ds, ver, lang in DATASET_CELLS_ORDERED: refresh_outputs.append(dataset_tables[(f"{base_ds}_{ver}", lang)]) refresh_outputs.append(systems_table) refresh_outputs.append(refresh_footer) demo.load(fn=refresh_all, inputs=None, outputs=refresh_outputs) refresh_timer = gr.Timer(REFRESH_INTERVAL_SECONDS) refresh_timer.tick(fn=refresh_all, inputs=None, outputs=refresh_outputs) if __name__ == "__main__": if not HF_TOKEN: log.warning( "HF_TOKEN not set — the leaderboard will load public results " "but submissions and the submissions-index read will be disabled." ) demo.launch()