"""Filename and content validation for HIPE-OCRepair leaderboard submissions. Mirrors the validation logic in `HIPE-OCRepair-2026-eval/lib/validate_submissions.py` but exposes structured return values rather than printing to stderr, so it can be called from a Gradio handler. """ from __future__ import annotations import json import re from pathlib import Path from typing import Iterable from jsonschema import Draft202012Validator SCHEMA_PATH = Path(__file__).parent / "schema" / "hipe-ocrepair.schema.json" KNOWN_LANGUAGES = ("de", "en", "fr") KNOWN_SPLITS = ("masked-test", "test") KNOWN_DATASETS = ( "dta19-l0", "dta19-l1", "dta19-l2", "icdar2017", "impresso-snippets", ) FILENAME_RE = re.compile( r"^(?P[A-Za-z0-9][A-Za-z0-9-]*)" r"_hipe-ocrepair-bench" r"_(?Pv[0-9][0-9.]*)" r"_(?P[A-Za-z0-9][A-Za-z0-9-]*)" r"_(?Pv[0-9][0-9.]*)" r"_(?P[a-z][a-z0-9-]*)" r"_(?P[a-z]{2,3})" r"_run(?P\d+)" r"\.jsonl$" ) EXPECTED_PATTERN = ( "_hipe-ocrepair-bench___" "___run.jsonl" ) def validate_filename(filename: str) -> tuple[list[str], dict]: """Validate a submission filename against the naming convention. Returns a tuple ``(errors, metadata)``. ``errors`` is empty when the filename matches the convention and the parsed fields use known values for split / language / dataset. ``metadata`` contains the parsed fields when the regex matched, otherwise an empty dict. """ name = Path(filename).name match = FILENAME_RE.match(name) if not match: return ( [ f"Filename {name!r} does not match the expected pattern " f"{EXPECTED_PATTERN!r}." ], {}, ) metadata = { "team": match.group("team"), "bench_version": match.group("bench_version"), "dataset": match.group("dataset"), "dataset_version": match.group("dataset_version"), "split": match.group("split"), "language": match.group("language"), "run": int(match.group("run")), } errors: list[str] = [] if metadata["split"] not in KNOWN_SPLITS: errors.append( f"Unknown split {metadata['split']!r}; expected one of {KNOWN_SPLITS}." ) if metadata["language"] not in KNOWN_LANGUAGES: errors.append( f"Unknown language {metadata['language']!r}; " f"expected one of {KNOWN_LANGUAGES}." ) if metadata["dataset"] not in KNOWN_DATASETS: errors.append( f"Unknown dataset {metadata['dataset']!r}; " f"expected one of {KNOWN_DATASETS}." ) if metadata["run"] < 1: errors.append(f"Run number must be >= 1, got {metadata['run']}.") return errors, metadata def _load_schema(schema_path: Path) -> dict: with open(schema_path, "r", encoding="utf-8") as handle: return json.load(handle) def _iter_jsonl(filepath: Path) -> Iterable[tuple[int, str]]: with open(filepath, "r", encoding="utf-8") as handle: for line_num, line in enumerate(handle, start=1): if line.strip(): yield line_num, line def validate_content(filepath: Path, schema_path: Path = SCHEMA_PATH) -> list[str]: """Validate the JSONL content of a submission file against the HIPE schema. Each non-empty line is parsed as JSON and validated independently. All errors across all lines are collected and returned as human-readable strings; an empty list means the file is valid. """ filepath = Path(filepath) if not filepath.is_file(): return [f"File not found: {filepath}"] try: schema = _load_schema(Path(schema_path)) except FileNotFoundError: return [f"Schema file not found: {schema_path}"] except json.JSONDecodeError as exc: return [f"Schema file is not valid JSON: {exc}"] validator = Draft202012Validator(schema) errors: list[str] = [] record_count = 0 for line_num, line in _iter_jsonl(filepath): try: record = json.loads(line) except json.JSONDecodeError as exc: errors.append(f"Line {line_num}: invalid JSON — {exc.msg} (col {exc.colno}).") continue record_count += 1 for err in validator.iter_errors(record): location = "/".join(str(p) for p in err.absolute_path) or "" errors.append(f"Line {line_num}: schema violation at {location} — {err.message}") if record_count == 0 and not errors: errors.append("File contains no JSON records.") return errors def validate_submission( filename: str, filepath: Path, schema_path: Path = SCHEMA_PATH, ) -> tuple[list[str], dict]: """Validate both filename and content of a submission. Convenience wrapper that runs :func:`validate_filename` and :func:`validate_content` and concatenates their errors. Returns ``(errors, metadata)``. """ filename_errors, metadata = validate_filename(filename) content_errors = validate_content(Path(filepath), schema_path=schema_path) return filename_errors + content_errors, metadata