Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

File size: 9,637 Bytes

acad3da

"""Génère les tableaux Markdown du README depuis le code réel.

Sprint A13 (item M-22 / M-23 / M-25 / M-26 du plan de remédiation).

Ce script remplace les listes manuelles qui dérivaient silencieusement
(le bug typique : un nouvel engine ajouté → README pas mis à jour →
``test_readme_consistency`` casse au prochain CI).

Trois tableaux sont produits :

1. **Engines** : un par fichier ``picarones/engines/*.py`` (hors base /
   factory / __init__).
2. **CLI commands** : depuis ``picarones --help``.
3. **API endpoints** : depuis ``app.openapi()["paths"]``.

Le script écrit chaque tableau dans le README entre des balises HTML
``<!-- generated:engines -->`` … ``<!-- /generated:engines -->`` (idem
``cli`` et ``endpoints``). En CI, un job re-exécute ce script et
échoue si le diff Git est non vide — garantissant l'absence de dérive.

Usage :

.. code-block:: bash

    python scripts/gen_readme_tables.py            # met à jour README.md
    python scripts/gen_readme_tables.py --check    # CI : exit 1 si diff
"""

from __future__ import annotations

import argparse
import re
import subprocess
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parent.parent
README = REPO_ROOT / "README.md"


# ---------------------------------------------------------------------------
# Engines
# ---------------------------------------------------------------------------


_ENGINE_DESCRIPTIONS: dict[str, tuple[str, str, str]] = {
    # name → (display_name, type, install_hint)
    "tesseract": ("Tesseract 5", "Local CLI", "`pip install pytesseract` + system binary"),
    "pero_ocr": ("Pero OCR", "Local Python", "`pip install -e .[pero]`"),
    "mistral_ocr": ("Mistral OCR", "Cloud API", "`MISTRAL_API_KEY` env var"),
    "google_vision": ("Google Vision", "Cloud API", "`GOOGLE_APPLICATION_CREDENTIALS` env var"),
    "azure_doc_intel": ("Azure Doc Intelligence", "Cloud API", "`AZURE_DOC_INTEL_ENDPOINT` + `AZURE_DOC_INTEL_KEY`"),
}


def _engine_files() -> list[str]:
    """Retourne la liste triée des modules d'engines (sans base / factory)."""
    out: list[str] = []
    for path in sorted((REPO_ROOT / "picarones" / "engines").glob("*.py")):
        name = path.stem
        if name in {"__init__", "base", "factory"}:
            continue
        out.append(name)
    return out


def build_engines_table() -> str:
    rows = [
        "| Engine | Type | Installation |",
        "|--------|------|-------------|",
    ]
    for name in _engine_files():
        display, kind, install = _ENGINE_DESCRIPTIONS.get(
            name,
            (name, "Unknown", "—"),
        )
        rows.append(f"| **{display}** | {kind} | {install} |")
    return "\n".join(rows)


# ---------------------------------------------------------------------------
# CLI commands
# ---------------------------------------------------------------------------


_CLI_DESCRIPTIONS: dict[str, str] = {
    "run": "Run a full benchmark on a corpus",
    "report": "Generate an HTML report from JSON results",
    "demo": "Generate a demo report with synthetic data (no engine required)",
    "metrics": "Compute CER/WER between two text files",
    "engines": "List available OCR engines and LLM adapters",
    "info": "Display version and system information",
    "serve": "Launch the FastAPI web interface",
    "history": "Query longitudinal benchmark history (SQLite)",
    "robustness": "Run robustness analysis with degraded images",
    "import": "Import a corpus from a remote source (IIIF, HF, HTR-United)",
    "compare": "Compare two benchmark JSON runs and flag regressions (Sprint 28)",
    "pipeline": "Run / compare composed pipelines from a YAML spec (Sprint 70)",
    "diagnose": "Pre-wired workflow: bench + improvement levers + factual recommendations",
    "economics": "Pre-wired workflow: bench + effective throughput + cost projection",
    "edition": "Pre-wired workflow: bench + philological metrics for critical editing",
}


def build_cli_table() -> str:
    from picarones.cli import cli

    rows = [
        "| Command | Description |",
        "|---------|-------------|",
    ]
    for name in sorted(cli.commands.keys()):
        desc = _CLI_DESCRIPTIONS.get(name, "—")
        rows.append(f"| `picarones {name}` | {desc} |")
    return "\n".join(rows)


# ---------------------------------------------------------------------------
# API endpoints
# ---------------------------------------------------------------------------


def build_endpoints_table() -> str:
    from picarones.web.app import app

    spec = app.openapi()
    rows = [
        "| Method | Endpoint | Summary |",
        "|--------|----------|---------|",
    ]
    for path in sorted(spec.get("paths", {})):
        methods = spec["paths"][path]
        for method, definition in sorted(methods.items()):
            if method.upper() not in ("GET", "POST", "PUT", "DELETE", "PATCH"):
                continue
            summary = (
                definition.get("summary")
                or (definition.get("description", "") or "—").split("\n")[0]
            )
            # Tronque à 60 caractères pour le tableau.
            if len(summary) > 80:
                summary = summary[:77] + "…"
            rows.append(
                f"| `{method.upper()}` | `{path}` | {summary} |"
            )
    return "\n".join(rows)


# ---------------------------------------------------------------------------
# Test count
# ---------------------------------------------------------------------------


def collect_test_count() -> int | None:
    """Lance ``pytest --collect-only`` et extrait le compteur."""
    try:
        result = subprocess.run(
            [
                sys.executable,
                "-m",
                "pytest",
                "--collect-only",
                "-q",
                "--no-cov",
                "-p",
                "no:cacheprovider",
                "tests/",
            ],
            capture_output=True,
            text=True,
            cwd=REPO_ROOT,
            timeout=60,
        )
    except subprocess.TimeoutExpired:
        return None
    for line in reversed(result.stdout.strip().split("\n")):
        m = re.search(r"(\d+)\s+tests?\s+collected", line)
        if m:
            return int(m.group(1))
    return None


# ---------------------------------------------------------------------------
# Insertion dans le README
# ---------------------------------------------------------------------------


def _replace_section(text: str, marker: str, content: str) -> str:
    """Remplace le contenu entre ``<!-- generated:<marker> -->`` et
    ``<!-- /generated:<marker> -->`` ; conserve le reste du fichier
    intact. Si les balises sont absentes, retourne le texte inchangé
    (le README doit être mis à jour avec les balises au moins une fois
    manuellement avant que ce script puisse opérer)."""
    pattern = re.compile(
        rf"(<!--\s*generated:{marker}\s*-->)(.*?)(<!--\s*/generated:{marker}\s*-->)",
        re.DOTALL,
    )
    replacement = f"\\1\n\n{content}\n\n\\3"
    new_text, n = pattern.subn(replacement, text)
    if n == 0:
        sys.stderr.write(
            f"[gen_readme_tables] Marqueurs <!-- generated:{marker} --> "
            f"absents du README — section non mise à jour.\n"
        )
        return text
    return new_text


def _replace_test_count(text: str, count: int) -> str:
    """Remplace les mentions ``N tests`` ou ``N passed`` qui citent un
    nombre dans la fenêtre [count*0.5, count*2]. Garde la formulation
    exacte (espace, ponctuation) intacte."""
    def _sub(match: re.Match) -> str:
        cited = int(match.group(1))
        # Ne touche pas si le nombre cité est complètement hors plage —
        # c'est probablement une autre référence (un chiffre dans une
        # phrase qui parle d'autre chose).
        if cited < count * 0.5 or cited > count * 2:
            return match.group(0)
        return match.group(0).replace(str(cited), str(count))

    return re.sub(r"(\d{3,5})\s+(?:tests|passed)\b", _sub, text)


def render_readme(check_only: bool = False) -> int:
    """Met à jour les sections générées du README. Retourne 0 ou 1."""
    if not README.exists():
        sys.stderr.write(f"README absent : {README}\n")
        return 1

    original = README.read_text(encoding="utf-8")
    text = original
    text = _replace_section(text, "engines", build_engines_table())
    text = _replace_section(text, "cli", build_cli_table())
    text = _replace_section(text, "endpoints", build_endpoints_table())

    count = collect_test_count()
    if count is not None:
        text = _replace_test_count(text, count)

    if check_only:
        if text != original:
            sys.stderr.write(
                "[gen_readme_tables] README divergent du code généré. "
                "Lancer ``python scripts/gen_readme_tables.py`` puis "
                "committer.\n"
            )
            return 1
        return 0

    if text != original:
        README.write_text(text, encoding="utf-8")
        print(f"[gen_readme_tables] README mis à jour ({len(text)} octets).")
    else:
        print("[gen_readme_tables] README déjà à jour.")
    return 0


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--check",
        action="store_true",
        help="N'écrit rien ; sort 1 si le README diverge du code généré.",
    )
    args = parser.parse_args()
    return render_readme(check_only=args.check)


if __name__ == "__main__":
    sys.exit(main())