Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Running

Claude commited on Mar 4

Commit

28b6ae2

unverified ·

1 Parent(s): 6c55b7a

Sprint 3 — Pipelines OCR+LLM, adaptateurs LLM, sur-normalisation (classe 10)

Modules créés :
- picarones/llm/ : BaseLLMAdapter + adaptateurs OpenAI (GPT-4o), Anthropic
(Claude Sonnet), Mistral Large, Ollama (modèles locaux)
- picarones/pipelines/ : OCRLLMPipeline (3 modes : text_only, text_and_image,
zero_shot) + détection sur-normalisation (classe 10 de la taxonomie)
- picarones/prompts/ : bibliothèque de 5 prompts .txt versionnés pour
documents en français médiéval et imprimés anciens

Comportement :
- OCRLLMPipeline étend BaseOCREngine : un pipeline est un concurrent normal
dans run_benchmark (mêmes métriques CER/WER)
- Sur-normalisation (classe 10) calculée automatiquement par document et
agrégée dans EngineReport.pipeline_info
- Concurrent fictif « tesseract → gpt-4o » ajouté aux fixtures de démo

Rapport HTML :
- Badge « ⛓ pipeline » + étapes OCR→LLM dans le tableau de classement
- Colonne « Sur-norm. » dans le classement
- Triple-diff dans la vue Document (GT→OCR brut / OCR brut→LLM corrigé)
- Pipeline badge dans les cartes de la galerie

Tests : 154 tests passants (+46 nouveaux tests Sprint 3)

https://claude.ai/code/session_017gXea9mxBQqDTAsSQd7aAq

Files changed (21) hide show

picarones/core/results.py +26 -3
picarones/core/runner.py +88 -17
picarones/fixtures.py +109 -7
picarones/llm/__init__.py +16 -0
picarones/llm/anthropic_adapter.py +72 -0
picarones/llm/base.py +95 -0
picarones/llm/mistral_adapter.py +69 -0
picarones/llm/ollama_adapter.py +70 -0
picarones/llm/openai_adapter.py +68 -0
picarones/pipelines/__init__.py +14 -0
picarones/pipelines/base.py +243 -0
picarones/pipelines/over_normalization.py +121 -0
picarones/prompts/correction_image_medieval_french.txt +19 -0
picarones/prompts/correction_imprime_ancien.txt +20 -0
picarones/prompts/correction_medieval_french.txt +18 -0
picarones/prompts/zero_shot_imprime_ancien.txt +15 -0
picarones/prompts/zero_shot_medieval_french.txt +14 -0
picarones/report/generator.py +132 -7
rapport_demo.html +0 -0
tests/test_report.py +5 -3
tests/test_sprint3_llm_pipelines.py +441 -0

picarones/core/results.py CHANGED Viewed

@@ -30,9 +30,14 @@ class DocumentResult:
     metrics: MetricsResult
     duration_seconds: float
     engine_error: Optional[str] = None
     def as_dict(self) -> dict:
-        return {
             "doc_id": self.doc_id,
             "image_path": self.image_path,
             "ground_truth": self.ground_truth,
@@ -41,17 +46,27 @@ class DocumentResult:
             "duration_seconds": self.duration_seconds,
             "engine_error": self.engine_error,
         }
 @dataclass
 class EngineReport:
-    """Rapport complet d'un moteur sur l'ensemble du corpus."""
     engine_name: str
     engine_version: str
     engine_config: dict
     document_results: list[DocumentResult]
     aggregated_metrics: dict = field(default_factory=dict)
     def __post_init__(self) -> None:
         if not self.aggregated_metrics and self.document_results:
@@ -69,14 +84,22 @@ class EngineReport:
         wer_stats = self.aggregated_metrics.get("wer", {})
         return wer_stats.get("mean")
     def as_dict(self) -> dict:
-        return {
             "engine_name": self.engine_name,
             "engine_version": self.engine_version,
             "engine_config": self.engine_config,
             "aggregated_metrics": self.aggregated_metrics,
             "document_results": [dr.as_dict() for dr in self.document_results],
         }
 @dataclass

     metrics: MetricsResult
     duration_seconds: float
     engine_error: Optional[str] = None
+    # Champs spécifiques aux pipelines OCR+LLM
+    ocr_intermediate: Optional[str] = None
+    """Sortie OCR brute avant correction LLM (None pour les moteurs OCR seuls)."""
+    pipeline_metadata: dict = field(default_factory=dict)
+    """Métadonnées du pipeline : mode, prompt, over-normalization…"""
     def as_dict(self) -> dict:
+        d = {
             "doc_id": self.doc_id,
             "image_path": self.image_path,
             "ground_truth": self.ground_truth,
             "duration_seconds": self.duration_seconds,
             "engine_error": self.engine_error,
         }
+        if self.ocr_intermediate is not None:
+            d["ocr_intermediate"] = self.ocr_intermediate
+        if self.pipeline_metadata:
+            d["pipeline_metadata"] = self.pipeline_metadata
+        return d
 @dataclass
 class EngineReport:
+    """Rapport complet d'un moteur (ou pipeline) sur l'ensemble du corpus."""
     engine_name: str
     engine_version: str
     engine_config: dict
     document_results: list[DocumentResult]
     aggregated_metrics: dict = field(default_factory=dict)
+    pipeline_info: dict = field(default_factory=dict)
+    """Métadonnées du pipeline OCR+LLM (vide pour les moteurs OCR seuls).
+    Clés typiques : mode, prompt_file, llm_model, llm_provider, pipeline_steps,
+    over_normalization (score agrégé, classe 10 de la taxonomie).
+    """
     def __post_init__(self) -> None:
         if not self.aggregated_metrics and self.document_results:
         wer_stats = self.aggregated_metrics.get("wer", {})
         return wer_stats.get("mean")
+    @property
+    def is_pipeline(self) -> bool:
+        """Vrai si ce rapport correspond à un pipeline OCR+LLM."""
+        return bool(self.pipeline_info)
     def as_dict(self) -> dict:
+        d = {
             "engine_name": self.engine_name,
             "engine_version": self.engine_version,
             "engine_config": self.engine_config,
             "aggregated_metrics": self.aggregated_metrics,
             "document_results": [dr.as_dict() for dr in self.document_results],
         }
+        if self.pipeline_info:
+            d["pipeline_info"] = self.pipeline_info
+        return d
 @dataclass

picarones/core/runner.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Orchestrateur du benchmark : exécute les moteurs sur le corpus et agrège les résultats."""
 from __future__ import annotations
@@ -9,7 +9,7 @@ from typing import Optional
 from tqdm import tqdm
 from picarones.core.corpus import Corpus
-from picarones.core.metrics import compute_metrics
 from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
 from picarones.engines.base import BaseOCREngine
@@ -22,33 +22,37 @@ def run_benchmark(
     output_json: Optional[str | Path] = None,
     show_progress: bool = True,
 ) -> BenchmarkResult:
-    """Exécute le benchmark d'un ou plusieurs moteurs sur un corpus.
-    Pour chaque moteur, chaque document est traité séquentiellement.
-    Les sorties sont évaluées par rapport à la vérité terrain via
-    les métriques CER et WER.
     Parameters
     ----------
     corpus:
-        Corpus à évaluer (objet ``Corpus`` avec ses ``Document``).
     engines:
-        Liste d'adaptateurs moteurs à comparer.
     output_json:
-        Chemin optionnel pour écrire le résultat JSON. Si ``None``, pas
-        d'écriture disque.
     show_progress:
-        Affiche une barre de progression tqdm (défaut : True).
     Returns
     -------
     BenchmarkResult
-        Objet contenant tous les résultats, agrégations et classement.
     """
     engine_reports: list[EngineReport] = []
     for engine in engines:
-        logger.info("Démarrage moteur : %s", engine.name)
         document_results: list[DocumentResult] = []
         iterator = tqdm(
@@ -64,9 +68,6 @@ def run_benchmark(
             if ocr_result.success:
                 metrics = compute_metrics(doc.ground_truth, ocr_result.text)
             else:
-                # Moteur en erreur → métriques dégradées avec erreur tracée
-                from picarones.core.metrics import MetricsResult
                 metrics = MetricsResult(
                     cer=1.0, cer_nfc=1.0, cer_caseless=1.0,
                     wer=1.0, wer_normalized=1.0, mer=1.0, wil=1.0,
@@ -75,6 +76,27 @@ def run_benchmark(
                     error=ocr_result.error,
                 )
             document_results.append(
                 DocumentResult(
                     doc_id=doc.doc_id,
@@ -84,19 +106,24 @@ def run_benchmark(
                     metrics=metrics,
                     duration_seconds=ocr_result.duration_seconds,
                     engine_error=ocr_result.error,
                 )
             )
         engine_version = engine._safe_version()
         report = EngineReport(
             engine_name=engine.name,
             engine_version=engine_version,
             engine_config=engine.config,
             document_results=document_results,
         )
         engine_reports.append(report)
         logger.info(
-            "Moteur %s terminé — CER moyen : %.2f%%",
             engine.name,
             (report.mean_cer or 0) * 100,
         )
@@ -113,3 +140,47 @@ def run_benchmark(
         logger.info("Résultats écrits dans : %s", path)
     return benchmark

+"""Orchestrateur du benchmark : exécute les moteurs/pipelines sur le corpus et agrège les résultats."""
 from __future__ import annotations
 from tqdm import tqdm
 from picarones.core.corpus import Corpus
+from picarones.core.metrics import MetricsResult, compute_metrics
 from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
 from picarones.engines.base import BaseOCREngine
     output_json: Optional[str | Path] = None,
     show_progress: bool = True,
 ) -> BenchmarkResult:
+    """Exécute le benchmark d'un ou plusieurs moteurs/pipelines sur un corpus.
+    Les pipelines OCR+LLM (``OCRLLMPipeline``) sont traités exactement comme
+    les moteurs OCR classiques — ils implémentent la même interface
+    ``BaseOCREngine`` et produisent les mêmes métriques CER/WER.
+    En supplément, pour les pipelines :
+    - La sortie OCR intermédiaire est conservée dans ``DocumentResult.ocr_intermediate``
+    - La sur-normalisation LLM (classe 10) est calculée et stockée dans
+      ``DocumentResult.pipeline_metadata["over_normalization"]``
+    - Les stats agrégées de sur-normalisation figurent dans ``EngineReport.pipeline_info``
     Parameters
     ----------
     corpus:
+        Corpus à évaluer.
     engines:
+        Liste d'adaptateurs moteurs ou de pipelines OCR+LLM.
     output_json:
+        Chemin optionnel pour écrire le résultat JSON.
     show_progress:
+        Affiche une barre de progression tqdm.
     Returns
     -------
     BenchmarkResult
     """
     engine_reports: list[EngineReport] = []
     for engine in engines:
+        logger.info("Démarrage concurrent : %s", engine.name)
         document_results: list[DocumentResult] = []
         iterator = tqdm(
             if ocr_result.success:
                 metrics = compute_metrics(doc.ground_truth, ocr_result.text)
             else:
                 metrics = MetricsResult(
                     cer=1.0, cer_nfc=1.0, cer_caseless=1.0,
                     wer=1.0, wer_normalized=1.0, mer=1.0, wil=1.0,
                     error=ocr_result.error,
                 )
+            # Extraction des champs pipeline depuis les métadonnées EngineResult
+            ocr_intermediate = ocr_result.metadata.get("ocr_intermediate")
+            pipeline_meta: dict = {}
+            if ocr_result.metadata.get("is_pipeline"):
+                pipeline_meta = {
+                    "pipeline_mode": ocr_result.metadata.get("pipeline_mode"),
+                    "prompt_file": ocr_result.metadata.get("prompt_file"),
+                    "llm_model": ocr_result.metadata.get("llm_model"),
+                    "llm_provider": ocr_result.metadata.get("llm_provider"),
+                }
+                # Calcul de la sur-normalisation (classe 10) si OCR intermédiaire disponible
+                if ocr_intermediate is not None and ocr_result.success:
+                    from picarones.pipelines.over_normalization import detect_over_normalization
+                    over_norm = detect_over_normalization(
+                        ground_truth=doc.ground_truth,
+                        ocr_text=ocr_intermediate,
+                        llm_text=ocr_result.text,
+                    )
+                    pipeline_meta["over_normalization"] = over_norm.as_dict()
             document_results.append(
                 DocumentResult(
                     doc_id=doc.doc_id,
                     metrics=metrics,
                     duration_seconds=ocr_result.duration_seconds,
                     engine_error=ocr_result.error,
+                    ocr_intermediate=ocr_intermediate,
+                    pipeline_metadata=pipeline_meta,
                 )
             )
         engine_version = engine._safe_version()
+        pipeline_info = _build_pipeline_info(engine, document_results)
         report = EngineReport(
             engine_name=engine.name,
             engine_version=engine_version,
             engine_config=engine.config,
             document_results=document_results,
+            pipeline_info=pipeline_info,
         )
         engine_reports.append(report)
         logger.info(
+            "Concurrent %s terminé — CER moyen : %.2f%%",
             engine.name,
             (report.mean_cer or 0) * 100,
         )
         logger.info("Résultats écrits dans : %s", path)
     return benchmark
+def _build_pipeline_info(engine: BaseOCREngine, doc_results: list[DocumentResult]) -> dict:
+    """Construit le dictionnaire pipeline_info pour un EngineReport."""
+    first_with_meta = next(
+        (dr for dr in doc_results if dr.pipeline_metadata), None
+    )
+    if first_with_meta is None:
+        return {}
+    meta = first_with_meta.pipeline_metadata
+    info: dict = {
+        "pipeline_mode": meta.get("pipeline_mode"),
+        "prompt_file": meta.get("prompt_file"),
+        "llm_model": meta.get("llm_model"),
+        "llm_provider": meta.get("llm_provider"),
+    }
+    # Récupérer les steps depuis le moteur si c'est un OCRLLMPipeline
+    try:
+        from picarones.pipelines.base import OCRLLMPipeline
+        if isinstance(engine, OCRLLMPipeline):
+            info["pipeline_steps"] = engine._build_steps_info()
+            info["prompt_template"] = engine._prompt_template
+    except ImportError:
+        pass
+    # Agréger les stats de sur-normalisation sur tous les documents
+    over_norm_results = [
+        dr.pipeline_metadata.get("over_normalization")
+        for dr in doc_results
+        if dr.pipeline_metadata.get("over_normalization") is not None
+    ]
+    if over_norm_results:
+        total_correct = sum(r["total_correct_ocr_words"] for r in over_norm_results)
+        total_over = sum(r["over_normalized_count"] for r in over_norm_results)
+        info["over_normalization"] = {
+            "score": round(total_over / total_correct, 4) if total_correct > 0 else 0.0,
+            "total_correct_ocr_words": total_correct,
+            "over_normalized_count": total_over,
+            "document_count": len(over_norm_results),
+        }
+    return info

picarones/fixtures.py CHANGED Viewed

@@ -17,6 +17,7 @@ from typing import Optional
 from picarones.core.metrics import MetricsResult, aggregate_metrics
 from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
 # ---------------------------------------------------------------------------
 # Textes GT réalistes (documents patrimoniaux BnF)
@@ -76,6 +77,38 @@ def _pero_errors(text: str, rng: random.Random) -> str:
     return text
 def _bad_engine_errors(text: str, rng: random.Random) -> str:
     """Moteur de mauvaise qualité : nombreuses erreurs."""
     words = text.split()
@@ -182,16 +215,44 @@ def generate_sample_benchmark(
     n_docs = min(n_docs, len(_GT_TEXTS))
     gt_texts = _GT_TEXTS[:n_docs]
     engines_config = [
-        ("pero_ocr", "0.7.2", {"config": "/models/pero_printed.ini"}, _pero_errors),
-        ("tesseract", "5.3.3", {"lang": "fra", "psm": 6}, _tesseract_errors),
-        ("ancien_moteur", "2.1.0", {"lang": "fra"}, _bad_engine_errors),
     ]
     engine_reports: list[EngineReport] = []
     image_b64_cache: dict[str, str] = {}
-    for engine_name, engine_version, engine_cfg, error_fn in engines_config:
         doc_results: list[DocumentResult] = []
         for i, gt in enumerate(gt_texts):
@@ -203,8 +264,28 @@ def generate_sample_benchmark(
                 png = _make_placeholder_png(320, 220, gt[:20])
                 image_b64_cache[doc_id] = _png_to_data_uri(png)
-            # Générer la sortie OCR avec erreurs
-            hypothesis = error_fn(gt, rng)
             metrics = _make_metrics(gt, hypothesis)
@@ -215,15 +296,36 @@ def generate_sample_benchmark(
                     ground_truth=gt,
                     hypothesis=hypothesis,
                     metrics=metrics,
-                    duration_seconds=round(rng.uniform(0.3, 4.5), 3),
                 )
             )
         report = EngineReport(
             engine_name=engine_name,
             engine_version=engine_version,
             engine_config=engine_cfg,
             document_results=doc_results,
         )
         engine_reports.append(report)

 from picarones.core.metrics import MetricsResult, aggregate_metrics
 from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
+from picarones.pipelines.over_normalization import detect_over_normalization
 # ---------------------------------------------------------------------------
 # Textes GT réalistes (documents patrimoniaux BnF)
     return text
+def _llm_correction(text: str, rng: random.Random) -> str:
+    """Simule la correction GPT-4o sur la sortie Tesseract.
+    Le LLM corrige la majorité des erreurs OCR mais introduit parfois
+    de la sur-normalisation (classe 10) : il modernise des graphies médiévales
+    légitimes (nostre → notre, maistre → maître, faict → fait).
+    """
+    # Corrections typiques que le LLM réussit (erreurs OCR fréquentes)
+    good_corrections = [
+        ("noltre", "nostre"), ("inaistre", "maistre"),
+        ("faictcs", "faictes"), ("conlcillier", "conseillie"),
+        ("confideration", "consideracion"), ("Froiflart", "Froissart"),
+        ("8", "&"), ("oe", "œ"),
+    ]
+    for src, tgt in good_corrections:
+        text = text.replace(src, tgt)
+    # Sur-normalisation : le LLM modernise parfois à tort (classe 10)
+    # Ces remplacements s'appliquent sur le texte (partiellement corrigé ci-dessus)
+    over_normalizations = [
+        ("nostre", "notre"), ("maistre", "maître"),
+        ("faictes", "faites"), ("Donné", "donné"),
+        ("conseillier", "conseiller"), ("consideracion", "considération"),
+    ]
+    # ~45% de chance de sur-normaliser sur chaque document
+    if rng.random() < 0.45:
+        for src, tgt in rng.sample(over_normalizations, k=rng.randint(1, 2)):
+            text = text.replace(src, tgt, 1)
+    return text
 def _bad_engine_errors(text: str, rng: random.Random) -> str:
     """Moteur de mauvaise qualité : nombreuses erreurs."""
     words = text.split()
     n_docs = min(n_docs, len(_GT_TEXTS))
     gt_texts = _GT_TEXTS[:n_docs]
+    # (name, version, config, error_fn, is_pipeline, pipeline_info)
     engines_config = [
+        ("pero_ocr", "0.7.2", {"config": "/models/pero_printed.ini"}, _pero_errors, False, {}),
+        ("tesseract", "5.3.3", {"lang": "fra", "psm": 6}, _tesseract_errors, False, {}),
+        ("ancien_moteur", "2.1.0", {"lang": "fra"}, _bad_engine_errors, False, {}),
+        # Pipeline fictif : tesseract → gpt-4o (post-correction image+texte)
+        (
+            "tesseract → gpt-4o",
+            "ocr=5.3.3; llm=gpt-4o",
+            {"lang": "fra", "psm": 6},
+            _llm_correction,  # appliqué sur la sortie tesseract
+            True,
+            {
+                "pipeline_mode": "text_and_image",
+                "prompt_file": "correction_medieval_french.txt",
+                "llm_model": "gpt-4o",
+                "llm_provider": "openai",
+                "pipeline_steps": [
+                    {"type": "ocr", "engine": "tesseract", "version": "5.3.3"},
+                    {
+                        "type": "llm",
+                        "model": "gpt-4o",
+                        "provider": "openai",
+                        "mode": "text_and_image",
+                        "prompt_file": "correction_medieval_french.txt",
+                    },
+                ],
+            },
+        ),
     ]
     engine_reports: list[EngineReport] = []
     image_b64_cache: dict[str, str] = {}
+    # Pré-calculer les sorties tesseract pour le pipeline
+    tess_outputs: dict[str, str] = {}
+    for engine_name, engine_version, engine_cfg, error_fn, is_pipeline, pipeline_info in engines_config:
         doc_results: list[DocumentResult] = []
         for i, gt in enumerate(gt_texts):
                 png = _make_placeholder_png(320, 220, gt[:20])
                 image_b64_cache[doc_id] = _png_to_data_uri(png)
+            if is_pipeline:
+                # Pour le pipeline : appliquer tesseract d'abord, puis LLM correction
+                ocr_intermediate = tess_outputs.get(doc_id) or _tesseract_errors(gt, random.Random(rng.randint(0, 9999)))
+                hypothesis = _llm_correction(ocr_intermediate, rng)
+                # Calcul de la sur-normalisation (classe 10)
+                over_norm = detect_over_normalization(gt, ocr_intermediate, hypothesis)
+                pipeline_meta = {
+                    "pipeline_mode": pipeline_info.get("pipeline_mode"),
+                    "prompt_file": pipeline_info.get("prompt_file"),
+                    "llm_model": pipeline_info.get("llm_model"),
+                    "llm_provider": pipeline_info.get("llm_provider"),
+                    "over_normalization": over_norm.as_dict(),
+                }
+                duration = round(rng.uniform(2.5, 12.0), 3)  # plus lent qu'un OCR seul
+            else:
+                ocr_intermediate = None
+                hypothesis = error_fn(gt, rng)
+                pipeline_meta = {}
+                duration = round(rng.uniform(0.3, 4.5), 3)
+                # Mémoriser la sortie tesseract pour le pipeline
+                if engine_name == "tesseract":
+                    tess_outputs[doc_id] = hypothesis
             metrics = _make_metrics(gt, hypothesis)
                     ground_truth=gt,
                     hypothesis=hypothesis,
                     metrics=metrics,
+                    duration_seconds=duration,
+                    ocr_intermediate=ocr_intermediate,
+                    pipeline_metadata=pipeline_meta,
                 )
             )
+        # Agréger les stats de sur-normalisation pour le pipeline
+        effective_pipeline_info = dict(pipeline_info)
+        if is_pipeline:
+            over_norms = [
+                dr.pipeline_metadata.get("over_normalization")
+                for dr in doc_results
+                if dr.pipeline_metadata.get("over_normalization")
+            ]
+            if over_norms:
+                total_correct = sum(r["total_correct_ocr_words"] for r in over_norms)
+                total_over = sum(r["over_normalized_count"] for r in over_norms)
+                effective_pipeline_info["over_normalization"] = {
+                    "score": round(total_over / total_correct, 4) if total_correct > 0 else 0.0,
+                    "total_correct_ocr_words": total_correct,
+                    "over_normalized_count": total_over,
+                    "document_count": len(over_norms),
+                }
         report = EngineReport(
             engine_name=engine_name,
             engine_version=engine_version,
             engine_config=engine_cfg,
             document_results=doc_results,
+            pipeline_info=effective_pipeline_info,
         )
         engine_reports.append(report)

picarones/llm/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""Adaptateurs LLM pour les pipelines OCR+LLM."""
+from picarones.llm.base import BaseLLMAdapter, LLMResult
+from picarones.llm.anthropic_adapter import AnthropicAdapter
+from picarones.llm.mistral_adapter import MistralAdapter
+from picarones.llm.ollama_adapter import OllamaAdapter
+from picarones.llm.openai_adapter import OpenAIAdapter
+__all__ = [
+    "BaseLLMAdapter",
+    "LLMResult",
+    "OpenAIAdapter",
+    "AnthropicAdapter",
+    "MistralAdapter",
+    "OllamaAdapter",
+]

picarones/llm/anthropic_adapter.py ADDED Viewed

	@@ -0,0 +1,72 @@

+"""Adaptateur LLM — Anthropic (Claude Sonnet, Claude Haiku)."""
+from __future__ import annotations
+import os
+from typing import Optional
+from picarones.llm.base import BaseLLMAdapter
+class AnthropicAdapter(BaseLLMAdapter):
+    """Adaptateur pour les modèles Anthropic Claude.
+    Clé API via la variable d'environnement ``ANTHROPIC_API_KEY``.
+    Modes supportés : text_only, text_and_image, zero_shot.
+    """
+    @property
+    def name(self) -> str:
+        return "anthropic"
+    @property
+    def default_model(self) -> str:
+        return "claude-sonnet-4-6"
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        config: Optional[dict] = None,
+    ) -> None:
+        super().__init__(model, config)
+        self._api_key = os.environ.get("ANTHROPIC_API_KEY")
+    def _call(self, prompt: str, image_b64: Optional[str] = None) -> str:
+        if not self._api_key:
+            raise RuntimeError(
+                "Clé API Anthropic manquante — définissez la variable d'environnement ANTHROPIC_API_KEY"
+            )
+        try:
+            import anthropic
+        except ImportError as exc:
+            raise RuntimeError(
+                "Le package 'anthropic' n'est pas installé. Lancez : pip install anthropic"
+            ) from exc
+        client = anthropic.Anthropic(api_key=self._api_key)
+        temperature = float(self.config.get("temperature", 0.0))
+        max_tokens = int(self.config.get("max_tokens", 4096))
+        if image_b64:
+            content: list | str = [
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": image_b64,
+                    },
+                },
+                {"type": "text", "text": prompt},
+            ]
+        else:
+            content = prompt
+        response = client.messages.create(
+            model=self.model,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            messages=[{"role": "user", "content": content}],
+        )
+        return response.content[0].text

picarones/llm/base.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""Interface abstraite commune à tous les adaptateurs LLM."""
+from __future__ import annotations
+import time
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class LLMResult:
+    """Résultat produit par un appel LLM."""
+    model_id: str
+    text: str
+    duration_seconds: float
+    tokens_used: Optional[int] = None
+    error: Optional[str] = None
+    @property
+    def success(self) -> bool:
+        return self.error is None
+class BaseLLMAdapter(ABC):
+    """Classe de base pour tous les adaptateurs LLM.
+    Chaque adaptateur doit implémenter :
+    - ``name``         : identifiant du provider (ex : 'openai')
+    - ``default_model``: modèle par défaut du provider
+    - ``_call()``      : appel API effectif, retourne le texte brut
+    Les clés API sont lues depuis les variables d'environnement uniquement.
+    """
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        config: Optional[dict] = None,
+    ) -> None:
+        self.config: dict = config or {}
+        self.model: str = model or self.default_model
+    @property
+    @abstractmethod
+    def name(self) -> str:
+        """Identifiant du provider (ex : 'openai', 'anthropic')."""
+    @property
+    @abstractmethod
+    def default_model(self) -> str:
+        """Modèle utilisé si aucun n'est fourni explicitement."""
+    @abstractmethod
+    def _call(self, prompt: str, image_b64: Optional[str] = None) -> str:
+        """Appel LLM effectif.
+        Parameters
+        ----------
+        prompt:
+            Texte du prompt final (variables déjà substituées).
+        image_b64:
+            Image encodée en base64 (sans préfixe data URI).
+            None pour les appels texte-uniquement.
+        Returns
+        -------
+        str
+            Texte généré par le LLM.
+        """
+    def complete(
+        self,
+        prompt: str,
+        image_b64: Optional[str] = None,
+    ) -> LLMResult:
+        """Point d'entrée public : appelle le LLM et mesure la durée."""
+        start = time.perf_counter()
+        try:
+            text = self._call(prompt, image_b64)
+            error = None
+        except Exception as exc:  # noqa: BLE001
+            text = ""
+            error = str(exc)
+        duration = time.perf_counter() - start
+        return LLMResult(
+            model_id=self.model,
+            text=text,
+            duration_seconds=round(duration, 4),
+            error=error,
+        )
+    def __repr__(self) -> str:
+        return f"{self.__class__.__name__}(model={self.model!r})"

picarones/llm/mistral_adapter.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""Adaptateur LLM — Mistral AI (Mistral Large, Pixtral)."""
+from __future__ import annotations
+import os
+from typing import Optional
+from picarones.llm.base import BaseLLMAdapter
+class MistralAdapter(BaseLLMAdapter):
+    """Adaptateur pour les modèles Mistral AI.
+    Clé API via la variable d'environnement ``MISTRAL_API_KEY``.
+    Modes supportés : text_only (tous modèles), text_and_image et zero_shot
+    avec les modèles multimodaux (pixtral-12b, pixtral-large).
+    """
+    @property
+    def name(self) -> str:
+        return "mistral"
+    @property
+    def default_model(self) -> str:
+        return "mistral-large-latest"
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        config: Optional[dict] = None,
+    ) -> None:
+        super().__init__(model, config)
+        self._api_key = os.environ.get("MISTRAL_API_KEY")
+    def _call(self, prompt: str, image_b64: Optional[str] = None) -> str:
+        if not self._api_key:
+            raise RuntimeError(
+                "Clé API Mistral manquante — définissez la variable d'environnement MISTRAL_API_KEY"
+            )
+        try:
+            from mistralai import Mistral
+        except ImportError as exc:
+            raise RuntimeError(
+                "Le package 'mistralai' n'est pas installé. Lancez : pip install mistralai"
+            ) from exc
+        client = Mistral(api_key=self._api_key)
+        temperature = float(self.config.get("temperature", 0.0))
+        max_tokens = int(self.config.get("max_tokens", 4096))
+        if image_b64:
+            content: list | str = [
+                {"type": "text", "text": prompt},
+                {
+                    "type": "image_url",
+                    "image_url": f"data:image/png;base64,{image_b64}",
+                },
+            ]
+        else:
+            content = prompt
+        response = client.chat.complete(
+            model=self.model,
+            messages=[{"role": "user", "content": content}],
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+        return response.choices[0].message.content or ""

picarones/llm/ollama_adapter.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""Adaptateur LLM — Ollama (modèles locaux : Llama 3, Gemma, Phi, Mistral local…)."""
+from __future__ import annotations
+from typing import Optional
+from picarones.llm.base import BaseLLMAdapter
+class OllamaAdapter(BaseLLMAdapter):
+    """Adaptateur pour les modèles locaux via Ollama.
+    Aucune clé API requise. Nécessite un serveur Ollama actif (par défaut
+    sur http://localhost:11434).
+    Modes supportés :
+    - text_only      : tous modèles Ollama
+    - text_and_image : modèles multimodaux (llava, bakllava, moondream…)
+    - zero_shot      : modèles multimodaux uniquement
+    Configuration (via ``config``) :
+    - ``base_url`` : URL du serveur Ollama (défaut : http://localhost:11434)
+    """
+    @property
+    def name(self) -> str:
+        return "ollama"
+    @property
+    def default_model(self) -> str:
+        return "llama3"
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        config: Optional[dict] = None,
+    ) -> None:
+        super().__init__(model, config)
+        self._base_url = self.config.get("base_url", "http://localhost:11434").rstrip("/")
+    def _call(self, prompt: str, image_b64: Optional[str] = None) -> str:
+        import json
+        import urllib.error
+        import urllib.request
+        temperature = float(self.config.get("temperature", 0.0))
+        payload: dict = {
+            "model": self.model,
+            "prompt": prompt,
+            "stream": False,
+            "options": {"temperature": temperature},
+        }
+        if image_b64:
+            payload["images"] = [image_b64]
+        data = json.dumps(payload).encode("utf-8")
+        req = urllib.request.Request(
+            f"{self._base_url}/api/generate",
+            data=data,
+            headers={"Content-Type": "application/json"},
+        )
+        try:
+            with urllib.request.urlopen(req, timeout=120) as resp:
+                result = json.loads(resp.read().decode("utf-8"))
+        except urllib.error.URLError as exc:
+            raise RuntimeError(
+                f"Impossible de joindre le serveur Ollama sur {self._base_url}. "
+                f"Vérifiez qu'Ollama est démarré (ollama serve). Erreur : {exc}"
+            ) from exc
+        return result.get("response", "")

picarones/llm/openai_adapter.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""Adaptateur LLM — OpenAI (GPT-4o, GPT-4o-mini)."""
+from __future__ import annotations
+import os
+from typing import Optional
+from picarones.llm.base import BaseLLMAdapter
+class OpenAIAdapter(BaseLLMAdapter):
+    """Adaptateur pour les modèles OpenAI (GPT-4o, GPT-4o-mini).
+    Clé API via la variable d'environnement ``OPENAI_API_KEY``.
+    Modes supportés : text_only, text_and_image, zero_shot.
+    """
+    @property
+    def name(self) -> str:
+        return "openai"
+    @property
+    def default_model(self) -> str:
+        return "gpt-4o"
+    def __init__(
+        self,
+        model: Optional[str] = None,
+        config: Optional[dict] = None,
+    ) -> None:
+        super().__init__(model, config)
+        self._api_key = os.environ.get("OPENAI_API_KEY")
+    def _call(self, prompt: str, image_b64: Optional[str] = None) -> str:
+        if not self._api_key:
+            raise RuntimeError(
+                "Clé API OpenAI manquante — définissez la variable d'environnement OPENAI_API_KEY"
+            )
+        try:
+            from openai import OpenAI
+        except ImportError as exc:
+            raise RuntimeError(
+                "Le package 'openai' n'est pas installé. Lancez : pip install openai"
+            ) from exc
+        client = OpenAI(api_key=self._api_key)
+        temperature = float(self.config.get("temperature", 0.0))
+        max_tokens = int(self.config.get("max_tokens", 4096))
+        if image_b64:
+            content = [
+                {"type": "text", "text": prompt},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_b64}"},
+                },
+            ]
+        else:
+            content = prompt  # type: ignore[assignment]
+        response = client.chat.completions.create(
+            model=self.model,
+            messages=[{"role": "user", "content": content}],
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+        return response.choices[0].message.content or ""

picarones/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""Pipelines OCR+LLM : combinent un moteur OCR avec un LLM de correction."""
+from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
+from picarones.pipelines.over_normalization import (
+    OverNormalizationResult,
+    detect_over_normalization,
+)
+__all__ = [
+    "OCRLLMPipeline",
+    "PipelineMode",
+    "OverNormalizationResult",
+    "detect_over_normalization",
+]

picarones/pipelines/base.py ADDED Viewed

	@@ -0,0 +1,243 @@

+"""Pipeline OCR+LLM — présenté comme un concurrent normal dans les benchmarks.
+Un pipeline compose un moteur OCR et un LLM de correction selon trois modes :
+  text_only      → OCR brut ──► LLM (texte seul)
+  text_and_image → OCR brut + image ──► LLM multimodal
+  zero_shot      → image ──► LLM (pas d'OCR amont)
+La classe ``OCRLLMPipeline`` étend ``BaseOCREngine`` : un pipeline est
+un concurrent comme un autre dans ``run_benchmark``, avec les mêmes métriques
+CER/WER. Les métadonnées spécifiques (étapes, prompt, OCR intermédiaire) sont
+exposées via ``EngineResult.metadata``.
+"""
+from __future__ import annotations
+import base64
+import time
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+from picarones.engines.base import BaseOCREngine, EngineResult
+from picarones.llm.base import BaseLLMAdapter
+class PipelineMode(str, Enum):
+    """Mode d'appel LLM dans le pipeline."""
+    TEXT_ONLY = "text_only"
+    """Le LLM reçoit uniquement le texte OCR brut."""
+    TEXT_AND_IMAGE = "text_and_image"
+    """Le LLM reçoit le texte OCR ET l'image (mode multimodal)."""
+    ZERO_SHOT = "zero_shot"
+    """Le LLM reçoit uniquement l'image — aucun OCR amont."""
+# Répertoire de la bibliothèque de prompts intégrée
+_PROMPTS_DIR = Path(__file__).parent.parent / "prompts"
+def _load_prompt(prompt_path: str | Path) -> str:
+    """Charge un prompt depuis un chemin absolu, relatif ou depuis la bibliothèque intégrée."""
+    p = Path(prompt_path)
+    if p.is_absolute() and p.exists():
+        return p.read_text(encoding="utf-8")
+    # Chemin relatif : chercher d'abord dans le CWD, puis dans la bibliothèque
+    if p.exists():
+        return p.read_text(encoding="utf-8")
+    builtin = _PROMPTS_DIR / p
+    if builtin.exists():
+        return builtin.read_text(encoding="utf-8")
+    raise FileNotFoundError(
+        f"Prompt introuvable : '{prompt_path}'. "
+        f"Bibliothèque disponible dans : {_PROMPTS_DIR}"
+    )
+def _image_to_b64(image_path: Path) -> str:
+    """Encode une image en base64 pur (sans préfixe data URI)."""
+    return base64.b64encode(image_path.read_bytes()).decode("ascii")
+class OCRLLMPipeline(BaseOCREngine):
+    """Pipeline OCR+LLM, interchangeable avec n'importe quel moteur OCR.
+    Parameters
+    ----------
+    llm_adapter:
+        Adaptateur LLM (OpenAI, Anthropic, Mistral, Ollama…).
+    mode:
+        Mode de correction — text_only, text_and_image, ou zero_shot.
+    prompt:
+        Chemin vers un fichier .txt de prompt, ou nom d'un fichier de la
+        bibliothèque intégrée (ex : ``"correction_medieval_french.txt"``).
+        Variables disponibles dans le fichier : ``{ocr_output}`` et ``{image_b64}``.
+    ocr_engine:
+        Moteur OCR amont. Obligatoire pour text_only et text_and_image.
+        Non utilisé en mode zero_shot.
+    pipeline_name:
+        Nom affiché dans le rapport (ex : ``"tesseract → gpt-4o"``).
+        Généré automatiquement si non fourni.
+    config:
+        Paramètres supplémentaires passés à la classe de base.
+    Examples
+    --------
+    >>> from picarones.llm import OpenAIAdapter
+    >>> from picarones.engines.tesseract import TesseractEngine
+    >>> pipeline = OCRLLMPipeline(
+    ...     ocr_engine=TesseractEngine({"lang": "fra"}),
+    ...     llm_adapter=OpenAIAdapter(model="gpt-4o"),
+    ...     mode=PipelineMode.TEXT_AND_IMAGE,
+    ...     prompt="correction_medieval_french.txt",
+    ... )
+    """
+    def __init__(
+        self,
+        llm_adapter: BaseLLMAdapter,
+        mode: PipelineMode | str = PipelineMode.TEXT_ONLY,
+        prompt: str | Path = "correction_medieval_french.txt",
+        ocr_engine: Optional[BaseOCREngine] = None,
+        pipeline_name: Optional[str] = None,
+        config: Optional[dict] = None,
+    ) -> None:
+        super().__init__(config)
+        self.ocr_engine = ocr_engine
+        self.llm_adapter = llm_adapter
+        self.mode = PipelineMode(mode)
+        self.prompt_path = str(prompt)
+        self._prompt_template = _load_prompt(prompt)
+        # Nom affiché dans le rapport
+        if pipeline_name:
+            self._name = pipeline_name
+        elif self.mode == PipelineMode.ZERO_SHOT:
+            self._name = f"{llm_adapter.model} (zero-shot)"
+        elif ocr_engine:
+            self._name = f"{ocr_engine.name} → {llm_adapter.model}"
+        else:
+            self._name = f"pipeline → {llm_adapter.model}"
+        # Stockage temporaire de la sortie OCR intermédiaire (pour over-normalization)
+        self._last_ocr_text: Optional[str] = None
+    # ------------------------------------------------------------------
+    # Interface BaseOCREngine
+    # ------------------------------------------------------------------
+    @property
+    def name(self) -> str:
+        return self._name
+    def version(self) -> str:
+        ocr_v = self.ocr_engine._safe_version() if self.ocr_engine else "—"
+        return f"ocr={ocr_v}; llm={self.llm_adapter.model}"
+    def _run_ocr(self, image_path: Path) -> str:
+        """Logique interne du pipeline — appelée par ``run()``."""
+        self._last_ocr_text = None
+        ocr_text = ""
+        if self.mode == PipelineMode.ZERO_SHOT:
+            image_b64 = _image_to_b64(image_path)
+            prompt = self._build_prompt(image_b64=image_b64)
+            result = self.llm_adapter.complete(prompt, image_b64=image_b64)
+        elif self.mode == PipelineMode.TEXT_ONLY:
+            if self.ocr_engine is None:
+                raise ValueError("ocr_engine est requis pour le mode text_only")
+            ocr_result = self.ocr_engine.run(image_path)
+            ocr_text = ocr_result.text
+            self._last_ocr_text = ocr_text
+            prompt = self._build_prompt(ocr_text=ocr_text)
+            result = self.llm_adapter.complete(prompt)
+        else:  # TEXT_AND_IMAGE
+            if self.ocr_engine is None:
+                raise ValueError("ocr_engine est requis pour le mode text_and_image")
+            ocr_result = self.ocr_engine.run(image_path)
+            ocr_text = ocr_result.text
+            self._last_ocr_text = ocr_text
+            image_b64 = _image_to_b64(image_path)
+            prompt = self._build_prompt(ocr_text=ocr_text, image_b64=image_b64)
+            result = self.llm_adapter.complete(prompt, image_b64=image_b64)
+        if not result.success:
+            raise RuntimeError(f"Erreur LLM ({self.llm_adapter.model}): {result.error}")
+        return result.text
+    # ------------------------------------------------------------------
+    # Override run() pour injecter les métadonnées pipeline
+    # ------------------------------------------------------------------
+    def run(self, image_path: str | Path) -> EngineResult:
+        """Exécute le pipeline et retourne un EngineResult enrichi de métadonnées."""
+        image_path = Path(image_path)
+        self._last_ocr_text = None
+        start = time.perf_counter()
+        try:
+            text = self._run_ocr(image_path)
+            error = None
+        except Exception as exc:  # noqa: BLE001
+            text = ""
+            error = str(exc)
+        duration = time.perf_counter() - start
+        metadata: dict = {
+            "engine_version": self._safe_version(),
+            "pipeline_mode": self.mode.value,
+            "prompt_file": self.prompt_path,
+            "prompt_template": self._prompt_template,
+            "llm_model": self.llm_adapter.model,
+            "llm_provider": self.llm_adapter.name,
+            "pipeline_steps": self._build_steps_info(),
+            "is_pipeline": True,
+        }
+        if self._last_ocr_text is not None:
+            metadata["ocr_intermediate"] = self._last_ocr_text
+        return EngineResult(
+            engine_name=self.name,
+            image_path=str(image_path),
+            text=text,
+            duration_seconds=round(duration, 4),
+            error=error,
+            metadata=metadata,
+        )
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+    def _build_prompt(self, ocr_text: str = "", image_b64: str = "") -> str:
+        """Substitue {ocr_output} et {image_b64} dans le template de prompt."""
+        return (
+            self._prompt_template
+            .replace("{ocr_output}", ocr_text)
+            .replace("{image_b64}", image_b64)
+        )
+    def _build_steps_info(self) -> list[dict]:
+        steps: list[dict] = []
+        if self.ocr_engine:
+            steps.append({
+                "type": "ocr",
+                "engine": self.ocr_engine.name,
+                "version": self.ocr_engine._safe_version(),
+            })
+        steps.append({
+            "type": "llm",
+            "model": self.llm_adapter.model,
+            "provider": self.llm_adapter.name,
+            "mode": self.mode.value,
+            "prompt_file": self.prompt_path,
+        })
+        return steps

picarones/pipelines/over_normalization.py ADDED Viewed

	@@ -0,0 +1,121 @@

+"""Détection de la sur-normalisation LLM — Classe 10 de la taxonomie des erreurs.
+La sur-normalisation désigne le cas où le LLM « corrige » à tort des passages
+déjà bien transcrits par l'OCR, en particulier :
+- modernisation de graphies médiévales légitimes (nostre → notre, faict → fait)
+- normalisation de variantes orthographiques historiques authentiques
+- modification de noms propres ou de termes rares sans erreur OCR initiale
+Mesure :
+    score = nombre de mots (OCR correct → LLM modifié) / nombre de mots OCR corrects
+Un score élevé indique que le prompt doit être affiné pour mieux préserver
+la graphie originale.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Optional
+@dataclass
+class OverNormalizationResult:
+    """Résultat de la détection de sur-normalisation pour un document."""
+    total_correct_ocr_words: int
+    over_normalized_count: int
+    over_normalized_passages: list[dict] = field(default_factory=list)
+    # Chaque entrée : {"gt": str, "ocr": str, "llm": str}
+    @property
+    def score(self) -> float:
+        """Score de sur-normalisation entre 0 (aucune dégradation) et 1 (tout dégradé)."""
+        if self.total_correct_ocr_words == 0:
+            return 0.0
+        return round(self.over_normalized_count / self.total_correct_ocr_words, 4)
+    def as_dict(self) -> dict:
+        return {
+            "score": self.score,
+            "total_correct_ocr_words": self.total_correct_ocr_words,
+            "over_normalized_count": self.over_normalized_count,
+            "over_normalized_passages": self.over_normalized_passages[:20],
+        }
+def detect_over_normalization(
+    ground_truth: str,
+    ocr_text: str,
+    llm_text: str,
+    *,
+    max_examples: int = 20,
+) -> OverNormalizationResult:
+    """Détecte la sur-normalisation LLM au niveau des mots.
+    Algorithme (alignement positionnel simple, adapté aux textes courts) :
+    Pour chaque position i dans min(len(GT), len(OCR), len(LLM)) :
+      - Si ocr[i] == gt[i]  → le mot était correct dans l'OCR
+      - Si llm[i] != gt[i]  → le LLM a dégradé ce mot correct → sur-normalisation
+    Parameters
+    ----------
+    ground_truth:
+        Transcription de référence.
+    ocr_text:
+        Sortie brute du moteur OCR (avant correction LLM).
+    llm_text:
+        Sortie après correction par le LLM.
+    max_examples:
+        Nombre maximal d'exemples de sur-normalisation conservés.
+    Returns
+    -------
+    OverNormalizationResult
+    """
+    gt_words = ground_truth.split()
+    ocr_words = ocr_text.split()
+    llm_words = llm_text.split()
+    n = min(len(gt_words), len(ocr_words), len(llm_words))
+    correct_ocr = 0
+    over_norm = 0
+    passages: list[dict] = []
+    for i in range(n):
+        gt_w = gt_words[i]
+        ocr_w = ocr_words[i]
+        llm_w = llm_words[i]
+        if ocr_w == gt_w:
+            correct_ocr += 1
+            if llm_w != gt_w and len(passages) < max_examples:
+                over_norm += 1
+                passages.append({"gt": gt_w, "ocr": ocr_w, "llm": llm_w})
+            elif llm_w != gt_w:
+                over_norm += 1
+    return OverNormalizationResult(
+        total_correct_ocr_words=correct_ocr,
+        over_normalized_count=over_norm,
+        over_normalized_passages=passages,
+    )
+def aggregate_over_normalization(results: list[Optional[OverNormalizationResult]]) -> dict:
+    """Agrège les résultats de sur-normalisation sur un ensemble de documents."""
+    valid = [r for r in results if r is not None]
+    if not valid:
+        return {"score": None, "total_correct_ocr_words": 0, "over_normalized_count": 0}
+    total_correct = sum(r.total_correct_ocr_words for r in valid)
+    total_over = sum(r.over_normalized_count for r in valid)
+    score = round(total_over / total_correct, 4) if total_correct > 0 else 0.0
+    return {
+        "score": score,
+        "total_correct_ocr_words": total_correct,
+        "over_normalized_count": total_over,
+        "document_count": len(valid),
+    }

picarones/prompts/correction_image_medieval_french.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+Tu es un expert en paléographie et en transcription de documents en français médiéval (XIIe–XVe siècle).
+On te fournit la sortie brute d'un moteur OCR ET l'image originale du document.
+Ta tâche est de corriger les erreurs de transcription en te basant sur :
+- L'image originale pour vérifier visuellement les passages ambigus
+- Le contexte linguistique et grammatical du français médiéval
+- Les confusions visuelles typiques de l'OCR sur documents anciens : rn/m, l/1, u/n, ſ/f, cl/d
+- Les abréviations et ligatures médiévales visibles sur l'image
+RÈGLES IMPÉRATIVES :
+1. Retourne UNIQUEMENT le texte corrigé — sans commentaire, sans explication, sans balise
+2. Conserve FIDÈLEMENT la graphie originale : ne modernise PAS l'orthographe
+   (nostre ≠ notre, faict ≠ fait, maistre ≠ maître, ledit ≠ le dit)
+3. Utilise l'image pour trancher les cas ambigus — pas pour « améliorer » le style
+4. Conserve la ponctuation et la capitalisation d'origine
+5. En cas de passage illisible sur l'image, conserve la forme OCR avec [?]
+OCR BRUT :
+{ocr_output}

picarones/prompts/correction_imprime_ancien.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+Tu es un expert en typographie historique et en transcription d'imprimés anciens (XVe–XVIIIe siècle).
+On te fournit la sortie brute d'un moteur OCR appliqué à un imprimé ancien.
+Ta tâche est de corriger les erreurs de transcription en te basant sur :
+- Les conventions typographiques de l'imprimerie ancienne
+- L'usage du s long (ſ) en position initiale et médiane (ſon, maiſon, diſcours)
+- Les ligatures typographiques : fi, fl, ff, ffi, ffl, st, ct, ſt
+- Les confusions de fontes : romain/italique, capitales ornées
+- Les caractères spéciaux : & (et), ꝛ (r rotunda), ÿ, j/i, u/v
+RÈGLES IMPÉRATIVES :
+1. Retourne UNIQUEMENT le texte corrigé — sans commentaire, sans explication, sans balise
+2. Conserve la graphie de l'époque : ne modernise PAS l'orthographe
+   (ſon ≠ son seulement si l'OCR a mal transcrit ; conſeil ≠ conseil)
+3. Respecte les réclames (mots répétés en bas de page/colonne) tels quels
+4. Conserve les chiffres romains, foliotation et pagination d'origine
+5. En cas de doute sur un passage, conserve la forme OCR plutôt que d'inventer
+OCR BRUT :
+{ocr_output}

picarones/prompts/correction_medieval_french.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+Tu es un expert en paléographie et en transcription de documents en français médiéval (XIIe–XVe siècle).
+On te fournit la sortie brute d'un moteur OCR appliqué à un document patrimonial.
+Ta tâche est de corriger les erreurs de transcription en te basant sur :
+- Le contexte linguistique et grammatical du français médiéval
+- Les confusions visuelles typiques de l'OCR sur documents anciens : rn/m, l/1, u/n, ſ/f, cl/d, ri/n, ii/u
+- Les abréviations courantes : ꝑ (per/par), ꝓ (pro), q̃ (que), p̃ (pre), ā (an), m̃ (men)
+- Les ligatures fréquentes : ct, st, fi, fl, ff, œ, æ
+RÈGLES IMPÉRATIVES :
+1. Retourne UNIQUEMENT le texte corrigé — sans commentaire, sans explication, sans balise
+2. Conserve FIDÈLEMENT la graphie originale : ne modernise PAS l'orthographe
+   (nostre ≠ notre, faict ≠ fait, ledit ≠ le dit, maistre ≠ maître)
+3. Conserve la ponctuation et la capitalisation d'origine
+4. En cas de doute sur un passage, conserve la forme OCR plutôt que d'inventer
+OCR BRUT :
+{ocr_output}

picarones/prompts/zero_shot_imprime_ancien.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+Tu es un expert en typographie historique et en transcription d'imprimés anciens (XVe–XVIIIe siècle).
+On te fournit l'image d'une page d'imprimé ancien (incunable, livre du XVIe–XVIIIe siècle).
+Ta tâche est de transcrire fidèlement le texte imprimé visible sur l'image.
+RÈGLES IMPÉRATIVES :
+1. Retourne UNIQUEMENT la transcription — sans commentaire, sans titre, sans balise
+2. Conserve les conventions typographiques de l'époque :
+   - s long (ſ) en position initiale et médiane : ſon, maiſon, diſcours
+   - ligatures typographiques : fi, fl, ff, ffi, ffl, st, ct
+   - & pour et, ÿ, j/i interchangeables selon l'époque
+3. Respecte la mise en page : colonnes, titres courants, réclames, foliotation
+4. Conserve la capitalisation d'origine — ne la normalise pas
+5. Signale les passages illisibles par [illisible] plutôt que d'inventer
+6. Transcris les chiffres romains tels quels (iij, xiij, MCCCLx…)

picarones/prompts/zero_shot_medieval_french.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+Tu es un expert en paléographie médiévale spécialisé dans la transcription de manuscrits en français médiéval (XIIe–XVe siècle).
+On te fournit l'image d'un folio ou d'une page de document patrimonial.
+Ta tâche est de transcrire fidèlement le texte visible sur l'image.
+RÈGLES IMPÉRATIVES :
+1. Retourne UNIQUEMENT la transcription — sans commentaire, sans titre, sans balise
+2. Conserve la graphie médiévale exacte : ne modernise PAS l'orthographe
+   (nostre, maistre, faict, ledit, &, ꝑ, ꝓ…)
+3. Respecte les abréviations telles qu'elles apparaissent sur le document
+4. Conserve les sauts de ligne et la structure du texte original
+5. Signale les passages illisibles par [illisible] plutôt que d'inventer
+6. Ne transcris que le texte principal — ignore les annotations marginales tardives
+   sauf si elles font partie du texte courant

picarones/report/generator.py CHANGED Viewed

@@ -69,7 +69,7 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
     engines_summary = []
     for report in benchmark.engine_reports:
         agg = report.aggregated_metrics
-        engines_summary.append({
             "name": report.engine_name,
             "version": report.engine_version,
             "cer":  _safe(agg.get("cer", {}).get("mean")),
@@ -87,7 +87,11 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
                 for dr in report.document_results
                 if dr.metrics.error is None
             ],
-        })
     # Documents (vue galerie + vue détail)
     # On collecte tous les doc_ids depuis le premier moteur
@@ -113,7 +117,7 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
             gt = dr.ground_truth
             image_path = dr.image_path
             diff_ops = compute_word_diff(dr.ground_truth, dr.hypothesis)
-            engine_results.append({
                 "engine": engine_name,
                 "hypothesis": dr.hypothesis,
                 "cer": _safe(dr.metrics.cer),
@@ -121,7 +125,18 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
                 "duration": dr.duration_seconds,
                 "error": dr.engine_error,
                 "diff": diff_ops,
-            })
         # CER moyen sur ce document (pour le badge galerie)
         cer_values = [er["cer"] for er in engine_results if er["error"] is None]
@@ -502,6 +517,42 @@ tbody tr:hover {{ background: #f8fafc; }}
 }}
 .chart-canvas-wrap {{ position: relative; height: 280px; }}
 /* ── Misc ─────────────────────────────────────────────────────────── */
 .badge {{
   display: inline-block; padding: .15rem .45rem;
@@ -570,7 +621,7 @@ footer {{
         <thead>
           <tr>
             <th data-col="rank" class="sortable sorted" data-dir="asc">#<i class="sort-icon">↑</i></th>
-            <th data-col="name" class="sortable">Moteur<i class="sort-icon">↕</i></th>
             <th data-col="cer"  class="sortable">CER<i class="sort-icon">↕</i></th>
             <th data-col="wer"  class="sortable">WER<i class="sort-icon">↕</i></th>
             <th data-col="mer"  class="sortable">MER<i class="sort-icon">↕</i></th>
@@ -578,6 +629,7 @@ footer {{
             <th>CER médian</th>
             <th>CER min</th>
             <th>CER max</th>
             <th>Docs</th>
           </tr>
         </thead>
@@ -826,11 +878,41 @@ function renderRanking() {{
     const badgeClass = rank === 1 ? 'rank-badge rank-1' : 'rank-badge';
     const cerC = cerColor(e.cer); const cerB = cerBg(e.cer);
     const barW = Math.min(100, e.cer * 100 * 3);
     return `<tr>
       <td><span class="${{badgeClass}}">${{rank}}</span></td>
       <td>
         <span class="engine-name">${{esc(e.name)}}</span>
         <span class="engine-version">v${{esc(e.version)}}</span>
       </td>
       <td>
         <span class="bar" style="width:${{barW}}px;background:${{cerC}}"></span>
@@ -842,16 +924,20 @@ function renderRanking() {{
       <td style="color:var(--text-muted)">${{pct(e.cer_median)}}</td>
       <td style="color:var(--text-muted)">${{pct(e.cer_min)}}</td>
       <td style="color:var(--text-muted)">${{pct(e.cer_max)}}</td>
       <td><span class="pill">${{e.doc_count}}</span></td>
     </tr>`;
   }}).join('');
   // Stats globales
   const stats = document.getElementById('ranking-stats');
   stats.innerHTML = `
     <div class="stat">Corpus <b>${{esc(DATA.meta.corpus_name)}}</b></div>
     <div class="stat">Documents <b>${{DATA.meta.document_count}}</b></div>
-    <div class="stat">Moteurs <b>${{DATA.engines.length}}</b></div>
   `;
 }}
@@ -920,8 +1006,10 @@ function renderGallery() {{
     const badges = doc.engine_results.map(er => {{
       const c = cerColor(er.cer); const bg = cerBg(er.cer);
       return `<span class="engine-cer-badge" style="color:${{c}};background:${{bg}}"
-        title="${{esc(er.engine)}}">${{esc(er.engine.slice(0,6))}} ${{pct(er.cer,1)}}</span>`;
     }}).join('');
     return `<div class="gallery-card" onclick="openDocument('${{esc(doc.doc_id)}}')">
@@ -987,16 +1075,53 @@ function loadDocument(docId) {{
     const c = cerColor(er.cer); const bg = cerBg(er.cer);
     const diffHtml = renderDiff(er.diff);
     const errBadge = er.error ? `<span class="badge" style="background:#fee2e2;color:#dc2626">Erreur</span>` : '';
     return `<div class="diff-panel">
       <div class="diff-panel-header">
         <span class="diff-panel-title">${{esc(er.engine)}}</span>
         <span class="diff-panel-metrics">
           <span class="cer-badge" style="color:${{c}};background:${{bg}}">${{pct(er.cer)}}</span>
           <span class="badge" style="background:#f1f5f9">WER ${{pct(er.wer)}}</span>
           ${{errBadge}}
         </span>
       </div>
       <div class="diff-panel-body">${{diffHtml || '<em style="color:var(--text-muted)">Aucune sortie</em>'}}</div>
     </div>`;
   }}).join('');
 }}

     engines_summary = []
     for report in benchmark.engine_reports:
         agg = report.aggregated_metrics
+        entry: dict = {
             "name": report.engine_name,
             "version": report.engine_version,
             "cer":  _safe(agg.get("cer", {}).get("mean")),
                 for dr in report.document_results
                 if dr.metrics.error is None
             ],
+            # Champs pipeline OCR+LLM (vides pour les moteurs OCR seuls)
+            "is_pipeline": report.is_pipeline,
+            "pipeline_info": report.pipeline_info,
+        }
+        engines_summary.append(entry)
     # Documents (vue galerie + vue détail)
     # On collecte tous les doc_ids depuis le premier moteur
             gt = dr.ground_truth
             image_path = dr.image_path
             diff_ops = compute_word_diff(dr.ground_truth, dr.hypothesis)
+            er_entry: dict = {
                 "engine": engine_name,
                 "hypothesis": dr.hypothesis,
                 "cer": _safe(dr.metrics.cer),
                 "duration": dr.duration_seconds,
                 "error": dr.engine_error,
                 "diff": diff_ops,
+            }
+            # Champs spécifiques aux pipelines OCR+LLM
+            if dr.ocr_intermediate is not None:
+                er_entry["ocr_intermediate"] = dr.ocr_intermediate
+                er_entry["ocr_diff"] = compute_word_diff(dr.ground_truth, dr.ocr_intermediate)
+                er_entry["llm_correction_diff"] = compute_word_diff(dr.ocr_intermediate, dr.hypothesis)
+            if dr.pipeline_metadata:
+                on = dr.pipeline_metadata.get("over_normalization")
+                if on is not None:
+                    er_entry["over_normalization"] = on
+                er_entry["pipeline_mode"] = dr.pipeline_metadata.get("pipeline_mode")
+            engine_results.append(er_entry)
         # CER moyen sur ce document (pour le badge galerie)
         cer_values = [er["cer"] for er in engine_results if er["error"] is None]
 }}
 .chart-canvas-wrap {{ position: relative; height: 280px; }}
+/* ── Pipeline badges ──────────────────────────────────────────────── */
+.pipeline-tag {{
+  display: inline-flex; align-items: center; gap: .25rem;
+  padding: .12rem .38rem;
+  border-radius: 4px; font-size: .67rem; font-weight: 700;
+  background: #ede9fe; color: #6d28d9;
+  letter-spacing: .02em; vertical-align: middle;
+}}
+.pipeline-tag .pipe-arrow {{ opacity: .7; }}
+.over-norm-badge {{
+  display: inline-block; padding: .12rem .38rem;
+  border-radius: 4px; font-size: .67rem; font-weight: 700;
+  background: #fef3c7; color: #b45309;
+}}
+.over-norm-badge.high {{ background: #fee2e2; color: #b91c1c; }}
+/* Vue triple-diff (pipeline) */
+.triple-diff-wrap {{
+  display: grid; grid-template-columns: 1fr 1fr; gap: .5rem;
+  margin-top: .5rem;
+}}
+.triple-diff-section {{ background: var(--bg); border-radius: 6px; padding: .5rem; }}
+.triple-diff-section h5 {{
+  font-size: .73rem; font-weight: 700; color: var(--text-muted);
+  margin-bottom: .35rem; text-transform: uppercase; letter-spacing: .04em;
+}}
+.pipeline-steps {{
+  display: flex; align-items: center; gap: .3rem; flex-wrap: wrap;
+  margin-top: .25rem;
+}}
+.step-chip {{
+  padding: .12rem .4rem; border-radius: 4px; font-size: .68rem; font-weight: 600;
+}}
+.step-chip.ocr  {{ background: #e0f2fe; color: #0369a1; }}
+.step-chip.llm  {{ background: #ede9fe; color: #6d28d9; }}
+.step-arrow {{ color: var(--text-muted); font-size: .8rem; }}
 /* ── Misc ─────────────────────────────────────────────────────────── */
 .badge {{
   display: inline-block; padding: .15rem .45rem;
         <thead>
           <tr>
             <th data-col="rank" class="sortable sorted" data-dir="asc">#<i class="sort-icon">↑</i></th>
+            <th data-col="name" class="sortable">Concurrent<i class="sort-icon">↕</i></th>
             <th data-col="cer"  class="sortable">CER<i class="sort-icon">↕</i></th>
             <th data-col="wer"  class="sortable">WER<i class="sort-icon">↕</i></th>
             <th data-col="mer"  class="sortable">MER<i class="sort-icon">↕</i></th>
             <th>CER médian</th>
             <th>CER min</th>
             <th>CER max</th>
+            <th title="Classe 10 — Sur-normalisation LLM : taux de mots corrects dégradés par le LLM">Sur-norm.</th>
             <th>Docs</th>
           </tr>
         </thead>
     const badgeClass = rank === 1 ? 'rank-badge rank-1' : 'rank-badge';
     const cerC = cerColor(e.cer); const cerB = cerBg(e.cer);
     const barW = Math.min(100, e.cer * 100 * 3);
+    // Badge pipeline
+    let pipelineBadge = '';
+    let pipelineStepsHtml = '';
+    if (e.is_pipeline && e.pipeline_info) {{
+      const pi = e.pipeline_info;
+      const modeLabel = {{text_only:'texte', text_and_image:'image+texte', zero_shot:'zero-shot'}}[pi.pipeline_mode] || pi.pipeline_mode || '';
+      pipelineBadge = `<span class="pipeline-tag" title="Pipeline OCR+LLM — mode ${{modeLabel}}">
+        ⛓ pipeline<span class="pipe-arrow">·${{modeLabel}}</span></span>`;
+      if (pi.pipeline_steps) {{
+        pipelineStepsHtml = `<div class="pipeline-steps">` +
+          pi.pipeline_steps.map(s => s.type === 'ocr'
+            ? `<span class="step-chip ocr">OCR: ${{esc(s.engine)}}</span>`
+            : `<span class="step-chip llm">LLM: ${{esc(s.model)}}</span>`
+          ).join(`<span class="step-arrow">→</span>`) +
+          `</div>`;
+      }}
+    }}
+    // Sur-normalisation (classe 10)
+    let overNormCell = '<td style="color:var(--text-muted)">—</td>';
+    if (e.is_pipeline && e.pipeline_info && e.pipeline_info.over_normalization) {{
+      const on = e.pipeline_info.over_normalization;
+      const onPct = (on.score * 100).toFixed(2);
+      const cls = on.score > 0.05 ? 'over-norm-badge high' : 'over-norm-badge';
+      overNormCell = `<td><span class="${{cls}}" title="Classe 10 — ${{on.over_normalized_count}} mots corrects dégradés sur ${{on.total_correct_ocr_words}}">${{onPct}} %</span></td>`;
+    }}
     return `<tr>
       <td><span class="${{badgeClass}}">${{rank}}</span></td>
       <td>
         <span class="engine-name">${{esc(e.name)}}</span>
+        ${{pipelineBadge}}
         <span class="engine-version">v${{esc(e.version)}}</span>
+        ${{pipelineStepsHtml}}
       </td>
       <td>
         <span class="bar" style="width:${{barW}}px;background:${{cerC}}"></span>
       <td style="color:var(--text-muted)">${{pct(e.cer_median)}}</td>
       <td style="color:var(--text-muted)">${{pct(e.cer_min)}}</td>
       <td style="color:var(--text-muted)">${{pct(e.cer_max)}}</td>
+      ${{overNormCell}}
       <td><span class="pill">${{e.doc_count}}</span></td>
     </tr>`;
   }}).join('');
   // Stats globales
+  const pipelineCount = DATA.engines.filter(e => e.is_pipeline).length;
   const stats = document.getElementById('ranking-stats');
   stats.innerHTML = `
     <div class="stat">Corpus <b>${{esc(DATA.meta.corpus_name)}}</b></div>
     <div class="stat">Documents <b>${{DATA.meta.document_count}}</b></div>
+    <div class="stat">Concurrents <b>${{DATA.engines.length}}</b>
+      ${{pipelineCount ? `<span class="pipeline-tag" style="margin-left:.3rem">${{pipelineCount}} pipeline${{pipelineCount>1?'s':''}}</span>` : ''}}
+    </div>
   `;
 }}
     const badges = doc.engine_results.map(er => {{
       const c = cerColor(er.cer); const bg = cerBg(er.cer);
+      const isPipe = er.ocr_intermediate !== undefined;
+      const label = isPipe ? '⛓' + er.engine.slice(0,8) : er.engine.slice(0,8);
       return `<span class="engine-cer-badge" style="color:${{c}};background:${{bg}}"
+        title="${{esc(er.engine)}}${{isPipe?' (pipeline)':''}}">${{esc(label)}} ${{pct(er.cer,1)}}</span>`;
     }}).join('');
     return `<div class="gallery-card" onclick="openDocument('${{esc(doc.doc_id)}}')">
     const c = cerColor(er.cer); const bg = cerBg(er.cer);
     const diffHtml = renderDiff(er.diff);
     const errBadge = er.error ? `<span class="badge" style="background:#fee2e2;color:#dc2626">Erreur</span>` : '';
+    // Pipeline badge dans l'en-tête du panneau
+    const isPipeline = er.ocr_intermediate !== undefined;
+    const modeLabel = {{text_only:'texte seul', text_and_image:'image+texte', zero_shot:'zero-shot'}}[er.pipeline_mode] || '';
+    const pipeTagPanel = isPipeline
+      ? `<span class="pipeline-tag">⛓ ${{modeLabel || 'pipeline'}}</span>` : '';
+    // Sur-normalisation (classe 10)
+    let onBadge = '';
+    if (er.over_normalization) {{
+      const on = er.over_normalization;
+      const onPct = (on.score * 100).toFixed(2);
+      const cls = on.score > 0.05 ? 'over-norm-badge high' : 'over-norm-badge';
+      onBadge = `<span class="${{cls}}" title="Classe 10 — sur-normalisation LLM">Sur-norm. ${{onPct}}%</span>`;
+    }}
+    // Triple-diff (vue sp��cifique pipeline) : OCR brut / Correction LLM
+    let tripleDiffHtml = '';
+    if (isPipeline && er.ocr_intermediate) {{
+      const ocrDiffHtml   = renderDiff(er.ocr_diff);
+      const llmDiffHtml   = renderDiff(er.llm_correction_diff);
+      tripleDiffHtml = `
+        <div class="triple-diff-wrap">
+          <div class="triple-diff-section">
+            <h5>GT → OCR brut</h5>
+            ${{ocrDiffHtml || '<em style="color:var(--text-muted)">—</em>'}}
+          </div>
+          <div class="triple-diff-section">
+            <h5>OCR brut → Correction LLM</h5>
+            ${{llmDiffHtml || '<em style="color:var(--text-muted)">—</em>'}}
+          </div>
+        </div>`;
+    }}
     return `<div class="diff-panel">
       <div class="diff-panel-header">
         <span class="diff-panel-title">${{esc(er.engine)}}</span>
+        ${{pipeTagPanel}}
         <span class="diff-panel-metrics">
           <span class="cer-badge" style="color:${{c}};background:${{bg}}">${{pct(er.cer)}}</span>
           <span class="badge" style="background:#f1f5f9">WER ${{pct(er.wer)}}</span>
+          ${{onBadge}}
           ${{errBadge}}
         </span>
       </div>
       <div class="diff-panel-body">${{diffHtml || '<em style="color:var(--text-muted)">Aucune sortie</em>'}}</div>
+      ${{tripleDiffHtml}}
     </div>`;
   }}).join('');
 }}

rapport_demo.html CHANGED Viewed

The diff for this file is too large to render. See raw diff

tests/test_report.py CHANGED Viewed

@@ -32,7 +32,8 @@ class TestGenerateSampleBenchmark:
         assert isinstance(sample_benchmark, BenchmarkResult)
     def test_correct_engine_count(self, sample_benchmark):
-        assert len(sample_benchmark.engine_reports) == 3
     def test_correct_doc_count(self, sample_benchmark):
         assert sample_benchmark.document_count == 3
@@ -88,7 +89,8 @@ class TestBuildReportData:
     def test_engines_count(self, sample_benchmark):
         data = _build_report_data(sample_benchmark, {})
-        assert len(data["engines"]) == 3
     def test_engine_fields(self, sample_benchmark):
         data = _build_report_data(sample_benchmark, {})
@@ -219,7 +221,7 @@ class TestReportGenerator:
         data = json.loads(match.group(1))
         assert "engines" in data
         assert "documents" in data
-        assert len(data["engines"]) == 3
 # ---------------------------------------------------------------------------

         assert isinstance(sample_benchmark, BenchmarkResult)
     def test_correct_engine_count(self, sample_benchmark):
+        # 3 moteurs OCR + 1 pipeline tesseract → gpt-4o
+        assert len(sample_benchmark.engine_reports) == 4
     def test_correct_doc_count(self, sample_benchmark):
         assert sample_benchmark.document_count == 3
     def test_engines_count(self, sample_benchmark):
         data = _build_report_data(sample_benchmark, {})
+        # 3 moteurs OCR + 1 pipeline tesseract → gpt-4o
+        assert len(data["engines"]) == 4
     def test_engine_fields(self, sample_benchmark):
         data = _build_report_data(sample_benchmark, {})
         data = json.loads(match.group(1))
         assert "engines" in data
         assert "documents" in data
+        assert len(data["engines"]) == 4  # 3 OCR + 1 pipeline
 # ---------------------------------------------------------------------------

tests/test_sprint3_llm_pipelines.py ADDED Viewed

	@@ -0,0 +1,441 @@

+"""Tests Sprint 3 — Pipelines OCR+LLM, adaptateurs LLM, bibliothèque de prompts, sur-normalisation.
+Ces tests couvrent :
+- La détection de sur-normalisation LLM (classe 10)
+- L'OCRLLMPipeline : modes, chargement de prompts, métadonnées
+- Les adaptateurs LLM (instanciation, structure)
+- L'intégration dans les fixtures (tesseract → gpt-4o)
+- La présence des données pipeline dans le rapport HTML
+"""
+from __future__ import annotations
+import json
+import re
+from pathlib import Path
+import pytest
+# ---------------------------------------------------------------------------
+# Détection de sur-normalisation (classe 10)
+# ---------------------------------------------------------------------------
+class TestOverNormalization:
+    def test_no_over_normalization(self):
+        from picarones.pipelines.over_normalization import detect_over_normalization
+        gt  = "nostre seigneur le roy"
+        ocr = "noltre seigneur le roy"   # erreur OCR sur 'nostre'
+        llm = "nostre seigneur le roy"   # LLM corrige → correct
+        result = detect_over_normalization(gt, ocr, llm)
+        assert result.score == 0.0
+        assert result.over_normalized_count == 0
+    def test_perfect_llm_no_over_norm(self):
+        from picarones.pipelines.over_normalization import detect_over_normalization
+        gt  = "nostre seigneur le roy"
+        ocr = "nostre seigneur le roy"   # OCR correct
+        llm = "nostre seigneur le roy"   # LLM conserve
+        result = detect_over_normalization(gt, ocr, llm)
+        assert result.score == 0.0
+        assert result.total_correct_ocr_words == 4
+    def test_over_normalization_detected(self):
+        from picarones.pipelines.over_normalization import detect_over_normalization
+        gt  = "nostre seigneur le roy"
+        ocr = "nostre seigneur le roy"   # OCR correct
+        llm = "notre seigneur le roy"    # LLM modifie 'nostre' → 'notre' : sur-normalisation
+        result = detect_over_normalization(gt, ocr, llm)
+        assert result.over_normalized_count == 1
+        assert result.score > 0.0
+        assert len(result.over_normalized_passages) == 1
+        passage = result.over_normalized_passages[0]
+        assert passage["gt"] == "nostre"
+        assert passage["ocr"] == "nostre"
+        assert passage["llm"] == "notre"
+    def test_over_normalization_score_formula(self):
+        from picarones.pipelines.over_normalization import detect_over_normalization
+        # 4 mots, OCR correct sur tous, LLM modifie 2 → score = 2/4 = 0.5
+        gt  = "maistre jehan nostre dame"
+        ocr = "maistre jehan nostre dame"
+        llm = "maître jehan notre dame"
+        result = detect_over_normalization(gt, ocr, llm)
+        assert result.total_correct_ocr_words == 4
+        assert result.over_normalized_count == 2
+        assert result.score == pytest.approx(0.5)
+    def test_as_dict_keys(self):
+        from picarones.pipelines.over_normalization import detect_over_normalization
+        result = detect_over_normalization("foo bar", "foo baz", "foo baz")
+        d = result.as_dict()
+        assert "score" in d
+        assert "total_correct_ocr_words" in d
+        assert "over_normalized_count" in d
+        assert "over_normalized_passages" in d
+    def test_empty_texts(self):
+        from picarones.pipelines.over_normalization import detect_over_normalization
+        result = detect_over_normalization("", "", "")
+        assert result.score == 0.0
+    def test_aggregate_over_normalization(self):
+        from picarones.pipelines.over_normalization import (
+            OverNormalizationResult,
+            aggregate_over_normalization,
+        )
+        results = [
+            OverNormalizationResult(total_correct_ocr_words=10, over_normalized_count=1),
+            OverNormalizationResult(total_correct_ocr_words=10, over_normalized_count=2),
+            None,
+        ]
+        agg = aggregate_over_normalization(results)
+        assert agg["total_correct_ocr_words"] == 20
+        assert agg["over_normalized_count"] == 3
+        assert agg["score"] == pytest.approx(0.15)
+        assert agg["document_count"] == 2
+# ---------------------------------------------------------------------------
+# Bibliothèque de prompts
+# ---------------------------------------------------------------------------
+class TestPromptsLibrary:
+    _PROMPTS_DIR = Path(__file__).parent.parent / "picarones" / "prompts"
+    def test_prompts_directory_exists(self):
+        assert self._PROMPTS_DIR.is_dir()
+    def test_required_prompt_files_exist(self):
+        expected = [
+            "correction_medieval_french.txt",
+            "correction_imprime_ancien.txt",
+            "correction_image_medieval_french.txt",
+            "zero_shot_medieval_french.txt",
+            "zero_shot_imprime_ancien.txt",
+        ]
+        for fname in expected:
+            assert (self._PROMPTS_DIR / fname).exists(), f"Prompt manquant : {fname}"
+    def test_correction_prompt_has_ocr_variable(self):
+        text = (self._PROMPTS_DIR / "correction_medieval_french.txt").read_text(encoding="utf-8")
+        assert "{ocr_output}" in text
+    def test_image_prompt_has_both_variables(self):
+        text = (self._PROMPTS_DIR / "correction_image_medieval_french.txt").read_text(encoding="utf-8")
+        assert "{ocr_output}" in text
+    def test_zero_shot_prompt_has_no_ocr_variable(self):
+        text = (self._PROMPTS_DIR / "zero_shot_medieval_french.txt").read_text(encoding="utf-8")
+        assert "{ocr_output}" not in text
+    def test_prompts_not_empty(self):
+        for f in self._PROMPTS_DIR.glob("*.txt"):
+            assert len(f.read_text(encoding="utf-8").strip()) > 100, f"Prompt trop court : {f.name}"
+# ---------------------------------------------------------------------------
+# PipelineMode enum
+# ---------------------------------------------------------------------------
+class TestPipelineMode:
+    def test_enum_values(self):
+        from picarones.pipelines.base import PipelineMode
+        assert PipelineMode.TEXT_ONLY.value == "text_only"
+        assert PipelineMode.TEXT_AND_IMAGE.value == "text_and_image"
+        assert PipelineMode.ZERO_SHOT.value == "zero_shot"
+    def test_from_string(self):
+        from picarones.pipelines.base import PipelineMode
+        assert PipelineMode("text_only") == PipelineMode.TEXT_ONLY
+# ---------------------------------------------------------------------------
+# Adaptateurs LLM — structure
+# ---------------------------------------------------------------------------
+class TestLLMAdapters:
+    def test_openai_adapter_structure(self):
+        from picarones.llm.openai_adapter import OpenAIAdapter
+        adapter = OpenAIAdapter(model="gpt-4o")
+        assert adapter.name == "openai"
+        assert adapter.model == "gpt-4o"
+    def test_anthropic_adapter_structure(self):
+        from picarones.llm.anthropic_adapter import AnthropicAdapter
+        adapter = AnthropicAdapter()
+        assert adapter.name == "anthropic"
+        assert "claude" in adapter.model.lower()
+    def test_mistral_adapter_structure(self):
+        from picarones.llm.mistral_adapter import MistralAdapter
+        adapter = MistralAdapter()
+        assert adapter.name == "mistral"
+        assert "mistral" in adapter.model.lower()
+    def test_ollama_adapter_structure(self):
+        from picarones.llm.ollama_adapter import OllamaAdapter
+        adapter = OllamaAdapter(model="llama3")
+        assert adapter.name == "ollama"
+        assert adapter.model == "llama3"
+    def test_ollama_custom_base_url(self):
+        from picarones.llm.ollama_adapter import OllamaAdapter
+        adapter = OllamaAdapter(config={"base_url": "http://myserver:11434"})
+        assert adapter._base_url == "http://myserver:11434"
+    def test_llm_result_dataclass(self):
+        from picarones.llm.base import LLMResult
+        r = LLMResult(model_id="gpt-4o", text="bonjour", duration_seconds=1.2)
+        assert r.success is True
+        r_err = LLMResult(model_id="gpt-4o", text="", duration_seconds=0.1, error="fail")
+        assert r_err.success is False
+    def test_missing_api_key_raises(self):
+        import os
+        from picarones.llm.openai_adapter import OpenAIAdapter
+        adapter = OpenAIAdapter()
+        adapter._api_key = None  # simuler clé manquante
+        with pytest.raises(RuntimeError, match="OPENAI_API_KEY"):
+            adapter._call("test prompt")
+# ---------------------------------------------------------------------------
+# OCRLLMPipeline — prompt loading, name, steps
+# ---------------------------------------------------------------------------
+class TestOCRLLMPipeline:
+    def _mock_llm(self, response: str = "texte corrigé"):
+        """Crée un adaptateur LLM mock qui retourne toujours la même réponse."""
+        from picarones.llm.base import BaseLLMAdapter
+        class MockLLM(BaseLLMAdapter):
+            @property
+            def name(self): return "mock"
+            @property
+            def default_model(self): return "mock-v1"
+            def _call(self, prompt, image_b64=None): return response
+        return MockLLM()
+    def test_load_builtin_prompt(self):
+        from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
+        pipeline = OCRLLMPipeline(
+            llm_adapter=self._mock_llm(),
+            mode=PipelineMode.TEXT_ONLY,
+            prompt="correction_medieval_french.txt",
+        )
+        assert "{ocr_output}" in pipeline._prompt_template
+    def test_prompt_substitution_text_only(self):
+        from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
+        pipeline = OCRLLMPipeline(
+            llm_adapter=self._mock_llm(),
+            mode=PipelineMode.TEXT_ONLY,
+            prompt="correction_medieval_french.txt",
+        )
+        built = pipeline._build_prompt(ocr_text="mon texte ocr")
+        assert "mon texte ocr" in built
+        assert "{ocr_output}" not in built
+    def test_auto_name_text_only(self):
+        from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
+        from picarones.engines.tesseract import TesseractEngine
+        pipeline = OCRLLMPipeline(
+            ocr_engine=TesseractEngine(),
+            llm_adapter=self._mock_llm(),
+            mode=PipelineMode.TEXT_ONLY,
+        )
+        assert "tesseract" in pipeline.name.lower()
+        assert "mock-v1" in pipeline.name
+    def test_auto_name_zero_shot(self):
+        from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
+        pipeline = OCRLLMPipeline(
+            llm_adapter=self._mock_llm(),
+            mode=PipelineMode.ZERO_SHOT,
+        )
+        assert "zero-shot" in pipeline.name
+    def test_custom_name(self):
+        from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
+        pipeline = OCRLLMPipeline(
+            llm_adapter=self._mock_llm(),
+            mode=PipelineMode.TEXT_ONLY,
+            pipeline_name="mon_pipeline_custom",
+        )
+        assert pipeline.name == "mon_pipeline_custom"
+    def test_pipeline_steps_without_ocr(self):
+        from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
+        pipeline = OCRLLMPipeline(
+            llm_adapter=self._mock_llm(),
+            mode=PipelineMode.ZERO_SHOT,
+        )
+        steps = pipeline._build_steps_info()
+        assert len(steps) == 1
+        assert steps[0]["type"] == "llm"
+        assert steps[0]["mode"] == "zero_shot"
+    def test_pipeline_steps_with_ocr(self):
+        from picarones.engines.tesseract import TesseractEngine
+        from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
+        pipeline = OCRLLMPipeline(
+            ocr_engine=TesseractEngine(),
+            llm_adapter=self._mock_llm(),
+            mode=PipelineMode.TEXT_ONLY,
+        )
+        steps = pipeline._build_steps_info()
+        assert len(steps) == 2
+        assert steps[0]["type"] == "ocr"
+        assert steps[1]["type"] == "llm"
+    def test_load_nonexistent_prompt_raises(self):
+        from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
+        with pytest.raises(FileNotFoundError):
+            OCRLLMPipeline(
+                llm_adapter=self._mock_llm(),
+                mode=PipelineMode.TEXT_ONLY,
+                prompt="inexistant_prompt_xyz.txt",
+            )
+    def test_text_only_requires_ocr_engine(self):
+        from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
+        pipeline = OCRLLMPipeline(
+            llm_adapter=self._mock_llm(),
+            mode=PipelineMode.TEXT_ONLY,
+        )
+        with pytest.raises(ValueError, match="ocr_engine"):
+            pipeline._run_ocr(Path("/nonexistent/image.jpg"))
+    def test_is_pipeline_flag(self):
+        from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
+        from picarones.engines.base import BaseOCREngine
+        pipeline = OCRLLMPipeline(
+            llm_adapter=self._mock_llm(),
+            mode=PipelineMode.ZERO_SHOT,
+        )
+        # Doit être utilisable comme BaseOCREngine
+        assert isinstance(pipeline, BaseOCREngine)
+# ---------------------------------------------------------------------------
+# Intégration fixtures — pipeline tesseract → gpt-4o
+# ---------------------------------------------------------------------------
+class TestFixturesPipeline:
+    @pytest.fixture(scope="class")
+    def benchmark(self):
+        from picarones.fixtures import generate_sample_benchmark
+        return generate_sample_benchmark(n_docs=3, seed=42)
+    def test_pipeline_engine_present(self, benchmark):
+        names = [r.engine_name for r in benchmark.engine_reports]
+        assert "tesseract → gpt-4o" in names
+    def test_pipeline_report_has_pipeline_info(self, benchmark):
+        report = next(r for r in benchmark.engine_reports if r.engine_name == "tesseract → gpt-4o")
+        assert report.is_pipeline
+        assert report.pipeline_info.get("pipeline_mode") == "text_and_image"
+        assert report.pipeline_info.get("llm_model") == "gpt-4o"
+    def test_pipeline_documents_have_ocr_intermediate(self, benchmark):
+        report = next(r for r in benchmark.engine_reports if r.engine_name == "tesseract → gpt-4o")
+        for dr in report.document_results:
+            assert dr.ocr_intermediate is not None, f"ocr_intermediate manquant sur {dr.doc_id}"
+            assert len(dr.ocr_intermediate) > 0
+    def test_pipeline_documents_have_over_normalization(self, benchmark):
+        report = next(r for r in benchmark.engine_reports if r.engine_name == "tesseract → gpt-4o")
+        for dr in report.document_results:
+            on = dr.pipeline_metadata.get("over_normalization")
+            assert on is not None, f"over_normalization manquant sur {dr.doc_id}"
+            assert "score" in on
+            assert "total_correct_ocr_words" in on
+    def test_pipeline_report_has_aggregated_over_normalization(self, benchmark):
+        report = next(r for r in benchmark.engine_reports if r.engine_name == "tesseract → gpt-4o")
+        on = report.pipeline_info.get("over_normalization")
+        assert on is not None
+        assert "score" in on
+        assert on["document_count"] == 3
+    def test_pipeline_pipeline_steps_in_info(self, benchmark):
+        report = next(r for r in benchmark.engine_reports if r.engine_name == "tesseract → gpt-4o")
+        steps = report.pipeline_info.get("pipeline_steps", [])
+        assert len(steps) == 2
+        assert steps[0]["type"] == "ocr"
+        assert steps[1]["type"] == "llm"
+    def test_non_pipeline_reports_empty_pipeline_info(self, benchmark):
+        for report in benchmark.engine_reports:
+            if report.engine_name != "tesseract → gpt-4o":
+                assert not report.is_pipeline
+                assert report.pipeline_info == {}
+# ---------------------------------------------------------------------------
+# Intégration rapport HTML — pipeline dans les données JSON
+# ---------------------------------------------------------------------------
+class TestReportWithPipeline:
+    @pytest.fixture(scope="class")
+    def report_data(self):
+        from picarones.fixtures import generate_sample_benchmark
+        from picarones.report.generator import _build_report_data
+        bm = generate_sample_benchmark(n_docs=3, seed=42)
+        images_b64 = bm.metadata.get("_images_b64", {})
+        return _build_report_data(bm, images_b64)
+    def test_pipeline_engine_in_data(self, report_data):
+        names = [e["name"] for e in report_data["engines"]]
+        assert "tesseract → gpt-4o" in names
+    def test_pipeline_engine_has_is_pipeline_flag(self, report_data):
+        pipeline_e = next(e for e in report_data["engines"] if e["name"] == "tesseract → gpt-4o")
+        assert pipeline_e["is_pipeline"] is True
+    def test_non_pipeline_engines_not_flagged(self, report_data):
+        for e in report_data["engines"]:
+            if e["name"] != "tesseract → gpt-4o":
+                assert e["is_pipeline"] is False
+    def test_pipeline_has_over_normalization_in_info(self, report_data):
+        pipeline_e = next(e for e in report_data["engines"] if e["name"] == "tesseract → gpt-4o")
+        pi = pipeline_e.get("pipeline_info", {})
+        assert pi.get("over_normalization") is not None
+    def test_document_results_have_ocr_intermediate(self, report_data):
+        for doc in report_data["documents"]:
+            pipeline_er = next(
+                (er for er in doc["engine_results"] if er["engine"] == "tesseract → gpt-4o"),
+                None,
+            )
+            assert pipeline_er is not None
+            assert "ocr_intermediate" in pipeline_er
+            assert "ocr_diff" in pipeline_er
+            assert "llm_correction_diff" in pipeline_er
+    def test_document_results_have_over_normalization(self, report_data):
+        for doc in report_data["documents"]:
+            pipeline_er = next(
+                (er for er in doc["engine_results"] if er["engine"] == "tesseract → gpt-4o"),
+                None,
+            )
+            assert pipeline_er is not None
+            assert "over_normalization" in pipeline_er
+    def test_html_contains_pipeline_tag(self, tmp_path):
+        from picarones.fixtures import generate_sample_benchmark
+        from picarones.report.generator import ReportGenerator
+        bm = generate_sample_benchmark(n_docs=3, seed=42)
+        out = tmp_path / "report.html"
+        ReportGenerator(bm).generate(out)
+        html = out.read_text(encoding="utf-8")
+        assert "pipeline" in html.lower()
+        assert "tesseract" in html