Spaces:
Running
Sprint 3 — Pipelines OCR+LLM, adaptateurs LLM, sur-normalisation (classe 10)
Browse filesModules créés :
- picarones/llm/ : BaseLLMAdapter + adaptateurs OpenAI (GPT-4o), Anthropic
(Claude Sonnet), Mistral Large, Ollama (modèles locaux)
- picarones/pipelines/ : OCRLLMPipeline (3 modes : text_only, text_and_image,
zero_shot) + détection sur-normalisation (classe 10 de la taxonomie)
- picarones/prompts/ : bibliothèque de 5 prompts .txt versionnés pour
documents en français médiéval et imprimés anciens
Comportement :
- OCRLLMPipeline étend BaseOCREngine : un pipeline est un concurrent normal
dans run_benchmark (mêmes métriques CER/WER)
- Sur-normalisation (classe 10) calculée automatiquement par document et
agrégée dans EngineReport.pipeline_info
- Concurrent fictif « tesseract → gpt-4o » ajouté aux fixtures de démo
Rapport HTML :
- Badge « ⛓ pipeline » + étapes OCR→LLM dans le tableau de classement
- Colonne « Sur-norm. » dans le classement
- Triple-diff dans la vue Document (GT→OCR brut / OCR brut→LLM corrigé)
- Pipeline badge dans les cartes de la galerie
Tests : 154 tests passants (+46 nouveaux tests Sprint 3)
https://claude.ai/code/session_017gXea9mxBQqDTAsSQd7aAq
- picarones/core/results.py +26 -3
- picarones/core/runner.py +88 -17
- picarones/fixtures.py +109 -7
- picarones/llm/__init__.py +16 -0
- picarones/llm/anthropic_adapter.py +72 -0
- picarones/llm/base.py +95 -0
- picarones/llm/mistral_adapter.py +69 -0
- picarones/llm/ollama_adapter.py +70 -0
- picarones/llm/openai_adapter.py +68 -0
- picarones/pipelines/__init__.py +14 -0
- picarones/pipelines/base.py +243 -0
- picarones/pipelines/over_normalization.py +121 -0
- picarones/prompts/correction_image_medieval_french.txt +19 -0
- picarones/prompts/correction_imprime_ancien.txt +20 -0
- picarones/prompts/correction_medieval_french.txt +18 -0
- picarones/prompts/zero_shot_imprime_ancien.txt +15 -0
- picarones/prompts/zero_shot_medieval_french.txt +14 -0
- picarones/report/generator.py +132 -7
- rapport_demo.html +0 -0
- tests/test_report.py +5 -3
- tests/test_sprint3_llm_pipelines.py +441 -0
|
@@ -30,9 +30,14 @@ class DocumentResult:
|
|
| 30 |
metrics: MetricsResult
|
| 31 |
duration_seconds: float
|
| 32 |
engine_error: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
def as_dict(self) -> dict:
|
| 35 |
-
|
| 36 |
"doc_id": self.doc_id,
|
| 37 |
"image_path": self.image_path,
|
| 38 |
"ground_truth": self.ground_truth,
|
|
@@ -41,17 +46,27 @@ class DocumentResult:
|
|
| 41 |
"duration_seconds": self.duration_seconds,
|
| 42 |
"engine_error": self.engine_error,
|
| 43 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
|
| 46 |
@dataclass
|
| 47 |
class EngineReport:
|
| 48 |
-
"""Rapport complet d'un moteur sur l'ensemble du corpus."""
|
| 49 |
|
| 50 |
engine_name: str
|
| 51 |
engine_version: str
|
| 52 |
engine_config: dict
|
| 53 |
document_results: list[DocumentResult]
|
| 54 |
aggregated_metrics: dict = field(default_factory=dict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
def __post_init__(self) -> None:
|
| 57 |
if not self.aggregated_metrics and self.document_results:
|
|
@@ -69,14 +84,22 @@ class EngineReport:
|
|
| 69 |
wer_stats = self.aggregated_metrics.get("wer", {})
|
| 70 |
return wer_stats.get("mean")
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
def as_dict(self) -> dict:
|
| 73 |
-
|
| 74 |
"engine_name": self.engine_name,
|
| 75 |
"engine_version": self.engine_version,
|
| 76 |
"engine_config": self.engine_config,
|
| 77 |
"aggregated_metrics": self.aggregated_metrics,
|
| 78 |
"document_results": [dr.as_dict() for dr in self.document_results],
|
| 79 |
}
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
|
| 82 |
@dataclass
|
|
|
|
| 30 |
metrics: MetricsResult
|
| 31 |
duration_seconds: float
|
| 32 |
engine_error: Optional[str] = None
|
| 33 |
+
# Champs spécifiques aux pipelines OCR+LLM
|
| 34 |
+
ocr_intermediate: Optional[str] = None
|
| 35 |
+
"""Sortie OCR brute avant correction LLM (None pour les moteurs OCR seuls)."""
|
| 36 |
+
pipeline_metadata: dict = field(default_factory=dict)
|
| 37 |
+
"""Métadonnées du pipeline : mode, prompt, over-normalization…"""
|
| 38 |
|
| 39 |
def as_dict(self) -> dict:
|
| 40 |
+
d = {
|
| 41 |
"doc_id": self.doc_id,
|
| 42 |
"image_path": self.image_path,
|
| 43 |
"ground_truth": self.ground_truth,
|
|
|
|
| 46 |
"duration_seconds": self.duration_seconds,
|
| 47 |
"engine_error": self.engine_error,
|
| 48 |
}
|
| 49 |
+
if self.ocr_intermediate is not None:
|
| 50 |
+
d["ocr_intermediate"] = self.ocr_intermediate
|
| 51 |
+
if self.pipeline_metadata:
|
| 52 |
+
d["pipeline_metadata"] = self.pipeline_metadata
|
| 53 |
+
return d
|
| 54 |
|
| 55 |
|
| 56 |
@dataclass
|
| 57 |
class EngineReport:
|
| 58 |
+
"""Rapport complet d'un moteur (ou pipeline) sur l'ensemble du corpus."""
|
| 59 |
|
| 60 |
engine_name: str
|
| 61 |
engine_version: str
|
| 62 |
engine_config: dict
|
| 63 |
document_results: list[DocumentResult]
|
| 64 |
aggregated_metrics: dict = field(default_factory=dict)
|
| 65 |
+
pipeline_info: dict = field(default_factory=dict)
|
| 66 |
+
"""Métadonnées du pipeline OCR+LLM (vide pour les moteurs OCR seuls).
|
| 67 |
+
Clés typiques : mode, prompt_file, llm_model, llm_provider, pipeline_steps,
|
| 68 |
+
over_normalization (score agrégé, classe 10 de la taxonomie).
|
| 69 |
+
"""
|
| 70 |
|
| 71 |
def __post_init__(self) -> None:
|
| 72 |
if not self.aggregated_metrics and self.document_results:
|
|
|
|
| 84 |
wer_stats = self.aggregated_metrics.get("wer", {})
|
| 85 |
return wer_stats.get("mean")
|
| 86 |
|
| 87 |
+
@property
|
| 88 |
+
def is_pipeline(self) -> bool:
|
| 89 |
+
"""Vrai si ce rapport correspond à un pipeline OCR+LLM."""
|
| 90 |
+
return bool(self.pipeline_info)
|
| 91 |
+
|
| 92 |
def as_dict(self) -> dict:
|
| 93 |
+
d = {
|
| 94 |
"engine_name": self.engine_name,
|
| 95 |
"engine_version": self.engine_version,
|
| 96 |
"engine_config": self.engine_config,
|
| 97 |
"aggregated_metrics": self.aggregated_metrics,
|
| 98 |
"document_results": [dr.as_dict() for dr in self.document_results],
|
| 99 |
}
|
| 100 |
+
if self.pipeline_info:
|
| 101 |
+
d["pipeline_info"] = self.pipeline_info
|
| 102 |
+
return d
|
| 103 |
|
| 104 |
|
| 105 |
@dataclass
|
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
"""Orchestrateur du benchmark : exécute les moteurs sur le corpus et agrège les résultats."""
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
@@ -9,7 +9,7 @@ from typing import Optional
|
|
| 9 |
from tqdm import tqdm
|
| 10 |
|
| 11 |
from picarones.core.corpus import Corpus
|
| 12 |
-
from picarones.core.metrics import compute_metrics
|
| 13 |
from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
|
| 14 |
from picarones.engines.base import BaseOCREngine
|
| 15 |
|
|
@@ -22,33 +22,37 @@ def run_benchmark(
|
|
| 22 |
output_json: Optional[str | Path] = None,
|
| 23 |
show_progress: bool = True,
|
| 24 |
) -> BenchmarkResult:
|
| 25 |
-
"""Exécute le benchmark d'un ou plusieurs moteurs sur un corpus.
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
les métriques CER
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
Parameters
|
| 32 |
----------
|
| 33 |
corpus:
|
| 34 |
-
Corpus à évaluer
|
| 35 |
engines:
|
| 36 |
-
Liste d'adaptateurs moteurs
|
| 37 |
output_json:
|
| 38 |
-
Chemin optionnel pour écrire le résultat JSON.
|
| 39 |
-
d'écriture disque.
|
| 40 |
show_progress:
|
| 41 |
-
Affiche une barre de progression tqdm
|
| 42 |
|
| 43 |
Returns
|
| 44 |
-------
|
| 45 |
BenchmarkResult
|
| 46 |
-
Objet contenant tous les résultats, agrégations et classement.
|
| 47 |
"""
|
| 48 |
engine_reports: list[EngineReport] = []
|
| 49 |
|
| 50 |
for engine in engines:
|
| 51 |
-
logger.info("Démarrage
|
| 52 |
document_results: list[DocumentResult] = []
|
| 53 |
|
| 54 |
iterator = tqdm(
|
|
@@ -64,9 +68,6 @@ def run_benchmark(
|
|
| 64 |
if ocr_result.success:
|
| 65 |
metrics = compute_metrics(doc.ground_truth, ocr_result.text)
|
| 66 |
else:
|
| 67 |
-
# Moteur en erreur → métriques dégradées avec erreur tracée
|
| 68 |
-
from picarones.core.metrics import MetricsResult
|
| 69 |
-
|
| 70 |
metrics = MetricsResult(
|
| 71 |
cer=1.0, cer_nfc=1.0, cer_caseless=1.0,
|
| 72 |
wer=1.0, wer_normalized=1.0, mer=1.0, wil=1.0,
|
|
@@ -75,6 +76,27 @@ def run_benchmark(
|
|
| 75 |
error=ocr_result.error,
|
| 76 |
)
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
document_results.append(
|
| 79 |
DocumentResult(
|
| 80 |
doc_id=doc.doc_id,
|
|
@@ -84,19 +106,24 @@ def run_benchmark(
|
|
| 84 |
metrics=metrics,
|
| 85 |
duration_seconds=ocr_result.duration_seconds,
|
| 86 |
engine_error=ocr_result.error,
|
|
|
|
|
|
|
| 87 |
)
|
| 88 |
)
|
| 89 |
|
| 90 |
engine_version = engine._safe_version()
|
|
|
|
|
|
|
| 91 |
report = EngineReport(
|
| 92 |
engine_name=engine.name,
|
| 93 |
engine_version=engine_version,
|
| 94 |
engine_config=engine.config,
|
| 95 |
document_results=document_results,
|
|
|
|
| 96 |
)
|
| 97 |
engine_reports.append(report)
|
| 98 |
logger.info(
|
| 99 |
-
"
|
| 100 |
engine.name,
|
| 101 |
(report.mean_cer or 0) * 100,
|
| 102 |
)
|
|
@@ -113,3 +140,47 @@ def run_benchmark(
|
|
| 113 |
logger.info("Résultats écrits dans : %s", path)
|
| 114 |
|
| 115 |
return benchmark
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Orchestrateur du benchmark : exécute les moteurs/pipelines sur le corpus et agrège les résultats."""
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
|
|
|
| 9 |
from tqdm import tqdm
|
| 10 |
|
| 11 |
from picarones.core.corpus import Corpus
|
| 12 |
+
from picarones.core.metrics import MetricsResult, compute_metrics
|
| 13 |
from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
|
| 14 |
from picarones.engines.base import BaseOCREngine
|
| 15 |
|
|
|
|
| 22 |
output_json: Optional[str | Path] = None,
|
| 23 |
show_progress: bool = True,
|
| 24 |
) -> BenchmarkResult:
|
| 25 |
+
"""Exécute le benchmark d'un ou plusieurs moteurs/pipelines sur un corpus.
|
| 26 |
|
| 27 |
+
Les pipelines OCR+LLM (``OCRLLMPipeline``) sont traités exactement comme
|
| 28 |
+
les moteurs OCR classiques — ils implémentent la même interface
|
| 29 |
+
``BaseOCREngine`` et produisent les mêmes métriques CER/WER.
|
| 30 |
+
|
| 31 |
+
En supplément, pour les pipelines :
|
| 32 |
+
- La sortie OCR intermédiaire est conservée dans ``DocumentResult.ocr_intermediate``
|
| 33 |
+
- La sur-normalisation LLM (classe 10) est calculée et stockée dans
|
| 34 |
+
``DocumentResult.pipeline_metadata["over_normalization"]``
|
| 35 |
+
- Les stats agrégées de sur-normalisation figurent dans ``EngineReport.pipeline_info``
|
| 36 |
|
| 37 |
Parameters
|
| 38 |
----------
|
| 39 |
corpus:
|
| 40 |
+
Corpus à évaluer.
|
| 41 |
engines:
|
| 42 |
+
Liste d'adaptateurs moteurs ou de pipelines OCR+LLM.
|
| 43 |
output_json:
|
| 44 |
+
Chemin optionnel pour écrire le résultat JSON.
|
|
|
|
| 45 |
show_progress:
|
| 46 |
+
Affiche une barre de progression tqdm.
|
| 47 |
|
| 48 |
Returns
|
| 49 |
-------
|
| 50 |
BenchmarkResult
|
|
|
|
| 51 |
"""
|
| 52 |
engine_reports: list[EngineReport] = []
|
| 53 |
|
| 54 |
for engine in engines:
|
| 55 |
+
logger.info("Démarrage concurrent : %s", engine.name)
|
| 56 |
document_results: list[DocumentResult] = []
|
| 57 |
|
| 58 |
iterator = tqdm(
|
|
|
|
| 68 |
if ocr_result.success:
|
| 69 |
metrics = compute_metrics(doc.ground_truth, ocr_result.text)
|
| 70 |
else:
|
|
|
|
|
|
|
|
|
|
| 71 |
metrics = MetricsResult(
|
| 72 |
cer=1.0, cer_nfc=1.0, cer_caseless=1.0,
|
| 73 |
wer=1.0, wer_normalized=1.0, mer=1.0, wil=1.0,
|
|
|
|
| 76 |
error=ocr_result.error,
|
| 77 |
)
|
| 78 |
|
| 79 |
+
# Extraction des champs pipeline depuis les métadonnées EngineResult
|
| 80 |
+
ocr_intermediate = ocr_result.metadata.get("ocr_intermediate")
|
| 81 |
+
pipeline_meta: dict = {}
|
| 82 |
+
|
| 83 |
+
if ocr_result.metadata.get("is_pipeline"):
|
| 84 |
+
pipeline_meta = {
|
| 85 |
+
"pipeline_mode": ocr_result.metadata.get("pipeline_mode"),
|
| 86 |
+
"prompt_file": ocr_result.metadata.get("prompt_file"),
|
| 87 |
+
"llm_model": ocr_result.metadata.get("llm_model"),
|
| 88 |
+
"llm_provider": ocr_result.metadata.get("llm_provider"),
|
| 89 |
+
}
|
| 90 |
+
# Calcul de la sur-normalisation (classe 10) si OCR intermédiaire disponible
|
| 91 |
+
if ocr_intermediate is not None and ocr_result.success:
|
| 92 |
+
from picarones.pipelines.over_normalization import detect_over_normalization
|
| 93 |
+
over_norm = detect_over_normalization(
|
| 94 |
+
ground_truth=doc.ground_truth,
|
| 95 |
+
ocr_text=ocr_intermediate,
|
| 96 |
+
llm_text=ocr_result.text,
|
| 97 |
+
)
|
| 98 |
+
pipeline_meta["over_normalization"] = over_norm.as_dict()
|
| 99 |
+
|
| 100 |
document_results.append(
|
| 101 |
DocumentResult(
|
| 102 |
doc_id=doc.doc_id,
|
|
|
|
| 106 |
metrics=metrics,
|
| 107 |
duration_seconds=ocr_result.duration_seconds,
|
| 108 |
engine_error=ocr_result.error,
|
| 109 |
+
ocr_intermediate=ocr_intermediate,
|
| 110 |
+
pipeline_metadata=pipeline_meta,
|
| 111 |
)
|
| 112 |
)
|
| 113 |
|
| 114 |
engine_version = engine._safe_version()
|
| 115 |
+
pipeline_info = _build_pipeline_info(engine, document_results)
|
| 116 |
+
|
| 117 |
report = EngineReport(
|
| 118 |
engine_name=engine.name,
|
| 119 |
engine_version=engine_version,
|
| 120 |
engine_config=engine.config,
|
| 121 |
document_results=document_results,
|
| 122 |
+
pipeline_info=pipeline_info,
|
| 123 |
)
|
| 124 |
engine_reports.append(report)
|
| 125 |
logger.info(
|
| 126 |
+
"Concurrent %s terminé — CER moyen : %.2f%%",
|
| 127 |
engine.name,
|
| 128 |
(report.mean_cer or 0) * 100,
|
| 129 |
)
|
|
|
|
| 140 |
logger.info("Résultats écrits dans : %s", path)
|
| 141 |
|
| 142 |
return benchmark
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
def _build_pipeline_info(engine: BaseOCREngine, doc_results: list[DocumentResult]) -> dict:
|
| 146 |
+
"""Construit le dictionnaire pipeline_info pour un EngineReport."""
|
| 147 |
+
first_with_meta = next(
|
| 148 |
+
(dr for dr in doc_results if dr.pipeline_metadata), None
|
| 149 |
+
)
|
| 150 |
+
if first_with_meta is None:
|
| 151 |
+
return {}
|
| 152 |
+
|
| 153 |
+
meta = first_with_meta.pipeline_metadata
|
| 154 |
+
info: dict = {
|
| 155 |
+
"pipeline_mode": meta.get("pipeline_mode"),
|
| 156 |
+
"prompt_file": meta.get("prompt_file"),
|
| 157 |
+
"llm_model": meta.get("llm_model"),
|
| 158 |
+
"llm_provider": meta.get("llm_provider"),
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
# Récupérer les steps depuis le moteur si c'est un OCRLLMPipeline
|
| 162 |
+
try:
|
| 163 |
+
from picarones.pipelines.base import OCRLLMPipeline
|
| 164 |
+
if isinstance(engine, OCRLLMPipeline):
|
| 165 |
+
info["pipeline_steps"] = engine._build_steps_info()
|
| 166 |
+
info["prompt_template"] = engine._prompt_template
|
| 167 |
+
except ImportError:
|
| 168 |
+
pass
|
| 169 |
+
|
| 170 |
+
# Agréger les stats de sur-normalisation sur tous les documents
|
| 171 |
+
over_norm_results = [
|
| 172 |
+
dr.pipeline_metadata.get("over_normalization")
|
| 173 |
+
for dr in doc_results
|
| 174 |
+
if dr.pipeline_metadata.get("over_normalization") is not None
|
| 175 |
+
]
|
| 176 |
+
if over_norm_results:
|
| 177 |
+
total_correct = sum(r["total_correct_ocr_words"] for r in over_norm_results)
|
| 178 |
+
total_over = sum(r["over_normalized_count"] for r in over_norm_results)
|
| 179 |
+
info["over_normalization"] = {
|
| 180 |
+
"score": round(total_over / total_correct, 4) if total_correct > 0 else 0.0,
|
| 181 |
+
"total_correct_ocr_words": total_correct,
|
| 182 |
+
"over_normalized_count": total_over,
|
| 183 |
+
"document_count": len(over_norm_results),
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
return info
|
|
@@ -17,6 +17,7 @@ from typing import Optional
|
|
| 17 |
|
| 18 |
from picarones.core.metrics import MetricsResult, aggregate_metrics
|
| 19 |
from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
|
|
|
|
| 20 |
|
| 21 |
# ---------------------------------------------------------------------------
|
| 22 |
# Textes GT réalistes (documents patrimoniaux BnF)
|
|
@@ -76,6 +77,38 @@ def _pero_errors(text: str, rng: random.Random) -> str:
|
|
| 76 |
return text
|
| 77 |
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
def _bad_engine_errors(text: str, rng: random.Random) -> str:
|
| 80 |
"""Moteur de mauvaise qualité : nombreuses erreurs."""
|
| 81 |
words = text.split()
|
|
@@ -182,16 +215,44 @@ def generate_sample_benchmark(
|
|
| 182 |
n_docs = min(n_docs, len(_GT_TEXTS))
|
| 183 |
gt_texts = _GT_TEXTS[:n_docs]
|
| 184 |
|
|
|
|
| 185 |
engines_config = [
|
| 186 |
-
("pero_ocr", "0.7.2", {"config": "/models/pero_printed.ini"}, _pero_errors),
|
| 187 |
-
("tesseract", "5.3.3", {"lang": "fra", "psm": 6}, _tesseract_errors),
|
| 188 |
-
("ancien_moteur", "2.1.0", {"lang": "fra"}, _bad_engine_errors),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
]
|
| 190 |
|
| 191 |
engine_reports: list[EngineReport] = []
|
| 192 |
image_b64_cache: dict[str, str] = {}
|
| 193 |
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
| 195 |
doc_results: list[DocumentResult] = []
|
| 196 |
|
| 197 |
for i, gt in enumerate(gt_texts):
|
|
@@ -203,8 +264,28 @@ def generate_sample_benchmark(
|
|
| 203 |
png = _make_placeholder_png(320, 220, gt[:20])
|
| 204 |
image_b64_cache[doc_id] = _png_to_data_uri(png)
|
| 205 |
|
| 206 |
-
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
metrics = _make_metrics(gt, hypothesis)
|
| 210 |
|
|
@@ -215,15 +296,36 @@ def generate_sample_benchmark(
|
|
| 215 |
ground_truth=gt,
|
| 216 |
hypothesis=hypothesis,
|
| 217 |
metrics=metrics,
|
| 218 |
-
duration_seconds=
|
|
|
|
|
|
|
| 219 |
)
|
| 220 |
)
|
| 221 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
report = EngineReport(
|
| 223 |
engine_name=engine_name,
|
| 224 |
engine_version=engine_version,
|
| 225 |
engine_config=engine_cfg,
|
| 226 |
document_results=doc_results,
|
|
|
|
| 227 |
)
|
| 228 |
engine_reports.append(report)
|
| 229 |
|
|
|
|
| 17 |
|
| 18 |
from picarones.core.metrics import MetricsResult, aggregate_metrics
|
| 19 |
from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
|
| 20 |
+
from picarones.pipelines.over_normalization import detect_over_normalization
|
| 21 |
|
| 22 |
# ---------------------------------------------------------------------------
|
| 23 |
# Textes GT réalistes (documents patrimoniaux BnF)
|
|
|
|
| 77 |
return text
|
| 78 |
|
| 79 |
|
| 80 |
+
def _llm_correction(text: str, rng: random.Random) -> str:
|
| 81 |
+
"""Simule la correction GPT-4o sur la sortie Tesseract.
|
| 82 |
+
|
| 83 |
+
Le LLM corrige la majorité des erreurs OCR mais introduit parfois
|
| 84 |
+
de la sur-normalisation (classe 10) : il modernise des graphies médiévales
|
| 85 |
+
légitimes (nostre → notre, maistre → maître, faict → fait).
|
| 86 |
+
"""
|
| 87 |
+
# Corrections typiques que le LLM réussit (erreurs OCR fréquentes)
|
| 88 |
+
good_corrections = [
|
| 89 |
+
("noltre", "nostre"), ("inaistre", "maistre"),
|
| 90 |
+
("faictcs", "faictes"), ("conlcillier", "conseillie"),
|
| 91 |
+
("confideration", "consideracion"), ("Froiflart", "Froissart"),
|
| 92 |
+
("8", "&"), ("oe", "œ"),
|
| 93 |
+
]
|
| 94 |
+
for src, tgt in good_corrections:
|
| 95 |
+
text = text.replace(src, tgt)
|
| 96 |
+
|
| 97 |
+
# Sur-normalisation : le LLM modernise parfois à tort (classe 10)
|
| 98 |
+
# Ces remplacements s'appliquent sur le texte (partiellement corrigé ci-dessus)
|
| 99 |
+
over_normalizations = [
|
| 100 |
+
("nostre", "notre"), ("maistre", "maître"),
|
| 101 |
+
("faictes", "faites"), ("Donné", "donné"),
|
| 102 |
+
("conseillier", "conseiller"), ("consideracion", "considération"),
|
| 103 |
+
]
|
| 104 |
+
# ~45% de chance de sur-normaliser sur chaque document
|
| 105 |
+
if rng.random() < 0.45:
|
| 106 |
+
for src, tgt in rng.sample(over_normalizations, k=rng.randint(1, 2)):
|
| 107 |
+
text = text.replace(src, tgt, 1)
|
| 108 |
+
|
| 109 |
+
return text
|
| 110 |
+
|
| 111 |
+
|
| 112 |
def _bad_engine_errors(text: str, rng: random.Random) -> str:
|
| 113 |
"""Moteur de mauvaise qualité : nombreuses erreurs."""
|
| 114 |
words = text.split()
|
|
|
|
| 215 |
n_docs = min(n_docs, len(_GT_TEXTS))
|
| 216 |
gt_texts = _GT_TEXTS[:n_docs]
|
| 217 |
|
| 218 |
+
# (name, version, config, error_fn, is_pipeline, pipeline_info)
|
| 219 |
engines_config = [
|
| 220 |
+
("pero_ocr", "0.7.2", {"config": "/models/pero_printed.ini"}, _pero_errors, False, {}),
|
| 221 |
+
("tesseract", "5.3.3", {"lang": "fra", "psm": 6}, _tesseract_errors, False, {}),
|
| 222 |
+
("ancien_moteur", "2.1.0", {"lang": "fra"}, _bad_engine_errors, False, {}),
|
| 223 |
+
# Pipeline fictif : tesseract → gpt-4o (post-correction image+texte)
|
| 224 |
+
(
|
| 225 |
+
"tesseract → gpt-4o",
|
| 226 |
+
"ocr=5.3.3; llm=gpt-4o",
|
| 227 |
+
{"lang": "fra", "psm": 6},
|
| 228 |
+
_llm_correction, # appliqué sur la sortie tesseract
|
| 229 |
+
True,
|
| 230 |
+
{
|
| 231 |
+
"pipeline_mode": "text_and_image",
|
| 232 |
+
"prompt_file": "correction_medieval_french.txt",
|
| 233 |
+
"llm_model": "gpt-4o",
|
| 234 |
+
"llm_provider": "openai",
|
| 235 |
+
"pipeline_steps": [
|
| 236 |
+
{"type": "ocr", "engine": "tesseract", "version": "5.3.3"},
|
| 237 |
+
{
|
| 238 |
+
"type": "llm",
|
| 239 |
+
"model": "gpt-4o",
|
| 240 |
+
"provider": "openai",
|
| 241 |
+
"mode": "text_and_image",
|
| 242 |
+
"prompt_file": "correction_medieval_french.txt",
|
| 243 |
+
},
|
| 244 |
+
],
|
| 245 |
+
},
|
| 246 |
+
),
|
| 247 |
]
|
| 248 |
|
| 249 |
engine_reports: list[EngineReport] = []
|
| 250 |
image_b64_cache: dict[str, str] = {}
|
| 251 |
|
| 252 |
+
# Pré-calculer les sorties tesseract pour le pipeline
|
| 253 |
+
tess_outputs: dict[str, str] = {}
|
| 254 |
+
|
| 255 |
+
for engine_name, engine_version, engine_cfg, error_fn, is_pipeline, pipeline_info in engines_config:
|
| 256 |
doc_results: list[DocumentResult] = []
|
| 257 |
|
| 258 |
for i, gt in enumerate(gt_texts):
|
|
|
|
| 264 |
png = _make_placeholder_png(320, 220, gt[:20])
|
| 265 |
image_b64_cache[doc_id] = _png_to_data_uri(png)
|
| 266 |
|
| 267 |
+
if is_pipeline:
|
| 268 |
+
# Pour le pipeline : appliquer tesseract d'abord, puis LLM correction
|
| 269 |
+
ocr_intermediate = tess_outputs.get(doc_id) or _tesseract_errors(gt, random.Random(rng.randint(0, 9999)))
|
| 270 |
+
hypothesis = _llm_correction(ocr_intermediate, rng)
|
| 271 |
+
# Calcul de la sur-normalisation (classe 10)
|
| 272 |
+
over_norm = detect_over_normalization(gt, ocr_intermediate, hypothesis)
|
| 273 |
+
pipeline_meta = {
|
| 274 |
+
"pipeline_mode": pipeline_info.get("pipeline_mode"),
|
| 275 |
+
"prompt_file": pipeline_info.get("prompt_file"),
|
| 276 |
+
"llm_model": pipeline_info.get("llm_model"),
|
| 277 |
+
"llm_provider": pipeline_info.get("llm_provider"),
|
| 278 |
+
"over_normalization": over_norm.as_dict(),
|
| 279 |
+
}
|
| 280 |
+
duration = round(rng.uniform(2.5, 12.0), 3) # plus lent qu'un OCR seul
|
| 281 |
+
else:
|
| 282 |
+
ocr_intermediate = None
|
| 283 |
+
hypothesis = error_fn(gt, rng)
|
| 284 |
+
pipeline_meta = {}
|
| 285 |
+
duration = round(rng.uniform(0.3, 4.5), 3)
|
| 286 |
+
# Mémoriser la sortie tesseract pour le pipeline
|
| 287 |
+
if engine_name == "tesseract":
|
| 288 |
+
tess_outputs[doc_id] = hypothesis
|
| 289 |
|
| 290 |
metrics = _make_metrics(gt, hypothesis)
|
| 291 |
|
|
|
|
| 296 |
ground_truth=gt,
|
| 297 |
hypothesis=hypothesis,
|
| 298 |
metrics=metrics,
|
| 299 |
+
duration_seconds=duration,
|
| 300 |
+
ocr_intermediate=ocr_intermediate,
|
| 301 |
+
pipeline_metadata=pipeline_meta,
|
| 302 |
)
|
| 303 |
)
|
| 304 |
|
| 305 |
+
# Agréger les stats de sur-normalisation pour le pipeline
|
| 306 |
+
effective_pipeline_info = dict(pipeline_info)
|
| 307 |
+
if is_pipeline:
|
| 308 |
+
over_norms = [
|
| 309 |
+
dr.pipeline_metadata.get("over_normalization")
|
| 310 |
+
for dr in doc_results
|
| 311 |
+
if dr.pipeline_metadata.get("over_normalization")
|
| 312 |
+
]
|
| 313 |
+
if over_norms:
|
| 314 |
+
total_correct = sum(r["total_correct_ocr_words"] for r in over_norms)
|
| 315 |
+
total_over = sum(r["over_normalized_count"] for r in over_norms)
|
| 316 |
+
effective_pipeline_info["over_normalization"] = {
|
| 317 |
+
"score": round(total_over / total_correct, 4) if total_correct > 0 else 0.0,
|
| 318 |
+
"total_correct_ocr_words": total_correct,
|
| 319 |
+
"over_normalized_count": total_over,
|
| 320 |
+
"document_count": len(over_norms),
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
report = EngineReport(
|
| 324 |
engine_name=engine_name,
|
| 325 |
engine_version=engine_version,
|
| 326 |
engine_config=engine_cfg,
|
| 327 |
document_results=doc_results,
|
| 328 |
+
pipeline_info=effective_pipeline_info,
|
| 329 |
)
|
| 330 |
engine_reports.append(report)
|
| 331 |
|
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Adaptateurs LLM pour les pipelines OCR+LLM."""
|
| 2 |
+
|
| 3 |
+
from picarones.llm.base import BaseLLMAdapter, LLMResult
|
| 4 |
+
from picarones.llm.anthropic_adapter import AnthropicAdapter
|
| 5 |
+
from picarones.llm.mistral_adapter import MistralAdapter
|
| 6 |
+
from picarones.llm.ollama_adapter import OllamaAdapter
|
| 7 |
+
from picarones.llm.openai_adapter import OpenAIAdapter
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"BaseLLMAdapter",
|
| 11 |
+
"LLMResult",
|
| 12 |
+
"OpenAIAdapter",
|
| 13 |
+
"AnthropicAdapter",
|
| 14 |
+
"MistralAdapter",
|
| 15 |
+
"OllamaAdapter",
|
| 16 |
+
]
|
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Adaptateur LLM — Anthropic (Claude Sonnet, Claude Haiku)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
from picarones.llm.base import BaseLLMAdapter
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AnthropicAdapter(BaseLLMAdapter):
|
| 12 |
+
"""Adaptateur pour les modèles Anthropic Claude.
|
| 13 |
+
|
| 14 |
+
Clé API via la variable d'environnement ``ANTHROPIC_API_KEY``.
|
| 15 |
+
|
| 16 |
+
Modes supportés : text_only, text_and_image, zero_shot.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
@property
|
| 20 |
+
def name(self) -> str:
|
| 21 |
+
return "anthropic"
|
| 22 |
+
|
| 23 |
+
@property
|
| 24 |
+
def default_model(self) -> str:
|
| 25 |
+
return "claude-sonnet-4-6"
|
| 26 |
+
|
| 27 |
+
def __init__(
|
| 28 |
+
self,
|
| 29 |
+
model: Optional[str] = None,
|
| 30 |
+
config: Optional[dict] = None,
|
| 31 |
+
) -> None:
|
| 32 |
+
super().__init__(model, config)
|
| 33 |
+
self._api_key = os.environ.get("ANTHROPIC_API_KEY")
|
| 34 |
+
|
| 35 |
+
def _call(self, prompt: str, image_b64: Optional[str] = None) -> str:
|
| 36 |
+
if not self._api_key:
|
| 37 |
+
raise RuntimeError(
|
| 38 |
+
"Clé API Anthropic manquante — définissez la variable d'environnement ANTHROPIC_API_KEY"
|
| 39 |
+
)
|
| 40 |
+
try:
|
| 41 |
+
import anthropic
|
| 42 |
+
except ImportError as exc:
|
| 43 |
+
raise RuntimeError(
|
| 44 |
+
"Le package 'anthropic' n'est pas installé. Lancez : pip install anthropic"
|
| 45 |
+
) from exc
|
| 46 |
+
|
| 47 |
+
client = anthropic.Anthropic(api_key=self._api_key)
|
| 48 |
+
temperature = float(self.config.get("temperature", 0.0))
|
| 49 |
+
max_tokens = int(self.config.get("max_tokens", 4096))
|
| 50 |
+
|
| 51 |
+
if image_b64:
|
| 52 |
+
content: list | str = [
|
| 53 |
+
{
|
| 54 |
+
"type": "image",
|
| 55 |
+
"source": {
|
| 56 |
+
"type": "base64",
|
| 57 |
+
"media_type": "image/png",
|
| 58 |
+
"data": image_b64,
|
| 59 |
+
},
|
| 60 |
+
},
|
| 61 |
+
{"type": "text", "text": prompt},
|
| 62 |
+
]
|
| 63 |
+
else:
|
| 64 |
+
content = prompt
|
| 65 |
+
|
| 66 |
+
response = client.messages.create(
|
| 67 |
+
model=self.model,
|
| 68 |
+
max_tokens=max_tokens,
|
| 69 |
+
temperature=temperature,
|
| 70 |
+
messages=[{"role": "user", "content": content}],
|
| 71 |
+
)
|
| 72 |
+
return response.content[0].text
|
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Interface abstraite commune à tous les adaptateurs LLM."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import time
|
| 6 |
+
from abc import ABC, abstractmethod
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from typing import Optional
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class LLMResult:
|
| 13 |
+
"""Résultat produit par un appel LLM."""
|
| 14 |
+
|
| 15 |
+
model_id: str
|
| 16 |
+
text: str
|
| 17 |
+
duration_seconds: float
|
| 18 |
+
tokens_used: Optional[int] = None
|
| 19 |
+
error: Optional[str] = None
|
| 20 |
+
|
| 21 |
+
@property
|
| 22 |
+
def success(self) -> bool:
|
| 23 |
+
return self.error is None
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class BaseLLMAdapter(ABC):
|
| 27 |
+
"""Classe de base pour tous les adaptateurs LLM.
|
| 28 |
+
|
| 29 |
+
Chaque adaptateur doit implémenter :
|
| 30 |
+
- ``name`` : identifiant du provider (ex : 'openai')
|
| 31 |
+
- ``default_model``: modèle par défaut du provider
|
| 32 |
+
- ``_call()`` : appel API effectif, retourne le texte brut
|
| 33 |
+
|
| 34 |
+
Les clés API sont lues depuis les variables d'environnement uniquement.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
model: Optional[str] = None,
|
| 40 |
+
config: Optional[dict] = None,
|
| 41 |
+
) -> None:
|
| 42 |
+
self.config: dict = config or {}
|
| 43 |
+
self.model: str = model or self.default_model
|
| 44 |
+
|
| 45 |
+
@property
|
| 46 |
+
@abstractmethod
|
| 47 |
+
def name(self) -> str:
|
| 48 |
+
"""Identifiant du provider (ex : 'openai', 'anthropic')."""
|
| 49 |
+
|
| 50 |
+
@property
|
| 51 |
+
@abstractmethod
|
| 52 |
+
def default_model(self) -> str:
|
| 53 |
+
"""Modèle utilisé si aucun n'est fourni explicitement."""
|
| 54 |
+
|
| 55 |
+
@abstractmethod
|
| 56 |
+
def _call(self, prompt: str, image_b64: Optional[str] = None) -> str:
|
| 57 |
+
"""Appel LLM effectif.
|
| 58 |
+
|
| 59 |
+
Parameters
|
| 60 |
+
----------
|
| 61 |
+
prompt:
|
| 62 |
+
Texte du prompt final (variables déjà substituées).
|
| 63 |
+
image_b64:
|
| 64 |
+
Image encodée en base64 (sans préfixe data URI).
|
| 65 |
+
None pour les appels texte-uniquement.
|
| 66 |
+
|
| 67 |
+
Returns
|
| 68 |
+
-------
|
| 69 |
+
str
|
| 70 |
+
Texte généré par le LLM.
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
def complete(
|
| 74 |
+
self,
|
| 75 |
+
prompt: str,
|
| 76 |
+
image_b64: Optional[str] = None,
|
| 77 |
+
) -> LLMResult:
|
| 78 |
+
"""Point d'entrée public : appelle le LLM et mesure la durée."""
|
| 79 |
+
start = time.perf_counter()
|
| 80 |
+
try:
|
| 81 |
+
text = self._call(prompt, image_b64)
|
| 82 |
+
error = None
|
| 83 |
+
except Exception as exc: # noqa: BLE001
|
| 84 |
+
text = ""
|
| 85 |
+
error = str(exc)
|
| 86 |
+
duration = time.perf_counter() - start
|
| 87 |
+
return LLMResult(
|
| 88 |
+
model_id=self.model,
|
| 89 |
+
text=text,
|
| 90 |
+
duration_seconds=round(duration, 4),
|
| 91 |
+
error=error,
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
def __repr__(self) -> str:
|
| 95 |
+
return f"{self.__class__.__name__}(model={self.model!r})"
|
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Adaptateur LLM — Mistral AI (Mistral Large, Pixtral)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
from picarones.llm.base import BaseLLMAdapter
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class MistralAdapter(BaseLLMAdapter):
|
| 12 |
+
"""Adaptateur pour les modèles Mistral AI.
|
| 13 |
+
|
| 14 |
+
Clé API via la variable d'environnement ``MISTRAL_API_KEY``.
|
| 15 |
+
|
| 16 |
+
Modes supportés : text_only (tous modèles), text_and_image et zero_shot
|
| 17 |
+
avec les modèles multimodaux (pixtral-12b, pixtral-large).
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
@property
|
| 21 |
+
def name(self) -> str:
|
| 22 |
+
return "mistral"
|
| 23 |
+
|
| 24 |
+
@property
|
| 25 |
+
def default_model(self) -> str:
|
| 26 |
+
return "mistral-large-latest"
|
| 27 |
+
|
| 28 |
+
def __init__(
|
| 29 |
+
self,
|
| 30 |
+
model: Optional[str] = None,
|
| 31 |
+
config: Optional[dict] = None,
|
| 32 |
+
) -> None:
|
| 33 |
+
super().__init__(model, config)
|
| 34 |
+
self._api_key = os.environ.get("MISTRAL_API_KEY")
|
| 35 |
+
|
| 36 |
+
def _call(self, prompt: str, image_b64: Optional[str] = None) -> str:
|
| 37 |
+
if not self._api_key:
|
| 38 |
+
raise RuntimeError(
|
| 39 |
+
"Clé API Mistral manquante — définissez la variable d'environnement MISTRAL_API_KEY"
|
| 40 |
+
)
|
| 41 |
+
try:
|
| 42 |
+
from mistralai import Mistral
|
| 43 |
+
except ImportError as exc:
|
| 44 |
+
raise RuntimeError(
|
| 45 |
+
"Le package 'mistralai' n'est pas installé. Lancez : pip install mistralai"
|
| 46 |
+
) from exc
|
| 47 |
+
|
| 48 |
+
client = Mistral(api_key=self._api_key)
|
| 49 |
+
temperature = float(self.config.get("temperature", 0.0))
|
| 50 |
+
max_tokens = int(self.config.get("max_tokens", 4096))
|
| 51 |
+
|
| 52 |
+
if image_b64:
|
| 53 |
+
content: list | str = [
|
| 54 |
+
{"type": "text", "text": prompt},
|
| 55 |
+
{
|
| 56 |
+
"type": "image_url",
|
| 57 |
+
"image_url": f"data:image/png;base64,{image_b64}",
|
| 58 |
+
},
|
| 59 |
+
]
|
| 60 |
+
else:
|
| 61 |
+
content = prompt
|
| 62 |
+
|
| 63 |
+
response = client.chat.complete(
|
| 64 |
+
model=self.model,
|
| 65 |
+
messages=[{"role": "user", "content": content}],
|
| 66 |
+
temperature=temperature,
|
| 67 |
+
max_tokens=max_tokens,
|
| 68 |
+
)
|
| 69 |
+
return response.choices[0].message.content or ""
|
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Adaptateur LLM — Ollama (modèles locaux : Llama 3, Gemma, Phi, Mistral local…)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
from picarones.llm.base import BaseLLMAdapter
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class OllamaAdapter(BaseLLMAdapter):
|
| 11 |
+
"""Adaptateur pour les modèles locaux via Ollama.
|
| 12 |
+
|
| 13 |
+
Aucune clé API requise. Nécessite un serveur Ollama actif (par défaut
|
| 14 |
+
sur http://localhost:11434).
|
| 15 |
+
|
| 16 |
+
Modes supportés :
|
| 17 |
+
- text_only : tous modèles Ollama
|
| 18 |
+
- text_and_image : modèles multimodaux (llava, bakllava, moondream…)
|
| 19 |
+
- zero_shot : modèles multimodaux uniquement
|
| 20 |
+
|
| 21 |
+
Configuration (via ``config``) :
|
| 22 |
+
- ``base_url`` : URL du serveur Ollama (défaut : http://localhost:11434)
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
@property
|
| 26 |
+
def name(self) -> str:
|
| 27 |
+
return "ollama"
|
| 28 |
+
|
| 29 |
+
@property
|
| 30 |
+
def default_model(self) -> str:
|
| 31 |
+
return "llama3"
|
| 32 |
+
|
| 33 |
+
def __init__(
|
| 34 |
+
self,
|
| 35 |
+
model: Optional[str] = None,
|
| 36 |
+
config: Optional[dict] = None,
|
| 37 |
+
) -> None:
|
| 38 |
+
super().__init__(model, config)
|
| 39 |
+
self._base_url = self.config.get("base_url", "http://localhost:11434").rstrip("/")
|
| 40 |
+
|
| 41 |
+
def _call(self, prompt: str, image_b64: Optional[str] = None) -> str:
|
| 42 |
+
import json
|
| 43 |
+
import urllib.error
|
| 44 |
+
import urllib.request
|
| 45 |
+
|
| 46 |
+
temperature = float(self.config.get("temperature", 0.0))
|
| 47 |
+
payload: dict = {
|
| 48 |
+
"model": self.model,
|
| 49 |
+
"prompt": prompt,
|
| 50 |
+
"stream": False,
|
| 51 |
+
"options": {"temperature": temperature},
|
| 52 |
+
}
|
| 53 |
+
if image_b64:
|
| 54 |
+
payload["images"] = [image_b64]
|
| 55 |
+
|
| 56 |
+
data = json.dumps(payload).encode("utf-8")
|
| 57 |
+
req = urllib.request.Request(
|
| 58 |
+
f"{self._base_url}/api/generate",
|
| 59 |
+
data=data,
|
| 60 |
+
headers={"Content-Type": "application/json"},
|
| 61 |
+
)
|
| 62 |
+
try:
|
| 63 |
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
| 64 |
+
result = json.loads(resp.read().decode("utf-8"))
|
| 65 |
+
except urllib.error.URLError as exc:
|
| 66 |
+
raise RuntimeError(
|
| 67 |
+
f"Impossible de joindre le serveur Ollama sur {self._base_url}. "
|
| 68 |
+
f"Vérifiez qu'Ollama est démarré (ollama serve). Erreur : {exc}"
|
| 69 |
+
) from exc
|
| 70 |
+
return result.get("response", "")
|
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Adaptateur LLM — OpenAI (GPT-4o, GPT-4o-mini)."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
from typing import Optional
|
| 7 |
+
|
| 8 |
+
from picarones.llm.base import BaseLLMAdapter
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class OpenAIAdapter(BaseLLMAdapter):
|
| 12 |
+
"""Adaptateur pour les modèles OpenAI (GPT-4o, GPT-4o-mini).
|
| 13 |
+
|
| 14 |
+
Clé API via la variable d'environnement ``OPENAI_API_KEY``.
|
| 15 |
+
|
| 16 |
+
Modes supportés : text_only, text_and_image, zero_shot.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
@property
|
| 20 |
+
def name(self) -> str:
|
| 21 |
+
return "openai"
|
| 22 |
+
|
| 23 |
+
@property
|
| 24 |
+
def default_model(self) -> str:
|
| 25 |
+
return "gpt-4o"
|
| 26 |
+
|
| 27 |
+
def __init__(
|
| 28 |
+
self,
|
| 29 |
+
model: Optional[str] = None,
|
| 30 |
+
config: Optional[dict] = None,
|
| 31 |
+
) -> None:
|
| 32 |
+
super().__init__(model, config)
|
| 33 |
+
self._api_key = os.environ.get("OPENAI_API_KEY")
|
| 34 |
+
|
| 35 |
+
def _call(self, prompt: str, image_b64: Optional[str] = None) -> str:
|
| 36 |
+
if not self._api_key:
|
| 37 |
+
raise RuntimeError(
|
| 38 |
+
"Clé API OpenAI manquante — définissez la variable d'environnement OPENAI_API_KEY"
|
| 39 |
+
)
|
| 40 |
+
try:
|
| 41 |
+
from openai import OpenAI
|
| 42 |
+
except ImportError as exc:
|
| 43 |
+
raise RuntimeError(
|
| 44 |
+
"Le package 'openai' n'est pas installé. Lancez : pip install openai"
|
| 45 |
+
) from exc
|
| 46 |
+
|
| 47 |
+
client = OpenAI(api_key=self._api_key)
|
| 48 |
+
temperature = float(self.config.get("temperature", 0.0))
|
| 49 |
+
max_tokens = int(self.config.get("max_tokens", 4096))
|
| 50 |
+
|
| 51 |
+
if image_b64:
|
| 52 |
+
content = [
|
| 53 |
+
{"type": "text", "text": prompt},
|
| 54 |
+
{
|
| 55 |
+
"type": "image_url",
|
| 56 |
+
"image_url": {"url": f"data:image/png;base64,{image_b64}"},
|
| 57 |
+
},
|
| 58 |
+
]
|
| 59 |
+
else:
|
| 60 |
+
content = prompt # type: ignore[assignment]
|
| 61 |
+
|
| 62 |
+
response = client.chat.completions.create(
|
| 63 |
+
model=self.model,
|
| 64 |
+
messages=[{"role": "user", "content": content}],
|
| 65 |
+
temperature=temperature,
|
| 66 |
+
max_tokens=max_tokens,
|
| 67 |
+
)
|
| 68 |
+
return response.choices[0].message.content or ""
|
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pipelines OCR+LLM : combinent un moteur OCR avec un LLM de correction."""
|
| 2 |
+
|
| 3 |
+
from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
|
| 4 |
+
from picarones.pipelines.over_normalization import (
|
| 5 |
+
OverNormalizationResult,
|
| 6 |
+
detect_over_normalization,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
__all__ = [
|
| 10 |
+
"OCRLLMPipeline",
|
| 11 |
+
"PipelineMode",
|
| 12 |
+
"OverNormalizationResult",
|
| 13 |
+
"detect_over_normalization",
|
| 14 |
+
]
|
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pipeline OCR+LLM — présenté comme un concurrent normal dans les benchmarks.
|
| 2 |
+
|
| 3 |
+
Un pipeline compose un moteur OCR et un LLM de correction selon trois modes :
|
| 4 |
+
|
| 5 |
+
text_only → OCR brut ──► LLM (texte seul)
|
| 6 |
+
text_and_image → OCR brut + image ──► LLM multimodal
|
| 7 |
+
zero_shot → image ──► LLM (pas d'OCR amont)
|
| 8 |
+
|
| 9 |
+
La classe ``OCRLLMPipeline`` étend ``BaseOCREngine`` : un pipeline est
|
| 10 |
+
un concurrent comme un autre dans ``run_benchmark``, avec les mêmes métriques
|
| 11 |
+
CER/WER. Les métadonnées spécifiques (étapes, prompt, OCR intermédiaire) sont
|
| 12 |
+
exposées via ``EngineResult.metadata``.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import base64
|
| 18 |
+
import time
|
| 19 |
+
from enum import Enum
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
from typing import Optional
|
| 22 |
+
|
| 23 |
+
from picarones.engines.base import BaseOCREngine, EngineResult
|
| 24 |
+
from picarones.llm.base import BaseLLMAdapter
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class PipelineMode(str, Enum):
|
| 28 |
+
"""Mode d'appel LLM dans le pipeline."""
|
| 29 |
+
|
| 30 |
+
TEXT_ONLY = "text_only"
|
| 31 |
+
"""Le LLM reçoit uniquement le texte OCR brut."""
|
| 32 |
+
|
| 33 |
+
TEXT_AND_IMAGE = "text_and_image"
|
| 34 |
+
"""Le LLM reçoit le texte OCR ET l'image (mode multimodal)."""
|
| 35 |
+
|
| 36 |
+
ZERO_SHOT = "zero_shot"
|
| 37 |
+
"""Le LLM reçoit uniquement l'image — aucun OCR amont."""
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# Répertoire de la bibliothèque de prompts intégrée
|
| 41 |
+
_PROMPTS_DIR = Path(__file__).parent.parent / "prompts"
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _load_prompt(prompt_path: str | Path) -> str:
|
| 45 |
+
"""Charge un prompt depuis un chemin absolu, relatif ou depuis la bibliothèque intégrée."""
|
| 46 |
+
p = Path(prompt_path)
|
| 47 |
+
if p.is_absolute() and p.exists():
|
| 48 |
+
return p.read_text(encoding="utf-8")
|
| 49 |
+
# Chemin relatif : chercher d'abord dans le CWD, puis dans la bibliothèque
|
| 50 |
+
if p.exists():
|
| 51 |
+
return p.read_text(encoding="utf-8")
|
| 52 |
+
builtin = _PROMPTS_DIR / p
|
| 53 |
+
if builtin.exists():
|
| 54 |
+
return builtin.read_text(encoding="utf-8")
|
| 55 |
+
raise FileNotFoundError(
|
| 56 |
+
f"Prompt introuvable : '{prompt_path}'. "
|
| 57 |
+
f"Bibliothèque disponible dans : {_PROMPTS_DIR}"
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def _image_to_b64(image_path: Path) -> str:
|
| 62 |
+
"""Encode une image en base64 pur (sans préfixe data URI)."""
|
| 63 |
+
return base64.b64encode(image_path.read_bytes()).decode("ascii")
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class OCRLLMPipeline(BaseOCREngine):
|
| 67 |
+
"""Pipeline OCR+LLM, interchangeable avec n'importe quel moteur OCR.
|
| 68 |
+
|
| 69 |
+
Parameters
|
| 70 |
+
----------
|
| 71 |
+
llm_adapter:
|
| 72 |
+
Adaptateur LLM (OpenAI, Anthropic, Mistral, Ollama…).
|
| 73 |
+
mode:
|
| 74 |
+
Mode de correction — text_only, text_and_image, ou zero_shot.
|
| 75 |
+
prompt:
|
| 76 |
+
Chemin vers un fichier .txt de prompt, ou nom d'un fichier de la
|
| 77 |
+
bibliothèque intégrée (ex : ``"correction_medieval_french.txt"``).
|
| 78 |
+
Variables disponibles dans le fichier : ``{ocr_output}`` et ``{image_b64}``.
|
| 79 |
+
ocr_engine:
|
| 80 |
+
Moteur OCR amont. Obligatoire pour text_only et text_and_image.
|
| 81 |
+
Non utilisé en mode zero_shot.
|
| 82 |
+
pipeline_name:
|
| 83 |
+
Nom affiché dans le rapport (ex : ``"tesseract → gpt-4o"``).
|
| 84 |
+
Généré automatiquement si non fourni.
|
| 85 |
+
config:
|
| 86 |
+
Paramètres supplémentaires passés à la classe de base.
|
| 87 |
+
|
| 88 |
+
Examples
|
| 89 |
+
--------
|
| 90 |
+
>>> from picarones.llm import OpenAIAdapter
|
| 91 |
+
>>> from picarones.engines.tesseract import TesseractEngine
|
| 92 |
+
>>> pipeline = OCRLLMPipeline(
|
| 93 |
+
... ocr_engine=TesseractEngine({"lang": "fra"}),
|
| 94 |
+
... llm_adapter=OpenAIAdapter(model="gpt-4o"),
|
| 95 |
+
... mode=PipelineMode.TEXT_AND_IMAGE,
|
| 96 |
+
... prompt="correction_medieval_french.txt",
|
| 97 |
+
... )
|
| 98 |
+
"""
|
| 99 |
+
|
| 100 |
+
def __init__(
|
| 101 |
+
self,
|
| 102 |
+
llm_adapter: BaseLLMAdapter,
|
| 103 |
+
mode: PipelineMode | str = PipelineMode.TEXT_ONLY,
|
| 104 |
+
prompt: str | Path = "correction_medieval_french.txt",
|
| 105 |
+
ocr_engine: Optional[BaseOCREngine] = None,
|
| 106 |
+
pipeline_name: Optional[str] = None,
|
| 107 |
+
config: Optional[dict] = None,
|
| 108 |
+
) -> None:
|
| 109 |
+
super().__init__(config)
|
| 110 |
+
self.ocr_engine = ocr_engine
|
| 111 |
+
self.llm_adapter = llm_adapter
|
| 112 |
+
self.mode = PipelineMode(mode)
|
| 113 |
+
self.prompt_path = str(prompt)
|
| 114 |
+
self._prompt_template = _load_prompt(prompt)
|
| 115 |
+
|
| 116 |
+
# Nom affiché dans le rapport
|
| 117 |
+
if pipeline_name:
|
| 118 |
+
self._name = pipeline_name
|
| 119 |
+
elif self.mode == PipelineMode.ZERO_SHOT:
|
| 120 |
+
self._name = f"{llm_adapter.model} (zero-shot)"
|
| 121 |
+
elif ocr_engine:
|
| 122 |
+
self._name = f"{ocr_engine.name} → {llm_adapter.model}"
|
| 123 |
+
else:
|
| 124 |
+
self._name = f"pipeline → {llm_adapter.model}"
|
| 125 |
+
|
| 126 |
+
# Stockage temporaire de la sortie OCR intermédiaire (pour over-normalization)
|
| 127 |
+
self._last_ocr_text: Optional[str] = None
|
| 128 |
+
|
| 129 |
+
# ------------------------------------------------------------------
|
| 130 |
+
# Interface BaseOCREngine
|
| 131 |
+
# ------------------------------------------------------------------
|
| 132 |
+
|
| 133 |
+
@property
|
| 134 |
+
def name(self) -> str:
|
| 135 |
+
return self._name
|
| 136 |
+
|
| 137 |
+
def version(self) -> str:
|
| 138 |
+
ocr_v = self.ocr_engine._safe_version() if self.ocr_engine else "—"
|
| 139 |
+
return f"ocr={ocr_v}; llm={self.llm_adapter.model}"
|
| 140 |
+
|
| 141 |
+
def _run_ocr(self, image_path: Path) -> str:
|
| 142 |
+
"""Logique interne du pipeline — appelée par ``run()``."""
|
| 143 |
+
self._last_ocr_text = None
|
| 144 |
+
ocr_text = ""
|
| 145 |
+
|
| 146 |
+
if self.mode == PipelineMode.ZERO_SHOT:
|
| 147 |
+
image_b64 = _image_to_b64(image_path)
|
| 148 |
+
prompt = self._build_prompt(image_b64=image_b64)
|
| 149 |
+
result = self.llm_adapter.complete(prompt, image_b64=image_b64)
|
| 150 |
+
|
| 151 |
+
elif self.mode == PipelineMode.TEXT_ONLY:
|
| 152 |
+
if self.ocr_engine is None:
|
| 153 |
+
raise ValueError("ocr_engine est requis pour le mode text_only")
|
| 154 |
+
ocr_result = self.ocr_engine.run(image_path)
|
| 155 |
+
ocr_text = ocr_result.text
|
| 156 |
+
self._last_ocr_text = ocr_text
|
| 157 |
+
prompt = self._build_prompt(ocr_text=ocr_text)
|
| 158 |
+
result = self.llm_adapter.complete(prompt)
|
| 159 |
+
|
| 160 |
+
else: # TEXT_AND_IMAGE
|
| 161 |
+
if self.ocr_engine is None:
|
| 162 |
+
raise ValueError("ocr_engine est requis pour le mode text_and_image")
|
| 163 |
+
ocr_result = self.ocr_engine.run(image_path)
|
| 164 |
+
ocr_text = ocr_result.text
|
| 165 |
+
self._last_ocr_text = ocr_text
|
| 166 |
+
image_b64 = _image_to_b64(image_path)
|
| 167 |
+
prompt = self._build_prompt(ocr_text=ocr_text, image_b64=image_b64)
|
| 168 |
+
result = self.llm_adapter.complete(prompt, image_b64=image_b64)
|
| 169 |
+
|
| 170 |
+
if not result.success:
|
| 171 |
+
raise RuntimeError(f"Erreur LLM ({self.llm_adapter.model}): {result.error}")
|
| 172 |
+
|
| 173 |
+
return result.text
|
| 174 |
+
|
| 175 |
+
# ------------------------------------------------------------------
|
| 176 |
+
# Override run() pour injecter les métadonnées pipeline
|
| 177 |
+
# ------------------------------------------------------------------
|
| 178 |
+
|
| 179 |
+
def run(self, image_path: str | Path) -> EngineResult:
|
| 180 |
+
"""Exécute le pipeline et retourne un EngineResult enrichi de métadonnées."""
|
| 181 |
+
image_path = Path(image_path)
|
| 182 |
+
self._last_ocr_text = None
|
| 183 |
+
start = time.perf_counter()
|
| 184 |
+
|
| 185 |
+
try:
|
| 186 |
+
text = self._run_ocr(image_path)
|
| 187 |
+
error = None
|
| 188 |
+
except Exception as exc: # noqa: BLE001
|
| 189 |
+
text = ""
|
| 190 |
+
error = str(exc)
|
| 191 |
+
|
| 192 |
+
duration = time.perf_counter() - start
|
| 193 |
+
|
| 194 |
+
metadata: dict = {
|
| 195 |
+
"engine_version": self._safe_version(),
|
| 196 |
+
"pipeline_mode": self.mode.value,
|
| 197 |
+
"prompt_file": self.prompt_path,
|
| 198 |
+
"prompt_template": self._prompt_template,
|
| 199 |
+
"llm_model": self.llm_adapter.model,
|
| 200 |
+
"llm_provider": self.llm_adapter.name,
|
| 201 |
+
"pipeline_steps": self._build_steps_info(),
|
| 202 |
+
"is_pipeline": True,
|
| 203 |
+
}
|
| 204 |
+
if self._last_ocr_text is not None:
|
| 205 |
+
metadata["ocr_intermediate"] = self._last_ocr_text
|
| 206 |
+
|
| 207 |
+
return EngineResult(
|
| 208 |
+
engine_name=self.name,
|
| 209 |
+
image_path=str(image_path),
|
| 210 |
+
text=text,
|
| 211 |
+
duration_seconds=round(duration, 4),
|
| 212 |
+
error=error,
|
| 213 |
+
metadata=metadata,
|
| 214 |
+
)
|
| 215 |
+
|
| 216 |
+
# ------------------------------------------------------------------
|
| 217 |
+
# Helpers
|
| 218 |
+
# ------------------------------------------------------------------
|
| 219 |
+
|
| 220 |
+
def _build_prompt(self, ocr_text: str = "", image_b64: str = "") -> str:
|
| 221 |
+
"""Substitue {ocr_output} et {image_b64} dans le template de prompt."""
|
| 222 |
+
return (
|
| 223 |
+
self._prompt_template
|
| 224 |
+
.replace("{ocr_output}", ocr_text)
|
| 225 |
+
.replace("{image_b64}", image_b64)
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
def _build_steps_info(self) -> list[dict]:
|
| 229 |
+
steps: list[dict] = []
|
| 230 |
+
if self.ocr_engine:
|
| 231 |
+
steps.append({
|
| 232 |
+
"type": "ocr",
|
| 233 |
+
"engine": self.ocr_engine.name,
|
| 234 |
+
"version": self.ocr_engine._safe_version(),
|
| 235 |
+
})
|
| 236 |
+
steps.append({
|
| 237 |
+
"type": "llm",
|
| 238 |
+
"model": self.llm_adapter.model,
|
| 239 |
+
"provider": self.llm_adapter.name,
|
| 240 |
+
"mode": self.mode.value,
|
| 241 |
+
"prompt_file": self.prompt_path,
|
| 242 |
+
})
|
| 243 |
+
return steps
|
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Détection de la sur-normalisation LLM — Classe 10 de la taxonomie des erreurs.
|
| 2 |
+
|
| 3 |
+
La sur-normalisation désigne le cas où le LLM « corrige » à tort des passages
|
| 4 |
+
déjà bien transcrits par l'OCR, en particulier :
|
| 5 |
+
- modernisation de graphies médiévales légitimes (nostre → notre, faict → fait)
|
| 6 |
+
- normalisation de variantes orthographiques historiques authentiques
|
| 7 |
+
- modification de noms propres ou de termes rares sans erreur OCR initiale
|
| 8 |
+
|
| 9 |
+
Mesure :
|
| 10 |
+
score = nombre de mots (OCR correct → LLM modifié) / nombre de mots OCR corrects
|
| 11 |
+
|
| 12 |
+
Un score élevé indique que le prompt doit être affiné pour mieux préserver
|
| 13 |
+
la graphie originale.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
from dataclasses import dataclass, field
|
| 19 |
+
from typing import Optional
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class OverNormalizationResult:
|
| 24 |
+
"""Résultat de la détection de sur-normalisation pour un document."""
|
| 25 |
+
|
| 26 |
+
total_correct_ocr_words: int
|
| 27 |
+
over_normalized_count: int
|
| 28 |
+
over_normalized_passages: list[dict] = field(default_factory=list)
|
| 29 |
+
# Chaque entrée : {"gt": str, "ocr": str, "llm": str}
|
| 30 |
+
|
| 31 |
+
@property
|
| 32 |
+
def score(self) -> float:
|
| 33 |
+
"""Score de sur-normalisation entre 0 (aucune dégradation) et 1 (tout dégradé)."""
|
| 34 |
+
if self.total_correct_ocr_words == 0:
|
| 35 |
+
return 0.0
|
| 36 |
+
return round(self.over_normalized_count / self.total_correct_ocr_words, 4)
|
| 37 |
+
|
| 38 |
+
def as_dict(self) -> dict:
|
| 39 |
+
return {
|
| 40 |
+
"score": self.score,
|
| 41 |
+
"total_correct_ocr_words": self.total_correct_ocr_words,
|
| 42 |
+
"over_normalized_count": self.over_normalized_count,
|
| 43 |
+
"over_normalized_passages": self.over_normalized_passages[:20],
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def detect_over_normalization(
|
| 48 |
+
ground_truth: str,
|
| 49 |
+
ocr_text: str,
|
| 50 |
+
llm_text: str,
|
| 51 |
+
*,
|
| 52 |
+
max_examples: int = 20,
|
| 53 |
+
) -> OverNormalizationResult:
|
| 54 |
+
"""Détecte la sur-normalisation LLM au niveau des mots.
|
| 55 |
+
|
| 56 |
+
Algorithme (alignement positionnel simple, adapté aux textes courts) :
|
| 57 |
+
Pour chaque position i dans min(len(GT), len(OCR), len(LLM)) :
|
| 58 |
+
- Si ocr[i] == gt[i] → le mot était correct dans l'OCR
|
| 59 |
+
- Si llm[i] != gt[i] → le LLM a dégradé ce mot correct → sur-normalisation
|
| 60 |
+
|
| 61 |
+
Parameters
|
| 62 |
+
----------
|
| 63 |
+
ground_truth:
|
| 64 |
+
Transcription de référence.
|
| 65 |
+
ocr_text:
|
| 66 |
+
Sortie brute du moteur OCR (avant correction LLM).
|
| 67 |
+
llm_text:
|
| 68 |
+
Sortie après correction par le LLM.
|
| 69 |
+
max_examples:
|
| 70 |
+
Nombre maximal d'exemples de sur-normalisation conservés.
|
| 71 |
+
|
| 72 |
+
Returns
|
| 73 |
+
-------
|
| 74 |
+
OverNormalizationResult
|
| 75 |
+
"""
|
| 76 |
+
gt_words = ground_truth.split()
|
| 77 |
+
ocr_words = ocr_text.split()
|
| 78 |
+
llm_words = llm_text.split()
|
| 79 |
+
|
| 80 |
+
n = min(len(gt_words), len(ocr_words), len(llm_words))
|
| 81 |
+
|
| 82 |
+
correct_ocr = 0
|
| 83 |
+
over_norm = 0
|
| 84 |
+
passages: list[dict] = []
|
| 85 |
+
|
| 86 |
+
for i in range(n):
|
| 87 |
+
gt_w = gt_words[i]
|
| 88 |
+
ocr_w = ocr_words[i]
|
| 89 |
+
llm_w = llm_words[i]
|
| 90 |
+
|
| 91 |
+
if ocr_w == gt_w:
|
| 92 |
+
correct_ocr += 1
|
| 93 |
+
if llm_w != gt_w and len(passages) < max_examples:
|
| 94 |
+
over_norm += 1
|
| 95 |
+
passages.append({"gt": gt_w, "ocr": ocr_w, "llm": llm_w})
|
| 96 |
+
elif llm_w != gt_w:
|
| 97 |
+
over_norm += 1
|
| 98 |
+
|
| 99 |
+
return OverNormalizationResult(
|
| 100 |
+
total_correct_ocr_words=correct_ocr,
|
| 101 |
+
over_normalized_count=over_norm,
|
| 102 |
+
over_normalized_passages=passages,
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def aggregate_over_normalization(results: list[Optional[OverNormalizationResult]]) -> dict:
|
| 107 |
+
"""Agrège les résultats de sur-normalisation sur un ensemble de documents."""
|
| 108 |
+
valid = [r for r in results if r is not None]
|
| 109 |
+
if not valid:
|
| 110 |
+
return {"score": None, "total_correct_ocr_words": 0, "over_normalized_count": 0}
|
| 111 |
+
|
| 112 |
+
total_correct = sum(r.total_correct_ocr_words for r in valid)
|
| 113 |
+
total_over = sum(r.over_normalized_count for r in valid)
|
| 114 |
+
score = round(total_over / total_correct, 4) if total_correct > 0 else 0.0
|
| 115 |
+
|
| 116 |
+
return {
|
| 117 |
+
"score": score,
|
| 118 |
+
"total_correct_ocr_words": total_correct,
|
| 119 |
+
"over_normalized_count": total_over,
|
| 120 |
+
"document_count": len(valid),
|
| 121 |
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un expert en paléographie et en transcription de documents en français médiéval (XIIe–XVe siècle).
|
| 2 |
+
|
| 3 |
+
On te fournit la sortie brute d'un moteur OCR ET l'image originale du document.
|
| 4 |
+
Ta tâche est de corriger les erreurs de transcription en te basant sur :
|
| 5 |
+
- L'image originale pour vérifier visuellement les passages ambigus
|
| 6 |
+
- Le contexte linguistique et grammatical du français médiéval
|
| 7 |
+
- Les confusions visuelles typiques de l'OCR sur documents anciens : rn/m, l/1, u/n, ſ/f, cl/d
|
| 8 |
+
- Les abréviations et ligatures médiévales visibles sur l'image
|
| 9 |
+
|
| 10 |
+
RÈGLES IMPÉRATIVES :
|
| 11 |
+
1. Retourne UNIQUEMENT le texte corrigé — sans commentaire, sans explication, sans balise
|
| 12 |
+
2. Conserve FIDÈLEMENT la graphie originale : ne modernise PAS l'orthographe
|
| 13 |
+
(nostre ≠ notre, faict ≠ fait, maistre ≠ maître, ledit ≠ le dit)
|
| 14 |
+
3. Utilise l'image pour trancher les cas ambigus — pas pour « améliorer » le style
|
| 15 |
+
4. Conserve la ponctuation et la capitalisation d'origine
|
| 16 |
+
5. En cas de passage illisible sur l'image, conserve la forme OCR avec [?]
|
| 17 |
+
|
| 18 |
+
OCR BRUT :
|
| 19 |
+
{ocr_output}
|
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un expert en typographie historique et en transcription d'imprimés anciens (XVe–XVIIIe siècle).
|
| 2 |
+
|
| 3 |
+
On te fournit la sortie brute d'un moteur OCR appliqué à un imprimé ancien.
|
| 4 |
+
Ta tâche est de corriger les erreurs de transcription en te basant sur :
|
| 5 |
+
- Les conventions typographiques de l'imprimerie ancienne
|
| 6 |
+
- L'usage du s long (ſ) en position initiale et médiane (ſon, maiſon, diſcours)
|
| 7 |
+
- Les ligatures typographiques : fi, fl, ff, ffi, ffl, st, ct, ſt
|
| 8 |
+
- Les confusions de fontes : romain/italique, capitales ornées
|
| 9 |
+
- Les caractères spéciaux : & (et), ꝛ (r rotunda), ÿ, j/i, u/v
|
| 10 |
+
|
| 11 |
+
RÈGLES IMPÉRATIVES :
|
| 12 |
+
1. Retourne UNIQUEMENT le texte corrigé — sans commentaire, sans explication, sans balise
|
| 13 |
+
2. Conserve la graphie de l'époque : ne modernise PAS l'orthographe
|
| 14 |
+
(ſon ≠ son seulement si l'OCR a mal transcrit ; conſeil ≠ conseil)
|
| 15 |
+
3. Respecte les réclames (mots répétés en bas de page/colonne) tels quels
|
| 16 |
+
4. Conserve les chiffres romains, foliotation et pagination d'origine
|
| 17 |
+
5. En cas de doute sur un passage, conserve la forme OCR plutôt que d'inventer
|
| 18 |
+
|
| 19 |
+
OCR BRUT :
|
| 20 |
+
{ocr_output}
|
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un expert en paléographie et en transcription de documents en français médiéval (XIIe–XVe siècle).
|
| 2 |
+
|
| 3 |
+
On te fournit la sortie brute d'un moteur OCR appliqué à un document patrimonial.
|
| 4 |
+
Ta tâche est de corriger les erreurs de transcription en te basant sur :
|
| 5 |
+
- Le contexte linguistique et grammatical du français médiéval
|
| 6 |
+
- Les confusions visuelles typiques de l'OCR sur documents anciens : rn/m, l/1, u/n, ſ/f, cl/d, ri/n, ii/u
|
| 7 |
+
- Les abréviations courantes : ꝑ (per/par), ꝓ (pro), q̃ (que), p̃ (pre), ā (an), m̃ (men)
|
| 8 |
+
- Les ligatures fréquentes : ct, st, fi, fl, ff, œ, æ
|
| 9 |
+
|
| 10 |
+
RÈGLES IMPÉRATIVES :
|
| 11 |
+
1. Retourne UNIQUEMENT le texte corrigé — sans commentaire, sans explication, sans balise
|
| 12 |
+
2. Conserve FIDÈLEMENT la graphie originale : ne modernise PAS l'orthographe
|
| 13 |
+
(nostre ≠ notre, faict ≠ fait, ledit ≠ le dit, maistre ≠ maître)
|
| 14 |
+
3. Conserve la ponctuation et la capitalisation d'origine
|
| 15 |
+
4. En cas de doute sur un passage, conserve la forme OCR plutôt que d'inventer
|
| 16 |
+
|
| 17 |
+
OCR BRUT :
|
| 18 |
+
{ocr_output}
|
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un expert en typographie historique et en transcription d'imprimés anciens (XVe–XVIIIe siècle).
|
| 2 |
+
|
| 3 |
+
On te fournit l'image d'une page d'imprimé ancien (incunable, livre du XVIe–XVIIIe siècle).
|
| 4 |
+
Ta tâche est de transcrire fidèlement le texte imprimé visible sur l'image.
|
| 5 |
+
|
| 6 |
+
RÈGLES IMPÉRATIVES :
|
| 7 |
+
1. Retourne UNIQUEMENT la transcription — sans commentaire, sans titre, sans balise
|
| 8 |
+
2. Conserve les conventions typographiques de l'époque :
|
| 9 |
+
- s long (ſ) en position initiale et médiane : ſon, maiſon, diſcours
|
| 10 |
+
- ligatures typographiques : fi, fl, ff, ffi, ffl, st, ct
|
| 11 |
+
- & pour et, ÿ, j/i interchangeables selon l'époque
|
| 12 |
+
3. Respecte la mise en page : colonnes, titres courants, réclames, foliotation
|
| 13 |
+
4. Conserve la capitalisation d'origine — ne la normalise pas
|
| 14 |
+
5. Signale les passages illisibles par [illisible] plutôt que d'inventer
|
| 15 |
+
6. Transcris les chiffres romains tels quels (iij, xiij, MCCCLx…)
|
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Tu es un expert en paléographie médiévale spécialisé dans la transcription de manuscrits en français médiéval (XIIe–XVe siècle).
|
| 2 |
+
|
| 3 |
+
On te fournit l'image d'un folio ou d'une page de document patrimonial.
|
| 4 |
+
Ta tâche est de transcrire fidèlement le texte visible sur l'image.
|
| 5 |
+
|
| 6 |
+
RÈGLES IMPÉRATIVES :
|
| 7 |
+
1. Retourne UNIQUEMENT la transcription — sans commentaire, sans titre, sans balise
|
| 8 |
+
2. Conserve la graphie médiévale exacte : ne modernise PAS l'orthographe
|
| 9 |
+
(nostre, maistre, faict, ledit, &, ꝑ, ꝓ…)
|
| 10 |
+
3. Respecte les abréviations telles qu'elles apparaissent sur le document
|
| 11 |
+
4. Conserve les sauts de ligne et la structure du texte original
|
| 12 |
+
5. Signale les passages illisibles par [illisible] plutôt que d'inventer
|
| 13 |
+
6. Ne transcris que le texte principal — ignore les annotations marginales tardives
|
| 14 |
+
sauf si elles font partie du texte courant
|
|
@@ -69,7 +69,7 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 69 |
engines_summary = []
|
| 70 |
for report in benchmark.engine_reports:
|
| 71 |
agg = report.aggregated_metrics
|
| 72 |
-
|
| 73 |
"name": report.engine_name,
|
| 74 |
"version": report.engine_version,
|
| 75 |
"cer": _safe(agg.get("cer", {}).get("mean")),
|
|
@@ -87,7 +87,11 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 87 |
for dr in report.document_results
|
| 88 |
if dr.metrics.error is None
|
| 89 |
],
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
# Documents (vue galerie + vue détail)
|
| 93 |
# On collecte tous les doc_ids depuis le premier moteur
|
|
@@ -113,7 +117,7 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 113 |
gt = dr.ground_truth
|
| 114 |
image_path = dr.image_path
|
| 115 |
diff_ops = compute_word_diff(dr.ground_truth, dr.hypothesis)
|
| 116 |
-
|
| 117 |
"engine": engine_name,
|
| 118 |
"hypothesis": dr.hypothesis,
|
| 119 |
"cer": _safe(dr.metrics.cer),
|
|
@@ -121,7 +125,18 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 121 |
"duration": dr.duration_seconds,
|
| 122 |
"error": dr.engine_error,
|
| 123 |
"diff": diff_ops,
|
| 124 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
# CER moyen sur ce document (pour le badge galerie)
|
| 127 |
cer_values = [er["cer"] for er in engine_results if er["error"] is None]
|
|
@@ -502,6 +517,42 @@ tbody tr:hover {{ background: #f8fafc; }}
|
|
| 502 |
}}
|
| 503 |
.chart-canvas-wrap {{ position: relative; height: 280px; }}
|
| 504 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 505 |
/* ── Misc ─────────────────────────────────────────────────────────── */
|
| 506 |
.badge {{
|
| 507 |
display: inline-block; padding: .15rem .45rem;
|
|
@@ -570,7 +621,7 @@ footer {{
|
|
| 570 |
<thead>
|
| 571 |
<tr>
|
| 572 |
<th data-col="rank" class="sortable sorted" data-dir="asc">#<i class="sort-icon">↑</i></th>
|
| 573 |
-
<th data-col="name" class="sortable">
|
| 574 |
<th data-col="cer" class="sortable">CER<i class="sort-icon">↕</i></th>
|
| 575 |
<th data-col="wer" class="sortable">WER<i class="sort-icon">↕</i></th>
|
| 576 |
<th data-col="mer" class="sortable">MER<i class="sort-icon">↕</i></th>
|
|
@@ -578,6 +629,7 @@ footer {{
|
|
| 578 |
<th>CER médian</th>
|
| 579 |
<th>CER min</th>
|
| 580 |
<th>CER max</th>
|
|
|
|
| 581 |
<th>Docs</th>
|
| 582 |
</tr>
|
| 583 |
</thead>
|
|
@@ -826,11 +878,41 @@ function renderRanking() {{
|
|
| 826 |
const badgeClass = rank === 1 ? 'rank-badge rank-1' : 'rank-badge';
|
| 827 |
const cerC = cerColor(e.cer); const cerB = cerBg(e.cer);
|
| 828 |
const barW = Math.min(100, e.cer * 100 * 3);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 829 |
return `<tr>
|
| 830 |
<td><span class="${{badgeClass}}">${{rank}}</span></td>
|
| 831 |
<td>
|
| 832 |
<span class="engine-name">${{esc(e.name)}}</span>
|
|
|
|
| 833 |
<span class="engine-version">v${{esc(e.version)}}</span>
|
|
|
|
| 834 |
</td>
|
| 835 |
<td>
|
| 836 |
<span class="bar" style="width:${{barW}}px;background:${{cerC}}"></span>
|
|
@@ -842,16 +924,20 @@ function renderRanking() {{
|
|
| 842 |
<td style="color:var(--text-muted)">${{pct(e.cer_median)}}</td>
|
| 843 |
<td style="color:var(--text-muted)">${{pct(e.cer_min)}}</td>
|
| 844 |
<td style="color:var(--text-muted)">${{pct(e.cer_max)}}</td>
|
|
|
|
| 845 |
<td><span class="pill">${{e.doc_count}}</span></td>
|
| 846 |
</tr>`;
|
| 847 |
}}).join('');
|
| 848 |
|
| 849 |
// Stats globales
|
|
|
|
| 850 |
const stats = document.getElementById('ranking-stats');
|
| 851 |
stats.innerHTML = `
|
| 852 |
<div class="stat">Corpus <b>${{esc(DATA.meta.corpus_name)}}</b></div>
|
| 853 |
<div class="stat">Documents <b>${{DATA.meta.document_count}}</b></div>
|
| 854 |
-
<div class="stat">
|
|
|
|
|
|
|
| 855 |
`;
|
| 856 |
}}
|
| 857 |
|
|
@@ -920,8 +1006,10 @@ function renderGallery() {{
|
|
| 920 |
|
| 921 |
const badges = doc.engine_results.map(er => {{
|
| 922 |
const c = cerColor(er.cer); const bg = cerBg(er.cer);
|
|
|
|
|
|
|
| 923 |
return `<span class="engine-cer-badge" style="color:${{c}};background:${{bg}}"
|
| 924 |
-
title="${{esc(er.engine)}}">${{esc(
|
| 925 |
}}).join('');
|
| 926 |
|
| 927 |
return `<div class="gallery-card" onclick="openDocument('${{esc(doc.doc_id)}}')">
|
|
@@ -987,16 +1075,53 @@ function loadDocument(docId) {{
|
|
| 987 |
const c = cerColor(er.cer); const bg = cerBg(er.cer);
|
| 988 |
const diffHtml = renderDiff(er.diff);
|
| 989 |
const errBadge = er.error ? `<span class="badge" style="background:#fee2e2;color:#dc2626">Erreur</span>` : '';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 990 |
return `<div class="diff-panel">
|
| 991 |
<div class="diff-panel-header">
|
| 992 |
<span class="diff-panel-title">${{esc(er.engine)}}</span>
|
|
|
|
| 993 |
<span class="diff-panel-metrics">
|
| 994 |
<span class="cer-badge" style="color:${{c}};background:${{bg}}">${{pct(er.cer)}}</span>
|
| 995 |
<span class="badge" style="background:#f1f5f9">WER ${{pct(er.wer)}}</span>
|
|
|
|
| 996 |
${{errBadge}}
|
| 997 |
</span>
|
| 998 |
</div>
|
| 999 |
<div class="diff-panel-body">${{diffHtml || '<em style="color:var(--text-muted)">Aucune sortie</em>'}}</div>
|
|
|
|
| 1000 |
</div>`;
|
| 1001 |
}}).join('');
|
| 1002 |
}}
|
|
|
|
| 69 |
engines_summary = []
|
| 70 |
for report in benchmark.engine_reports:
|
| 71 |
agg = report.aggregated_metrics
|
| 72 |
+
entry: dict = {
|
| 73 |
"name": report.engine_name,
|
| 74 |
"version": report.engine_version,
|
| 75 |
"cer": _safe(agg.get("cer", {}).get("mean")),
|
|
|
|
| 87 |
for dr in report.document_results
|
| 88 |
if dr.metrics.error is None
|
| 89 |
],
|
| 90 |
+
# Champs pipeline OCR+LLM (vides pour les moteurs OCR seuls)
|
| 91 |
+
"is_pipeline": report.is_pipeline,
|
| 92 |
+
"pipeline_info": report.pipeline_info,
|
| 93 |
+
}
|
| 94 |
+
engines_summary.append(entry)
|
| 95 |
|
| 96 |
# Documents (vue galerie + vue détail)
|
| 97 |
# On collecte tous les doc_ids depuis le premier moteur
|
|
|
|
| 117 |
gt = dr.ground_truth
|
| 118 |
image_path = dr.image_path
|
| 119 |
diff_ops = compute_word_diff(dr.ground_truth, dr.hypothesis)
|
| 120 |
+
er_entry: dict = {
|
| 121 |
"engine": engine_name,
|
| 122 |
"hypothesis": dr.hypothesis,
|
| 123 |
"cer": _safe(dr.metrics.cer),
|
|
|
|
| 125 |
"duration": dr.duration_seconds,
|
| 126 |
"error": dr.engine_error,
|
| 127 |
"diff": diff_ops,
|
| 128 |
+
}
|
| 129 |
+
# Champs spécifiques aux pipelines OCR+LLM
|
| 130 |
+
if dr.ocr_intermediate is not None:
|
| 131 |
+
er_entry["ocr_intermediate"] = dr.ocr_intermediate
|
| 132 |
+
er_entry["ocr_diff"] = compute_word_diff(dr.ground_truth, dr.ocr_intermediate)
|
| 133 |
+
er_entry["llm_correction_diff"] = compute_word_diff(dr.ocr_intermediate, dr.hypothesis)
|
| 134 |
+
if dr.pipeline_metadata:
|
| 135 |
+
on = dr.pipeline_metadata.get("over_normalization")
|
| 136 |
+
if on is not None:
|
| 137 |
+
er_entry["over_normalization"] = on
|
| 138 |
+
er_entry["pipeline_mode"] = dr.pipeline_metadata.get("pipeline_mode")
|
| 139 |
+
engine_results.append(er_entry)
|
| 140 |
|
| 141 |
# CER moyen sur ce document (pour le badge galerie)
|
| 142 |
cer_values = [er["cer"] for er in engine_results if er["error"] is None]
|
|
|
|
| 517 |
}}
|
| 518 |
.chart-canvas-wrap {{ position: relative; height: 280px; }}
|
| 519 |
|
| 520 |
+
/* ── Pipeline badges ──────────────────────────────────────────────── */
|
| 521 |
+
.pipeline-tag {{
|
| 522 |
+
display: inline-flex; align-items: center; gap: .25rem;
|
| 523 |
+
padding: .12rem .38rem;
|
| 524 |
+
border-radius: 4px; font-size: .67rem; font-weight: 700;
|
| 525 |
+
background: #ede9fe; color: #6d28d9;
|
| 526 |
+
letter-spacing: .02em; vertical-align: middle;
|
| 527 |
+
}}
|
| 528 |
+
.pipeline-tag .pipe-arrow {{ opacity: .7; }}
|
| 529 |
+
.over-norm-badge {{
|
| 530 |
+
display: inline-block; padding: .12rem .38rem;
|
| 531 |
+
border-radius: 4px; font-size: .67rem; font-weight: 700;
|
| 532 |
+
background: #fef3c7; color: #b45309;
|
| 533 |
+
}}
|
| 534 |
+
.over-norm-badge.high {{ background: #fee2e2; color: #b91c1c; }}
|
| 535 |
+
/* Vue triple-diff (pipeline) */
|
| 536 |
+
.triple-diff-wrap {{
|
| 537 |
+
display: grid; grid-template-columns: 1fr 1fr; gap: .5rem;
|
| 538 |
+
margin-top: .5rem;
|
| 539 |
+
}}
|
| 540 |
+
.triple-diff-section {{ background: var(--bg); border-radius: 6px; padding: .5rem; }}
|
| 541 |
+
.triple-diff-section h5 {{
|
| 542 |
+
font-size: .73rem; font-weight: 700; color: var(--text-muted);
|
| 543 |
+
margin-bottom: .35rem; text-transform: uppercase; letter-spacing: .04em;
|
| 544 |
+
}}
|
| 545 |
+
.pipeline-steps {{
|
| 546 |
+
display: flex; align-items: center; gap: .3rem; flex-wrap: wrap;
|
| 547 |
+
margin-top: .25rem;
|
| 548 |
+
}}
|
| 549 |
+
.step-chip {{
|
| 550 |
+
padding: .12rem .4rem; border-radius: 4px; font-size: .68rem; font-weight: 600;
|
| 551 |
+
}}
|
| 552 |
+
.step-chip.ocr {{ background: #e0f2fe; color: #0369a1; }}
|
| 553 |
+
.step-chip.llm {{ background: #ede9fe; color: #6d28d9; }}
|
| 554 |
+
.step-arrow {{ color: var(--text-muted); font-size: .8rem; }}
|
| 555 |
+
|
| 556 |
/* ── Misc ─────────────────────────────────────────────────────────── */
|
| 557 |
.badge {{
|
| 558 |
display: inline-block; padding: .15rem .45rem;
|
|
|
|
| 621 |
<thead>
|
| 622 |
<tr>
|
| 623 |
<th data-col="rank" class="sortable sorted" data-dir="asc">#<i class="sort-icon">↑</i></th>
|
| 624 |
+
<th data-col="name" class="sortable">Concurrent<i class="sort-icon">↕</i></th>
|
| 625 |
<th data-col="cer" class="sortable">CER<i class="sort-icon">↕</i></th>
|
| 626 |
<th data-col="wer" class="sortable">WER<i class="sort-icon">↕</i></th>
|
| 627 |
<th data-col="mer" class="sortable">MER<i class="sort-icon">↕</i></th>
|
|
|
|
| 629 |
<th>CER médian</th>
|
| 630 |
<th>CER min</th>
|
| 631 |
<th>CER max</th>
|
| 632 |
+
<th title="Classe 10 — Sur-normalisation LLM : taux de mots corrects dégradés par le LLM">Sur-norm.</th>
|
| 633 |
<th>Docs</th>
|
| 634 |
</tr>
|
| 635 |
</thead>
|
|
|
|
| 878 |
const badgeClass = rank === 1 ? 'rank-badge rank-1' : 'rank-badge';
|
| 879 |
const cerC = cerColor(e.cer); const cerB = cerBg(e.cer);
|
| 880 |
const barW = Math.min(100, e.cer * 100 * 3);
|
| 881 |
+
|
| 882 |
+
// Badge pipeline
|
| 883 |
+
let pipelineBadge = '';
|
| 884 |
+
let pipelineStepsHtml = '';
|
| 885 |
+
if (e.is_pipeline && e.pipeline_info) {{
|
| 886 |
+
const pi = e.pipeline_info;
|
| 887 |
+
const modeLabel = {{text_only:'texte', text_and_image:'image+texte', zero_shot:'zero-shot'}}[pi.pipeline_mode] || pi.pipeline_mode || '';
|
| 888 |
+
pipelineBadge = `<span class="pipeline-tag" title="Pipeline OCR+LLM — mode ${{modeLabel}}">
|
| 889 |
+
⛓ pipeline<span class="pipe-arrow">·${{modeLabel}}</span></span>`;
|
| 890 |
+
if (pi.pipeline_steps) {{
|
| 891 |
+
pipelineStepsHtml = `<div class="pipeline-steps">` +
|
| 892 |
+
pi.pipeline_steps.map(s => s.type === 'ocr'
|
| 893 |
+
? `<span class="step-chip ocr">OCR: ${{esc(s.engine)}}</span>`
|
| 894 |
+
: `<span class="step-chip llm">LLM: ${{esc(s.model)}}</span>`
|
| 895 |
+
).join(`<span class="step-arrow">→</span>`) +
|
| 896 |
+
`</div>`;
|
| 897 |
+
}}
|
| 898 |
+
}}
|
| 899 |
+
|
| 900 |
+
// Sur-normalisation (classe 10)
|
| 901 |
+
let overNormCell = '<td style="color:var(--text-muted)">—</td>';
|
| 902 |
+
if (e.is_pipeline && e.pipeline_info && e.pipeline_info.over_normalization) {{
|
| 903 |
+
const on = e.pipeline_info.over_normalization;
|
| 904 |
+
const onPct = (on.score * 100).toFixed(2);
|
| 905 |
+
const cls = on.score > 0.05 ? 'over-norm-badge high' : 'over-norm-badge';
|
| 906 |
+
overNormCell = `<td><span class="${{cls}}" title="Classe 10 — ${{on.over_normalized_count}} mots corrects dégradés sur ${{on.total_correct_ocr_words}}">${{onPct}} %</span></td>`;
|
| 907 |
+
}}
|
| 908 |
+
|
| 909 |
return `<tr>
|
| 910 |
<td><span class="${{badgeClass}}">${{rank}}</span></td>
|
| 911 |
<td>
|
| 912 |
<span class="engine-name">${{esc(e.name)}}</span>
|
| 913 |
+
${{pipelineBadge}}
|
| 914 |
<span class="engine-version">v${{esc(e.version)}}</span>
|
| 915 |
+
${{pipelineStepsHtml}}
|
| 916 |
</td>
|
| 917 |
<td>
|
| 918 |
<span class="bar" style="width:${{barW}}px;background:${{cerC}}"></span>
|
|
|
|
| 924 |
<td style="color:var(--text-muted)">${{pct(e.cer_median)}}</td>
|
| 925 |
<td style="color:var(--text-muted)">${{pct(e.cer_min)}}</td>
|
| 926 |
<td style="color:var(--text-muted)">${{pct(e.cer_max)}}</td>
|
| 927 |
+
${{overNormCell}}
|
| 928 |
<td><span class="pill">${{e.doc_count}}</span></td>
|
| 929 |
</tr>`;
|
| 930 |
}}).join('');
|
| 931 |
|
| 932 |
// Stats globales
|
| 933 |
+
const pipelineCount = DATA.engines.filter(e => e.is_pipeline).length;
|
| 934 |
const stats = document.getElementById('ranking-stats');
|
| 935 |
stats.innerHTML = `
|
| 936 |
<div class="stat">Corpus <b>${{esc(DATA.meta.corpus_name)}}</b></div>
|
| 937 |
<div class="stat">Documents <b>${{DATA.meta.document_count}}</b></div>
|
| 938 |
+
<div class="stat">Concurrents <b>${{DATA.engines.length}}</b>
|
| 939 |
+
${{pipelineCount ? `<span class="pipeline-tag" style="margin-left:.3rem">${{pipelineCount}} pipeline${{pipelineCount>1?'s':''}}</span>` : ''}}
|
| 940 |
+
</div>
|
| 941 |
`;
|
| 942 |
}}
|
| 943 |
|
|
|
|
| 1006 |
|
| 1007 |
const badges = doc.engine_results.map(er => {{
|
| 1008 |
const c = cerColor(er.cer); const bg = cerBg(er.cer);
|
| 1009 |
+
const isPipe = er.ocr_intermediate !== undefined;
|
| 1010 |
+
const label = isPipe ? '⛓' + er.engine.slice(0,8) : er.engine.slice(0,8);
|
| 1011 |
return `<span class="engine-cer-badge" style="color:${{c}};background:${{bg}}"
|
| 1012 |
+
title="${{esc(er.engine)}}${{isPipe?' (pipeline)':''}}">${{esc(label)}} ${{pct(er.cer,1)}}</span>`;
|
| 1013 |
}}).join('');
|
| 1014 |
|
| 1015 |
return `<div class="gallery-card" onclick="openDocument('${{esc(doc.doc_id)}}')">
|
|
|
|
| 1075 |
const c = cerColor(er.cer); const bg = cerBg(er.cer);
|
| 1076 |
const diffHtml = renderDiff(er.diff);
|
| 1077 |
const errBadge = er.error ? `<span class="badge" style="background:#fee2e2;color:#dc2626">Erreur</span>` : '';
|
| 1078 |
+
|
| 1079 |
+
// Pipeline badge dans l'en-tête du panneau
|
| 1080 |
+
const isPipeline = er.ocr_intermediate !== undefined;
|
| 1081 |
+
const modeLabel = {{text_only:'texte seul', text_and_image:'image+texte', zero_shot:'zero-shot'}}[er.pipeline_mode] || '';
|
| 1082 |
+
const pipeTagPanel = isPipeline
|
| 1083 |
+
? `<span class="pipeline-tag">⛓ ${{modeLabel || 'pipeline'}}</span>` : '';
|
| 1084 |
+
|
| 1085 |
+
// Sur-normalisation (classe 10)
|
| 1086 |
+
let onBadge = '';
|
| 1087 |
+
if (er.over_normalization) {{
|
| 1088 |
+
const on = er.over_normalization;
|
| 1089 |
+
const onPct = (on.score * 100).toFixed(2);
|
| 1090 |
+
const cls = on.score > 0.05 ? 'over-norm-badge high' : 'over-norm-badge';
|
| 1091 |
+
onBadge = `<span class="${{cls}}" title="Classe 10 — sur-normalisation LLM">Sur-norm. ${{onPct}}%</span>`;
|
| 1092 |
+
}}
|
| 1093 |
+
|
| 1094 |
+
// Triple-diff (vue sp��cifique pipeline) : OCR brut / Correction LLM
|
| 1095 |
+
let tripleDiffHtml = '';
|
| 1096 |
+
if (isPipeline && er.ocr_intermediate) {{
|
| 1097 |
+
const ocrDiffHtml = renderDiff(er.ocr_diff);
|
| 1098 |
+
const llmDiffHtml = renderDiff(er.llm_correction_diff);
|
| 1099 |
+
tripleDiffHtml = `
|
| 1100 |
+
<div class="triple-diff-wrap">
|
| 1101 |
+
<div class="triple-diff-section">
|
| 1102 |
+
<h5>GT → OCR brut</h5>
|
| 1103 |
+
${{ocrDiffHtml || '<em style="color:var(--text-muted)">—</em>'}}
|
| 1104 |
+
</div>
|
| 1105 |
+
<div class="triple-diff-section">
|
| 1106 |
+
<h5>OCR brut → Correction LLM</h5>
|
| 1107 |
+
${{llmDiffHtml || '<em style="color:var(--text-muted)">—</em>'}}
|
| 1108 |
+
</div>
|
| 1109 |
+
</div>`;
|
| 1110 |
+
}}
|
| 1111 |
+
|
| 1112 |
return `<div class="diff-panel">
|
| 1113 |
<div class="diff-panel-header">
|
| 1114 |
<span class="diff-panel-title">${{esc(er.engine)}}</span>
|
| 1115 |
+
${{pipeTagPanel}}
|
| 1116 |
<span class="diff-panel-metrics">
|
| 1117 |
<span class="cer-badge" style="color:${{c}};background:${{bg}}">${{pct(er.cer)}}</span>
|
| 1118 |
<span class="badge" style="background:#f1f5f9">WER ${{pct(er.wer)}}</span>
|
| 1119 |
+
${{onBadge}}
|
| 1120 |
${{errBadge}}
|
| 1121 |
</span>
|
| 1122 |
</div>
|
| 1123 |
<div class="diff-panel-body">${{diffHtml || '<em style="color:var(--text-muted)">Aucune sortie</em>'}}</div>
|
| 1124 |
+
${{tripleDiffHtml}}
|
| 1125 |
</div>`;
|
| 1126 |
}}).join('');
|
| 1127 |
}}
|
|
The diff for this file is too large to render.
See raw diff
|
|
|
|
@@ -32,7 +32,8 @@ class TestGenerateSampleBenchmark:
|
|
| 32 |
assert isinstance(sample_benchmark, BenchmarkResult)
|
| 33 |
|
| 34 |
def test_correct_engine_count(self, sample_benchmark):
|
| 35 |
-
|
|
|
|
| 36 |
|
| 37 |
def test_correct_doc_count(self, sample_benchmark):
|
| 38 |
assert sample_benchmark.document_count == 3
|
|
@@ -88,7 +89,8 @@ class TestBuildReportData:
|
|
| 88 |
|
| 89 |
def test_engines_count(self, sample_benchmark):
|
| 90 |
data = _build_report_data(sample_benchmark, {})
|
| 91 |
-
|
|
|
|
| 92 |
|
| 93 |
def test_engine_fields(self, sample_benchmark):
|
| 94 |
data = _build_report_data(sample_benchmark, {})
|
|
@@ -219,7 +221,7 @@ class TestReportGenerator:
|
|
| 219 |
data = json.loads(match.group(1))
|
| 220 |
assert "engines" in data
|
| 221 |
assert "documents" in data
|
| 222 |
-
assert len(data["engines"]) == 3
|
| 223 |
|
| 224 |
|
| 225 |
# ---------------------------------------------------------------------------
|
|
|
|
| 32 |
assert isinstance(sample_benchmark, BenchmarkResult)
|
| 33 |
|
| 34 |
def test_correct_engine_count(self, sample_benchmark):
|
| 35 |
+
# 3 moteurs OCR + 1 pipeline tesseract → gpt-4o
|
| 36 |
+
assert len(sample_benchmark.engine_reports) == 4
|
| 37 |
|
| 38 |
def test_correct_doc_count(self, sample_benchmark):
|
| 39 |
assert sample_benchmark.document_count == 3
|
|
|
|
| 89 |
|
| 90 |
def test_engines_count(self, sample_benchmark):
|
| 91 |
data = _build_report_data(sample_benchmark, {})
|
| 92 |
+
# 3 moteurs OCR + 1 pipeline tesseract → gpt-4o
|
| 93 |
+
assert len(data["engines"]) == 4
|
| 94 |
|
| 95 |
def test_engine_fields(self, sample_benchmark):
|
| 96 |
data = _build_report_data(sample_benchmark, {})
|
|
|
|
| 221 |
data = json.loads(match.group(1))
|
| 222 |
assert "engines" in data
|
| 223 |
assert "documents" in data
|
| 224 |
+
assert len(data["engines"]) == 4 # 3 OCR + 1 pipeline
|
| 225 |
|
| 226 |
|
| 227 |
# ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,441 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests Sprint 3 — Pipelines OCR+LLM, adaptateurs LLM, bibliothèque de prompts, sur-normalisation.
|
| 2 |
+
|
| 3 |
+
Ces tests couvrent :
|
| 4 |
+
- La détection de sur-normalisation LLM (classe 10)
|
| 5 |
+
- L'OCRLLMPipeline : modes, chargement de prompts, métadonnées
|
| 6 |
+
- Les adaptateurs LLM (instanciation, structure)
|
| 7 |
+
- L'intégration dans les fixtures (tesseract → gpt-4o)
|
| 8 |
+
- La présence des données pipeline dans le rapport HTML
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import re
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
|
| 17 |
+
import pytest
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# ---------------------------------------------------------------------------
|
| 21 |
+
# Détection de sur-normalisation (classe 10)
|
| 22 |
+
# ---------------------------------------------------------------------------
|
| 23 |
+
|
| 24 |
+
class TestOverNormalization:
|
| 25 |
+
|
| 26 |
+
def test_no_over_normalization(self):
|
| 27 |
+
from picarones.pipelines.over_normalization import detect_over_normalization
|
| 28 |
+
gt = "nostre seigneur le roy"
|
| 29 |
+
ocr = "noltre seigneur le roy" # erreur OCR sur 'nostre'
|
| 30 |
+
llm = "nostre seigneur le roy" # LLM corrige → correct
|
| 31 |
+
result = detect_over_normalization(gt, ocr, llm)
|
| 32 |
+
assert result.score == 0.0
|
| 33 |
+
assert result.over_normalized_count == 0
|
| 34 |
+
|
| 35 |
+
def test_perfect_llm_no_over_norm(self):
|
| 36 |
+
from picarones.pipelines.over_normalization import detect_over_normalization
|
| 37 |
+
gt = "nostre seigneur le roy"
|
| 38 |
+
ocr = "nostre seigneur le roy" # OCR correct
|
| 39 |
+
llm = "nostre seigneur le roy" # LLM conserve
|
| 40 |
+
result = detect_over_normalization(gt, ocr, llm)
|
| 41 |
+
assert result.score == 0.0
|
| 42 |
+
assert result.total_correct_ocr_words == 4
|
| 43 |
+
|
| 44 |
+
def test_over_normalization_detected(self):
|
| 45 |
+
from picarones.pipelines.over_normalization import detect_over_normalization
|
| 46 |
+
gt = "nostre seigneur le roy"
|
| 47 |
+
ocr = "nostre seigneur le roy" # OCR correct
|
| 48 |
+
llm = "notre seigneur le roy" # LLM modifie 'nostre' → 'notre' : sur-normalisation
|
| 49 |
+
result = detect_over_normalization(gt, ocr, llm)
|
| 50 |
+
assert result.over_normalized_count == 1
|
| 51 |
+
assert result.score > 0.0
|
| 52 |
+
assert len(result.over_normalized_passages) == 1
|
| 53 |
+
passage = result.over_normalized_passages[0]
|
| 54 |
+
assert passage["gt"] == "nostre"
|
| 55 |
+
assert passage["ocr"] == "nostre"
|
| 56 |
+
assert passage["llm"] == "notre"
|
| 57 |
+
|
| 58 |
+
def test_over_normalization_score_formula(self):
|
| 59 |
+
from picarones.pipelines.over_normalization import detect_over_normalization
|
| 60 |
+
# 4 mots, OCR correct sur tous, LLM modifie 2 → score = 2/4 = 0.5
|
| 61 |
+
gt = "maistre jehan nostre dame"
|
| 62 |
+
ocr = "maistre jehan nostre dame"
|
| 63 |
+
llm = "maître jehan notre dame"
|
| 64 |
+
result = detect_over_normalization(gt, ocr, llm)
|
| 65 |
+
assert result.total_correct_ocr_words == 4
|
| 66 |
+
assert result.over_normalized_count == 2
|
| 67 |
+
assert result.score == pytest.approx(0.5)
|
| 68 |
+
|
| 69 |
+
def test_as_dict_keys(self):
|
| 70 |
+
from picarones.pipelines.over_normalization import detect_over_normalization
|
| 71 |
+
result = detect_over_normalization("foo bar", "foo baz", "foo baz")
|
| 72 |
+
d = result.as_dict()
|
| 73 |
+
assert "score" in d
|
| 74 |
+
assert "total_correct_ocr_words" in d
|
| 75 |
+
assert "over_normalized_count" in d
|
| 76 |
+
assert "over_normalized_passages" in d
|
| 77 |
+
|
| 78 |
+
def test_empty_texts(self):
|
| 79 |
+
from picarones.pipelines.over_normalization import detect_over_normalization
|
| 80 |
+
result = detect_over_normalization("", "", "")
|
| 81 |
+
assert result.score == 0.0
|
| 82 |
+
|
| 83 |
+
def test_aggregate_over_normalization(self):
|
| 84 |
+
from picarones.pipelines.over_normalization import (
|
| 85 |
+
OverNormalizationResult,
|
| 86 |
+
aggregate_over_normalization,
|
| 87 |
+
)
|
| 88 |
+
results = [
|
| 89 |
+
OverNormalizationResult(total_correct_ocr_words=10, over_normalized_count=1),
|
| 90 |
+
OverNormalizationResult(total_correct_ocr_words=10, over_normalized_count=2),
|
| 91 |
+
None,
|
| 92 |
+
]
|
| 93 |
+
agg = aggregate_over_normalization(results)
|
| 94 |
+
assert agg["total_correct_ocr_words"] == 20
|
| 95 |
+
assert agg["over_normalized_count"] == 3
|
| 96 |
+
assert agg["score"] == pytest.approx(0.15)
|
| 97 |
+
assert agg["document_count"] == 2
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# ---------------------------------------------------------------------------
|
| 101 |
+
# Bibliothèque de prompts
|
| 102 |
+
# ---------------------------------------------------------------------------
|
| 103 |
+
|
| 104 |
+
class TestPromptsLibrary:
|
| 105 |
+
|
| 106 |
+
_PROMPTS_DIR = Path(__file__).parent.parent / "picarones" / "prompts"
|
| 107 |
+
|
| 108 |
+
def test_prompts_directory_exists(self):
|
| 109 |
+
assert self._PROMPTS_DIR.is_dir()
|
| 110 |
+
|
| 111 |
+
def test_required_prompt_files_exist(self):
|
| 112 |
+
expected = [
|
| 113 |
+
"correction_medieval_french.txt",
|
| 114 |
+
"correction_imprime_ancien.txt",
|
| 115 |
+
"correction_image_medieval_french.txt",
|
| 116 |
+
"zero_shot_medieval_french.txt",
|
| 117 |
+
"zero_shot_imprime_ancien.txt",
|
| 118 |
+
]
|
| 119 |
+
for fname in expected:
|
| 120 |
+
assert (self._PROMPTS_DIR / fname).exists(), f"Prompt manquant : {fname}"
|
| 121 |
+
|
| 122 |
+
def test_correction_prompt_has_ocr_variable(self):
|
| 123 |
+
text = (self._PROMPTS_DIR / "correction_medieval_french.txt").read_text(encoding="utf-8")
|
| 124 |
+
assert "{ocr_output}" in text
|
| 125 |
+
|
| 126 |
+
def test_image_prompt_has_both_variables(self):
|
| 127 |
+
text = (self._PROMPTS_DIR / "correction_image_medieval_french.txt").read_text(encoding="utf-8")
|
| 128 |
+
assert "{ocr_output}" in text
|
| 129 |
+
|
| 130 |
+
def test_zero_shot_prompt_has_no_ocr_variable(self):
|
| 131 |
+
text = (self._PROMPTS_DIR / "zero_shot_medieval_french.txt").read_text(encoding="utf-8")
|
| 132 |
+
assert "{ocr_output}" not in text
|
| 133 |
+
|
| 134 |
+
def test_prompts_not_empty(self):
|
| 135 |
+
for f in self._PROMPTS_DIR.glob("*.txt"):
|
| 136 |
+
assert len(f.read_text(encoding="utf-8").strip()) > 100, f"Prompt trop court : {f.name}"
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# ---------------------------------------------------------------------------
|
| 140 |
+
# PipelineMode enum
|
| 141 |
+
# ---------------------------------------------------------------------------
|
| 142 |
+
|
| 143 |
+
class TestPipelineMode:
|
| 144 |
+
|
| 145 |
+
def test_enum_values(self):
|
| 146 |
+
from picarones.pipelines.base import PipelineMode
|
| 147 |
+
assert PipelineMode.TEXT_ONLY.value == "text_only"
|
| 148 |
+
assert PipelineMode.TEXT_AND_IMAGE.value == "text_and_image"
|
| 149 |
+
assert PipelineMode.ZERO_SHOT.value == "zero_shot"
|
| 150 |
+
|
| 151 |
+
def test_from_string(self):
|
| 152 |
+
from picarones.pipelines.base import PipelineMode
|
| 153 |
+
assert PipelineMode("text_only") == PipelineMode.TEXT_ONLY
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
# ---------------------------------------------------------------------------
|
| 157 |
+
# Adaptateurs LLM — structure
|
| 158 |
+
# ---------------------------------------------------------------------------
|
| 159 |
+
|
| 160 |
+
class TestLLMAdapters:
|
| 161 |
+
|
| 162 |
+
def test_openai_adapter_structure(self):
|
| 163 |
+
from picarones.llm.openai_adapter import OpenAIAdapter
|
| 164 |
+
adapter = OpenAIAdapter(model="gpt-4o")
|
| 165 |
+
assert adapter.name == "openai"
|
| 166 |
+
assert adapter.model == "gpt-4o"
|
| 167 |
+
|
| 168 |
+
def test_anthropic_adapter_structure(self):
|
| 169 |
+
from picarones.llm.anthropic_adapter import AnthropicAdapter
|
| 170 |
+
adapter = AnthropicAdapter()
|
| 171 |
+
assert adapter.name == "anthropic"
|
| 172 |
+
assert "claude" in adapter.model.lower()
|
| 173 |
+
|
| 174 |
+
def test_mistral_adapter_structure(self):
|
| 175 |
+
from picarones.llm.mistral_adapter import MistralAdapter
|
| 176 |
+
adapter = MistralAdapter()
|
| 177 |
+
assert adapter.name == "mistral"
|
| 178 |
+
assert "mistral" in adapter.model.lower()
|
| 179 |
+
|
| 180 |
+
def test_ollama_adapter_structure(self):
|
| 181 |
+
from picarones.llm.ollama_adapter import OllamaAdapter
|
| 182 |
+
adapter = OllamaAdapter(model="llama3")
|
| 183 |
+
assert adapter.name == "ollama"
|
| 184 |
+
assert adapter.model == "llama3"
|
| 185 |
+
|
| 186 |
+
def test_ollama_custom_base_url(self):
|
| 187 |
+
from picarones.llm.ollama_adapter import OllamaAdapter
|
| 188 |
+
adapter = OllamaAdapter(config={"base_url": "http://myserver:11434"})
|
| 189 |
+
assert adapter._base_url == "http://myserver:11434"
|
| 190 |
+
|
| 191 |
+
def test_llm_result_dataclass(self):
|
| 192 |
+
from picarones.llm.base import LLMResult
|
| 193 |
+
r = LLMResult(model_id="gpt-4o", text="bonjour", duration_seconds=1.2)
|
| 194 |
+
assert r.success is True
|
| 195 |
+
r_err = LLMResult(model_id="gpt-4o", text="", duration_seconds=0.1, error="fail")
|
| 196 |
+
assert r_err.success is False
|
| 197 |
+
|
| 198 |
+
def test_missing_api_key_raises(self):
|
| 199 |
+
import os
|
| 200 |
+
from picarones.llm.openai_adapter import OpenAIAdapter
|
| 201 |
+
adapter = OpenAIAdapter()
|
| 202 |
+
adapter._api_key = None # simuler clé manquante
|
| 203 |
+
with pytest.raises(RuntimeError, match="OPENAI_API_KEY"):
|
| 204 |
+
adapter._call("test prompt")
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
# ---------------------------------------------------------------------------
|
| 208 |
+
# OCRLLMPipeline — prompt loading, name, steps
|
| 209 |
+
# ---------------------------------------------------------------------------
|
| 210 |
+
|
| 211 |
+
class TestOCRLLMPipeline:
|
| 212 |
+
|
| 213 |
+
def _mock_llm(self, response: str = "texte corrigé"):
|
| 214 |
+
"""Crée un adaptateur LLM mock qui retourne toujours la même réponse."""
|
| 215 |
+
from picarones.llm.base import BaseLLMAdapter
|
| 216 |
+
class MockLLM(BaseLLMAdapter):
|
| 217 |
+
@property
|
| 218 |
+
def name(self): return "mock"
|
| 219 |
+
@property
|
| 220 |
+
def default_model(self): return "mock-v1"
|
| 221 |
+
def _call(self, prompt, image_b64=None): return response
|
| 222 |
+
return MockLLM()
|
| 223 |
+
|
| 224 |
+
def test_load_builtin_prompt(self):
|
| 225 |
+
from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
|
| 226 |
+
pipeline = OCRLLMPipeline(
|
| 227 |
+
llm_adapter=self._mock_llm(),
|
| 228 |
+
mode=PipelineMode.TEXT_ONLY,
|
| 229 |
+
prompt="correction_medieval_french.txt",
|
| 230 |
+
)
|
| 231 |
+
assert "{ocr_output}" in pipeline._prompt_template
|
| 232 |
+
|
| 233 |
+
def test_prompt_substitution_text_only(self):
|
| 234 |
+
from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
|
| 235 |
+
pipeline = OCRLLMPipeline(
|
| 236 |
+
llm_adapter=self._mock_llm(),
|
| 237 |
+
mode=PipelineMode.TEXT_ONLY,
|
| 238 |
+
prompt="correction_medieval_french.txt",
|
| 239 |
+
)
|
| 240 |
+
built = pipeline._build_prompt(ocr_text="mon texte ocr")
|
| 241 |
+
assert "mon texte ocr" in built
|
| 242 |
+
assert "{ocr_output}" not in built
|
| 243 |
+
|
| 244 |
+
def test_auto_name_text_only(self):
|
| 245 |
+
from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
|
| 246 |
+
from picarones.engines.tesseract import TesseractEngine
|
| 247 |
+
pipeline = OCRLLMPipeline(
|
| 248 |
+
ocr_engine=TesseractEngine(),
|
| 249 |
+
llm_adapter=self._mock_llm(),
|
| 250 |
+
mode=PipelineMode.TEXT_ONLY,
|
| 251 |
+
)
|
| 252 |
+
assert "tesseract" in pipeline.name.lower()
|
| 253 |
+
assert "mock-v1" in pipeline.name
|
| 254 |
+
|
| 255 |
+
def test_auto_name_zero_shot(self):
|
| 256 |
+
from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
|
| 257 |
+
pipeline = OCRLLMPipeline(
|
| 258 |
+
llm_adapter=self._mock_llm(),
|
| 259 |
+
mode=PipelineMode.ZERO_SHOT,
|
| 260 |
+
)
|
| 261 |
+
assert "zero-shot" in pipeline.name
|
| 262 |
+
|
| 263 |
+
def test_custom_name(self):
|
| 264 |
+
from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
|
| 265 |
+
pipeline = OCRLLMPipeline(
|
| 266 |
+
llm_adapter=self._mock_llm(),
|
| 267 |
+
mode=PipelineMode.TEXT_ONLY,
|
| 268 |
+
pipeline_name="mon_pipeline_custom",
|
| 269 |
+
)
|
| 270 |
+
assert pipeline.name == "mon_pipeline_custom"
|
| 271 |
+
|
| 272 |
+
def test_pipeline_steps_without_ocr(self):
|
| 273 |
+
from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
|
| 274 |
+
pipeline = OCRLLMPipeline(
|
| 275 |
+
llm_adapter=self._mock_llm(),
|
| 276 |
+
mode=PipelineMode.ZERO_SHOT,
|
| 277 |
+
)
|
| 278 |
+
steps = pipeline._build_steps_info()
|
| 279 |
+
assert len(steps) == 1
|
| 280 |
+
assert steps[0]["type"] == "llm"
|
| 281 |
+
assert steps[0]["mode"] == "zero_shot"
|
| 282 |
+
|
| 283 |
+
def test_pipeline_steps_with_ocr(self):
|
| 284 |
+
from picarones.engines.tesseract import TesseractEngine
|
| 285 |
+
from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
|
| 286 |
+
pipeline = OCRLLMPipeline(
|
| 287 |
+
ocr_engine=TesseractEngine(),
|
| 288 |
+
llm_adapter=self._mock_llm(),
|
| 289 |
+
mode=PipelineMode.TEXT_ONLY,
|
| 290 |
+
)
|
| 291 |
+
steps = pipeline._build_steps_info()
|
| 292 |
+
assert len(steps) == 2
|
| 293 |
+
assert steps[0]["type"] == "ocr"
|
| 294 |
+
assert steps[1]["type"] == "llm"
|
| 295 |
+
|
| 296 |
+
def test_load_nonexistent_prompt_raises(self):
|
| 297 |
+
from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
|
| 298 |
+
with pytest.raises(FileNotFoundError):
|
| 299 |
+
OCRLLMPipeline(
|
| 300 |
+
llm_adapter=self._mock_llm(),
|
| 301 |
+
mode=PipelineMode.TEXT_ONLY,
|
| 302 |
+
prompt="inexistant_prompt_xyz.txt",
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
def test_text_only_requires_ocr_engine(self):
|
| 306 |
+
from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
|
| 307 |
+
pipeline = OCRLLMPipeline(
|
| 308 |
+
llm_adapter=self._mock_llm(),
|
| 309 |
+
mode=PipelineMode.TEXT_ONLY,
|
| 310 |
+
)
|
| 311 |
+
with pytest.raises(ValueError, match="ocr_engine"):
|
| 312 |
+
pipeline._run_ocr(Path("/nonexistent/image.jpg"))
|
| 313 |
+
|
| 314 |
+
def test_is_pipeline_flag(self):
|
| 315 |
+
from picarones.pipelines.base import OCRLLMPipeline, PipelineMode
|
| 316 |
+
from picarones.engines.base import BaseOCREngine
|
| 317 |
+
pipeline = OCRLLMPipeline(
|
| 318 |
+
llm_adapter=self._mock_llm(),
|
| 319 |
+
mode=PipelineMode.ZERO_SHOT,
|
| 320 |
+
)
|
| 321 |
+
# Doit être utilisable comme BaseOCREngine
|
| 322 |
+
assert isinstance(pipeline, BaseOCREngine)
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
# ---------------------------------------------------------------------------
|
| 326 |
+
# Intégration fixtures — pipeline tesseract → gpt-4o
|
| 327 |
+
# ---------------------------------------------------------------------------
|
| 328 |
+
|
| 329 |
+
class TestFixturesPipeline:
|
| 330 |
+
|
| 331 |
+
@pytest.fixture(scope="class")
|
| 332 |
+
def benchmark(self):
|
| 333 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 334 |
+
return generate_sample_benchmark(n_docs=3, seed=42)
|
| 335 |
+
|
| 336 |
+
def test_pipeline_engine_present(self, benchmark):
|
| 337 |
+
names = [r.engine_name for r in benchmark.engine_reports]
|
| 338 |
+
assert "tesseract → gpt-4o" in names
|
| 339 |
+
|
| 340 |
+
def test_pipeline_report_has_pipeline_info(self, benchmark):
|
| 341 |
+
report = next(r for r in benchmark.engine_reports if r.engine_name == "tesseract → gpt-4o")
|
| 342 |
+
assert report.is_pipeline
|
| 343 |
+
assert report.pipeline_info.get("pipeline_mode") == "text_and_image"
|
| 344 |
+
assert report.pipeline_info.get("llm_model") == "gpt-4o"
|
| 345 |
+
|
| 346 |
+
def test_pipeline_documents_have_ocr_intermediate(self, benchmark):
|
| 347 |
+
report = next(r for r in benchmark.engine_reports if r.engine_name == "tesseract → gpt-4o")
|
| 348 |
+
for dr in report.document_results:
|
| 349 |
+
assert dr.ocr_intermediate is not None, f"ocr_intermediate manquant sur {dr.doc_id}"
|
| 350 |
+
assert len(dr.ocr_intermediate) > 0
|
| 351 |
+
|
| 352 |
+
def test_pipeline_documents_have_over_normalization(self, benchmark):
|
| 353 |
+
report = next(r for r in benchmark.engine_reports if r.engine_name == "tesseract → gpt-4o")
|
| 354 |
+
for dr in report.document_results:
|
| 355 |
+
on = dr.pipeline_metadata.get("over_normalization")
|
| 356 |
+
assert on is not None, f"over_normalization manquant sur {dr.doc_id}"
|
| 357 |
+
assert "score" in on
|
| 358 |
+
assert "total_correct_ocr_words" in on
|
| 359 |
+
|
| 360 |
+
def test_pipeline_report_has_aggregated_over_normalization(self, benchmark):
|
| 361 |
+
report = next(r for r in benchmark.engine_reports if r.engine_name == "tesseract → gpt-4o")
|
| 362 |
+
on = report.pipeline_info.get("over_normalization")
|
| 363 |
+
assert on is not None
|
| 364 |
+
assert "score" in on
|
| 365 |
+
assert on["document_count"] == 3
|
| 366 |
+
|
| 367 |
+
def test_pipeline_pipeline_steps_in_info(self, benchmark):
|
| 368 |
+
report = next(r for r in benchmark.engine_reports if r.engine_name == "tesseract → gpt-4o")
|
| 369 |
+
steps = report.pipeline_info.get("pipeline_steps", [])
|
| 370 |
+
assert len(steps) == 2
|
| 371 |
+
assert steps[0]["type"] == "ocr"
|
| 372 |
+
assert steps[1]["type"] == "llm"
|
| 373 |
+
|
| 374 |
+
def test_non_pipeline_reports_empty_pipeline_info(self, benchmark):
|
| 375 |
+
for report in benchmark.engine_reports:
|
| 376 |
+
if report.engine_name != "tesseract → gpt-4o":
|
| 377 |
+
assert not report.is_pipeline
|
| 378 |
+
assert report.pipeline_info == {}
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
# ---------------------------------------------------------------------------
|
| 382 |
+
# Intégration rapport HTML — pipeline dans les données JSON
|
| 383 |
+
# ---------------------------------------------------------------------------
|
| 384 |
+
|
| 385 |
+
class TestReportWithPipeline:
|
| 386 |
+
|
| 387 |
+
@pytest.fixture(scope="class")
|
| 388 |
+
def report_data(self):
|
| 389 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 390 |
+
from picarones.report.generator import _build_report_data
|
| 391 |
+
bm = generate_sample_benchmark(n_docs=3, seed=42)
|
| 392 |
+
images_b64 = bm.metadata.get("_images_b64", {})
|
| 393 |
+
return _build_report_data(bm, images_b64)
|
| 394 |
+
|
| 395 |
+
def test_pipeline_engine_in_data(self, report_data):
|
| 396 |
+
names = [e["name"] for e in report_data["engines"]]
|
| 397 |
+
assert "tesseract → gpt-4o" in names
|
| 398 |
+
|
| 399 |
+
def test_pipeline_engine_has_is_pipeline_flag(self, report_data):
|
| 400 |
+
pipeline_e = next(e for e in report_data["engines"] if e["name"] == "tesseract → gpt-4o")
|
| 401 |
+
assert pipeline_e["is_pipeline"] is True
|
| 402 |
+
|
| 403 |
+
def test_non_pipeline_engines_not_flagged(self, report_data):
|
| 404 |
+
for e in report_data["engines"]:
|
| 405 |
+
if e["name"] != "tesseract → gpt-4o":
|
| 406 |
+
assert e["is_pipeline"] is False
|
| 407 |
+
|
| 408 |
+
def test_pipeline_has_over_normalization_in_info(self, report_data):
|
| 409 |
+
pipeline_e = next(e for e in report_data["engines"] if e["name"] == "tesseract → gpt-4o")
|
| 410 |
+
pi = pipeline_e.get("pipeline_info", {})
|
| 411 |
+
assert pi.get("over_normalization") is not None
|
| 412 |
+
|
| 413 |
+
def test_document_results_have_ocr_intermediate(self, report_data):
|
| 414 |
+
for doc in report_data["documents"]:
|
| 415 |
+
pipeline_er = next(
|
| 416 |
+
(er for er in doc["engine_results"] if er["engine"] == "tesseract → gpt-4o"),
|
| 417 |
+
None,
|
| 418 |
+
)
|
| 419 |
+
assert pipeline_er is not None
|
| 420 |
+
assert "ocr_intermediate" in pipeline_er
|
| 421 |
+
assert "ocr_diff" in pipeline_er
|
| 422 |
+
assert "llm_correction_diff" in pipeline_er
|
| 423 |
+
|
| 424 |
+
def test_document_results_have_over_normalization(self, report_data):
|
| 425 |
+
for doc in report_data["documents"]:
|
| 426 |
+
pipeline_er = next(
|
| 427 |
+
(er for er in doc["engine_results"] if er["engine"] == "tesseract → gpt-4o"),
|
| 428 |
+
None,
|
| 429 |
+
)
|
| 430 |
+
assert pipeline_er is not None
|
| 431 |
+
assert "over_normalization" in pipeline_er
|
| 432 |
+
|
| 433 |
+
def test_html_contains_pipeline_tag(self, tmp_path):
|
| 434 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 435 |
+
from picarones.report.generator import ReportGenerator
|
| 436 |
+
bm = generate_sample_benchmark(n_docs=3, seed=42)
|
| 437 |
+
out = tmp_path / "report.html"
|
| 438 |
+
ReportGenerator(bm).generate(out)
|
| 439 |
+
html = out.read_text(encoding="utf-8")
|
| 440 |
+
assert "pipeline" in html.lower()
|
| 441 |
+
assert "tesseract" in html
|