Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

Picarones / picarones /pipelines /base.py

Claude

feat: corpus triplet, post-correction LLM et modèles dynamiques avec capacités

63e236b unverified 3 months ago

13.6 kB

	"""Pipeline OCR+LLM — présenté comme un concurrent normal dans les benchmarks.

	Un pipeline compose un moteur OCR et un LLM de correction selon trois modes :

	text_only → OCR brut ──► LLM (texte seul)
	text_and_image → OCR brut + image ──► LLM multimodal
	zero_shot → image ──► LLM (pas d'OCR amont)

	La classe ``OCRLLMPipeline`` étend ``BaseOCREngine`` : un pipeline est
	un concurrent comme un autre dans ``run_benchmark``, avec les mêmes métriques
	CER/WER. Les métadonnées spécifiques (étapes, prompt, OCR intermédiaire) sont
	exposées via ``EngineResult.metadata``.
	"""

	from __future__ import annotations

	import base64
	import logging
	import time
	from enum import Enum
	from pathlib import Path
	from typing import Optional

	from picarones.engines.base import BaseOCREngine, EngineResult
	from picarones.llm.base import BaseLLMAdapter

	logger = logging.getLogger(__name__)


	class PipelineMode(str, Enum):
	"""Mode d'appel LLM dans le pipeline."""

	TEXT_ONLY = "text_only"
	"""Le LLM reçoit uniquement le texte OCR brut."""

	TEXT_AND_IMAGE = "text_and_image"
	"""Le LLM reçoit le texte OCR ET l'image (mode multimodal)."""

	ZERO_SHOT = "zero_shot"
	"""Le LLM reçoit uniquement l'image — aucun OCR amont."""


	# Répertoire de la bibliothèque de prompts intégrée
	_PROMPTS_DIR = Path(__file__).parent.parent / "prompts"


	def _load_prompt(prompt_path: str \| Path) -> str:
	"""Charge un prompt depuis un chemin absolu, relatif ou depuis la bibliothèque intégrée."""
	p = Path(prompt_path)
	if p.is_absolute() and p.exists():
	return p.read_text(encoding="utf-8")
	# Chemin relatif : chercher d'abord dans le CWD, puis dans la bibliothèque
	if p.exists():
	return p.read_text(encoding="utf-8")
	builtin = _PROMPTS_DIR / p
	if builtin.exists():
	return builtin.read_text(encoding="utf-8")
	raise FileNotFoundError(
	f"Prompt introuvable : '{prompt_path}'. "
	f"Bibliothèque disponible dans : {_PROMPTS_DIR}"
	)


	def _image_to_b64(image_path: Path) -> str:
	"""Encode une image en base64 pur (sans préfixe data URI)."""
	return base64.b64encode(image_path.read_bytes()).decode("ascii")


	class OCRLLMPipeline(BaseOCREngine):
	"""Pipeline OCR+LLM, interchangeable avec n'importe quel moteur OCR.

	Parameters
	----------
	llm_adapter:
	Adaptateur LLM (OpenAI, Anthropic, Mistral, Ollama…).
	mode:
	Mode de correction — text_only, text_and_image, ou zero_shot.
	prompt:
	Chemin vers un fichier .txt de prompt, ou nom d'un fichier de la
	bibliothèque intégrée (ex : ``"correction_medieval_french.txt"``).
	Variables disponibles dans le fichier : ``{ocr_output}`` et ``{image_b64}``.
	ocr_engine:
	Moteur OCR amont. Obligatoire pour text_only et text_and_image.
	Non utilisé en mode zero_shot.
	pipeline_name:
	Nom affiché dans le rapport (ex : ``"tesseract → gpt-4o"``).
	Généré automatiquement si non fourni.
	config:
	Paramètres supplémentaires passés à la classe de base.

	Examples
	--------
	>>> from picarones.llm import OpenAIAdapter
	>>> from picarones.engines.tesseract import TesseractEngine
	>>> pipeline = OCRLLMPipeline(
	... ocr_engine=TesseractEngine({"lang": "fra"}),
	... llm_adapter=OpenAIAdapter(model="gpt-4o"),
	... mode=PipelineMode.TEXT_AND_IMAGE,
	... prompt="correction_medieval_french.txt",
	... )
	"""

	def __init__(
	self,
	llm_adapter: BaseLLMAdapter,
	mode: PipelineMode \| str = PipelineMode.TEXT_ONLY,
	prompt: str \| Path = "correction_medieval_french.txt",
	ocr_engine: Optional[BaseOCREngine] = None,
	pipeline_name: Optional[str] = None,
	config: Optional[dict] = None,
	) -> None:
	super().__init__(config)
	self.ocr_engine = ocr_engine
	self.llm_adapter = llm_adapter
	self.mode = PipelineMode(mode)
	self.prompt_path = str(prompt)
	self._prompt_template = _load_prompt(prompt)

	# Nom affiché dans le rapport
	if pipeline_name:
	self._name = pipeline_name
	elif self.mode == PipelineMode.ZERO_SHOT:
	self._name = f"{llm_adapter.model} (zero-shot)"
	elif ocr_engine:
	self._name = f"{ocr_engine.name} → {llm_adapter.model}"
	else:
	self._name = f"pipeline → {llm_adapter.model}"


	# ------------------------------------------------------------------
	# Interface BaseOCREngine
	# ------------------------------------------------------------------

	@property
	def name(self) -> str:
	return self._name

	def version(self) -> str:
	ocr_v = self.ocr_engine._safe_version() if self.ocr_engine else "—"
	return f"ocr={ocr_v}; llm={self.llm_adapter.model}"

	def _run_llm_step(
	self, image_path: Path, ocr_text: str,
	) -> tuple[str, Optional[str]]:
	"""Étape LLM du pipeline (commune à run() et run_with_ocr_text()).

	Construit le prompt, appelle le LLM, retourne ``(llm_text, ocr_intermediate)``.
	``ocr_intermediate`` est ``None`` en mode zero_shot.
	"""
	if self.mode == PipelineMode.ZERO_SHOT:
	image_b64 = _image_to_b64(image_path)
	prompt = self._build_prompt(image_b64=image_b64)
	logger.info("[Pipeline] appel LLM pour doc %s (zero-shot)", image_path.name)
	result = self.llm_adapter.complete(prompt, image_b64=image_b64)

	elif self.mode == PipelineMode.TEXT_ONLY:
	if not ocr_text.strip():
	logger.warning(
	"[%s] texte OCR vide pour '%s' — le LLM recevra {ocr_output} vide.",
	self._name, image_path.name,
	)
	prompt = self._build_prompt(ocr_text=ocr_text)
	logger.info(
	"[Pipeline] appel LLM pour doc %s (text_only, ocr=%d chars)",
	image_path.name, len(ocr_text),
	)
	result = self.llm_adapter.complete(prompt)

	else: # TEXT_AND_IMAGE
	if not ocr_text.strip():
	logger.warning(
	"[%s] texte OCR vide pour '%s' — le LLM recevra {ocr_output} vide.",
	self._name, image_path.name,
	)
	image_b64 = _image_to_b64(image_path)
	prompt = self._build_prompt(ocr_text=ocr_text, image_b64=image_b64)
	logger.info(
	"[Pipeline] appel LLM pour doc %s (text_and_image, ocr=%d chars)",
	image_path.name, len(ocr_text),
	)
	result = self.llm_adapter.complete(prompt, image_b64=image_b64)

	logger.info("[Pipeline] LLM retourné pour doc %s", image_path.name)

	if not result.success:
	raise RuntimeError(f"Erreur LLM ({self.llm_adapter.model}): {result.error}")

	llm_text = result.text
	logger.info(
	"[Pipeline] %s — OCR: %d chars → LLM: %d chars",
	image_path.name, len(ocr_text), len(llm_text),
	)
	if not llm_text or not llm_text.strip():
	logger.warning(
	"[%s] le LLM ('%s') a retourné un texte vide pour '%s'. "
	"CER sera calculé à 1.0 (100%%). "
	"Vérifier : (1) le prompt contient-il {ocr_output} ? "
	"(2) le modèle supporte-t-il ce mode d'appel ? "
	"(3) la réponse n'est-elle pas tronquée (max_tokens) ?",
	self._name, self.llm_adapter.model, image_path.name,
	)
	else:
	logger.debug(
	"[%s] réponse LLM : %d car., extrait : %r",
	self._name, len(llm_text), llm_text[:120],
	)

	ocr_intermediate = ocr_text if self.mode != PipelineMode.ZERO_SHOT else None
	return llm_text, ocr_intermediate

	def _run_ocr(self, image_path: Path) -> tuple[str, Optional[str]]:
	"""Logique interne du pipeline — lance l'OCR engine puis le LLM.

	Returns
	-------
	tuple[str, Optional[str]]
	(llm_text, ocr_intermediate) — ocr_intermediate est None en mode zero_shot.
	"""
	ocr_text = ""
	if self.mode != PipelineMode.ZERO_SHOT:
	if self.ocr_engine is None:
	raise ValueError(
	f"ocr_engine est requis pour le mode {self.mode.value} "
	"(utilisez run_with_ocr_text() pour la post-correction sans OCR engine)"
	)
	ocr_result = self.ocr_engine.run(image_path)
	ocr_text = ocr_result.text

	return self._run_llm_step(image_path, ocr_text)

	# ------------------------------------------------------------------
	# Override run() pour injecter les métadonnées pipeline
	# ------------------------------------------------------------------

	def run(self, image_path: str \| Path) -> EngineResult:
	"""Exécute le pipeline et retourne un EngineResult enrichi de métadonnées."""
	image_path = Path(image_path)
	start = time.perf_counter()

	ocr_intermediate: Optional[str] = None
	try:
	text, ocr_intermediate = self._run_ocr(image_path)
	error = None
	except Exception as exc: # noqa: BLE001
	text = ""
	error = str(exc)
	logger.warning(
	"[%s] erreur pipeline pour '%s' : %s",
	self._name, image_path.name, exc,
	)

	duration = time.perf_counter() - start

	metadata: dict = {
	"engine_version": self._safe_version(),
	"pipeline_mode": self.mode.value,
	"prompt_file": self.prompt_path,
	"prompt_template": self._prompt_template,
	"llm_model": self.llm_adapter.model,
	"llm_provider": self.llm_adapter.name,
	"pipeline_steps": self._build_steps_info(),
	"is_pipeline": True,
	}
	if ocr_intermediate is not None:
	metadata["ocr_intermediate"] = ocr_intermediate

	return EngineResult(
	engine_name=self.name,
	image_path=str(image_path),
	text=text,
	duration_seconds=round(duration, 4),
	error=error,
	metadata=metadata,
	)

	# ------------------------------------------------------------------
	# Post-correction avec OCR pré-calculé
	# ------------------------------------------------------------------

	def run_with_ocr_text(
	self, image_path: str \| Path, ocr_text: str,
	) -> EngineResult:
	"""Exécute le pipeline avec un texte OCR pré-fourni (corpus triplet).

	Utilisé quand le corpus contient des fichiers ``.ocr.txt`` : le
	texte OCR bruité est fourni directement, sans lancer de moteur OCR.

	Parameters
	----------
	image_path:
	Chemin de l'image (utilisée en mode multimodal, ignorée en text_only).
	ocr_text:
	Texte OCR bruité pré-calculé.

	Returns
	-------
	EngineResult
	"""
	image_path = Path(image_path)
	start = time.perf_counter()

	ocr_intermediate: Optional[str] = ocr_text
	try:
	text, _ = self._run_llm_step(image_path, ocr_text)
	error = None
	except Exception as exc: # noqa: BLE001
	text = ""
	error = str(exc)
	logger.warning(
	"[%s] erreur pipeline (post-correction) pour '%s' : %s",
	self._name, image_path.name, exc,
	)

	duration = time.perf_counter() - start

	metadata: dict = {
	"engine_version": self._safe_version(),
	"pipeline_mode": self.mode.value,
	"prompt_file": self.prompt_path,
	"prompt_template": self._prompt_template,
	"llm_model": self.llm_adapter.model,
	"llm_provider": self.llm_adapter.name,
	"pipeline_steps": self._build_steps_info(),
	"is_pipeline": True,
	"ocr_source": "corpus", # distingue de "live"
	}
	if ocr_intermediate is not None:
	metadata["ocr_intermediate"] = ocr_intermediate

	return EngineResult(
	engine_name=self.name,
	image_path=str(image_path),
	text=text,
	duration_seconds=round(duration, 4),
	error=error,
	metadata=metadata,
	)

	# ------------------------------------------------------------------
	# Helpers
	# ------------------------------------------------------------------

	def _build_prompt(self, ocr_text: str = "", image_b64: str = "") -> str:
	"""Substitue {ocr_output} et {image_b64} dans le template de prompt."""
	return (
	self._prompt_template
	.replace("{ocr_output}", ocr_text)
	.replace("{image_b64}", image_b64)
	)

	def _build_steps_info(self) -> list[dict]:
	steps: list[dict] = []
	if self.ocr_engine:
	steps.append({
	"type": "ocr",
	"engine": self.ocr_engine.name,
	"version": self.ocr_engine._safe_version(),
	})
	steps.append({
	"type": "llm",
	"model": self.llm_adapter.model,
	"provider": self.llm_adapter.name,
	"mode": self.mode.value,
	"prompt_file": self.prompt_path,
	})
	return steps