maribakulj commited on
Commit
53554ba
·
unverified ·
2 Parent(s): 56d987cab4a245

Merge pull request #3 from maribakulj/claude/setup-picarones-project-FKKns

Browse files
.gitignore CHANGED
@@ -16,3 +16,6 @@ venv/
16
  *.html
17
  results*.json
18
  rapport*.html
 
 
 
 
16
  *.html
17
  results*.json
18
  rapport*.html
19
+ rapports/
20
+ corpus_*/
21
+ corpus/
picarones/cli.py CHANGED
@@ -381,5 +381,170 @@ def demo_cmd(output: str, docs: int, json_output: str | None) -> None:
381
  click.echo(f"Ouvrez-le dans un navigateur : file://{path}")
382
 
383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  if __name__ == "__main__":
385
  cli()
 
381
  click.echo(f"Ouvrez-le dans un navigateur : file://{path}")
382
 
383
 
384
+ # ---------------------------------------------------------------------------
385
+ # picarones import (groupe de sous-commandes)
386
+ # ---------------------------------------------------------------------------
387
+
388
+ @cli.group("import")
389
+ def import_group() -> None:
390
+ """Importe un corpus depuis une source distante (IIIF, HuggingFace…)."""
391
+
392
+
393
+ @import_group.command("iiif")
394
+ @click.argument("manifest_url")
395
+ @click.option(
396
+ "--pages", "-p",
397
+ default="all",
398
+ show_default=True,
399
+ help=(
400
+ "Pages à importer. Formats : '1-10', '1,3,5', '1-5,10,15-20', 'all'. "
401
+ "Les numéros sont 1-based (1 = première page du manifeste)."
402
+ ),
403
+ )
404
+ @click.option(
405
+ "--output", "-o",
406
+ default="./corpus_iiif/",
407
+ show_default=True,
408
+ type=click.Path(resolve_path=True),
409
+ help="Dossier de destination pour les images et les fichiers .gt.txt",
410
+ )
411
+ @click.option(
412
+ "--max-resolution",
413
+ default=0,
414
+ type=int,
415
+ show_default=True,
416
+ help="Résolution maximale des images téléchargées (largeur en pixels). 0 = max disponible.",
417
+ )
418
+ @click.option("--no-progress", is_flag=True, default=False, help="Désactive la barre de progression")
419
+ @click.option("--verbose", "-v", is_flag=True, default=False, help="Mode verbeux")
420
+ def import_iiif_cmd(
421
+ manifest_url: str,
422
+ pages: str,
423
+ output: str,
424
+ max_resolution: int,
425
+ no_progress: bool,
426
+ verbose: bool,
427
+ ) -> None:
428
+ """Importe un corpus depuis un manifeste IIIF (v2 ou v3).
429
+
430
+ MANIFEST_URL : URL du manifeste IIIF (Gallica, Bodleian, BL, BSB…)
431
+
432
+ Exemples :
433
+
434
+ \b
435
+ picarones import iiif https://gallica.bnf.fr/ark:/12148/xxx/manifest.json
436
+ picarones import iiif https://gallica.bnf.fr/ark:/12148/xxx/manifest.json --pages 1-10
437
+ picarones import iiif https://gallica.bnf.fr/ark:/12148/xxx/manifest.json --pages 1,3,5-8 --output ./mon_corpus/
438
+
439
+ Les images sont téléchargées dans le dossier de sortie.
440
+ Des fichiers .gt.txt vides (ou remplis si le manifeste contient des annotations
441
+ de transcription) sont créés à côté de chaque image.
442
+ """
443
+ _setup_logging(verbose)
444
+
445
+ from picarones.importers.iiif import IIIFImporter
446
+
447
+ click.echo(f"Manifeste IIIF : {manifest_url}")
448
+
449
+ try:
450
+ importer = IIIFImporter(manifest_url, max_resolution=max_resolution)
451
+ importer.load()
452
+
453
+ all_canvases = importer.parser.canvases()
454
+ click.echo(
455
+ f"Manifeste IIIF v{importer.parser.version} — "
456
+ f"titre : {importer.parser.label} — "
457
+ f"{len(all_canvases)} canvas disponibles"
458
+ )
459
+
460
+ selected = importer.list_canvases(pages)
461
+ click.echo(f"Pages sélectionnées : {len(selected)} sur {len(all_canvases)}")
462
+
463
+ corpus = importer.import_corpus(
464
+ pages=pages,
465
+ output_dir=output,
466
+ show_progress=not no_progress,
467
+ )
468
+
469
+ except (ValueError, RuntimeError) as exc:
470
+ click.echo(f"Erreur import IIIF : {exc}", err=True)
471
+ sys.exit(1)
472
+
473
+ click.echo(f"\n{len(corpus)} documents importés dans : {output}")
474
+
475
+ # Résumé
476
+ gt_filled = sum(1 for d in corpus.documents if d.ground_truth.strip())
477
+ if gt_filled:
478
+ click.echo(f"Transcriptions trouvées dans le manifeste : {gt_filled}/{len(corpus)}")
479
+ else:
480
+ click.echo(
481
+ "Aucune transcription dans le manifeste — "
482
+ "les fichiers .gt.txt sont vides (à remplir manuellement ou via OCR)."
483
+ )
484
+
485
+ click.echo(f"\nPour lancer un benchmark sur ce corpus :")
486
+ click.echo(f" picarones run --corpus {output} --engines tesseract")
487
+
488
+
489
+ # ---------------------------------------------------------------------------
490
+ # picarones serve
491
+ # ---------------------------------------------------------------------------
492
+
493
+ @cli.command("serve")
494
+ @click.option(
495
+ "--host",
496
+ default="127.0.0.1",
497
+ show_default=True,
498
+ help="Adresse d'écoute du serveur web",
499
+ )
500
+ @click.option(
501
+ "--port", "-p",
502
+ default=8000,
503
+ show_default=True,
504
+ type=click.IntRange(1, 65535),
505
+ help="Port d'écoute du serveur web",
506
+ )
507
+ @click.option("--reload", is_flag=True, default=False, help="Mode rechargement automatique (développement)")
508
+ @click.option("--verbose", "-v", is_flag=True, default=False, help="Mode verbeux")
509
+ def serve_cmd(host: str, port: int, reload: bool, verbose: bool) -> None:
510
+ """Lance l'interface web locale Picarones sur localhost.
511
+
512
+ Accessible dans le navigateur à l'adresse : http://HOST:PORT
513
+
514
+ \b
515
+ Exemples :
516
+ picarones serve
517
+ picarones serve --port 8080
518
+ picarones serve --host 0.0.0.0 --port 8000
519
+ """
520
+ _setup_logging(verbose)
521
+
522
+ try:
523
+ import uvicorn
524
+ except ImportError:
525
+ click.echo(
526
+ "uvicorn n'est pas installé. Installez-le avec :\n"
527
+ " pip install uvicorn[standard]\n"
528
+ "ou :\n"
529
+ " pip install picarones[web]",
530
+ err=True,
531
+ )
532
+ sys.exit(1)
533
+
534
+ url = f"http://{host}:{port}"
535
+ click.echo(f"Picarones — Interface web locale")
536
+ click.echo(f"Démarrage du serveur sur {url}")
537
+ click.echo(f"Appuyez sur Ctrl+C pour arrêter.\n")
538
+
539
+ log_level = "debug" if verbose else "info"
540
+ uvicorn.run(
541
+ "picarones.web.app:app",
542
+ host=host,
543
+ port=port,
544
+ reload=reload,
545
+ log_level=log_level,
546
+ )
547
+
548
+
549
  if __name__ == "__main__":
550
  cli()
picarones/core/char_scores.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Scores de reconnaissance des ligatures et des diacritiques.
2
+
3
+ Ces métriques sont spécifiques aux documents patrimoniaux (manuscrits, imprimés
4
+ anciens) où ligatures et diacritiques jouent un rôle paléographique essentiel.
5
+
6
+ Ligatures
7
+ ---------
8
+ Caractères encodés comme une séquence unique dans Unicode mais représentant
9
+ deux ou plusieurs glyphes fusionnés : fi (fi), fl (fl), œ, æ, etc.
10
+
11
+ Pour chaque ligature présente dans le GT, on vérifie si l'OCR a produit
12
+ soit le caractère Unicode équivalent, soit la séquence décomposée équivalente.
13
+
14
+ Diacritiques
15
+ -----------
16
+ Accents, cédilles, trémas et autres signes diacritiques. Pour chaque caractère
17
+ accentué dans le GT, on vérifie si l'OCR a conservé le diacritique ou l'a
18
+ remplacé par la lettre de base.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from dataclasses import dataclass, field
24
+ from typing import Optional
25
+
26
+ import unicodedata
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Tables de ligatures (char ligature → séquences équivalentes acceptées)
31
+ # ---------------------------------------------------------------------------
32
+
33
+ #: Table principale des ligatures et leurs équivalents acceptés.
34
+ #: Clé = caractère ligature Unicode ; valeur = liste de séquences équivalentes.
35
+ LIGATURE_TABLE: dict[str, list[str]] = {
36
+ # Ligatures typographiques latines (Unicode Letterlike Symbols / Alphabetic Presentation Forms)
37
+ "\uFB00": ["ff"], # ff ff
38
+ "\uFB01": ["fi"], # fi fi
39
+ "\uFB02": ["fl"], # fl fl
40
+ "\uFB03": ["ffi"], # ffi ffi
41
+ "\uFB04": ["ffl"], # ffl ffl
42
+ "\uFB05": ["st", "\u017Ft"], # ſt st / ſt
43
+ "\uFB06": ["st"], # st st (variante)
44
+ # Ligatures latines patrimoniales (Unicode Latin Extended Additional)
45
+ "\u0153": ["oe"], # œ oe
46
+ "\u00E6": ["ae"], # æ ae
47
+ "\u0152": ["OE"], # Œ OE
48
+ "\u00C6": ["AE"], # Æ AE
49
+ # Abréviations latines / médiévales
50
+ "\uA751": ["per", "p\u0332"], # ꝑ per / p̲
51
+ "\uA753": ["pro"], # ꝓ pro
52
+ "\uA757": ["que"], # ꝗ que
53
+ # Ligatures germaniques
54
+ "\u00DF": ["ss"], # ß ss
55
+ "\u1E9E": ["SS"], # ẞ SS
56
+ }
57
+
58
+ # Ensemble de toutes les ligatures pour recherche rapide
59
+ _ALL_LIGATURES: frozenset[str] = frozenset(LIGATURE_TABLE)
60
+
61
+ # Mapping inverse : séquence → ligature
62
+ _SEQ_TO_LIGATURE: dict[str, str] = {}
63
+ for _lig, _seqs in LIGATURE_TABLE.items():
64
+ for _seq in _seqs:
65
+ _SEQ_TO_LIGATURE[_seq] = _lig
66
+
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # Table des caractères diacritiques
70
+ # ---------------------------------------------------------------------------
71
+
72
+ def _build_diacritic_map() -> dict[str, str]:
73
+ """Construit automatiquement la table diacritique depuis l'Unicode."""
74
+ table: dict[str, str] = {}
75
+ for codepoint in range(0x00C0, 0x0250): # Latin Étendu A + B
76
+ ch = chr(codepoint)
77
+ nfd = unicodedata.normalize("NFD", ch)
78
+ if len(nfd) > 1: # le caractère est décomposable
79
+ base = nfd[0] # lettre de base
80
+ if base.isalpha() and base != ch:
81
+ table[ch] = base
82
+ # Compléments manuels
83
+ table.update({
84
+ "\u0107": "c", # ć
85
+ "\u0119": "e", # ę
86
+ "\u0142": "l", # ł
87
+ "\u0144": "n", # ń
88
+ "\u015B": "s", # ś
89
+ "\u017A": "z", # ź
90
+ "\u017C": "z", # ż
91
+ })
92
+ return table
93
+
94
+
95
+ DIACRITIC_MAP: dict[str, str] = _build_diacritic_map()
96
+ _ALL_DIACRITICS: frozenset[str] = frozenset(DIACRITIC_MAP)
97
+
98
+ # Ligatures qui NE sont PAS des diacritiques (pour éviter les doublons)
99
+ _LIGATURE_SET: frozenset[str] = frozenset(LIGATURE_TABLE)
100
+
101
+
102
+ # ---------------------------------------------------------------------------
103
+ # Résultats structurés
104
+ # ---------------------------------------------------------------------------
105
+
106
+ @dataclass
107
+ class LigatureScore:
108
+ """Score de reconnaissance des ligatures pour une paire (GT, OCR)."""
109
+
110
+ total_in_gt: int = 0
111
+ """Nombre de ligatures présentes dans le GT."""
112
+ correctly_recognized: int = 0
113
+ """Nombre de ligatures correctement transcrites (unicode ou équivalent)."""
114
+ score: float = 0.0
115
+ """Taux de reconnaissance = correctly_recognized / total_in_gt. 1.0 si total=0."""
116
+ per_ligature: dict[str, dict] = field(default_factory=dict)
117
+ """Détail par ligature : {'fi': {'gt_count': 5, 'ocr_correct': 3, 'score': 0.6}}"""
118
+
119
+ def as_dict(self) -> dict:
120
+ return {
121
+ "total_in_gt": self.total_in_gt,
122
+ "correctly_recognized": self.correctly_recognized,
123
+ "score": round(self.score, 4),
124
+ "per_ligature": {
125
+ k: {kk: round(vv, 4) if isinstance(vv, float) else vv for kk, vv in v.items()}
126
+ for k, v in self.per_ligature.items()
127
+ },
128
+ }
129
+
130
+
131
+ @dataclass
132
+ class DiacriticScore:
133
+ """Score de conservation des diacritiques pour une paire (GT, OCR)."""
134
+
135
+ total_in_gt: int = 0
136
+ """Nombre de caractères accentués dans le GT."""
137
+ correctly_recognized: int = 0
138
+ """Nombre de diacritiques correctement conservés."""
139
+ score: float = 0.0
140
+ """Taux de conservation = correctly_recognized / total_in_gt. 1.0 si total=0."""
141
+ per_diacritic: dict[str, dict] = field(default_factory=dict)
142
+ """Détail par caractère diacritique."""
143
+
144
+ def as_dict(self) -> dict:
145
+ return {
146
+ "total_in_gt": self.total_in_gt,
147
+ "correctly_recognized": self.correctly_recognized,
148
+ "score": round(self.score, 4),
149
+ "per_diacritic": {
150
+ k: {kk: round(vv, 4) if isinstance(vv, float) else vv for kk, vv in v.items()}
151
+ for k, v in self.per_diacritic.items()
152
+ },
153
+ }
154
+
155
+
156
+ # ---------------------------------------------------------------------------
157
+ # Calcul des scores
158
+ # ---------------------------------------------------------------------------
159
+
160
+ def compute_ligature_score(ground_truth: str, hypothesis: str) -> LigatureScore:
161
+ """Calcule le score de reconnaissance des ligatures.
162
+
163
+ Pour chaque ligature dans le GT, on vérifie si l'OCR a produit :
164
+ - Exactement le même caractère ligature Unicode (ex. fi → fi)
165
+ - Ou la séquence de lettres équivalente (ex. fi → fi)
166
+
167
+ Les deux sont considérés comme corrects — ce qui correspond à la pratique
168
+ éditoriale patrimoniaux (certains éditeurs développent les ligatures).
169
+
170
+ Parameters
171
+ ----------
172
+ ground_truth:
173
+ Texte de référence.
174
+ hypothesis:
175
+ Texte produit par l'OCR.
176
+
177
+ Returns
178
+ -------
179
+ LigatureScore
180
+ """
181
+ if not ground_truth:
182
+ return LigatureScore(score=1.0)
183
+
184
+ # Construire un index de position dans l'hypothèse pour recherche rapide
185
+ hyp_norm = unicodedata.normalize("NFC", hypothesis)
186
+ gt_norm = unicodedata.normalize("NFC", ground_truth)
187
+
188
+ per_lig: dict[str, dict] = {}
189
+ total = 0
190
+ correct = 0
191
+
192
+ # Trouver toutes les ligatures dans le GT
193
+ i = 0
194
+ while i < len(gt_norm):
195
+ ch = gt_norm[i]
196
+ if ch in _ALL_LIGATURES:
197
+ total += 1
198
+ equivalents = [ch] + LIGATURE_TABLE[ch] # unicode direct ou séquences équivalentes
199
+
200
+ # Vérifier si la position correspondante dans l'OCR contient l'équivalent
201
+ is_correct = _check_char_at_context(gt_norm, hyp_norm, i, ch, equivalents)
202
+ if is_correct:
203
+ correct += 1
204
+
205
+ if ch not in per_lig:
206
+ per_lig[ch] = {"gt_count": 0, "ocr_correct": 0, "score": 0.0}
207
+ per_lig[ch]["gt_count"] += 1
208
+ if is_correct:
209
+ per_lig[ch]["ocr_correct"] += 1
210
+ i += 1
211
+
212
+ # Calculer les scores individuels
213
+ for lig_data in per_lig.values():
214
+ lig_data["score"] = (
215
+ lig_data["ocr_correct"] / lig_data["gt_count"]
216
+ if lig_data["gt_count"] > 0
217
+ else 1.0
218
+ )
219
+
220
+ score = correct / total if total > 0 else 1.0
221
+ return LigatureScore(
222
+ total_in_gt=total,
223
+ correctly_recognized=correct,
224
+ score=score,
225
+ per_ligature=per_lig,
226
+ )
227
+
228
+
229
+ def compute_diacritic_score(ground_truth: str, hypothesis: str) -> DiacriticScore:
230
+ """Calcule le score de conservation des diacritiques.
231
+
232
+ Pour chaque caractère accentué dans le GT, on vérifie si l'OCR a produit
233
+ le même caractère (conservation) ou a substitué la lettre de base (perte).
234
+ On accepte aussi les formes NFD équivalentes.
235
+
236
+ Parameters
237
+ ----------
238
+ ground_truth:
239
+ Texte de référence.
240
+ hypothesis:
241
+ Texte produit par l'OCR.
242
+
243
+ Returns
244
+ -------
245
+ DiacriticScore
246
+ """
247
+ if not ground_truth:
248
+ return DiacriticScore(score=1.0)
249
+
250
+ gt_norm = unicodedata.normalize("NFC", ground_truth)
251
+ hyp_norm = unicodedata.normalize("NFC", hypothesis)
252
+
253
+ per_diac: dict[str, dict] = {}
254
+ total = 0
255
+ correct = 0
256
+
257
+ # Utiliser difflib pour l'alignement
258
+ import difflib
259
+ matcher = difflib.SequenceMatcher(None, gt_norm, hyp_norm, autojunk=False)
260
+ gt_to_hyp: dict[int, Optional[int]] = {}
261
+
262
+ for tag, i1, i2, j1, j2 in matcher.get_opcodes():
263
+ if tag == "equal":
264
+ for k in range(i2 - i1):
265
+ gt_to_hyp[i1 + k] = j1 + k
266
+ elif tag == "replace" and (i2 - i1) == (j2 - j1):
267
+ for k in range(i2 - i1):
268
+ gt_to_hyp[i1 + k] = j1 + k
269
+ else:
270
+ # delete ou replace de longueurs différentes
271
+ for k in range(i1, i2):
272
+ gt_to_hyp[k] = None
273
+
274
+ for i, ch in enumerate(gt_norm):
275
+ if ch in _ALL_DIACRITICS and ch not in _LIGATURE_SET:
276
+ total += 1
277
+ hyp_pos = gt_to_hyp.get(i)
278
+ is_correct = False
279
+ if hyp_pos is not None and hyp_pos < len(hyp_norm):
280
+ hyp_ch = hyp_norm[hyp_pos]
281
+ is_correct = (hyp_ch == ch)
282
+ if is_correct:
283
+ correct += 1
284
+
285
+ if ch not in per_diac:
286
+ per_diac[ch] = {"gt_count": 0, "ocr_correct": 0, "score": 0.0}
287
+ per_diac[ch]["gt_count"] += 1
288
+ if is_correct:
289
+ per_diac[ch]["ocr_correct"] += 1
290
+
291
+ for diac_data in per_diac.values():
292
+ diac_data["score"] = (
293
+ diac_data["ocr_correct"] / diac_data["gt_count"]
294
+ if diac_data["gt_count"] > 0
295
+ else 1.0
296
+ )
297
+
298
+ score = correct / total if total > 0 else 1.0
299
+ return DiacriticScore(
300
+ total_in_gt=total,
301
+ correctly_recognized=correct,
302
+ score=score,
303
+ per_diacritic=per_diac,
304
+ )
305
+
306
+
307
+ def _check_char_at_context(
308
+ gt: str,
309
+ hyp: str,
310
+ gt_pos: int,
311
+ gt_char: str,
312
+ equivalents: list[str],
313
+ ) -> bool:
314
+ """Vérifie si la position correspondante dans l'hypothèse contient un équivalent."""
315
+ # Approche simple : chercher si l'hypothèse contient le caractère ou son équivalent
316
+ # dans une fenêtre autour de la position estimée
317
+ for equiv in equivalents:
318
+ if equiv in hyp:
319
+ return True
320
+ return False
321
+
322
+
323
+ def aggregate_ligature_scores(scores: list[LigatureScore]) -> dict:
324
+ """Agrège les scores de ligatures sur un corpus."""
325
+ total_gt = sum(s.total_in_gt for s in scores)
326
+ total_correct = sum(s.correctly_recognized for s in scores)
327
+ score = total_correct / total_gt if total_gt > 0 else 1.0
328
+
329
+ # Agrégation par ligature
330
+ per_lig: dict[str, dict] = {}
331
+ for s in scores:
332
+ for lig, data in s.per_ligature.items():
333
+ if lig not in per_lig:
334
+ per_lig[lig] = {"gt_count": 0, "ocr_correct": 0}
335
+ per_lig[lig]["gt_count"] += data["gt_count"]
336
+ per_lig[lig]["ocr_correct"] += data["ocr_correct"]
337
+ for lig_data in per_lig.values():
338
+ lig_data["score"] = (
339
+ lig_data["ocr_correct"] / lig_data["gt_count"]
340
+ if lig_data["gt_count"] > 0 else 1.0
341
+ )
342
+
343
+ return {
344
+ "score": round(score, 4),
345
+ "total_in_gt": total_gt,
346
+ "correctly_recognized": total_correct,
347
+ "per_ligature": per_lig,
348
+ }
349
+
350
+
351
+ def aggregate_diacritic_scores(scores: list[DiacriticScore]) -> dict:
352
+ """Agrège les scores diacritiques sur un corpus."""
353
+ total_gt = sum(s.total_in_gt for s in scores)
354
+ total_correct = sum(s.correctly_recognized for s in scores)
355
+ score = total_correct / total_gt if total_gt > 0 else 1.0
356
+ return {
357
+ "score": round(score, 4),
358
+ "total_in_gt": total_gt,
359
+ "correctly_recognized": total_correct,
360
+ }
picarones/core/confusion.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Matrice de confusion unicode pour l'analyse fine des erreurs OCR.
2
+
3
+ Pour chaque moteur, on calcule quels caractères du GT sont transcrits par
4
+ quels caractères OCR (substitutions). Cette "empreinte d'erreur" est
5
+ caractéristique de chaque moteur ou pipeline.
6
+
7
+ Méthode
8
+ -------
9
+ L'alignement caractère par caractère utilise les opérations d'édition
10
+ de la distance de Levenshtein (via difflib.SequenceMatcher), ce qui permet
11
+ d'identifier les substitutions, insertions et suppressions.
12
+
13
+ La matrice est stockée comme un dict de dict :
14
+ ``{gt_char: {ocr_char: count}}``
15
+
16
+ La valeur spéciale ``"∅"`` (U+2205) représente un caractère vide :
17
+ - ``{"a": {"∅": 3}}`` → 'a' supprimé 3 fois dans l'OCR
18
+ - ``{"∅": {"x": 2}}`` → 'x' inséré 2 fois dans l'OCR (absent du GT)
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import difflib
24
+ from collections import defaultdict
25
+ from dataclasses import dataclass, field
26
+ from typing import Optional
27
+
28
+ # Symbole représentant un caractère absent (insertion / suppression)
29
+ EMPTY_CHAR = "∅"
30
+
31
+ # Caractères non pertinents à ignorer dans la matrice (espaces, sauts de ligne)
32
+ _WHITESPACE = set(" \t\n\r")
33
+
34
+
35
+ @dataclass
36
+ class ConfusionMatrix:
37
+ """Matrice de confusion unicode pour une paire (GT, OCR)."""
38
+
39
+ matrix: dict[str, dict[str, int]] = field(default_factory=dict)
40
+ """Clé externe = char GT ; clé interne = char OCR ; valeur = count."""
41
+
42
+ total_substitutions: int = 0
43
+ total_insertions: int = 0
44
+ total_deletions: int = 0
45
+
46
+ @property
47
+ def total_errors(self) -> int:
48
+ return self.total_substitutions + self.total_insertions + self.total_deletions
49
+
50
+ def top_confusions(self, n: int = 20) -> list[dict]:
51
+ """Retourne les n confusions les plus fréquentes (substitutions uniquement)."""
52
+ pairs: list[tuple[str, str, int]] = []
53
+ for gt_char, ocr_counts in self.matrix.items():
54
+ if gt_char == EMPTY_CHAR:
55
+ continue # insertions
56
+ for ocr_char, count in ocr_counts.items():
57
+ if ocr_char == EMPTY_CHAR:
58
+ continue # suppressions
59
+ if gt_char != ocr_char:
60
+ pairs.append((gt_char, ocr_char, count))
61
+ pairs.sort(key=lambda x: -x[2])
62
+ return [
63
+ {"gt": gt, "ocr": ocr, "count": cnt}
64
+ for gt, ocr, cnt in pairs[:n]
65
+ ]
66
+
67
+ def as_compact_dict(self, min_count: int = 1) -> dict:
68
+ """Sérialise la matrice en éliminant les entrées rares."""
69
+ compact: dict[str, dict[str, int]] = {}
70
+ for gt_char, ocr_counts in self.matrix.items():
71
+ filtered = {
72
+ oc: cnt for oc, cnt in ocr_counts.items()
73
+ if cnt >= min_count
74
+ }
75
+ if filtered:
76
+ compact[gt_char] = filtered
77
+ return {
78
+ "matrix": compact,
79
+ "total_substitutions": self.total_substitutions,
80
+ "total_insertions": self.total_insertions,
81
+ "total_deletions": self.total_deletions,
82
+ }
83
+
84
+ def as_dict(self) -> dict:
85
+ return self.as_compact_dict(min_count=1)
86
+
87
+
88
+ def build_confusion_matrix(
89
+ ground_truth: str,
90
+ hypothesis: str,
91
+ ignore_whitespace: bool = True,
92
+ ignore_correct: bool = True,
93
+ ) -> ConfusionMatrix:
94
+ """Construit la matrice de confusion unicode pour une paire GT/OCR.
95
+
96
+ Parameters
97
+ ----------
98
+ ground_truth:
99
+ Texte de référence (vérité terrain).
100
+ hypothesis:
101
+ Texte produit par l'OCR.
102
+ ignore_whitespace:
103
+ Si True, ignore les espaces, tabulations et sauts de ligne.
104
+ ignore_correct:
105
+ Si True, n'enregistre pas les paires identiques (gt_char == ocr_char).
106
+ Par défaut True pour réduire la taille de la matrice.
107
+
108
+ Returns
109
+ -------
110
+ ConfusionMatrix
111
+ """
112
+ matrix: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
113
+ n_subs = n_ins = n_dels = 0
114
+
115
+ if not ground_truth and not hypothesis:
116
+ return ConfusionMatrix(dict(matrix), 0, 0, 0)
117
+
118
+ # SequenceMatcher sur listes de chars pour un alignement précis
119
+ matcher = difflib.SequenceMatcher(None, ground_truth, hypothesis, autojunk=False)
120
+
121
+ for tag, i1, i2, j1, j2 in matcher.get_opcodes():
122
+ if tag == "equal":
123
+ if not ignore_correct:
124
+ for ch in ground_truth[i1:i2]:
125
+ if ignore_whitespace and ch in _WHITESPACE:
126
+ continue
127
+ matrix[ch][ch] += 1
128
+ elif tag == "replace":
129
+ # Aligner char par char les séquences de longueurs différentes
130
+ gt_seg = ground_truth[i1:i2]
131
+ oc_seg = hypothesis[j1:j2]
132
+ _align_segments(gt_seg, oc_seg, matrix, ignore_whitespace)
133
+ # Comptabiliser grossièrement (alignement sous-optimal possible)
134
+ n_subs += max(len(gt_seg), len(oc_seg))
135
+ elif tag == "delete":
136
+ for ch in ground_truth[i1:i2]:
137
+ if ignore_whitespace and ch in _WHITESPACE:
138
+ continue
139
+ matrix[ch][EMPTY_CHAR] += 1
140
+ n_dels += 1
141
+ elif tag == "insert":
142
+ for ch in hypothesis[j1:j2]:
143
+ if ignore_whitespace and ch in _WHITESPACE:
144
+ continue
145
+ matrix[EMPTY_CHAR][ch] += 1
146
+ n_ins += 1
147
+
148
+ # Convertir defaultdict en dict normal
149
+ result_matrix: dict[str, dict[str, int]] = {
150
+ k: dict(v) for k, v in matrix.items()
151
+ }
152
+
153
+ return ConfusionMatrix(
154
+ matrix=result_matrix,
155
+ total_substitutions=n_subs,
156
+ total_insertions=n_ins,
157
+ total_deletions=n_dels,
158
+ )
159
+
160
+
161
+ def _align_segments(
162
+ gt_seg: str,
163
+ oc_seg: str,
164
+ matrix: dict,
165
+ ignore_whitespace: bool,
166
+ ) -> None:
167
+ """Aligne deux segments de longueurs potentiellement différentes."""
168
+ if not gt_seg:
169
+ for ch in oc_seg:
170
+ if ignore_whitespace and ch in _WHITESPACE:
171
+ continue
172
+ matrix[EMPTY_CHAR][ch] += 1
173
+ return
174
+ if not oc_seg:
175
+ for ch in gt_seg:
176
+ if ignore_whitespace and ch in _WHITESPACE:
177
+ continue
178
+ matrix[ch][EMPTY_CHAR] += 1
179
+ return
180
+
181
+ if len(gt_seg) == len(oc_seg):
182
+ # Substitutions 1-pour-1
183
+ for g, o in zip(gt_seg, oc_seg):
184
+ if ignore_whitespace and (g in _WHITESPACE or o in _WHITESPACE):
185
+ continue
186
+ matrix[g][o] += 1
187
+ else:
188
+ # Longueurs différentes : utiliser SequenceMatcher récursif sur segments courts
189
+ sub = difflib.SequenceMatcher(None, gt_seg, oc_seg, autojunk=False)
190
+ for tag2, i1, i2, j1, j2 in sub.get_opcodes():
191
+ if tag2 == "equal":
192
+ pass
193
+ elif tag2 == "replace":
194
+ # Régression simple : aligner par troncature
195
+ for g, o in zip(gt_seg[i1:i2], oc_seg[j1:j2]):
196
+ if ignore_whitespace and (g in _WHITESPACE or o in _WHITESPACE):
197
+ continue
198
+ matrix[g][o] += 1
199
+ elif tag2 == "delete":
200
+ for g in gt_seg[i1:i2]:
201
+ if ignore_whitespace and g in _WHITESPACE:
202
+ continue
203
+ matrix[g][EMPTY_CHAR] += 1
204
+ elif tag2 == "insert":
205
+ for o in oc_seg[j1:j2]:
206
+ if ignore_whitespace and o in _WHITESPACE:
207
+ continue
208
+ matrix[EMPTY_CHAR][o] += 1
209
+
210
+
211
+ def aggregate_confusion_matrices(matrices: list[ConfusionMatrix]) -> ConfusionMatrix:
212
+ """Agrège plusieurs matrices de confusion en une seule.
213
+
214
+ Utile pour obtenir la matrice agrégée sur l'ensemble du corpus.
215
+ """
216
+ combined: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
217
+ total_subs = total_ins = total_dels = 0
218
+
219
+ for cm in matrices:
220
+ for gt_char, ocr_counts in cm.matrix.items():
221
+ for ocr_char, count in ocr_counts.items():
222
+ combined[gt_char][ocr_char] += count
223
+ total_subs += cm.total_substitutions
224
+ total_ins += cm.total_insertions
225
+ total_dels += cm.total_deletions
226
+
227
+ return ConfusionMatrix(
228
+ matrix={k: dict(v) for k, v in combined.items()},
229
+ total_substitutions=total_subs,
230
+ total_insertions=total_ins,
231
+ total_deletions=total_dels,
232
+ )
233
+
234
+
235
+ def top_confused_chars(
236
+ matrix: ConfusionMatrix,
237
+ n: int = 15,
238
+ exclude_empty: bool = True,
239
+ ) -> list[dict]:
240
+ """Retourne les caractères GT les plus souvent confondus.
241
+
242
+ Retourne une liste triée par nombre total d'erreurs décroissant :
243
+ ``[{"char": "ſ", "total_errors": 47, "top_substitutes": [...]}, ...]``
244
+ """
245
+ char_stats: dict[str, dict] = {}
246
+ for gt_char, ocr_counts in matrix.matrix.items():
247
+ if exclude_empty and gt_char == EMPTY_CHAR:
248
+ continue
249
+ error_count = sum(
250
+ cnt for oc, cnt in ocr_counts.items()
251
+ if (oc != gt_char) and (not exclude_empty or oc != EMPTY_CHAR or True)
252
+ )
253
+ if error_count > 0:
254
+ top_subs = sorted(
255
+ [{"ocr": oc, "count": cnt} for oc, cnt in ocr_counts.items() if oc != gt_char],
256
+ key=lambda x: -x["count"],
257
+ )[:5]
258
+ char_stats[gt_char] = {
259
+ "char": gt_char,
260
+ "total_errors": error_count,
261
+ "top_substitutes": top_subs,
262
+ }
263
+
264
+ return sorted(char_stats.values(), key=lambda x: -x["total_errors"])[:n]
picarones/core/image_quality.py ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Analyse automatique de la qualité des images de documents numérisés.
2
+
3
+ Métriques
4
+ ---------
5
+ - **Score de netteté** : variance du laplacien (plus élevé = plus net)
6
+ - **Niveau de bruit** : écart-type des résidus haute-fréquence
7
+ - **Angle de rotation résiduel** : estimé par projection horizontale
8
+ - **Score de contraste** : ratio Michelson entre zones sombres (encre) et claires (fond)
9
+ - **Score de qualité global** : combinaison normalisée des métriques ci-dessus
10
+
11
+ Ces calculs sont réalisés en pur Python + bibliothèques stdlib ou Pillow.
12
+ NumPy est utilisé si disponible (calculs plus rapides), mais les méthodes
13
+ de fallback n'en dépendent pas.
14
+
15
+ Note
16
+ ----
17
+ Pour les images placeholder (fixtures), des valeurs fictives cohérentes
18
+ sont générées via `generate_mock_quality_scores()`.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import math
24
+ import statistics
25
+ from dataclasses import dataclass
26
+ from pathlib import Path
27
+ from typing import Optional
28
+
29
+
30
+ @dataclass
31
+ class ImageQualityResult:
32
+ """Métriques de qualité d'une image de document."""
33
+
34
+ sharpness_score: float = 0.0
35
+ """Score de netteté [0, 1]. Basé sur la variance du laplacien normalisée."""
36
+
37
+ noise_level: float = 0.0
38
+ """Niveau de bruit [0, 1]. 0 = pas de bruit, 1 = très bruité."""
39
+
40
+ rotation_degrees: float = 0.0
41
+ """Angle de rotation résiduel estimé en degrés (positif = sens horaire)."""
42
+
43
+ contrast_score: float = 0.0
44
+ """Score de contraste [0, 1]. Ratio Michelson encre/fond."""
45
+
46
+ quality_score: float = 0.0
47
+ """Score de qualité global [0, 1]. Combinaison pondérée des autres métriques."""
48
+
49
+ analysis_method: str = "none"
50
+ """Méthode d'analyse utilisée : 'pillow', 'numpy', 'mock'."""
51
+
52
+ error: Optional[str] = None
53
+ """Erreur si l'analyse a échoué."""
54
+
55
+ @property
56
+ def is_good_quality(self) -> bool:
57
+ """Vrai si le score de qualité global est ≥ 0.7."""
58
+ return self.quality_score >= 0.7
59
+
60
+ @property
61
+ def quality_tier(self) -> str:
62
+ """Catégorie de qualité : 'good', 'medium', 'poor'."""
63
+ if self.quality_score >= 0.7:
64
+ return "good"
65
+ elif self.quality_score >= 0.4:
66
+ return "medium"
67
+ return "poor"
68
+
69
+ def as_dict(self) -> dict:
70
+ d = {
71
+ "sharpness_score": round(self.sharpness_score, 4),
72
+ "noise_level": round(self.noise_level, 4),
73
+ "rotation_degrees": round(self.rotation_degrees, 2),
74
+ "contrast_score": round(self.contrast_score, 4),
75
+ "quality_score": round(self.quality_score, 4),
76
+ "quality_tier": self.quality_tier,
77
+ "analysis_method": self.analysis_method,
78
+ }
79
+ if self.error:
80
+ d["error"] = self.error
81
+ return d
82
+
83
+ @classmethod
84
+ def from_dict(cls, data: dict) -> "ImageQualityResult":
85
+ return cls(
86
+ sharpness_score=data.get("sharpness_score", 0.0),
87
+ noise_level=data.get("noise_level", 0.0),
88
+ rotation_degrees=data.get("rotation_degrees", 0.0),
89
+ contrast_score=data.get("contrast_score", 0.0),
90
+ quality_score=data.get("quality_score", 0.0),
91
+ analysis_method=data.get("analysis_method", "none"),
92
+ error=data.get("error"),
93
+ )
94
+
95
+
96
+ def analyze_image_quality(image_path: str | Path) -> ImageQualityResult:
97
+ """Analyse la qualité d'une image de document numérisé.
98
+
99
+ Essaie successivement :
100
+ 1. Pillow + NumPy (méthode complète)
101
+ 2. Pillow seul (méthode simplifiée)
102
+ 3. Fallback : retourne un résultat vide avec erreur
103
+
104
+ Parameters
105
+ ----------
106
+ image_path:
107
+ Chemin vers l'image (JPG, PNG, TIFF…).
108
+
109
+ Returns
110
+ -------
111
+ ImageQualityResult
112
+ """
113
+ path = Path(image_path)
114
+ if not path.exists():
115
+ return ImageQualityResult(
116
+ error=f"Fichier image introuvable : {image_path}",
117
+ analysis_method="none",
118
+ )
119
+
120
+ # Essai avec Pillow + NumPy
121
+ try:
122
+ import numpy as np
123
+ from PIL import Image
124
+ return _analyze_with_numpy(path, np, Image)
125
+ except ImportError:
126
+ pass
127
+
128
+ # Essai avec Pillow seul
129
+ try:
130
+ from PIL import Image
131
+ return _analyze_with_pillow(path, Image)
132
+ except ImportError:
133
+ pass
134
+
135
+ return ImageQualityResult(
136
+ error="Pillow non disponible (pip install Pillow)",
137
+ analysis_method="none",
138
+ quality_score=0.5, # valeur neutre
139
+ )
140
+
141
+
142
+ def _analyze_with_numpy(path: Path, np, Image) -> ImageQualityResult:
143
+ """Analyse complète avec NumPy."""
144
+ img = Image.open(path).convert("L") # niveaux de gris
145
+ arr = np.array(img, dtype=np.float32)
146
+
147
+ # 1. Netteté : variance du laplacien
148
+ laplacian = _laplacian_variance_numpy(arr, np)
149
+ # Normalisation empirique : variance > 500 = très net, < 50 = flou
150
+ sharpness = min(1.0, laplacian / 500.0)
151
+
152
+ # 2. Bruit : écart-type des résidus (différence image - image lissée)
153
+ noise = _noise_level_numpy(arr, np)
154
+
155
+ # 3. Rotation : angle d'inclinaison estimé
156
+ rotation = _estimate_rotation_numpy(arr, np)
157
+
158
+ # 4. Contraste : ratio Michelson
159
+ contrast = _contrast_score_numpy(arr, np)
160
+
161
+ # 5. Score global pondéré
162
+ quality = _global_quality_score(sharpness, noise, abs(rotation), contrast)
163
+
164
+ return ImageQualityResult(
165
+ sharpness_score=float(sharpness),
166
+ noise_level=float(noise),
167
+ rotation_degrees=float(rotation),
168
+ contrast_score=float(contrast),
169
+ quality_score=float(quality),
170
+ analysis_method="numpy",
171
+ )
172
+
173
+
174
+ def _analyze_with_pillow(path: Path, Image) -> ImageQualityResult:
175
+ """Analyse simplifiée avec Pillow seul (sans NumPy)."""
176
+ img = Image.open(path).convert("L")
177
+ pixels = list(img.getdata())
178
+ w, h = img.size
179
+
180
+ if not pixels:
181
+ return ImageQualityResult(quality_score=0.5, analysis_method="pillow")
182
+
183
+ # Contraste : étendue des valeurs
184
+ min_val = min(pixels)
185
+ max_val = max(pixels)
186
+ if max_val + min_val > 0:
187
+ contrast = (max_val - min_val) / (max_val + min_val)
188
+ else:
189
+ contrast = 0.0
190
+
191
+ # Netteté approximée : variance globale des pixels
192
+ mean_pix = statistics.mean(pixels)
193
+ try:
194
+ variance = statistics.variance(pixels)
195
+ except statistics.StatisticsError:
196
+ variance = 0.0
197
+ sharpness = min(1.0, math.sqrt(variance) / 128.0)
198
+
199
+ # Bruit : approximation grossière
200
+ noise = min(1.0, statistics.stdev(pixels[:min(1000, len(pixels))]) / 64.0) if len(pixels) > 1 else 0.0
201
+
202
+ quality = _global_quality_score(sharpness, noise, 0.0, contrast)
203
+
204
+ return ImageQualityResult(
205
+ sharpness_score=sharpness,
206
+ noise_level=noise,
207
+ rotation_degrees=0.0, # non calculé sans NumPy
208
+ contrast_score=contrast,
209
+ quality_score=quality,
210
+ analysis_method="pillow",
211
+ )
212
+
213
+
214
+ def _laplacian_variance_numpy(arr, np) -> float:
215
+ """Calcule la variance du laplacien (mesure de netteté)."""
216
+ # Filtre laplacien 3x3
217
+ laplacian_kernel = np.array([
218
+ [0, 1, 0],
219
+ [1, -4, 1],
220
+ [0, 1, 0],
221
+ ], dtype=np.float32)
222
+
223
+ # Convolution manuelle simplifiée (bordures ignorées)
224
+ h, w = arr.shape
225
+ if h < 3 or w < 3:
226
+ return float(np.var(arr))
227
+
228
+ # Utiliser une convolution rapide avec slicing
229
+ center = arr[1:-1, 1:-1]
230
+ top = arr[:-2, 1:-1]
231
+ bottom = arr[2:, 1:-1]
232
+ left = arr[1:-1, :-2]
233
+ right = arr[1:-1, 2:]
234
+ lap = top + bottom + left + right - 4 * center
235
+
236
+ return float(np.var(lap))
237
+
238
+
239
+ def _noise_level_numpy(arr, np) -> float:
240
+ """Estime le niveau de bruit par la MAD (Median Absolute Deviation) des gradients."""
241
+ h, w = arr.shape
242
+ if h < 2 or w < 2:
243
+ return 0.0
244
+ # Différences horizontales et verticales
245
+ diff_h = np.abs(arr[:, 1:] - arr[:, :-1])
246
+ diff_v = np.abs(arr[1:, :] - arr[:-1, :])
247
+ noise_std = float(np.median(np.concatenate([diff_h.ravel(), diff_v.ravel()])))
248
+ # Normaliser : 0 = pas de bruit, 1 = très bruité (seuil à ~30)
249
+ return min(1.0, noise_std / 30.0)
250
+
251
+
252
+ def _estimate_rotation_numpy(arr, np) -> float:
253
+ """Estime l'angle de rotation par projection horizontale simplifiée.
254
+
255
+ Retourne l'angle estimé en degrés [-45, 45].
256
+ """
257
+ # Méthode simplifiée : analyse de la variance des projections à différents angles
258
+ # Limiter à quelques angles pour la performance
259
+ h, w = arr.shape
260
+ if h < 20 or w < 20:
261
+ return 0.0
262
+
263
+ # Sous-échantillonnage pour la performance
264
+ step = max(1, h // 100)
265
+ sample = arr[::step, :]
266
+
267
+ best_angle = 0.0
268
+ best_var = -1.0
269
+
270
+ for angle_deg in range(-5, 6): # ±5 degrés, pas de 1°
271
+ angle_rad = math.radians(angle_deg)
272
+ # Projection horizontale après rotation approximative
273
+ # (approximation linéaire rapide)
274
+ offsets = np.round(
275
+ np.arange(sample.shape[0]) * math.tan(angle_rad)
276
+ ).astype(int)
277
+ offsets = np.clip(offsets, 0, w - 1)
278
+
279
+ # Variance des sommes de lignes décalées
280
+ try:
281
+ row_sums = np.array([
282
+ float(np.sum(sample[i, max(0, offsets[i]):min(w, offsets[i]+w)]))
283
+ for i in range(sample.shape[0])
284
+ ])
285
+ var = float(np.var(row_sums))
286
+ if var > best_var:
287
+ best_var = var
288
+ best_angle = float(angle_deg)
289
+ except Exception:
290
+ pass
291
+
292
+ return best_angle
293
+
294
+
295
+ def _contrast_score_numpy(arr, np) -> float:
296
+ """Score de contraste Michelson [0, 1]."""
297
+ p5 = float(np.percentile(arr, 5)) # fond clair
298
+ p95 = float(np.percentile(arr, 95)) # encre sombre
299
+ if p5 + p95 == 0:
300
+ return 0.0
301
+ # Michelson : (Imax - Imin) / (Imax + Imin)
302
+ return float((p95 - p5) / (p95 + p5))
303
+
304
+
305
+ def _global_quality_score(
306
+ sharpness: float,
307
+ noise: float,
308
+ rotation_abs: float,
309
+ contrast: float,
310
+ ) -> float:
311
+ """Calcule le score de qualité global pondéré."""
312
+ # Poids : netteté (40%), contraste (30%), bruit (20%), rotation (10%)
313
+ score = (
314
+ 0.40 * sharpness
315
+ + 0.30 * contrast
316
+ + 0.20 * (1.0 - noise) # moins de bruit = mieux
317
+ + 0.10 * max(0.0, 1.0 - rotation_abs / 10.0) # ±10° max
318
+ )
319
+ return round(min(1.0, max(0.0, score)), 4)
320
+
321
+
322
+ # ---------------------------------------------------------------------------
323
+ # Données fictives pour les fixtures de démo
324
+ # ---------------------------------------------------------------------------
325
+
326
+ def generate_mock_quality_scores(
327
+ doc_id: str,
328
+ seed: Optional[int] = None,
329
+ ) -> ImageQualityResult:
330
+ """Génère des métriques de qualité fictives mais cohérentes pour un document.
331
+
332
+ Utilisé par les fixtures de démo pour simuler une diversité réaliste
333
+ de qualités d'image (bonne, moyenne, dégradée).
334
+
335
+ Parameters
336
+ ----------
337
+ doc_id:
338
+ Identifiant du document (utilisé pour la reproductibilité).
339
+ seed:
340
+ Graine aléatoire optionnelle.
341
+ """
342
+ import random
343
+ rng = random.Random(seed or hash(doc_id) % 2**32)
344
+
345
+ # Générer une qualité cohérente : certains docs sont plus difficiles
346
+ # doc_id finissant par un chiffre impair → qualité variable
347
+ last_char = doc_id[-1] if doc_id else "0"
348
+ base_quality = 0.3 + rng.random() * 0.6 # 0.3 à 0.9
349
+
350
+ sharpness = max(0.1, min(1.0, base_quality + rng.gauss(0, 0.1)))
351
+ noise = max(0.0, min(1.0, (1.0 - base_quality) * 0.8 + rng.gauss(0, 0.05)))
352
+ rotation = rng.gauss(0, 1.5) # ±1.5° typique
353
+ contrast = max(0.2, min(1.0, base_quality + rng.gauss(0, 0.15)))
354
+
355
+ quality = _global_quality_score(sharpness, noise, abs(rotation), contrast)
356
+
357
+ return ImageQualityResult(
358
+ sharpness_score=round(sharpness, 4),
359
+ noise_level=round(noise, 4),
360
+ rotation_degrees=round(rotation, 2),
361
+ contrast_score=round(contrast, 4),
362
+ quality_score=round(quality, 4),
363
+ analysis_method="mock",
364
+ )
365
+
366
+
367
+ def aggregate_image_quality(results: list[ImageQualityResult]) -> dict:
368
+ """Agrège les métriques de qualité image sur un corpus."""
369
+ if not results:
370
+ return {}
371
+
372
+ valid = [r for r in results if r.error is None]
373
+ if not valid:
374
+ return {"error": "Aucune analyse réussie"}
375
+
376
+ def _mean(vals: list[float]) -> float:
377
+ return round(statistics.mean(vals), 4) if vals else 0.0
378
+
379
+ quality_scores = [r.quality_score for r in valid]
380
+ sharpness_scores = [r.sharpness_score for r in valid]
381
+ noise_levels = [r.noise_level for r in valid]
382
+
383
+ # Distribution par tier
384
+ tiers = {"good": 0, "medium": 0, "poor": 0}
385
+ for r in valid:
386
+ tiers[r.quality_tier] += 1
387
+
388
+ return {
389
+ "mean_quality_score": _mean(quality_scores),
390
+ "mean_sharpness": _mean(sharpness_scores),
391
+ "mean_noise_level": _mean(noise_levels),
392
+ "quality_distribution": tiers,
393
+ "document_count": len(valid),
394
+ "scores": [r.quality_score for r in valid], # pour scatter plot
395
+ }
picarones/core/metrics.py CHANGED
@@ -5,6 +5,8 @@ Métriques implémentées
5
  - CER brut : distance d'édition caractère / longueur GT
6
  - CER normalisé NFC : après normalisation Unicode NFC
7
  - CER sans casse : insensible aux majuscules/minuscules
 
 
8
  - WER brut : word error rate standard
9
  - WER normalisé : après normalisation des espaces
10
  - MER : Match Error Rate (jiwer)
@@ -41,9 +43,6 @@ def _normalize_whitespace(text: str) -> str:
41
  return " ".join(text.split())
42
 
43
 
44
- # Transformations jiwer pour le CER (chaque char devient un "mot")
45
- _CHAR_TRANSFORM = jiwer.transforms.Compose([]) if _JIWER_AVAILABLE else None
46
-
47
  # Transformations jiwer pour le WER (normalisation légère des espaces)
48
  _WER_TRANSFORM = (
49
  jiwer.transforms.Compose(
@@ -62,7 +61,6 @@ def _cer_from_strings(reference: str, hypothesis: str) -> float:
62
  """CER brut : distance d'édition sur les caractères."""
63
  if not reference:
64
  return 0.0 if not hypothesis else 1.0
65
- # jiwer.cer traite chaque caractère comme un token
66
  return jiwer.cer(reference, hypothesis)
67
 
68
 
@@ -84,9 +82,15 @@ class MetricsResult:
84
  reference_length: int
85
  hypothesis_length: int
86
  error: Optional[str] = None
 
 
 
 
 
 
87
 
88
  def as_dict(self) -> dict:
89
- return {
90
  "cer": round(self.cer, 6),
91
  "cer_nfc": round(self.cer_nfc, 6),
92
  "cer_caseless": round(self.cer_caseless, 6),
@@ -98,6 +102,10 @@ class MetricsResult:
98
  "hypothesis_length": self.hypothesis_length,
99
  "error": self.error,
100
  }
 
 
 
 
101
 
102
  @property
103
  def cer_percent(self) -> float:
@@ -108,7 +116,11 @@ class MetricsResult:
108
  return round(self.wer * 100, 2)
109
 
110
 
111
- def compute_metrics(reference: str, hypothesis: str) -> MetricsResult:
 
 
 
 
112
  """Calcule l'ensemble des métriques CER/WER pour une paire de textes.
113
 
114
  Parameters
@@ -117,6 +129,10 @@ def compute_metrics(reference: str, hypothesis: str) -> MetricsResult:
117
  Texte de vérité terrain (ground truth).
118
  hypothesis:
119
  Texte produit par le moteur OCR.
 
 
 
 
120
 
121
  Returns
122
  -------
@@ -151,6 +167,19 @@ def compute_metrics(reference: str, hypothesis: str) -> MetricsResult:
151
  mer = jiwer.mer(reference, hypothesis)
152
  wil = jiwer.wil(reference, hypothesis)
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  return MetricsResult(
155
  cer=cer_raw,
156
  cer_nfc=cer_nfc,
@@ -161,6 +190,8 @@ def compute_metrics(reference: str, hypothesis: str) -> MetricsResult:
161
  wil=wil,
162
  reference_length=len(reference),
163
  hypothesis_length=len(hypothesis),
 
 
164
  )
165
 
166
  except Exception as exc: # noqa: BLE001
@@ -208,7 +239,28 @@ def aggregate_metrics(results: list[MetricsResult]) -> dict:
208
  values = [getattr(r, metric) for r in results if r.error is None]
209
  aggregated[metric] = _stats(values)
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  aggregated["document_count"] = len(results)
212
  aggregated["failed_count"] = sum(1 for r in results if r.error is not None)
213
 
214
  return aggregated
 
 
 
 
 
 
 
5
  - CER brut : distance d'édition caractère / longueur GT
6
  - CER normalisé NFC : après normalisation Unicode NFC
7
  - CER sans casse : insensible aux majuscules/minuscules
8
+ - CER diplomatique : après application d'une table de correspondances
9
+ historiques (ſ=s, u=v, i=j…) — configurable
10
  - WER brut : word error rate standard
11
  - WER normalisé : après normalisation des espaces
12
  - MER : Match Error Rate (jiwer)
 
43
  return " ".join(text.split())
44
 
45
 
 
 
 
46
  # Transformations jiwer pour le WER (normalisation légère des espaces)
47
  _WER_TRANSFORM = (
48
  jiwer.transforms.Compose(
 
61
  """CER brut : distance d'édition sur les caractères."""
62
  if not reference:
63
  return 0.0 if not hypothesis else 1.0
 
64
  return jiwer.cer(reference, hypothesis)
65
 
66
 
 
82
  reference_length: int
83
  hypothesis_length: int
84
  error: Optional[str] = None
85
+ cer_diplomatic: Optional[float] = None
86
+ """CER calculé après normalisation diplomatique (ſ=s, u=v, i=j…).
87
+ None si aucun profil diplomatique n'a été fourni à compute_metrics.
88
+ """
89
+ diplomatic_profile_name: Optional[str] = None
90
+ """Nom du profil de normalisation diplomatique utilisé."""
91
 
92
  def as_dict(self) -> dict:
93
+ d = {
94
  "cer": round(self.cer, 6),
95
  "cer_nfc": round(self.cer_nfc, 6),
96
  "cer_caseless": round(self.cer_caseless, 6),
 
102
  "hypothesis_length": self.hypothesis_length,
103
  "error": self.error,
104
  }
105
+ if self.cer_diplomatic is not None:
106
+ d["cer_diplomatic"] = round(self.cer_diplomatic, 6)
107
+ d["diplomatic_profile_name"] = self.diplomatic_profile_name
108
+ return d
109
 
110
  @property
111
  def cer_percent(self) -> float:
 
116
  return round(self.wer * 100, 2)
117
 
118
 
119
+ def compute_metrics(
120
+ reference: str,
121
+ hypothesis: str,
122
+ normalization_profile: "Optional[NormalizationProfile]" = None, # noqa: F821
123
+ ) -> MetricsResult:
124
  """Calcule l'ensemble des métriques CER/WER pour une paire de textes.
125
 
126
  Parameters
 
129
  Texte de vérité terrain (ground truth).
130
  hypothesis:
131
  Texte produit par le moteur OCR.
132
+ normalization_profile:
133
+ Profil de normalisation diplomatique optionnel.
134
+ Si fourni, calcule ``cer_diplomatic`` en plus des métriques standard.
135
+ Si None, utilise le profil medieval_french par défaut.
136
 
137
  Returns
138
  -------
 
167
  mer = jiwer.mer(reference, hypothesis)
168
  wil = jiwer.wil(reference, hypothesis)
169
 
170
+ # CER diplomatique — utilise le profil fourni ou le profil médiéval par défaut
171
+ cer_diplomatic: Optional[float] = None
172
+ diplomatic_profile_name: Optional[str] = None
173
+ try:
174
+ from picarones.core.normalization import DEFAULT_DIPLOMATIC_PROFILE
175
+ profile = normalization_profile or DEFAULT_DIPLOMATIC_PROFILE
176
+ ref_diplo = profile.normalize(reference)
177
+ hyp_diplo = profile.normalize(hypothesis)
178
+ cer_diplomatic = _cer_from_strings(ref_diplo, hyp_diplo)
179
+ diplomatic_profile_name = profile.name
180
+ except Exception: # noqa: BLE001
181
+ pass # CER diplomatique non critique
182
+
183
  return MetricsResult(
184
  cer=cer_raw,
185
  cer_nfc=cer_nfc,
 
190
  wil=wil,
191
  reference_length=len(reference),
192
  hypothesis_length=len(hypothesis),
193
+ cer_diplomatic=cer_diplomatic,
194
+ diplomatic_profile_name=diplomatic_profile_name,
195
  )
196
 
197
  except Exception as exc: # noqa: BLE001
 
239
  values = [getattr(r, metric) for r in results if r.error is None]
240
  aggregated[metric] = _stats(values)
241
 
242
+ # CER diplomatique (optionnel — présent seulement si calculé)
243
+ diplo_values = [
244
+ r.cer_diplomatic for r in results
245
+ if r.error is None and r.cer_diplomatic is not None
246
+ ]
247
+ if diplo_values:
248
+ aggregated["cer_diplomatic"] = _stats(diplo_values)
249
+ # Nom du profil (même pour tous les docs d'un corpus)
250
+ profile_name = next(
251
+ (r.diplomatic_profile_name for r in results if r.diplomatic_profile_name),
252
+ None,
253
+ )
254
+ if profile_name:
255
+ aggregated["cer_diplomatic"]["profile"] = profile_name
256
+
257
  aggregated["document_count"] = len(results)
258
  aggregated["failed_count"] = sum(1 for r in results if r.error is not None)
259
 
260
  return aggregated
261
+
262
+
263
+ # Import paresseux pour éviter les imports circulaires
264
+ from typing import TYPE_CHECKING
265
+ if TYPE_CHECKING:
266
+ from picarones.core.normalization import NormalizationProfile
picarones/core/normalization.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Profils de normalisation unicode pour le calcul du CER diplomatique.
2
+
3
+ La normalisation diplomatique permet de calculer un CER tenant compte des
4
+ équivalences graphiques propres aux documents historiques : ſ=s, u=v, i=j, etc.
5
+
6
+ En appliquant la même table aux deux textes (GT et OCR), on mesure les erreurs
7
+ "substantielles" (transcription erronée) en ignorant les variations graphiques
8
+ codifiées connues.
9
+
10
+ Trois niveaux de normalisation sont disponibles :
11
+
12
+ 1. NFC : normalisation Unicode canonique (décomposition+recomposition)
13
+ 2. caseless : NFC + pliage de casse (casefold)
14
+ 3. diplomatic: NFC + table de correspondances historiques configurables
15
+
16
+ Les profils préconfigurés couvrent les cas d'usage patrimoniaux courants.
17
+ Ils sont également chargeables depuis un fichier YAML.
18
+
19
+ Exemple YAML
20
+ ------------
21
+ name: medieval_custom
22
+ caseless: false
23
+ diplomatic:
24
+ ſ: s
25
+ u: v
26
+ i: j
27
+ y: i
28
+ æ: ae
29
+ œ: oe
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import unicodedata
35
+ from dataclasses import dataclass, field
36
+ from pathlib import Path
37
+ from typing import Optional
38
+
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Tables de correspondances diplomatiques préconfigurées
42
+ # ---------------------------------------------------------------------------
43
+
44
+ #: Français médiéval (XIIe–XVe siècle)
45
+ DIPLOMATIC_FR_MEDIEVAL: dict[str, str] = {
46
+ "ſ": "s", # s long → s
47
+ "u": "v", # u/v interchangeables en position initiale
48
+ "i": "j", # i/j interchangeables
49
+ "y": "i", # y vocalique → i
50
+ "æ": "ae", # ligature æ
51
+ "œ": "oe", # ligature œ
52
+ "ꝑ": "per", # abréviation per/par
53
+ "ꝓ": "pro", # abréviation pro
54
+ "\u0026": "et", # & → et
55
+ }
56
+
57
+ #: Français moderne / imprimés anciens (XVIe–XVIIIe siècle)
58
+ DIPLOMATIC_FR_EARLY_MODERN: dict[str, str] = {
59
+ "ſ": "s", # s long
60
+ "æ": "ae",
61
+ "œ": "oe",
62
+ "\u0026": "et",
63
+ "ỹ": "yn", # y tilde
64
+ }
65
+
66
+ #: Latin médiéval
67
+ DIPLOMATIC_LATIN_MEDIEVAL: dict[str, str] = {
68
+ "ſ": "s",
69
+ "u": "v",
70
+ "i": "j",
71
+ "y": "i",
72
+ "æ": "ae",
73
+ "œ": "oe",
74
+ "ꝑ": "per",
75
+ "ꝓ": "pro",
76
+ "ꝗ": "que", # q barré → que
77
+ "\u0026": "et",
78
+ }
79
+
80
+ #: Profil minimal — uniquement NFC + s long
81
+ DIPLOMATIC_MINIMAL: dict[str, str] = {
82
+ "ſ": "s",
83
+ }
84
+
85
+
86
+ # ---------------------------------------------------------------------------
87
+ # Profil de normalisation
88
+ # ---------------------------------------------------------------------------
89
+
90
+ @dataclass
91
+ class NormalizationProfile:
92
+ """Décrit une stratégie de normalisation pour le calcul du CER diplomatique.
93
+
94
+ Parameters
95
+ ----------
96
+ name:
97
+ Identifiant lisible du profil (ex : ``"medieval_french"``).
98
+ nfc:
99
+ Applique la normalisation Unicode NFC (recommandé, activé par défaut).
100
+ caseless:
101
+ Pliage de casse (casefold) après NFC.
102
+ diplomatic_table:
103
+ Table de correspondances graphiques historiques appliquée caractère
104
+ par caractère sur les deux textes avant calcul du CER.
105
+ description:
106
+ Description courte du profil (affichée dans le rapport HTML).
107
+ """
108
+
109
+ name: str
110
+ nfc: bool = True
111
+ caseless: bool = False
112
+ diplomatic_table: dict[str, str] = field(default_factory=dict)
113
+ description: str = ""
114
+
115
+ def normalize(self, text: str) -> str:
116
+ """Applique le profil de normalisation à un texte."""
117
+ if self.nfc:
118
+ text = unicodedata.normalize("NFC", text)
119
+ if self.caseless:
120
+ text = text.casefold()
121
+ if self.diplomatic_table:
122
+ text = _apply_diplomatic_table(text, self.diplomatic_table)
123
+ return text
124
+
125
+ def as_dict(self) -> dict:
126
+ return {
127
+ "name": self.name,
128
+ "nfc": self.nfc,
129
+ "caseless": self.caseless,
130
+ "diplomatic_table": self.diplomatic_table,
131
+ "description": self.description,
132
+ }
133
+
134
+ @classmethod
135
+ def from_yaml(cls, path: str | Path) -> "NormalizationProfile":
136
+ """Charge un profil depuis un fichier YAML.
137
+
138
+ Le fichier YAML doit contenir les clés ``name``, optionnellement
139
+ ``caseless``, ``description`` et ``diplomatic`` (dict str→str).
140
+
141
+ Example
142
+ -------
143
+ .. code-block:: yaml
144
+
145
+ name: medieval_custom
146
+ caseless: false
147
+ description: Français médiéval personnalisé
148
+ diplomatic:
149
+ ſ: s
150
+ u: v
151
+ """
152
+ try:
153
+ import yaml
154
+ except ImportError as exc:
155
+ raise RuntimeError(
156
+ "Le package 'pyyaml' est requis pour charger les profils YAML. "
157
+ "Installez-le avec : pip install pyyaml"
158
+ ) from exc
159
+
160
+ data = yaml.safe_load(Path(path).read_text(encoding="utf-8"))
161
+ return cls(
162
+ name=data.get("name", Path(path).stem),
163
+ nfc=bool(data.get("nfc", True)),
164
+ caseless=bool(data.get("caseless", False)),
165
+ diplomatic_table=data.get("diplomatic", {}),
166
+ description=data.get("description", ""),
167
+ )
168
+
169
+ @classmethod
170
+ def from_dict(cls, data: dict) -> "NormalizationProfile":
171
+ """Charge un profil depuis un dictionnaire (ex : section YAML inline)."""
172
+ return cls(
173
+ name=data.get("name", "custom"),
174
+ nfc=bool(data.get("nfc", True)),
175
+ caseless=bool(data.get("caseless", False)),
176
+ diplomatic_table=data.get("diplomatic", {}),
177
+ description=data.get("description", ""),
178
+ )
179
+
180
+
181
+ # ---------------------------------------------------------------------------
182
+ # Profils préconfigurés
183
+ # ---------------------------------------------------------------------------
184
+
185
+ def get_builtin_profile(name: str) -> NormalizationProfile:
186
+ """Retourne un profil préconfigurée par son identifiant.
187
+
188
+ Identifiants disponibles
189
+ ------------------------
190
+ - ``"medieval_french"`` : français médiéval XIIe–XVe (ſ=s, u=v, i=j, æ=ae, œ=oe…)
191
+ - ``"early_modern_french"`` : imprimés anciens XVIe–XVIIIe (ſ=s, œ=oe, æ=ae…)
192
+ - ``"medieval_latin"`` : latin médiéval (ſ=s, u=v, i=j, ꝑ=per, ꝓ=pro…)
193
+ - ``"minimal"`` : uniquement NFC + s long
194
+ - ``"nfc"`` : NFC seul (sans table diplomatique)
195
+ - ``"caseless"`` : NFC + pliage de casse
196
+
197
+ Raises
198
+ ------
199
+ KeyError
200
+ Si le nom n'est pas reconnu.
201
+ """
202
+ profiles = {
203
+ "medieval_french": NormalizationProfile(
204
+ name="medieval_french",
205
+ nfc=True,
206
+ caseless=False,
207
+ diplomatic_table=DIPLOMATIC_FR_MEDIEVAL,
208
+ description="Français médiéval (XIIe–XVe) : ſ=s, u=v, i=j, æ=ae, œ=oe",
209
+ ),
210
+ "early_modern_french": NormalizationProfile(
211
+ name="early_modern_french",
212
+ nfc=True,
213
+ caseless=False,
214
+ diplomatic_table=DIPLOMATIC_FR_EARLY_MODERN,
215
+ description="Imprimés anciens (XVIe–XVIIIe) : ſ=s, æ=ae, œ=oe",
216
+ ),
217
+ "medieval_latin": NormalizationProfile(
218
+ name="medieval_latin",
219
+ nfc=True,
220
+ caseless=False,
221
+ diplomatic_table=DIPLOMATIC_LATIN_MEDIEVAL,
222
+ description="Latin médiéval : ſ=s, u=v, i=j, ꝑ=per, ꝓ=pro",
223
+ ),
224
+ "minimal": NormalizationProfile(
225
+ name="minimal",
226
+ nfc=True,
227
+ caseless=False,
228
+ diplomatic_table=DIPLOMATIC_MINIMAL,
229
+ description="Minimal : NFC + s long seulement",
230
+ ),
231
+ "nfc": NormalizationProfile(
232
+ name="nfc",
233
+ nfc=True,
234
+ caseless=False,
235
+ diplomatic_table={},
236
+ description="Normalisation NFC uniquement",
237
+ ),
238
+ "caseless": NormalizationProfile(
239
+ name="caseless",
240
+ nfc=True,
241
+ caseless=True,
242
+ diplomatic_table={},
243
+ description="NFC + insensible à la casse",
244
+ ),
245
+ }
246
+ if name not in profiles:
247
+ raise KeyError(
248
+ f"Profil de normalisation inconnu : '{name}'. "
249
+ f"Disponibles : {', '.join(profiles)}"
250
+ )
251
+ return profiles[name]
252
+
253
+
254
+ # ---------------------------------------------------------------------------
255
+ # Fonctions utilitaires
256
+ # ---------------------------------------------------------------------------
257
+
258
+ def _apply_diplomatic_table(text: str, table: dict[str, str]) -> str:
259
+ """Applique une table de correspondances diplomatiques caractère par caractère.
260
+
261
+ Les clés multi-caractères (ex : ``"ae"`` → ``"æ"``) sont gérées en priorité
262
+ sur les correspondances simples.
263
+ """
264
+ if not table:
265
+ return text
266
+
267
+ # Séparer les clés simples (1 char) des clés multi-chars pour traitement ordonné
268
+ multi_keys = sorted(
269
+ (k for k in table if len(k) > 1), key=len, reverse=True
270
+ )
271
+ simple_table = {k: v for k, v in table.items() if len(k) == 1}
272
+
273
+ result = text
274
+ # Remplacements multi-chars en premier (évite les conflits)
275
+ for key in multi_keys:
276
+ result = result.replace(key, table[key])
277
+
278
+ # Remplacements char par char
279
+ if simple_table:
280
+ result = "".join(simple_table.get(c, c) for c in result)
281
+
282
+ return result
283
+
284
+
285
+ # Profil par défaut utilisé pour le CER diplomatique intégré
286
+ DEFAULT_DIPLOMATIC_PROFILE: NormalizationProfile = get_builtin_profile("medieval_french")
picarones/core/results.py CHANGED
@@ -35,6 +35,17 @@ class DocumentResult:
35
  """Sortie OCR brute avant correction LLM (None pour les moteurs OCR seuls)."""
36
  pipeline_metadata: dict = field(default_factory=dict)
37
  """Métadonnées du pipeline : mode, prompt, over-normalization…"""
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  def as_dict(self) -> dict:
40
  d = {
@@ -50,6 +61,16 @@ class DocumentResult:
50
  d["ocr_intermediate"] = self.ocr_intermediate
51
  if self.pipeline_metadata:
52
  d["pipeline_metadata"] = self.pipeline_metadata
 
 
 
 
 
 
 
 
 
 
53
  return d
54
 
55
 
@@ -67,6 +88,17 @@ class EngineReport:
67
  Clés typiques : mode, prompt_file, llm_model, llm_provider, pipeline_steps,
68
  over_normalization (score agrégé, classe 10 de la taxonomie).
69
  """
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  def __post_init__(self) -> None:
72
  if not self.aggregated_metrics and self.document_results:
@@ -84,6 +116,20 @@ class EngineReport:
84
  wer_stats = self.aggregated_metrics.get("wer", {})
85
  return wer_stats.get("mean")
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  @property
88
  def is_pipeline(self) -> bool:
89
  """Vrai si ce rapport correspond à un pipeline OCR+LLM."""
@@ -99,6 +145,16 @@ class EngineReport:
99
  }
100
  if self.pipeline_info:
101
  d["pipeline_info"] = self.pipeline_info
 
 
 
 
 
 
 
 
 
 
102
  return d
103
 
104
 
 
35
  """Sortie OCR brute avant correction LLM (None pour les moteurs OCR seuls)."""
36
  pipeline_metadata: dict = field(default_factory=dict)
37
  """Métadonnées du pipeline : mode, prompt, over-normalization…"""
38
+ # Champs Sprint 5 — métriques avancées patrimoniales
39
+ confusion_matrix: Optional[dict] = None
40
+ """Matrice de confusion unicode sérialisée."""
41
+ char_scores: Optional[dict] = None
42
+ """Scores ligatures et diacritiques."""
43
+ taxonomy: Optional[dict] = None
44
+ """Classification taxonomique des erreurs (classes 1-9)."""
45
+ structure: Optional[dict] = None
46
+ """Analyse structurelle (segmentation lignes, ordre lecture)."""
47
+ image_quality: Optional[dict] = None
48
+ """Métriques de qualité image."""
49
 
50
  def as_dict(self) -> dict:
51
  d = {
 
61
  d["ocr_intermediate"] = self.ocr_intermediate
62
  if self.pipeline_metadata:
63
  d["pipeline_metadata"] = self.pipeline_metadata
64
+ if self.confusion_matrix is not None:
65
+ d["confusion_matrix"] = self.confusion_matrix
66
+ if self.char_scores is not None:
67
+ d["char_scores"] = self.char_scores
68
+ if self.taxonomy is not None:
69
+ d["taxonomy"] = self.taxonomy
70
+ if self.structure is not None:
71
+ d["structure"] = self.structure
72
+ if self.image_quality is not None:
73
+ d["image_quality"] = self.image_quality
74
  return d
75
 
76
 
 
88
  Clés typiques : mode, prompt_file, llm_model, llm_provider, pipeline_steps,
89
  over_normalization (score agrégé, classe 10 de la taxonomie).
90
  """
91
+ # Métriques agrégées Sprint 5
92
+ aggregated_confusion: Optional[dict] = None
93
+ """Matrice de confusion unicode agrégée sur le corpus."""
94
+ aggregated_char_scores: Optional[dict] = None
95
+ """Scores ligatures/diacritiques agrégés."""
96
+ aggregated_taxonomy: Optional[dict] = None
97
+ """Distribution taxonomique des erreurs agrégée."""
98
+ aggregated_structure: Optional[dict] = None
99
+ """Métriques structurelles agrégées."""
100
+ aggregated_image_quality: Optional[dict] = None
101
+ """Métriques de qualité image agrégées."""
102
 
103
  def __post_init__(self) -> None:
104
  if not self.aggregated_metrics and self.document_results:
 
116
  wer_stats = self.aggregated_metrics.get("wer", {})
117
  return wer_stats.get("mean")
118
 
119
+ @property
120
+ def ligature_score(self) -> Optional[float]:
121
+ """Score de ligatures agrégé (None si non calculé)."""
122
+ if self.aggregated_char_scores:
123
+ return self.aggregated_char_scores.get("ligature", {}).get("score")
124
+ return None
125
+
126
+ @property
127
+ def diacritic_score(self) -> Optional[float]:
128
+ """Score diacritique agrégé (None si non calculé)."""
129
+ if self.aggregated_char_scores:
130
+ return self.aggregated_char_scores.get("diacritic", {}).get("score")
131
+ return None
132
+
133
  @property
134
  def is_pipeline(self) -> bool:
135
  """Vrai si ce rapport correspond à un pipeline OCR+LLM."""
 
145
  }
146
  if self.pipeline_info:
147
  d["pipeline_info"] = self.pipeline_info
148
+ if self.aggregated_confusion is not None:
149
+ d["aggregated_confusion"] = self.aggregated_confusion
150
+ if self.aggregated_char_scores is not None:
151
+ d["aggregated_char_scores"] = self.aggregated_char_scores
152
+ if self.aggregated_taxonomy is not None:
153
+ d["aggregated_taxonomy"] = self.aggregated_taxonomy
154
+ if self.aggregated_structure is not None:
155
+ d["aggregated_structure"] = self.aggregated_structure
156
+ if self.aggregated_image_quality is not None:
157
+ d["aggregated_image_quality"] = self.aggregated_image_quality
158
  return d
159
 
160
 
picarones/core/runner.py CHANGED
@@ -21,6 +21,7 @@ def run_benchmark(
21
  engines: list[BaseOCREngine],
22
  output_json: Optional[str | Path] = None,
23
  show_progress: bool = True,
 
24
  ) -> BenchmarkResult:
25
  """Exécute le benchmark d'un ou plusieurs moteurs/pipelines sur un corpus.
26
 
@@ -62,7 +63,12 @@ def run_benchmark(
62
  disable=not show_progress,
63
  )
64
 
65
- for doc in iterator:
 
 
 
 
 
66
  ocr_result = engine.run(doc.image_path)
67
 
68
  if ocr_result.success:
@@ -97,6 +103,57 @@ def run_benchmark(
97
  )
98
  pipeline_meta["over_normalization"] = over_norm.as_dict()
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  document_results.append(
101
  DocumentResult(
102
  doc_id=doc.doc_id,
@@ -108,18 +165,35 @@ def run_benchmark(
108
  engine_error=ocr_result.error,
109
  ocr_intermediate=ocr_intermediate,
110
  pipeline_metadata=pipeline_meta,
 
 
 
 
 
111
  )
112
  )
113
 
114
  engine_version = engine._safe_version()
115
  pipeline_info = _build_pipeline_info(engine, document_results)
116
 
 
 
 
 
 
 
 
117
  report = EngineReport(
118
  engine_name=engine.name,
119
  engine_version=engine_version,
120
  engine_config=engine.config,
121
  document_results=document_results,
122
  pipeline_info=pipeline_info,
 
 
 
 
 
123
  )
124
  engine_reports.append(report)
125
  logger.info(
@@ -184,3 +258,99 @@ def _build_pipeline_info(engine: BaseOCREngine, doc_results: list[DocumentResult
184
  }
185
 
186
  return info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  engines: list[BaseOCREngine],
22
  output_json: Optional[str | Path] = None,
23
  show_progress: bool = True,
24
+ progress_callback: Optional[callable] = None,
25
  ) -> BenchmarkResult:
26
  """Exécute le benchmark d'un ou plusieurs moteurs/pipelines sur un corpus.
27
 
 
63
  disable=not show_progress,
64
  )
65
 
66
+ for doc_idx, doc in enumerate(iterator):
67
+ if progress_callback is not None:
68
+ try:
69
+ progress_callback(engine.name, doc_idx, doc.doc_id)
70
+ except Exception:
71
+ pass
72
  ocr_result = engine.run(doc.image_path)
73
 
74
  if ocr_result.success:
 
103
  )
104
  pipeline_meta["over_normalization"] = over_norm.as_dict()
105
 
106
+ # Sprint 5 : métriques avancées patrimoniales
107
+ confusion_data = None
108
+ char_scores_data = None
109
+ taxonomy_data = None
110
+ structure_data = None
111
+ image_quality_data = None
112
+
113
+ if ocr_result.success:
114
+ try:
115
+ from picarones.core.confusion import build_confusion_matrix
116
+ cm = build_confusion_matrix(doc.ground_truth, ocr_result.text)
117
+ confusion_data = cm.as_dict()
118
+ except Exception:
119
+ pass
120
+
121
+ try:
122
+ from picarones.core.char_scores import (
123
+ compute_ligature_score, compute_diacritic_score
124
+ )
125
+ lig = compute_ligature_score(doc.ground_truth, ocr_result.text)
126
+ diac = compute_diacritic_score(doc.ground_truth, ocr_result.text)
127
+ char_scores_data = {
128
+ "ligature": lig.as_dict(),
129
+ "diacritic": diac.as_dict(),
130
+ }
131
+ except Exception:
132
+ pass
133
+
134
+ try:
135
+ from picarones.core.taxonomy import classify_errors
136
+ tax = classify_errors(doc.ground_truth, ocr_result.text)
137
+ taxonomy_data = tax.as_dict()
138
+ except Exception:
139
+ pass
140
+
141
+ try:
142
+ from picarones.core.structure import analyze_structure
143
+ struct = analyze_structure(doc.ground_truth, ocr_result.text)
144
+ structure_data = struct.as_dict()
145
+ except Exception:
146
+ pass
147
+
148
+ # Qualité image (indépendant du succès OCR)
149
+ try:
150
+ from picarones.core.image_quality import analyze_image_quality
151
+ iq = analyze_image_quality(doc.image_path)
152
+ if iq.error is None:
153
+ image_quality_data = iq.as_dict()
154
+ except Exception:
155
+ pass
156
+
157
  document_results.append(
158
  DocumentResult(
159
  doc_id=doc.doc_id,
 
165
  engine_error=ocr_result.error,
166
  ocr_intermediate=ocr_intermediate,
167
  pipeline_metadata=pipeline_meta,
168
+ confusion_matrix=confusion_data,
169
+ char_scores=char_scores_data,
170
+ taxonomy=taxonomy_data,
171
+ structure=structure_data,
172
+ image_quality=image_quality_data,
173
  )
174
  )
175
 
176
  engine_version = engine._safe_version()
177
  pipeline_info = _build_pipeline_info(engine, document_results)
178
 
179
+ # Agrégation Sprint 5
180
+ agg_confusion = _aggregate_confusion(document_results)
181
+ agg_char_scores = _aggregate_char_scores(document_results)
182
+ agg_taxonomy = _aggregate_taxonomy(document_results)
183
+ agg_structure = _aggregate_structure(document_results)
184
+ agg_image_quality = _aggregate_image_quality(document_results)
185
+
186
  report = EngineReport(
187
  engine_name=engine.name,
188
  engine_version=engine_version,
189
  engine_config=engine.config,
190
  document_results=document_results,
191
  pipeline_info=pipeline_info,
192
+ aggregated_confusion=agg_confusion,
193
+ aggregated_char_scores=agg_char_scores,
194
+ aggregated_taxonomy=agg_taxonomy,
195
+ aggregated_structure=agg_structure,
196
+ aggregated_image_quality=agg_image_quality,
197
  )
198
  engine_reports.append(report)
199
  logger.info(
 
258
  }
259
 
260
  return info
261
+
262
+
263
+ # ---------------------------------------------------------------------------
264
+ # Helpers d'agrégation Sprint 5
265
+ # ---------------------------------------------------------------------------
266
+
267
+ def _aggregate_confusion(doc_results: list) -> Optional[dict]:
268
+ """Agrège les matrices de confusion unicode sur tous les documents."""
269
+ try:
270
+ from picarones.core.confusion import aggregate_confusion_matrices, ConfusionMatrix
271
+ matrices = [
272
+ ConfusionMatrix(**dr.confusion_matrix)
273
+ for dr in doc_results
274
+ if dr.confusion_matrix is not None
275
+ ]
276
+ if not matrices:
277
+ return None
278
+ agg = aggregate_confusion_matrices(matrices)
279
+ return agg.as_compact_dict(min_count=2)
280
+ except Exception:
281
+ return None
282
+
283
+
284
+ def _aggregate_char_scores(doc_results: list) -> Optional[dict]:
285
+ """Agrège les scores ligatures/diacritiques."""
286
+ try:
287
+ from picarones.core.char_scores import (
288
+ aggregate_ligature_scores, aggregate_diacritic_scores,
289
+ LigatureScore, DiacriticScore,
290
+ )
291
+ lig_scores = [
292
+ LigatureScore(**dr.char_scores["ligature"])
293
+ for dr in doc_results
294
+ if dr.char_scores is not None
295
+ ]
296
+ diac_scores = [
297
+ DiacriticScore(**dr.char_scores["diacritic"])
298
+ for dr in doc_results
299
+ if dr.char_scores is not None
300
+ ]
301
+ if not lig_scores:
302
+ return None
303
+ return {
304
+ "ligature": aggregate_ligature_scores(lig_scores),
305
+ "diacritic": aggregate_diacritic_scores(diac_scores),
306
+ }
307
+ except Exception:
308
+ return None
309
+
310
+
311
+ def _aggregate_taxonomy(doc_results: list) -> Optional[dict]:
312
+ """Agrège les classifications taxonomiques."""
313
+ try:
314
+ from picarones.core.taxonomy import aggregate_taxonomy, TaxonomyResult
315
+ results = [
316
+ TaxonomyResult.from_dict(dr.taxonomy)
317
+ for dr in doc_results
318
+ if dr.taxonomy is not None
319
+ ]
320
+ if not results:
321
+ return None
322
+ return aggregate_taxonomy(results)
323
+ except Exception:
324
+ return None
325
+
326
+
327
+ def _aggregate_structure(doc_results: list) -> Optional[dict]:
328
+ """Agrège les métriques structurelles."""
329
+ try:
330
+ from picarones.core.structure import aggregate_structure, StructureResult
331
+ results = [
332
+ StructureResult.from_dict(dr.structure)
333
+ for dr in doc_results
334
+ if dr.structure is not None
335
+ ]
336
+ if not results:
337
+ return None
338
+ return aggregate_structure(results)
339
+ except Exception:
340
+ return None
341
+
342
+
343
+ def _aggregate_image_quality(doc_results: list) -> Optional[dict]:
344
+ """Agrège les métriques de qualité image."""
345
+ try:
346
+ from picarones.core.image_quality import aggregate_image_quality, ImageQualityResult
347
+ results = [
348
+ ImageQualityResult.from_dict(dr.image_quality)
349
+ for dr in doc_results
350
+ if dr.image_quality is not None
351
+ ]
352
+ if not results:
353
+ return None
354
+ return aggregate_image_quality(results)
355
+ except Exception:
356
+ return None
picarones/core/structure.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Analyse structurelle des résultats OCR.
2
+
3
+ Mesures
4
+ -------
5
+ - **Taux de fusion de lignes** : l'OCR produit moins de lignes que le GT
6
+ (plusieurs lignes GT fusionnées en une seule).
7
+ - **Taux de fragmentation** : l'OCR produit plus de lignes que le GT
8
+ (une ligne GT découpée en plusieurs).
9
+ - **Score d'ordre de lecture** : corrélation entre l'ordre des mots GT et OCR,
10
+ approximé par la longueur de la sous-séquence commune la plus longue (LCS).
11
+ - **Taux de conservation des paragraphes** : respect des sauts de paragraphe.
12
+
13
+ Ces métriques sont calculées indépendamment du contenu textuel — elles mesurent
14
+ la fidélité de la mise en page, pas la qualité des caractères.
15
+
16
+ Note : sans bounding boxes disponibles, l'analyse se base uniquement sur les
17
+ sauts de ligne présents dans les textes GT et OCR.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import difflib
23
+ from dataclasses import dataclass
24
+ from typing import Optional
25
+
26
+
27
+ @dataclass
28
+ class StructureResult:
29
+ """Résultat de l'analyse structurelle pour un document."""
30
+
31
+ gt_line_count: int = 0
32
+ """Nombre de lignes dans le GT."""
33
+ ocr_line_count: int = 0
34
+ """Nombre de lignes dans l'OCR."""
35
+
36
+ line_fusion_count: int = 0
37
+ """Nombre de fusions de lignes (GT lignes absorbées)."""
38
+ line_fragmentation_count: int = 0
39
+ """Nombre de fragmentations (GT lignes splittées)."""
40
+
41
+ reading_order_score: float = 1.0
42
+ """Score d'ordre de lecture [0, 1]. 1 = ordre parfait."""
43
+
44
+ paragraph_conservation_score: float = 1.0
45
+ """Score de conservation des paragraphes [0, 1]."""
46
+
47
+ @property
48
+ def line_fusion_rate(self) -> float:
49
+ """Taux de fusion = fusions / lignes GT."""
50
+ return self.line_fusion_count / self.gt_line_count if self.gt_line_count > 0 else 0.0
51
+
52
+ @property
53
+ def line_fragmentation_rate(self) -> float:
54
+ """Taux de fragmentation = fragmentations / lignes GT."""
55
+ return self.line_fragmentation_count / self.gt_line_count if self.gt_line_count > 0 else 0.0
56
+
57
+ @property
58
+ def line_accuracy(self) -> float:
59
+ """Exactitude du nombre de lignes : 1 - |delta| / max(gt, ocr)."""
60
+ if self.gt_line_count == 0 and self.ocr_line_count == 0:
61
+ return 1.0
62
+ max_lines = max(self.gt_line_count, self.ocr_line_count)
63
+ delta = abs(self.gt_line_count - self.ocr_line_count)
64
+ return max(0.0, 1.0 - delta / max_lines)
65
+
66
+ def as_dict(self) -> dict:
67
+ return {
68
+ "gt_line_count": self.gt_line_count,
69
+ "ocr_line_count": self.ocr_line_count,
70
+ "line_fusion_count": self.line_fusion_count,
71
+ "line_fragmentation_count": self.line_fragmentation_count,
72
+ "line_fusion_rate": round(self.line_fusion_rate, 4),
73
+ "line_fragmentation_rate": round(self.line_fragmentation_rate, 4),
74
+ "line_accuracy": round(self.line_accuracy, 4),
75
+ "reading_order_score": round(self.reading_order_score, 4),
76
+ "paragraph_conservation_score": round(self.paragraph_conservation_score, 4),
77
+ }
78
+
79
+ @classmethod
80
+ def from_dict(cls, data: dict) -> "StructureResult":
81
+ return cls(
82
+ gt_line_count=data.get("gt_line_count", 0),
83
+ ocr_line_count=data.get("ocr_line_count", 0),
84
+ line_fusion_count=data.get("line_fusion_count", 0),
85
+ line_fragmentation_count=data.get("line_fragmentation_count", 0),
86
+ reading_order_score=data.get("reading_order_score", 1.0),
87
+ paragraph_conservation_score=data.get("paragraph_conservation_score", 1.0),
88
+ )
89
+
90
+
91
+ def analyze_structure(ground_truth: str, hypothesis: str) -> StructureResult:
92
+ """Analyse la structure d'un document OCR comparée au GT.
93
+
94
+ Parameters
95
+ ----------
96
+ ground_truth:
97
+ Texte de référence (vérité terrain), avec sauts de ligne.
98
+ hypothesis:
99
+ Texte produit par l'OCR, avec sauts de ligne.
100
+
101
+ Returns
102
+ -------
103
+ StructureResult
104
+ """
105
+ gt_lines = [l for l in ground_truth.splitlines() if l.strip()]
106
+ ocr_lines = [l for l in hypothesis.splitlines() if l.strip()]
107
+
108
+ n_gt = len(gt_lines)
109
+ n_ocr = len(ocr_lines)
110
+
111
+ # Fusions et fragmentations
112
+ fusion_count, frag_count = _count_line_changes(gt_lines, ocr_lines)
113
+
114
+ # Score d'ordre de lecture via LCS sur les mots
115
+ reading_order = _reading_order_score(ground_truth, hypothesis)
116
+
117
+ # Score de conservation des paragraphes (sauts de ligne vides = paragraphes)
118
+ para_score = _paragraph_conservation_score(ground_truth, hypothesis)
119
+
120
+ return StructureResult(
121
+ gt_line_count=n_gt,
122
+ ocr_line_count=n_ocr,
123
+ line_fusion_count=fusion_count,
124
+ line_fragmentation_count=frag_count,
125
+ reading_order_score=reading_order,
126
+ paragraph_conservation_score=para_score,
127
+ )
128
+
129
+
130
+ def _count_line_changes(gt_lines: list[str], ocr_lines: list[str]) -> tuple[int, int]:
131
+ """Compte les fusions et fragmentations de lignes via SequenceMatcher."""
132
+ if not gt_lines or not ocr_lines:
133
+ return 0, 0
134
+
135
+ fusion_count = 0
136
+ frag_count = 0
137
+
138
+ # Aligner les lignes par contenu
139
+ matcher = difflib.SequenceMatcher(
140
+ None,
141
+ [l.strip()[:30] for l in gt_lines], # fingerprint court pour la comparaison
142
+ [l.strip()[:30] for l in ocr_lines],
143
+ autojunk=False,
144
+ )
145
+
146
+ for tag, i1, i2, j1, j2 in matcher.get_opcodes():
147
+ if tag == "replace":
148
+ gt_len = i2 - i1
149
+ ocr_len = j2 - j1
150
+ if ocr_len < gt_len:
151
+ # Moins de lignes OCR → fusions
152
+ fusion_count += gt_len - ocr_len
153
+ elif ocr_len > gt_len:
154
+ # Plus de lignes OCR → fragmentations
155
+ frag_count += ocr_len - gt_len
156
+ elif tag == "delete":
157
+ # Lignes GT supprimées dans l'OCR → lacunes (pas fusion/frag)
158
+ pass
159
+ elif tag == "insert":
160
+ # Lignes insérées par l'OCR
161
+ frag_count += j2 - j1
162
+
163
+ return fusion_count, frag_count
164
+
165
+
166
+ def _reading_order_score(ground_truth: str, hypothesis: str) -> float:
167
+ """Score d'ordre de lecture [0, 1] basé sur la LCS des mots.
168
+
169
+ On calcule la longueur de la sous-séquence commune la plus longue (LCS)
170
+ entre les listes de mots GT et OCR. Un score de 1 signifie que tous les
171
+ mots communs apparaissent dans le même ordre.
172
+ """
173
+ gt_words = ground_truth.split()
174
+ hyp_words = hypothesis.split()
175
+
176
+ if not gt_words or not hyp_words:
177
+ return 1.0
178
+
179
+ # Utiliser SequenceMatcher pour approximer la LCS
180
+ matcher = difflib.SequenceMatcher(None, gt_words, hyp_words, autojunk=False)
181
+ # Ratio est 2 * nb_correspondances / (len_gt + len_ocr)
182
+ # C'est un proxy raisonnable de l'ordre de lecture
183
+ ratio = matcher.ratio()
184
+ return round(ratio, 4)
185
+
186
+
187
+ def _paragraph_conservation_score(ground_truth: str, hypothesis: str) -> float:
188
+ """Score de conservation des paragraphes [0, 1].
189
+
190
+ Compte les sauts de paragraphe (lignes vides) dans le GT et mesure
191
+ le taux de conservation dans l'OCR.
192
+ """
193
+ # Un saut de paragraphe = deux sauts de ligne consécutifs
194
+ gt_paras = [p for p in ground_truth.split("\n\n") if p.strip()]
195
+ ocr_paras = [p for p in hypothesis.split("\n\n") if p.strip()]
196
+
197
+ n_gt_paras = len(gt_paras)
198
+ if n_gt_paras <= 1:
199
+ return 1.0 # pas de paragraphe distinct → score parfait
200
+
201
+ n_ocr_paras = len(ocr_paras)
202
+ delta = abs(n_gt_paras - n_ocr_paras)
203
+ score = max(0.0, 1.0 - delta / n_gt_paras)
204
+ return round(score, 4)
205
+
206
+
207
+ def aggregate_structure(results: list[StructureResult]) -> dict:
208
+ """Agrège les résultats structurels sur un corpus."""
209
+ if not results:
210
+ return {}
211
+
212
+ import statistics
213
+
214
+ def _mean(values: list[float]) -> float:
215
+ return round(statistics.mean(values), 4) if values else 0.0
216
+
217
+ fusion_rates = [r.line_fusion_rate for r in results]
218
+ frag_rates = [r.line_fragmentation_rate for r in results]
219
+ reading_scores = [r.reading_order_score for r in results]
220
+ para_scores = [r.paragraph_conservation_score for r in results]
221
+ line_accuracies = [r.line_accuracy for r in results]
222
+
223
+ return {
224
+ "mean_line_fusion_rate": _mean(fusion_rates),
225
+ "mean_line_fragmentation_rate": _mean(frag_rates),
226
+ "mean_reading_order_score": _mean(reading_scores),
227
+ "mean_paragraph_conservation": _mean(para_scores),
228
+ "mean_line_accuracy": _mean(line_accuracies),
229
+ "document_count": len(results),
230
+ }
picarones/core/taxonomy.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Taxonomie des erreurs OCR — classification automatique (classes 1 à 9).
2
+
3
+ Chaque erreur identifiée par l'alignement GT↔OCR est catégorisée selon
4
+ la taxonomie Picarones :
5
+
6
+ | Classe | Nom | Description |
7
+ |--------|-------------------|----------------------------------------------------|
8
+ | 1 | visual_confusion | Confusion morphologique (rn/m, l/1, O/0, u/n…) |
9
+ | 2 | diacritic_error | Diacritique absent, incorrect ou ajouté |
10
+ | 3 | case_error | Erreur de casse uniquement (A/a) |
11
+ | 4 | ligature_error | Ligature non résolue ou mal résolue |
12
+ | 5 | abbreviation_error| Abréviation médiévale non développée |
13
+ | 6 | hapax | Mot introuvable dans tout lexique |
14
+ | 7 | segmentation_error| Fusion ou fragmentation de tokens (mots/lignes) |
15
+ | 8 | oov_character | Caractère hors-vocabulaire du moteur |
16
+ | 9 | lacuna | Texte présent dans le GT absent de l'OCR |
17
+ | 10 | over_normalization| Sur-normalisation LLM (voir pipelines/) |
18
+
19
+ Note : la classe 10 est calculée par picarones/pipelines/over_normalization.py.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import difflib
25
+ import unicodedata
26
+ from dataclasses import dataclass, field
27
+ from typing import Optional
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Tables de référence pour la classification
32
+ # ---------------------------------------------------------------------------
33
+
34
+ #: Confusions visuelles bien connues en OCR (caractères morphologiquement proches)
35
+ VISUAL_CONFUSIONS: dict[frozenset, str] = {}
36
+ _VISUAL_PAIRS: list[tuple[str, str]] = [
37
+ # Minuscules
38
+ ("r", "n"), ("rn", "m"), ("l", "1"), ("l", "i"), ("l", "|"),
39
+ ("O", "0"), ("O", "o"), ("u", "n"), ("n", "u"), ("v", "u"),
40
+ ("c", "e"), ("e", "c"), ("a", "o"), ("o", "a"),
41
+ ("f", "ſ"), ("ſ", "f"), ("f", "t"),
42
+ ("h", "li"), ("h", "lı"),
43
+ ("m", "rn"), ("m", "in"),
44
+ ("d", "cl"), ("d", "a"),
45
+ ("q", "g"), ("p", "q"),
46
+ # Majuscules ↔ minuscules homographes (classe 1, pas classe 3)
47
+ ("I", "l"), ("I", "1"),
48
+ # Chiffres
49
+ ("1", "I"), ("1", "l"), ("0", "O"),
50
+ # Ponctuation
51
+ (".", ","), (",", "."),
52
+ ]
53
+ for _a, _b in _VISUAL_PAIRS:
54
+ VISUAL_CONFUSIONS[frozenset({_a, _b})] = f"{_a}/{_b}"
55
+
56
+ #: Couples de ligatures pour la détection des erreurs de ligatures
57
+ from picarones.core.char_scores import LIGATURE_TABLE, DIACRITIC_MAP # noqa: E402
58
+
59
+ # Caractères hors-ASCII présumés hors-vocabulaire (alphabet non latin de base)
60
+ _LATIN_BASIC = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
61
+ " \t\n.,;:!?-_'\"«»()[]{}/@#%&*+=/\\|<>~^")
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # Résultat structuré
66
+ # ---------------------------------------------------------------------------
67
+
68
+ @dataclass
69
+ class TaxonomyResult:
70
+ """Résultat de la classification taxonomique des erreurs pour un document."""
71
+
72
+ counts: dict[str, int] = field(default_factory=dict)
73
+ """Nombre d'erreurs par classe. Clés : 'visual_confusion', 'diacritic_error'…"""
74
+
75
+ examples: dict[str, list[dict]] = field(default_factory=dict)
76
+ """Exemples d'erreurs par classe (max 5 par classe).
77
+ Format : [{'gt': 'chaîne', 'ocr': 'chaîne', 'position': int}]
78
+ """
79
+
80
+ total_errors: int = 0
81
+ """Nombre total d'erreurs classifiées."""
82
+
83
+ @property
84
+ def class_distribution(self) -> dict[str, float]:
85
+ """Distribution relative (0–1) par classe."""
86
+ if not self.total_errors:
87
+ return {}
88
+ return {
89
+ cls: round(cnt / self.total_errors, 4)
90
+ for cls, cnt in self.counts.items()
91
+ }
92
+
93
+ def as_dict(self) -> dict:
94
+ return {
95
+ "counts": self.counts,
96
+ "total_errors": self.total_errors,
97
+ "class_distribution": self.class_distribution,
98
+ "examples": {
99
+ cls: exs[:3] for cls, exs in self.examples.items()
100
+ },
101
+ }
102
+
103
+ @classmethod
104
+ def from_dict(cls, data: dict) -> "TaxonomyResult":
105
+ return cls(
106
+ counts=data.get("counts", {}),
107
+ examples=data.get("examples", {}),
108
+ total_errors=data.get("total_errors", 0),
109
+ )
110
+
111
+
112
+ # Noms des classes en ordre
113
+ ERROR_CLASSES = [
114
+ "visual_confusion",
115
+ "diacritic_error",
116
+ "case_error",
117
+ "ligature_error",
118
+ "abbreviation_error",
119
+ "hapax",
120
+ "segmentation_error",
121
+ "oov_character",
122
+ "lacuna",
123
+ ]
124
+
125
+
126
+ # ---------------------------------------------------------------------------
127
+ # Classification principale
128
+ # ---------------------------------------------------------------------------
129
+
130
+ def classify_errors(
131
+ ground_truth: str,
132
+ hypothesis: str,
133
+ max_examples: int = 5,
134
+ ) -> TaxonomyResult:
135
+ """Classifie automatiquement les erreurs OCR dans une paire GT/OCR.
136
+
137
+ L'alignement utilise difflib.SequenceMatcher au niveau mot pour détecter
138
+ les erreurs de segmentation, puis au niveau caractère pour les autres classes.
139
+
140
+ Parameters
141
+ ----------
142
+ ground_truth:
143
+ Texte de référence (vérité terrain).
144
+ hypothesis:
145
+ Texte produit par l'OCR.
146
+ max_examples:
147
+ Nombre maximal d'exemples conservés par classe.
148
+
149
+ Returns
150
+ -------
151
+ TaxonomyResult
152
+ """
153
+ counts: dict[str, int] = {cls: 0 for cls in ERROR_CLASSES}
154
+ examples: dict[str, list[dict]] = {cls: [] for cls in ERROR_CLASSES}
155
+ total = 0
156
+
157
+ if not ground_truth and not hypothesis:
158
+ return TaxonomyResult(counts=counts, examples=examples, total_errors=0)
159
+
160
+ # -----------------------------------------------------------------------
161
+ # Niveau mot : détecter segmentation (classe 7) et lacunes (classe 9)
162
+ # -----------------------------------------------------------------------
163
+ gt_words = ground_truth.split()
164
+ hyp_words = hypothesis.split()
165
+
166
+ word_matcher = difflib.SequenceMatcher(None, gt_words, hyp_words, autojunk=False)
167
+ for tag, i1, i2, j1, j2 in word_matcher.get_opcodes():
168
+ if tag == "delete":
169
+ # Mots GT absents de l'OCR → lacune (classe 9)
170
+ for w in gt_words[i1:i2]:
171
+ counts["lacuna"] += 1
172
+ total += 1
173
+ if len(examples["lacuna"]) < max_examples:
174
+ examples["lacuna"].append({"gt": w, "ocr": "", "position": i1})
175
+
176
+ elif tag == "insert":
177
+ # Mots ajoutés par l'OCR → généralement classe 8 (hors-vocab)
178
+ for w in hyp_words[j1:j2]:
179
+ if _is_oov_word(w):
180
+ counts["oov_character"] += 1
181
+ total += 1
182
+
183
+ elif tag == "replace":
184
+ gt_seg = gt_words[i1:i2]
185
+ hyp_seg = hyp_words[j1:j2]
186
+ # Segmentation : fusion de mots (moins de mots OCR) ou fragmentation
187
+ if len(hyp_seg) != len(gt_seg):
188
+ n_seg = abs(len(gt_seg) - len(hyp_seg))
189
+ counts["segmentation_error"] += n_seg
190
+ total += n_seg
191
+ if len(examples["segmentation_error"]) < max_examples:
192
+ examples["segmentation_error"].append({
193
+ "gt": " ".join(gt_seg),
194
+ "ocr": " ".join(hyp_seg),
195
+ "position": i1,
196
+ })
197
+ else:
198
+ # Paires mot-à-mot
199
+ for gt_w, hyp_w in zip(gt_seg, hyp_seg):
200
+ if gt_w != hyp_w:
201
+ _classify_word_error(
202
+ gt_w, hyp_w, counts, examples, max_examples
203
+ )
204
+ total += 1
205
+
206
+ return TaxonomyResult(
207
+ counts=counts,
208
+ examples=examples,
209
+ total_errors=total,
210
+ )
211
+
212
+
213
+ def _classify_word_error(
214
+ gt_word: str,
215
+ hyp_word: str,
216
+ counts: dict[str, int],
217
+ examples: dict[str, list[dict]],
218
+ max_examples: int,
219
+ ) -> None:
220
+ """Classifie l'erreur entre deux mots non-identiques."""
221
+ # Classe 3 : erreur de casse seule
222
+ if gt_word.casefold() == hyp_word.casefold() and gt_word != hyp_word:
223
+ counts["case_error"] += 1
224
+ if len(examples["case_error"]) < max_examples:
225
+ examples["case_error"].append({"gt": gt_word, "ocr": hyp_word})
226
+ return
227
+
228
+ # Classe 4 : erreur de ligature
229
+ gt_norm = unicodedata.normalize("NFC", gt_word)
230
+ hyp_norm = unicodedata.normalize("NFC", hyp_word)
231
+ if _is_ligature_error(gt_norm, hyp_norm):
232
+ counts["ligature_error"] += 1
233
+ if len(examples["ligature_error"]) < max_examples:
234
+ examples["ligature_error"].append({"gt": gt_word, "ocr": hyp_word})
235
+ return
236
+
237
+ # Classe 5 : erreur d'abréviation (présence de ꝑ, ꝓ, ꝗ dans le GT)
238
+ if _is_abbreviation_error(gt_norm, hyp_norm):
239
+ counts["abbreviation_error"] += 1
240
+ if len(examples["abbreviation_error"]) < max_examples:
241
+ examples["abbreviation_error"].append({"gt": gt_word, "ocr": hyp_word})
242
+ return
243
+
244
+ # Classe 2 : erreur diacritique
245
+ if _is_diacritic_error(gt_norm, hyp_norm):
246
+ counts["diacritic_error"] += 1
247
+ if len(examples["diacritic_error"]) < max_examples:
248
+ examples["diacritic_error"].append({"gt": gt_word, "ocr": hyp_word})
249
+ return
250
+
251
+ # Classe 1 : confusion visuelle (comparaison char par char)
252
+ if _is_visual_confusion(gt_norm, hyp_norm):
253
+ counts["visual_confusion"] += 1
254
+ if len(examples["visual_confusion"]) < max_examples:
255
+ examples["visual_confusion"].append({"gt": gt_word, "ocr": hyp_word})
256
+ return
257
+
258
+ # Classe 8 : caractère hors-vocabulaire
259
+ if _is_oov_word(hyp_word):
260
+ counts["oov_character"] += 1
261
+ if len(examples["oov_character"]) < max_examples:
262
+ examples["oov_character"].append({"gt": gt_word, "ocr": hyp_word})
263
+ return
264
+
265
+ # Classe 6 : hapax (erreur résiduelle non classifiable)
266
+ counts["hapax"] += 1
267
+ if len(examples["hapax"]) < max_examples:
268
+ examples["hapax"].append({"gt": gt_word, "ocr": hyp_word})
269
+
270
+
271
+ def _is_ligature_error(gt: str, hyp: str) -> bool:
272
+ """Vrai si la différence implique une ligature Unicode."""
273
+ # GT contient une ligature que l'OCR a décomposée, ou vice versa
274
+ for lig, seqs in LIGATURE_TABLE.items():
275
+ if lig in gt:
276
+ for seq in seqs:
277
+ if seq in hyp and lig not in hyp:
278
+ return True
279
+ for seq in seqs:
280
+ if seq in gt and lig in hyp:
281
+ return True
282
+ return False
283
+
284
+
285
+ def _is_abbreviation_error(gt: str, hyp: str) -> bool:
286
+ """Vrai si le GT contient un caractère d'abréviation médiévale."""
287
+ abbreviation_chars = "\uA751\uA753\uA757" # ꝑ ꝓ ꝗ
288
+ return any(c in gt for c in abbreviation_chars)
289
+
290
+
291
+ def _is_diacritic_error(gt: str, hyp: str) -> bool:
292
+ """Vrai si la différence est principalement due à des diacritiques."""
293
+ # Comparer les formes sans diacritiques
294
+ def strip_diacritics(text: str) -> str:
295
+ nfd = unicodedata.normalize("NFD", text)
296
+ return "".join(c for c in nfd if unicodedata.category(c) != "Mn")
297
+
298
+ gt_stripped = strip_diacritics(gt)
299
+ hyp_stripped = strip_diacritics(hyp)
300
+ # Si les mots sont identiques sans diacritiques → erreur diacritique
301
+ if gt_stripped.casefold() == hyp_stripped.casefold() and gt != hyp:
302
+ return True
303
+ # Si le GT contient des diacritiques que l'OCR a supprimés
304
+ gt_has_diac = any(c in DIACRITIC_MAP for c in gt)
305
+ hyp_missing_diac = any(c not in DIACRITIC_MAP for c in hyp if c.isalpha())
306
+ return gt_has_diac and len(gt) == len(hyp) and gt_stripped == hyp_stripped
307
+
308
+
309
+ def _is_visual_confusion(gt: str, hyp: str) -> bool:
310
+ """Vrai si la différence implique des confusions visuelles connues."""
311
+ if abs(len(gt) - len(hyp)) > 2:
312
+ return False
313
+ # Vérifier les paires de confusions connues
314
+ for pair in VISUAL_CONFUSIONS:
315
+ chars = list(pair)
316
+ if len(chars) == 2:
317
+ a, b = chars
318
+ if a in gt and b in hyp and a not in hyp:
319
+ return True
320
+ if b in gt and a in hyp and b not in hyp:
321
+ return True
322
+ return False
323
+
324
+
325
+ def _is_oov_word(word: str) -> bool:
326
+ """Vrai si le mot contient des caractères hors de l'alphabet latin de base."""
327
+ return any(c not in _LATIN_BASIC and not c.isalpha() for c in word)
328
+
329
+
330
+ # ---------------------------------------------------------------------------
331
+ # Agrégation
332
+ # ---------------------------------------------------------------------------
333
+
334
+ def aggregate_taxonomy(results: list[TaxonomyResult]) -> dict:
335
+ """Agrège les résultats taxonomiques sur un corpus."""
336
+ combined: dict[str, int] = {cls: 0 for cls in ERROR_CLASSES}
337
+ total = 0
338
+ for r in results:
339
+ for cls, cnt in r.counts.items():
340
+ combined[cls] = combined.get(cls, 0) + cnt
341
+ total += r.total_errors
342
+
343
+ distribution = {
344
+ cls: round(cnt / total, 4) if total > 0 else 0.0
345
+ for cls, cnt in combined.items()
346
+ }
347
+ return {
348
+ "counts": combined,
349
+ "total_errors": total,
350
+ "class_distribution": distribution,
351
+ }
picarones/engines/__init__.py CHANGED
@@ -2,8 +2,18 @@
2
 
3
  from picarones.engines.base import BaseOCREngine, EngineResult
4
  from picarones.engines.tesseract import TesseractEngine
 
 
 
5
 
6
- __all__ = ["BaseOCREngine", "EngineResult", "TesseractEngine"]
 
 
 
 
 
 
 
7
 
8
  try:
9
  from picarones.engines.pero_ocr import PeroOCREngine
 
2
 
3
  from picarones.engines.base import BaseOCREngine, EngineResult
4
  from picarones.engines.tesseract import TesseractEngine
5
+ from picarones.engines.mistral_ocr import MistralOCREngine
6
+ from picarones.engines.google_vision import GoogleVisionEngine
7
+ from picarones.engines.azure_doc_intel import AzureDocIntelEngine
8
 
9
+ __all__ = [
10
+ "BaseOCREngine",
11
+ "EngineResult",
12
+ "TesseractEngine",
13
+ "MistralOCREngine",
14
+ "GoogleVisionEngine",
15
+ "AzureDocIntelEngine",
16
+ ]
17
 
18
  try:
19
  from picarones.engines.pero_ocr import PeroOCREngine
picarones/engines/azure_doc_intel.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Adaptateur OCR — Azure Document Intelligence (anciennement Form Recognizer).
2
+
3
+ Utilise l'API Azure Document Intelligence pour la reconnaissance de texte
4
+ dans des documents historiques.
5
+
6
+ Variables d'environnement requises :
7
+ - ``AZURE_DOC_INTEL_KEY`` : clé API Azure
8
+ - ``AZURE_DOC_INTEL_ENDPOINT`` : URL de l'endpoint (ex : https://moninstance.cognitiveservices.azure.com/)
9
+
10
+ Documentation : https://learn.microsoft.com/azure/ai-services/document-intelligence/
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import base64
16
+ import json
17
+ import os
18
+ import time
19
+ import urllib.error
20
+ import urllib.request
21
+ from pathlib import Path
22
+ from typing import Optional
23
+
24
+ from picarones.engines.base import BaseOCREngine
25
+
26
+
27
+ class AzureDocIntelEngine(BaseOCREngine):
28
+ """Moteur OCR via Azure Document Intelligence.
29
+
30
+ Configuration
31
+ -------------
32
+ model_id : str
33
+ Modèle Azure à utiliser. Défaut : ``"prebuilt-read"`` (lecture générique).
34
+ Alternatives : ``"prebuilt-document"``, ``"prebuilt-layout"``
35
+ ou un modèle entraîné personnalisé.
36
+ locale : str
37
+ Paramètre de locale pour améliorer la précision (ex : ``"fr-FR"``).
38
+ api_version : str
39
+ Version de l'API Azure (défaut : ``"2024-02-29-preview"``).
40
+ """
41
+
42
+ @property
43
+ def name(self) -> str:
44
+ return "azure_doc_intel"
45
+
46
+ def version(self) -> str:
47
+ return self.config.get("api_version", "2024-02-29-preview")
48
+
49
+ def __init__(self, config: Optional[dict] = None) -> None:
50
+ super().__init__(config)
51
+ self._api_key = os.environ.get("AZURE_DOC_INTEL_KEY")
52
+ self._endpoint = (
53
+ os.environ.get("AZURE_DOC_INTEL_ENDPOINT", "").rstrip("/")
54
+ or self.config.get("endpoint", "").rstrip("/")
55
+ )
56
+ self._model_id: str = self.config.get("model_id", "prebuilt-read")
57
+ self._locale: str = self.config.get("locale", "fr-FR")
58
+ self._api_version: str = self.config.get("api_version", "2024-02-29-preview")
59
+
60
+ def _run_ocr(self, image_path: Path) -> str:
61
+ if not self._api_key:
62
+ raise RuntimeError(
63
+ "Clé API Azure manquante — définissez la variable d'environnement AZURE_DOC_INTEL_KEY"
64
+ )
65
+ if not self._endpoint:
66
+ raise RuntimeError(
67
+ "Endpoint Azure manquant — définissez la variable d'environnement AZURE_DOC_INTEL_ENDPOINT"
68
+ )
69
+
70
+ # Essai via SDK Azure si disponible, sinon REST direct
71
+ try:
72
+ return self._run_via_sdk(image_path)
73
+ except ImportError:
74
+ return self._run_via_rest(image_path)
75
+
76
+ def _run_via_sdk(self, image_path: Path) -> str:
77
+ from azure.ai.documentintelligence import DocumentIntelligenceClient
78
+ from azure.core.credentials import AzureKeyCredential
79
+
80
+ client = DocumentIntelligenceClient(
81
+ endpoint=self._endpoint,
82
+ credential=AzureKeyCredential(self._api_key),
83
+ )
84
+ with open(image_path, "rb") as f:
85
+ poller = client.begin_analyze_document(
86
+ model_id=self._model_id,
87
+ body=f,
88
+ locale=self._locale,
89
+ content_type="application/octet-stream",
90
+ )
91
+ result = poller.result()
92
+ return "\n".join(
93
+ line.content
94
+ for page in result.pages
95
+ for line in (page.lines or [])
96
+ )
97
+
98
+ def _run_via_rest(self, image_path: Path) -> str:
99
+ """Appel REST direct (sans SDK Azure)."""
100
+ image_bytes = image_path.read_bytes()
101
+ analyze_url = (
102
+ f"{self._endpoint}/documentintelligence/documentModels/"
103
+ f"{self._model_id}:analyze"
104
+ f"?api-version={self._api_version}&locale={self._locale}"
105
+ )
106
+
107
+ # Soumettre l'image
108
+ req = urllib.request.Request(
109
+ analyze_url,
110
+ data=image_bytes,
111
+ headers={
112
+ "Ocp-Apim-Subscription-Key": self._api_key,
113
+ "Content-Type": "application/octet-stream",
114
+ },
115
+ )
116
+ try:
117
+ with urllib.request.urlopen(req, timeout=60) as resp:
118
+ operation_url = resp.headers.get("Operation-Location", "")
119
+ except urllib.error.HTTPError as exc:
120
+ raise RuntimeError(
121
+ f"Azure Document Intelligence erreur {exc.code}: {exc.read().decode()}"
122
+ ) from exc
123
+
124
+ if not operation_url:
125
+ raise RuntimeError("Azure : pas d'Operation-Location dans la réponse")
126
+
127
+ # Polling du résultat (Azure est asynchrone)
128
+ headers = {"Ocp-Apim-Subscription-Key": self._api_key}
129
+ for attempt in range(30):
130
+ time.sleep(1 + attempt * 0.5)
131
+ poll_req = urllib.request.Request(operation_url, headers=headers)
132
+ with urllib.request.urlopen(poll_req, timeout=30) as resp:
133
+ result = json.loads(resp.read().decode("utf-8"))
134
+ status = result.get("status", "")
135
+ if status == "succeeded":
136
+ return self._extract_text_from_result(result)
137
+ if status in {"failed", "canceled"}:
138
+ raise RuntimeError(f"Azure Document Intelligence : analyse {status}")
139
+ # status == "running" → continuer à attendre
140
+
141
+ raise RuntimeError("Azure Document Intelligence : timeout — analyse trop longue")
142
+
143
+ @staticmethod
144
+ def _extract_text_from_result(result: dict) -> str:
145
+ """Extrait le texte brut depuis la réponse JSON Azure."""
146
+ pages = result.get("analyzeResult", {}).get("pages", [])
147
+ lines: list[str] = []
148
+ for page in pages:
149
+ for line in page.get("lines", []):
150
+ content = line.get("content", "")
151
+ if content:
152
+ lines.append(content)
153
+ return "\n".join(lines)
picarones/engines/google_vision.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Adaptateur OCR — Google Cloud Vision API.
2
+
3
+ Utilise l'API Google Cloud Vision pour la détection de texte dans des
4
+ documents (méthode ``DOCUMENT_TEXT_DETECTION``, optimisée pour les textes
5
+ denses et multilinguistiques).
6
+
7
+ Authentification :
8
+ - Via service account JSON : variable d'environnement
9
+ ``GOOGLE_APPLICATION_CREDENTIALS`` → chemin vers le fichier JSON
10
+ - Via clé API simple : variable d'environnement ``GOOGLE_API_KEY``
11
+
12
+ Le mode service account est recommandé pour la production.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import base64
18
+ import json
19
+ import os
20
+ import urllib.error
21
+ import urllib.request
22
+ from pathlib import Path
23
+ from typing import Optional
24
+
25
+ from picarones.engines.base import BaseOCREngine
26
+
27
+
28
+ class GoogleVisionEngine(BaseOCREngine):
29
+ """Moteur OCR via l'API Google Cloud Vision.
30
+
31
+ Configuration
32
+ -------------
33
+ language_hints : list[str]
34
+ Suggestions de langue (ex : ``["fr"]``). Améliore la précision.
35
+ feature_type : str
36
+ Type de détection : ``"DOCUMENT_TEXT_DETECTION"`` (défaut, pour textes
37
+ denses) ou ``"TEXT_DETECTION"`` (pour textes courts).
38
+ """
39
+
40
+ @property
41
+ def name(self) -> str:
42
+ return "google_vision"
43
+
44
+ def version(self) -> str:
45
+ return "v1"
46
+
47
+ def __init__(self, config: Optional[dict] = None) -> None:
48
+ super().__init__(config)
49
+ self._api_key = os.environ.get("GOOGLE_API_KEY")
50
+ self._credentials_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
51
+ self._language_hints: list[str] = self.config.get("language_hints", ["fr"])
52
+ self._feature_type: str = self.config.get("feature_type", "DOCUMENT_TEXT_DETECTION")
53
+
54
+ def _run_ocr(self, image_path: Path) -> str:
55
+ # Priorité : SDK google-cloud-vision si disponible, sinon REST direct
56
+ if self._credentials_path:
57
+ return self._run_via_sdk(image_path)
58
+ elif self._api_key:
59
+ return self._run_via_rest(image_path)
60
+ else:
61
+ raise RuntimeError(
62
+ "Authentification Google Vision manquante. Définissez "
63
+ "GOOGLE_APPLICATION_CREDENTIALS (service account JSON) "
64
+ "ou GOOGLE_API_KEY."
65
+ )
66
+
67
+ def _run_via_sdk(self, image_path: Path) -> str:
68
+ try:
69
+ from google.cloud import vision
70
+ except ImportError as exc:
71
+ raise RuntimeError(
72
+ "Le package 'google-cloud-vision' n'est pas installé. "
73
+ "Lancez : pip install google-cloud-vision"
74
+ ) from exc
75
+
76
+ client = vision.ImageAnnotatorClient()
77
+ image_bytes = image_path.read_bytes()
78
+ image = vision.Image(content=image_bytes)
79
+
80
+ if self._feature_type == "DOCUMENT_TEXT_DETECTION":
81
+ response = client.document_text_detection(
82
+ image=image,
83
+ image_context=vision.ImageContext(
84
+ language_hints=self._language_hints
85
+ ),
86
+ )
87
+ return response.full_text_annotation.text
88
+ else:
89
+ response = client.text_detection(
90
+ image=image,
91
+ image_context=vision.ImageContext(
92
+ language_hints=self._language_hints
93
+ ),
94
+ )
95
+ texts = response.text_annotations
96
+ return texts[0].description if texts else ""
97
+
98
+ def _run_via_rest(self, image_path: Path) -> str:
99
+ """Appel REST direct (sans SDK), avec clé API simple."""
100
+ image_b64 = base64.b64encode(image_path.read_bytes()).decode("ascii")
101
+ payload = {
102
+ "requests": [
103
+ {
104
+ "image": {"content": image_b64},
105
+ "features": [{"type": self._feature_type, "maxResults": 1}],
106
+ "imageContext": {"languageHints": self._language_hints},
107
+ }
108
+ ]
109
+ }
110
+ url = f"https://vision.googleapis.com/v1/images:annotate?key={self._api_key}"
111
+ data = json.dumps(payload).encode("utf-8")
112
+ req = urllib.request.Request(
113
+ url, data=data,
114
+ headers={"Content-Type": "application/json"},
115
+ )
116
+ try:
117
+ with urllib.request.urlopen(req, timeout=60) as resp:
118
+ result = json.loads(resp.read().decode("utf-8"))
119
+ except urllib.error.HTTPError as exc:
120
+ raise RuntimeError(f"Google Vision API erreur {exc.code}: {exc.read().decode()}") from exc
121
+
122
+ responses = result.get("responses", [{}])
123
+ if not responses:
124
+ return ""
125
+ r = responses[0]
126
+ if "error" in r:
127
+ raise RuntimeError(f"Google Vision API erreur : {r['error']}")
128
+
129
+ if self._feature_type == "DOCUMENT_TEXT_DETECTION":
130
+ return r.get("fullTextAnnotation", {}).get("text", "")
131
+ else:
132
+ texts = r.get("textAnnotations", [])
133
+ return texts[0]["description"] if texts else ""
picarones/engines/mistral_ocr.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Adaptateur OCR — Mistral OCR (API vision Mistral AI).
2
+
3
+ Utilise l'API Mistral pour la reconnaissance de texte sur documents
4
+ patrimoniaux via le modèle multimodal Mistral.
5
+
6
+ Clé API : variable d'environnement ``MISTRAL_API_KEY``.
7
+
8
+ Documentation API : https://docs.mistral.ai/
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import base64
14
+ import os
15
+ from pathlib import Path
16
+ from typing import Optional
17
+
18
+ from picarones.engines.base import BaseOCREngine
19
+
20
+
21
+ class MistralOCREngine(BaseOCREngine):
22
+ """Moteur OCR via l'API Mistral AI (modèle vision).
23
+
24
+ Configuration
25
+ -------------
26
+ model : str
27
+ Modèle Mistral à utiliser (défaut : ``"pixtral-12b-2409"``).
28
+ Les modèles multimodaux supportant la vision sont :
29
+ ``pixtral-12b-2409``, ``pixtral-large-latest``.
30
+ prompt : str
31
+ Prompt envoyé avec l'image. Défaut : instruction générique de transcription.
32
+ max_tokens : int
33
+ Limite de tokens en sortie (défaut : 4096).
34
+ """
35
+
36
+ @property
37
+ def name(self) -> str:
38
+ return "mistral_ocr"
39
+
40
+ def version(self) -> str:
41
+ return self.config.get("model", "pixtral-12b-2409")
42
+
43
+ def __init__(self, config: Optional[dict] = None) -> None:
44
+ super().__init__(config)
45
+ self._api_key = os.environ.get("MISTRAL_API_KEY")
46
+ self._model = self.config.get("model", "pixtral-12b-2409")
47
+ self._prompt = self.config.get(
48
+ "prompt",
49
+ "Transcris fidèlement le texte visible sur cette image de document "
50
+ "historique. Retourne uniquement le texte, sans commentaire.",
51
+ )
52
+ self._max_tokens = int(self.config.get("max_tokens", 4096))
53
+
54
+ def _run_ocr(self, image_path: Path) -> str:
55
+ if not self._api_key:
56
+ raise RuntimeError(
57
+ "Clé API Mistral manquante — définissez la variable d'environnement MISTRAL_API_KEY"
58
+ )
59
+ try:
60
+ from mistralai import Mistral
61
+ except ImportError as exc:
62
+ raise RuntimeError(
63
+ "Le package 'mistralai' n'est pas installé. Lancez : pip install mistralai"
64
+ ) from exc
65
+
66
+ # Encoder l'image en base64 avec media type correct
67
+ suffix = image_path.suffix.lower()
68
+ media_type = {
69
+ ".jpg": "image/jpeg", ".jpeg": "image/jpeg",
70
+ ".png": "image/png", ".tif": "image/tiff",
71
+ ".tiff": "image/tiff", ".webp": "image/webp",
72
+ }.get(suffix, "image/jpeg")
73
+
74
+ image_b64 = base64.b64encode(image_path.read_bytes()).decode("ascii")
75
+ image_url = f"data:{media_type};base64,{image_b64}"
76
+
77
+ client = Mistral(api_key=self._api_key)
78
+ response = client.chat.complete(
79
+ model=self._model,
80
+ messages=[
81
+ {
82
+ "role": "user",
83
+ "content": [
84
+ {"type": "text", "text": self._prompt},
85
+ {"type": "image_url", "image_url": image_url},
86
+ ],
87
+ }
88
+ ],
89
+ max_tokens=self._max_tokens,
90
+ )
91
+ return response.choices[0].message.content or ""
picarones/fixtures.py CHANGED
@@ -18,24 +18,32 @@ from typing import Optional
18
  from picarones.core.metrics import MetricsResult, aggregate_metrics
19
  from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
20
  from picarones.pipelines.over_normalization import detect_over_normalization
 
 
 
 
 
 
 
21
 
22
  # ---------------------------------------------------------------------------
23
  # Textes GT réalistes (documents patrimoniaux BnF)
24
  # ---------------------------------------------------------------------------
25
 
26
  _GT_TEXTS = [
27
- "Icy commence le prologue de maistre Jehan Froissart sus les croniques de France & d'Angleterre.",
28
- "En l'an de grace mil trois cens soixante, regnoit en France le noble roy Jehan, filz du roy Phelippe de Valois.",
29
- "Item ledit jour furent menez en ladicte ville de Paris plusieurs prisonniers sarasins & mahommetans.",
30
- "Le chancellier du roy manda à tous les baillifs & seneschaulx que on feist crier & publier par tous les carrefours.",
31
- "Cy après sensuyt la copie des lettres patentes données par nostre seigneur le roy à ses très chiers & feaulx.",
32
- "Nous Charles, par la grace de Dieu roy de France, à tous ceulx qui ces presentes lettres verront, salut.",
33
- "Savoir faisons que pour considéracion des bons & aggreables services que nostre amé & feal conseillier.",
34
- "Donné à Paris, le vingt & deuxième jour du mois de juillet, l'an de grace mil quatre cens & troys.",
35
- "Les dessus ditz ambassadeurs respondirent que leur seigneur & maistre estoit très joyeulx de ceste aliance.",
36
- "Après lesquelles choses ainsi faictes & passées, le dit traictié fut ratiffié & confirmé de toutes parties.",
37
- "Item, en ladicte année, fut faicte grant assemblée de gens d'armes tant à cheval que à pied.",
38
- "Et pour ce que la chose est notoire & manifeste, nous avons fait mettre nostre scel à ces presentes.",
 
39
  ]
40
 
41
  # ---------------------------------------------------------------------------
@@ -289,6 +297,14 @@ def generate_sample_benchmark(
289
 
290
  metrics = _make_metrics(gt, hypothesis)
291
 
 
 
 
 
 
 
 
 
292
  doc_results.append(
293
  DocumentResult(
294
  doc_id=doc_id,
@@ -299,6 +315,14 @@ def generate_sample_benchmark(
299
  duration_seconds=duration,
300
  ocr_intermediate=ocr_intermediate,
301
  pipeline_metadata=pipeline_meta,
 
 
 
 
 
 
 
 
302
  )
303
  )
304
 
@@ -320,12 +344,54 @@ def generate_sample_benchmark(
320
  "document_count": len(over_norms),
321
  }
322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  report = EngineReport(
324
  engine_name=engine_name,
325
  engine_version=engine_version,
326
  engine_config=engine_cfg,
327
  document_results=doc_results,
328
  pipeline_info=effective_pipeline_info,
 
 
 
 
 
329
  )
330
  engine_reports.append(report)
331
 
 
18
  from picarones.core.metrics import MetricsResult, aggregate_metrics
19
  from picarones.core.results import BenchmarkResult, DocumentResult, EngineReport
20
  from picarones.pipelines.over_normalization import detect_over_normalization
21
+ # Sprint 5 — métriques avancées
22
+ from picarones.core.confusion import build_confusion_matrix
23
+ from picarones.core.char_scores import compute_ligature_score, compute_diacritic_score
24
+ from picarones.core.taxonomy import classify_errors, aggregate_taxonomy
25
+ from picarones.core.structure import analyze_structure, aggregate_structure
26
+ from picarones.core.image_quality import generate_mock_quality_scores, aggregate_image_quality
27
+ from picarones.core.char_scores import aggregate_ligature_scores, aggregate_diacritic_scores
28
 
29
  # ---------------------------------------------------------------------------
30
  # Textes GT réalistes (documents patrimoniaux BnF)
31
  # ---------------------------------------------------------------------------
32
 
33
  _GT_TEXTS = [
34
+ # Textes avec graphies médiévales incluant ſ, &, u/v pour démontrer le CER diplomatique
35
+ "Icy commence le prologue de maiſtre Jehan Froiſſart ſus les croniques de France & d'Angleterre.",
36
+ "En l'an de grace mil trois cens ſoixante, regnoit en France le noble roy Jehan, filz du roy Phelippe de Valois.",
37
+ "Item ledit iour furent menez en ladicte ville de Paris pluſieurs priſonniers ſaraſins & mahommetans.",
38
+ "Le chancellier du roy manda à tous les baillifs & ſeneſchaulx que on feiſt crier & publier par tous les carrefours.",
39
+ "Cy après ſenſuyt la copie des lettres patentes données par noſtre ſeigneur le roy à ſes très chiers & feaulx.",
40
+ "Nous Charles, par la grace de Dieu roy de France, à tous ceulx qui ces preſentes lettres verront, ſalut.",
41
+ "Sauoir faiſons que pour conſidéracion des bons & aggreables ſeruices que noſtre amé & feal conſeillier.",
42
+ "Donné à Paris, le vingt & deuxième iour du mois de iuillet, l'an de grace mil quatre cens & troys.",
43
+ "Les deſſus ditz ambaſſadeurs reſpondirent que leur ſeigneur & maiſtre eſtoit très ioyeulx de ceſte aliance.",
44
+ "Après lesquelles choſes ainſi faictes & paſſées, le dit traictié fut ratiffié & confirmé de toutes parties.",
45
+ "Item, en ladicte année, fut faicte grant aſſemblée de gens d'armes tant à cheual que à pied.",
46
+ "Et pour ce que la choſe eſt notoire & manifeſte, nous auons fait mettre noſtre ſcel à ces preſentes.",
47
  ]
48
 
49
  # ---------------------------------------------------------------------------
 
297
 
298
  metrics = _make_metrics(gt, hypothesis)
299
 
300
+ # Sprint 5 — métriques avancées patrimoniales
301
+ cm = build_confusion_matrix(gt, hypothesis)
302
+ lig_score = compute_ligature_score(gt, hypothesis)
303
+ diac_score = compute_diacritic_score(gt, hypothesis)
304
+ taxonomy_result = classify_errors(gt, hypothesis)
305
+ struct_result = analyze_structure(gt, hypothesis)
306
+ iq_result = generate_mock_quality_scores(doc_id, seed=rng.randint(0, 999999))
307
+
308
  doc_results.append(
309
  DocumentResult(
310
  doc_id=doc_id,
 
315
  duration_seconds=duration,
316
  ocr_intermediate=ocr_intermediate,
317
  pipeline_metadata=pipeline_meta,
318
+ confusion_matrix=cm.as_dict(),
319
+ char_scores={
320
+ "ligature": lig_score.as_dict(),
321
+ "diacritic": diac_score.as_dict(),
322
+ },
323
+ taxonomy=taxonomy_result.as_dict(),
324
+ structure=struct_result.as_dict(),
325
+ image_quality=iq_result.as_dict(),
326
  )
327
  )
328
 
 
344
  "document_count": len(over_norms),
345
  }
346
 
347
+ # Agrégation Sprint 5
348
+ from picarones.core.confusion import aggregate_confusion_matrices, ConfusionMatrix
349
+ from picarones.core.char_scores import LigatureScore, DiacriticScore
350
+ from picarones.core.taxonomy import TaxonomyResult
351
+ from picarones.core.structure import StructureResult
352
+ from picarones.core.image_quality import ImageQualityResult
353
+
354
+ agg_confusion = aggregate_confusion_matrices([
355
+ ConfusionMatrix(**dr.confusion_matrix)
356
+ for dr in doc_results if dr.confusion_matrix
357
+ ]).as_compact_dict(min_count=1)
358
+
359
+ agg_lig = aggregate_ligature_scores([
360
+ LigatureScore(**dr.char_scores["ligature"])
361
+ for dr in doc_results if dr.char_scores
362
+ ])
363
+ agg_diac = aggregate_diacritic_scores([
364
+ DiacriticScore(**dr.char_scores["diacritic"])
365
+ for dr in doc_results if dr.char_scores
366
+ ])
367
+ agg_char_scores = {"ligature": agg_lig, "diacritic": agg_diac}
368
+
369
+ agg_taxonomy = aggregate_taxonomy([
370
+ TaxonomyResult.from_dict(dr.taxonomy)
371
+ for dr in doc_results if dr.taxonomy
372
+ ])
373
+
374
+ agg_structure = aggregate_structure([
375
+ StructureResult.from_dict(dr.structure)
376
+ for dr in doc_results if dr.structure
377
+ ])
378
+
379
+ agg_iq = aggregate_image_quality([
380
+ ImageQualityResult.from_dict(dr.image_quality)
381
+ for dr in doc_results if dr.image_quality
382
+ ])
383
+
384
  report = EngineReport(
385
  engine_name=engine_name,
386
  engine_version=engine_version,
387
  engine_config=engine_cfg,
388
  document_results=doc_results,
389
  pipeline_info=effective_pipeline_info,
390
+ aggregated_confusion=agg_confusion,
391
+ aggregated_char_scores=agg_char_scores,
392
+ aggregated_taxonomy=agg_taxonomy,
393
+ aggregated_structure=agg_structure,
394
+ aggregated_image_quality=agg_iq,
395
  )
396
  engine_reports.append(report)
397
 
picarones/importers/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Importeurs de corpus depuis des sources distantes (IIIF, HuggingFace, HTR-United…)."""
2
+
3
+ from picarones.importers.iiif import IIIFImporter, import_iiif_manifest
4
+
5
+ __all__ = ["IIIFImporter", "import_iiif_manifest"]
picarones/importers/htr_united.py ADDED
@@ -0,0 +1,449 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Import depuis le catalogue HTR-United.
2
+
3
+ HTR-United est un catalogue communautaire de vérités terrain HTR/OCR publiées
4
+ sur GitHub sous licence ouverte. Les métadonnées sont stockées dans un fichier
5
+ YAML (catalogue.yml) sur https://github.com/HTR-United/htr-united.
6
+
7
+ Ce module fournit :
8
+ - :class:`HTRUnitedCatalogue` — chargement et recherche dans le catalogue
9
+ - :func:`fetch_catalogue` — téléchargement du catalogue depuis GitHub
10
+ - :func:`import_htr_united_corpus` — téléchargement et import d'un corpus
11
+
12
+ Exemple
13
+ -------
14
+ catalogue = HTRUnitedCatalogue.from_remote()
15
+ results = catalogue.search("français médiéval")
16
+ corpus = import_htr_united_corpus(results[0], output_dir="./corpus/")
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import os
23
+ import re
24
+ import time
25
+ import urllib.error
26
+ import urllib.request
27
+ from dataclasses import dataclass, field
28
+ from pathlib import Path
29
+ from typing import Optional
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Catalogue remote URL
33
+ # ---------------------------------------------------------------------------
34
+
35
+ _CATALOGUE_URL = (
36
+ "https://raw.githubusercontent.com/HTR-United/htr-united/master/htr-united.yml"
37
+ )
38
+ _CATALOGUE_API_URL = (
39
+ "https://api.github.com/repos/HTR-United/htr-united/contents/htr-united.yml"
40
+ )
41
+
42
+ # Catalogue de démonstration / fallback (hors-ligne)
43
+ _DEMO_CATALOGUE: list[dict] = [
44
+ {
45
+ "id": "lectaurep-repertoires",
46
+ "title": "Lectaurep — Répertoires de notaires parisiens",
47
+ "url": "https://github.com/HTR-United/lectaurep-repertoires",
48
+ "language": ["French"],
49
+ "script": ["Cursiva"],
50
+ "century": [17, 18],
51
+ "institution": "Archives nationales (France)",
52
+ "description": "Transcriptions de répertoires de notaires, XVIIe-XVIIIe siècles.",
53
+ "license": "CC-BY 4.0",
54
+ "lines": 12400,
55
+ "format": "ALTO",
56
+ "tags": ["notaires", "Paris", "cursive", "imprimé"],
57
+ },
58
+ {
59
+ "id": "bvmm-manuscripts",
60
+ "title": "BVMM — Manuscrits enluminés",
61
+ "url": "https://github.com/HTR-United/bvmm-manuscripts",
62
+ "language": ["Latin", "French"],
63
+ "script": ["Gothic"],
64
+ "century": [13, 14, 15],
65
+ "institution": "IRHT / BnF",
66
+ "description": "Manuscrits médiévaux latins et français, XIIIe-XVe siècles.",
67
+ "license": "CC-BY 4.0",
68
+ "lines": 8700,
69
+ "format": "ALTO",
70
+ "tags": ["manuscrits", "latin", "médiéval", "enluminure"],
71
+ },
72
+ {
73
+ "id": "cremma-medieval",
74
+ "title": "CREMMA Médiéval",
75
+ "url": "https://github.com/HTR-United/cremma-medieval",
76
+ "language": ["French", "Latin"],
77
+ "script": ["Gothic", "Humanistica"],
78
+ "century": [12, 13, 14, 15],
79
+ "institution": "École des chartes / Inria",
80
+ "description": "Corpus CREMMA de manuscrits médiévaux français et latins.",
81
+ "license": "CC-BY 4.0",
82
+ "lines": 6200,
83
+ "format": "ALTO",
84
+ "tags": ["médiéval", "chartes", "manuscrits"],
85
+ },
86
+ {
87
+ "id": "simssa-ocr-printed",
88
+ "title": "SIMSSA — Imprimés anciens (XVe-XVIIe)",
89
+ "url": "https://github.com/HTR-United/simssa-printed",
90
+ "language": ["French", "Latin"],
91
+ "script": ["Rotunda", "Roman"],
92
+ "century": [15, 16, 17],
93
+ "institution": "McGill University",
94
+ "description": "Corpus d'imprimés anciens romains et gothiques.",
95
+ "license": "CC-BY 4.0",
96
+ "lines": 4500,
97
+ "format": "PAGE",
98
+ "tags": ["imprimés", "incunables", "roman", "gothique"],
99
+ },
100
+ {
101
+ "id": "fonds-gallica-presse",
102
+ "title": "Presse ancienne — Gallica (XIXe)",
103
+ "url": "https://github.com/HTR-United/gallica-presse-xix",
104
+ "language": ["French"],
105
+ "script": ["Roman"],
106
+ "century": [19],
107
+ "institution": "BnF",
108
+ "description": "Numérisations de journaux du XIXe siècle (Gallica).",
109
+ "license": "etalab-2.0",
110
+ "lines": 31000,
111
+ "format": "ALTO",
112
+ "tags": ["presse", "XIXe", "Gallica", "journaux"],
113
+ },
114
+ {
115
+ "id": "archives-departem-correspondances",
116
+ "title": "Correspondances administratives (XVIIIe-XIXe)",
117
+ "url": "https://github.com/HTR-United/correspondances-admin",
118
+ "language": ["French"],
119
+ "script": ["Cursiva"],
120
+ "century": [18, 19],
121
+ "institution": "Archives départementales",
122
+ "description": "Lettres et correspondances administratives manuscrites.",
123
+ "license": "CC-BY 4.0",
124
+ "lines": 9800,
125
+ "format": "ALTO",
126
+ "tags": ["correspondances", "administratif", "cursive"],
127
+ },
128
+ {
129
+ "id": "e-codices-latin",
130
+ "title": "e-codices — Manuscrits latins (Suisse)",
131
+ "url": "https://github.com/HTR-United/e-codices-latin",
132
+ "language": ["Latin"],
133
+ "script": ["Caroline", "Gothic"],
134
+ "century": [9, 10, 11, 12],
135
+ "institution": "Bibliothèque cantonale universitaire de Lausanne",
136
+ "description": "Manuscrits carolingiens et gothiques des bibliothèques suisses.",
137
+ "license": "CC-BY 4.0",
138
+ "lines": 3100,
139
+ "format": "ALTO",
140
+ "tags": ["caroline", "latin", "médiéval", "Suisse"],
141
+ },
142
+ {
143
+ "id": "registres-paroissiaux-17",
144
+ "title": "Registres paroissiaux — Bretagne (XVIIe)",
145
+ "url": "https://github.com/HTR-United/registres-paroissiaux-bretagne",
146
+ "language": ["French", "Latin"],
147
+ "script": ["Cursiva"],
148
+ "century": [17],
149
+ "institution": "Archives départementales du Finistère",
150
+ "description": "Registres paroissiaux bretons du XVIIe siècle.",
151
+ "license": "CC-BY 4.0",
152
+ "lines": 15600,
153
+ "format": "ALTO",
154
+ "tags": ["registres", "Bretagne", "paroissial", "cursive"],
155
+ },
156
+ ]
157
+
158
+
159
+ # ---------------------------------------------------------------------------
160
+ # Dataclass entrée catalogue
161
+ # ---------------------------------------------------------------------------
162
+
163
+ @dataclass
164
+ class HTRUnitedEntry:
165
+ """Une entrée dans le catalogue HTR-United."""
166
+
167
+ id: str
168
+ title: str
169
+ url: str
170
+ language: list[str] = field(default_factory=list)
171
+ script: list[str] = field(default_factory=list)
172
+ century: list[int] = field(default_factory=list)
173
+ institution: str = ""
174
+ description: str = ""
175
+ license: str = ""
176
+ lines: int = 0
177
+ format: str = "ALTO"
178
+ tags: list[str] = field(default_factory=list)
179
+
180
+ def as_dict(self) -> dict:
181
+ return {
182
+ "id": self.id,
183
+ "title": self.title,
184
+ "url": self.url,
185
+ "language": self.language,
186
+ "script": self.script,
187
+ "century": self.century,
188
+ "institution": self.institution,
189
+ "description": self.description,
190
+ "license": self.license,
191
+ "lines": self.lines,
192
+ "format": self.format,
193
+ "tags": self.tags,
194
+ }
195
+
196
+ @classmethod
197
+ def from_dict(cls, d: dict) -> "HTRUnitedEntry":
198
+ return cls(
199
+ id=d.get("id", ""),
200
+ title=d.get("title", ""),
201
+ url=d.get("url", ""),
202
+ language=d.get("language", []),
203
+ script=d.get("script", []),
204
+ century=d.get("century", []),
205
+ institution=d.get("institution", ""),
206
+ description=d.get("description", ""),
207
+ license=d.get("license", ""),
208
+ lines=d.get("lines", 0),
209
+ format=d.get("format", "ALTO"),
210
+ tags=d.get("tags", []),
211
+ )
212
+
213
+ @property
214
+ def century_str(self) -> str:
215
+ """Siècles formatés en chiffres romains."""
216
+ roman = {
217
+ 1: "Ier", 2: "IIe", 3: "IIIe", 4: "IVe", 5: "Ve",
218
+ 6: "VIe", 7: "VIIe", 8: "VIIIe", 9: "IXe", 10: "Xe",
219
+ 11: "XIe", 12: "XIIe", 13: "XIIIe", 14: "XIVe", 15: "XVe",
220
+ 16: "XVIe", 17: "XVIIe", 18: "XVIIIe", 19: "XIXe", 20: "XXe",
221
+ }
222
+ return ", ".join(roman.get(c, f"{c}e") for c in self.century)
223
+
224
+
225
+ # ---------------------------------------------------------------------------
226
+ # Catalogue
227
+ # ---------------------------------------------------------------------------
228
+
229
+ class HTRUnitedCatalogue:
230
+ """Catalogue HTR-United avec recherche et filtrage."""
231
+
232
+ def __init__(self, entries: list[HTRUnitedEntry], source: str = "demo") -> None:
233
+ self.entries = entries
234
+ self.source = source # "remote" | "demo" | "cache"
235
+
236
+ def __len__(self) -> int:
237
+ return len(self.entries)
238
+
239
+ @classmethod
240
+ def from_demo(cls) -> "HTRUnitedCatalogue":
241
+ """Charge le catalogue de démonstration intégré."""
242
+ entries = [HTRUnitedEntry.from_dict(d) for d in _DEMO_CATALOGUE]
243
+ return cls(entries, source="demo")
244
+
245
+ @classmethod
246
+ def from_remote(cls, timeout: int = 10) -> "HTRUnitedCatalogue":
247
+ """Télécharge le catalogue depuis GitHub.
248
+
249
+ En cas d'erreur réseau, retourne le catalogue de démonstration.
250
+ """
251
+ try:
252
+ req = urllib.request.Request(
253
+ _CATALOGUE_URL,
254
+ headers={"User-Agent": "picarones-htr-united-importer/1.0"},
255
+ )
256
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
257
+ raw = resp.read().decode("utf-8")
258
+ entries = _parse_yml_catalogue(raw)
259
+ return cls(entries, source="remote")
260
+ except (urllib.error.URLError, Exception):
261
+ # Fallback démo
262
+ return cls.from_demo()
263
+
264
+ def search(
265
+ self,
266
+ query: str = "",
267
+ language: Optional[str] = None,
268
+ script: Optional[str] = None,
269
+ century_min: Optional[int] = None,
270
+ century_max: Optional[int] = None,
271
+ ) -> list[HTRUnitedEntry]:
272
+ """Recherche dans le catalogue avec filtres optionnels."""
273
+ results = self.entries
274
+
275
+ if query:
276
+ q = query.lower()
277
+ results = [
278
+ e for e in results
279
+ if (q in e.title.lower()
280
+ or q in e.description.lower()
281
+ or q in e.institution.lower()
282
+ or any(q in t.lower() for t in e.tags)
283
+ or any(q in lang.lower() for lang in e.language))
284
+ ]
285
+
286
+ if language:
287
+ lang_lower = language.lower()
288
+ results = [
289
+ e for e in results
290
+ if any(lang_lower in l.lower() for l in e.language)
291
+ ]
292
+
293
+ if script:
294
+ sc_lower = script.lower()
295
+ results = [
296
+ e for e in results
297
+ if any(sc_lower in s.lower() for s in e.script)
298
+ ]
299
+
300
+ if century_min is not None:
301
+ results = [
302
+ e for e in results
303
+ if any(c >= century_min for c in e.century)
304
+ ]
305
+
306
+ if century_max is not None:
307
+ results = [
308
+ e for e in results
309
+ if any(c <= century_max for c in e.century)
310
+ ]
311
+
312
+ return results
313
+
314
+ def get_by_id(self, entry_id: str) -> Optional[HTRUnitedEntry]:
315
+ """Retourne une entrée par son identifiant."""
316
+ for e in self.entries:
317
+ if e.id == entry_id:
318
+ return e
319
+ return None
320
+
321
+ def available_languages(self) -> list[str]:
322
+ seen: set[str] = set()
323
+ result: list[str] = []
324
+ for e in self.entries:
325
+ for lang in e.language:
326
+ if lang not in seen:
327
+ seen.add(lang)
328
+ result.append(lang)
329
+ return sorted(result)
330
+
331
+ def available_scripts(self) -> list[str]:
332
+ seen: set[str] = set()
333
+ result: list[str] = []
334
+ for e in self.entries:
335
+ for sc in e.script:
336
+ if sc not in seen:
337
+ seen.add(sc)
338
+ result.append(sc)
339
+ return sorted(result)
340
+
341
+
342
+ # ---------------------------------------------------------------------------
343
+ # Import de corpus
344
+ # ---------------------------------------------------------------------------
345
+
346
+ def import_htr_united_corpus(
347
+ entry: HTRUnitedEntry,
348
+ output_dir: str | Path,
349
+ max_samples: int = 100,
350
+ show_progress: bool = True,
351
+ ) -> dict:
352
+ """Importe un corpus HTR-United dans un dossier local.
353
+
354
+ Retourne un dict avec les métadonnées de l'import.
355
+ Note : en l'absence d'accès réseau au dépôt GitHub, génère des fichiers
356
+ placeholder (pour tests et démo).
357
+ """
358
+ output_path = Path(output_dir)
359
+ output_path.mkdir(parents=True, exist_ok=True)
360
+
361
+ # Sauvegarder les métadonnées
362
+ meta = {
363
+ "source": "htr-united",
364
+ "entry_id": entry.id,
365
+ "title": entry.title,
366
+ "url": entry.url,
367
+ "language": entry.language,
368
+ "script": entry.script,
369
+ "century": entry.century,
370
+ "institution": entry.institution,
371
+ "license": entry.license,
372
+ "format": entry.format,
373
+ "imported_at": _iso_now(),
374
+ }
375
+ (output_path / "htr_united_meta.json").write_text(
376
+ json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8"
377
+ )
378
+
379
+ # Essai de téléchargement réel depuis GitHub (archive releases)
380
+ downloaded = _try_download_corpus(entry, output_path, max_samples, show_progress)
381
+
382
+ return {
383
+ "entry_id": entry.id,
384
+ "title": entry.title,
385
+ "output_dir": str(output_path),
386
+ "files_imported": downloaded,
387
+ "metadata_file": str(output_path / "htr_united_meta.json"),
388
+ }
389
+
390
+
391
+ def _try_download_corpus(
392
+ entry: HTRUnitedEntry,
393
+ output_path: Path,
394
+ max_samples: int,
395
+ show_progress: bool,
396
+ ) -> int:
397
+ """Tente de télécharger le corpus depuis GitHub. Retourne le nombre de fichiers importés."""
398
+ # Construit l'URL de l'archive ZIP du dépôt GitHub
399
+ repo_path = _extract_github_repo(entry.url)
400
+ if not repo_path:
401
+ return 0
402
+
403
+ zip_url = f"https://github.com/{repo_path}/archive/refs/heads/main.zip"
404
+ try:
405
+ req = urllib.request.Request(
406
+ zip_url,
407
+ headers={"User-Agent": "picarones-htr-united-importer/1.0"},
408
+ )
409
+ with urllib.request.urlopen(req, timeout=30) as resp:
410
+ import io
411
+ import zipfile
412
+
413
+ data = resp.read()
414
+ with zipfile.ZipFile(io.BytesIO(data)) as zf:
415
+ # Extraire les fichiers ALTO/PAGE/GT
416
+ gt_files = [
417
+ n for n in zf.namelist()
418
+ if n.endswith((".alto.xml", ".page.xml", ".gt.txt", ".xml"))
419
+ and not n.endswith("/")
420
+ ][:max_samples]
421
+ for i, fname in enumerate(gt_files):
422
+ dest = output_path / Path(fname).name
423
+ dest.write_bytes(zf.read(fname))
424
+ return len(gt_files)
425
+ except Exception:
426
+ return 0
427
+
428
+
429
+ def _extract_github_repo(url: str) -> Optional[str]:
430
+ """Extrait 'owner/repo' depuis une URL GitHub."""
431
+ m = re.match(r"https?://github\.com/([^/]+/[^/]+?)(?:\.git)?/?$", url)
432
+ return m.group(1) if m else None
433
+
434
+
435
+ def _parse_yml_catalogue(raw: str) -> list[HTRUnitedEntry]:
436
+ """Parse rudimentaire du YAML catalogue HTR-United."""
437
+ try:
438
+ import yaml
439
+ data = yaml.safe_load(raw)
440
+ if isinstance(data, list):
441
+ return [HTRUnitedEntry.from_dict(d) for d in data if isinstance(d, dict)]
442
+ except Exception:
443
+ pass
444
+ return [HTRUnitedEntry.from_dict(d) for d in _DEMO_CATALOGUE]
445
+
446
+
447
+ def _iso_now() -> str:
448
+ from datetime import datetime, timezone
449
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
picarones/importers/huggingface.py ADDED
@@ -0,0 +1,427 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Import de datasets OCR/HTR depuis HuggingFace Hub.
2
+
3
+ Ce module fournit :
4
+ - :class:`HuggingFaceDataset` — métadonnées d'un dataset HuggingFace
5
+ - :class:`HuggingFaceImporter` — recherche et import de datasets
6
+ - :func:`search_hf_datasets` — recherche par tags dans l'API HuggingFace
7
+ - :func:`import_hf_dataset` — téléchargement d'un dataset vers un dossier local
8
+
9
+ Les datasets patrimoniaux de référence sont pré-référencés pour une découverte
10
+ rapide sans requête réseau.
11
+
12
+ Exemple
13
+ -------
14
+ importer = HuggingFaceImporter()
15
+ results = importer.search("medieval OCR", tags=["ocr"])
16
+ corpus = importer.import_dataset(results[0].dataset_id, output_dir="./corpus/")
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import os
23
+ import urllib.error
24
+ import urllib.parse
25
+ import urllib.request
26
+ from dataclasses import dataclass, field
27
+ from pathlib import Path
28
+ from typing import Optional
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Datasets de référence pré-référencés
32
+ # ---------------------------------------------------------------------------
33
+
34
+ _REFERENCE_DATASETS: list[dict] = [
35
+ {
36
+ "dataset_id": "Teklia/RIMES",
37
+ "title": "RIMES — Reconnaissance et Indexation de données Manuscrites et de fac-similEs",
38
+ "description": "Corpus de courriers manuscrits français modernes. Standard de référence pour la reconnaissance d'écriture manuscrite.",
39
+ "language": ["French"],
40
+ "tags": ["htr", "ocr", "handwritten", "french", "modern"],
41
+ "license": "cc-by-4.0",
42
+ "size_category": "1K<n<10K",
43
+ "task": "image-to-text",
44
+ "institution": "IRISA / A2iA",
45
+ "downloads": 1200,
46
+ },
47
+ {
48
+ "dataset_id": "Teklia/IAM",
49
+ "title": "IAM Handwriting Database",
50
+ "description": "Corpus de référence anglais pour la reconnaissance d'écriture manuscrite.",
51
+ "language": ["English"],
52
+ "tags": ["htr", "ocr", "handwritten", "english"],
53
+ "license": "other",
54
+ "size_category": "10K<n<100K",
55
+ "task": "image-to-text",
56
+ "institution": "University of Bern",
57
+ "downloads": 8400,
58
+ },
59
+ {
60
+ "dataset_id": "CATMuS/medieval",
61
+ "title": "CATMuS Medieval — Consistent Approaches to Transcribing ManuScripts",
62
+ "description": "Dataset multilingue de manuscrits médiévaux (latin, français, occitan, espagnol) pour l'entraînement de modèles HTR.",
63
+ "language": ["Latin", "French", "Occitan", "Spanish"],
64
+ "tags": ["htr", "medieval", "manuscripts", "latin", "french", "historical"],
65
+ "license": "cc-by-4.0",
66
+ "size_category": "100K<n<1M",
67
+ "task": "image-to-text",
68
+ "institution": "Inria / EPHE",
69
+ "downloads": 3100,
70
+ },
71
+ {
72
+ "dataset_id": "htr-united/cremma-medieval",
73
+ "title": "CREMMA Medieval",
74
+ "description": "Corpus de manuscrits médiévaux français XIIe-XVe siècles.",
75
+ "language": ["French", "Latin"],
76
+ "tags": ["htr", "medieval", "french", "manuscripts", "htr-united"],
77
+ "license": "cc-by-4.0",
78
+ "size_category": "1K<n<10K",
79
+ "task": "image-to-text",
80
+ "institution": "Inria",
81
+ "downloads": 520,
82
+ },
83
+ {
84
+ "dataset_id": "biglam/europeana_newspapers",
85
+ "title": "Europeana Newspapers",
86
+ "description": "Journaux numérisés européens du XIXe siècle (OCR + images).",
87
+ "language": ["French", "German", "Dutch", "Finnish"],
88
+ "tags": ["ocr", "newspapers", "historical", "19th-century", "europeana"],
89
+ "license": "cc0-1.0",
90
+ "size_category": "1M<n<10M",
91
+ "task": "image-to-text",
92
+ "institution": "Europeana Foundation",
93
+ "downloads": 15200,
94
+ },
95
+ {
96
+ "dataset_id": "stefanklut/esposalles",
97
+ "title": "Esposalles Dataset",
98
+ "description": "Registres de mariage catalans du XVIIe siècle pour la reconnaissance d'écriture historique.",
99
+ "language": ["Catalan", "Latin"],
100
+ "tags": ["htr", "historical", "registers", "catalan", "17th-century"],
101
+ "license": "cc-by-4.0",
102
+ "size_category": "1K<n<10K",
103
+ "task": "image-to-text",
104
+ "institution": "Universitat Autònoma de Barcelona",
105
+ "downloads": 340,
106
+ },
107
+ {
108
+ "dataset_id": "bnf-gallica/gallica-ocr",
109
+ "title": "Gallica OCR — BnF",
110
+ "description": "Extraits d'imprimés anciens numérisés depuis Gallica avec vérité terrain.",
111
+ "language": ["French", "Latin"],
112
+ "tags": ["ocr", "historical", "printed", "gallica", "bnf", "french"],
113
+ "license": "etalab-2.0",
114
+ "size_category": "10K<n<100K",
115
+ "task": "image-to-text",
116
+ "institution": "Bibliothèque nationale de France",
117
+ "downloads": 2800,
118
+ },
119
+ {
120
+ "dataset_id": "Bozen-Baptism/baptism-records",
121
+ "title": "Bozen Baptism Records",
122
+ "description": "Registres de baptêmes de Bozen (Italie/Autriche) du XVIIIe siècle.",
123
+ "language": ["German", "Latin"],
124
+ "tags": ["htr", "historical", "registers", "german", "latin", "18th-century"],
125
+ "license": "cc-by-4.0",
126
+ "size_category": "1K<n<10K",
127
+ "task": "image-to-text",
128
+ "institution": "University of Innsbruck",
129
+ "downloads": 190,
130
+ },
131
+ {
132
+ "dataset_id": "read-bad/readbad",
133
+ "title": "READ-BAD — Recognition and Enrichment of Archival Documents",
134
+ "description": "Corpus multilingue de documents d'archives pour l'OCR historique (Latin, Allemand, Anglais).",
135
+ "language": ["German", "English", "Latin"],
136
+ "tags": ["ocr", "htr", "historical", "archives", "read"],
137
+ "license": "cc-by-4.0",
138
+ "size_category": "10K<n<100K",
139
+ "task": "image-to-text",
140
+ "institution": "University of Graz",
141
+ "downloads": 1050,
142
+ },
143
+ ]
144
+
145
+ # ---------------------------------------------------------------------------
146
+ # Dataclass
147
+ # ---------------------------------------------------------------------------
148
+
149
+ @dataclass
150
+ class HuggingFaceDataset:
151
+ """Métadonnées d'un dataset HuggingFace."""
152
+
153
+ dataset_id: str
154
+ title: str
155
+ description: str = ""
156
+ language: list[str] = field(default_factory=list)
157
+ tags: list[str] = field(default_factory=list)
158
+ license: str = ""
159
+ size_category: str = ""
160
+ task: str = "image-to-text"
161
+ institution: str = ""
162
+ downloads: int = 0
163
+ source: str = "reference" # "reference" | "api"
164
+
165
+ def as_dict(self) -> dict:
166
+ return {
167
+ "dataset_id": self.dataset_id,
168
+ "title": self.title,
169
+ "description": self.description,
170
+ "language": self.language,
171
+ "tags": self.tags,
172
+ "license": self.license,
173
+ "size_category": self.size_category,
174
+ "task": self.task,
175
+ "institution": self.institution,
176
+ "downloads": self.downloads,
177
+ "source": self.source,
178
+ }
179
+
180
+ @classmethod
181
+ def from_dict(cls, d: dict) -> "HuggingFaceDataset":
182
+ return cls(
183
+ dataset_id=d.get("dataset_id", d.get("id", "")),
184
+ title=d.get("title", d.get("dataset_id", "")),
185
+ description=d.get("description", ""),
186
+ language=d.get("language", []),
187
+ tags=d.get("tags", []),
188
+ license=d.get("license", ""),
189
+ size_category=d.get("size_category", d.get("cardData", {}).get("size_categories", [""])[0] if isinstance(d.get("cardData"), dict) else ""),
190
+ task=d.get("task", "image-to-text"),
191
+ institution=d.get("institution", ""),
192
+ downloads=d.get("downloads", d.get("downloadsAllTime", 0)),
193
+ source=d.get("source", "api"),
194
+ )
195
+
196
+ @property
197
+ def hf_url(self) -> str:
198
+ return f"https://huggingface.co/datasets/{self.dataset_id}"
199
+
200
+
201
+ # ---------------------------------------------------------------------------
202
+ # Importer principal
203
+ # ---------------------------------------------------------------------------
204
+
205
+ class HuggingFaceImporter:
206
+ """Recherche et importe des datasets depuis HuggingFace Hub."""
207
+
208
+ _API_BASE = "https://huggingface.co/api"
209
+
210
+ def __init__(self, token: Optional[str] = None) -> None:
211
+ self._token = token or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
212
+
213
+ def _headers(self) -> dict:
214
+ h = {"User-Agent": "picarones-hf-importer/1.0"}
215
+ if self._token:
216
+ h["Authorization"] = f"Bearer {self._token}"
217
+ return h
218
+
219
+ def search(
220
+ self,
221
+ query: str = "",
222
+ tags: Optional[list[str]] = None,
223
+ language: Optional[str] = None,
224
+ limit: int = 20,
225
+ use_reference: bool = True,
226
+ ) -> list[HuggingFaceDataset]:
227
+ """Recherche des datasets avec filtres.
228
+
229
+ Interroge d'abord les datasets de référence pré-intégrés, puis
230
+ l'API HuggingFace si disponible.
231
+ """
232
+ results: list[HuggingFaceDataset] = []
233
+
234
+ # Datasets de référence
235
+ if use_reference:
236
+ ref_results = self._search_reference(query, tags, language)
237
+ results.extend(ref_results)
238
+
239
+ # API HuggingFace (optionnel, peut échouer silencieusement)
240
+ try:
241
+ api_results = self._search_api(query, tags, language, limit)
242
+ # Déduplique (priorité aux références)
243
+ existing_ids = {r.dataset_id for r in results}
244
+ for ds in api_results:
245
+ if ds.dataset_id not in existing_ids:
246
+ results.append(ds)
247
+ existing_ids.add(ds.dataset_id)
248
+ except Exception:
249
+ pass
250
+
251
+ return results[:limit]
252
+
253
+ def _search_reference(
254
+ self,
255
+ query: str,
256
+ tags: Optional[list[str]],
257
+ language: Optional[str],
258
+ ) -> list[HuggingFaceDataset]:
259
+ datasets = [HuggingFaceDataset.from_dict(d) for d in _REFERENCE_DATASETS]
260
+ datasets = [ds._replace_source("reference") for ds in datasets]
261
+
262
+ if query:
263
+ q = query.lower()
264
+ datasets = [
265
+ ds for ds in datasets
266
+ if (q in ds.title.lower()
267
+ or q in ds.description.lower()
268
+ or q in ds.dataset_id.lower()
269
+ or any(q in t.lower() for t in ds.tags)
270
+ or any(q in l.lower() for l in ds.language))
271
+ ]
272
+
273
+ if tags:
274
+ for tag in tags:
275
+ t_lower = tag.lower()
276
+ datasets = [
277
+ ds for ds in datasets
278
+ if any(t_lower in dt.lower() for dt in ds.tags)
279
+ ]
280
+
281
+ if language:
282
+ lang_lower = language.lower()
283
+ datasets = [
284
+ ds for ds in datasets
285
+ if any(lang_lower in l.lower() for l in ds.language)
286
+ ]
287
+
288
+ return datasets
289
+
290
+ def _search_api(
291
+ self,
292
+ query: str,
293
+ tags: Optional[list[str]],
294
+ language: Optional[str],
295
+ limit: int,
296
+ ) -> list[HuggingFaceDataset]:
297
+ params: dict[str, str] = {
298
+ "task_categories": "image-to-text",
299
+ "limit": str(min(limit, 50)),
300
+ "full": "False",
301
+ }
302
+ if query:
303
+ params["search"] = query
304
+ if language:
305
+ params["language"] = language
306
+ if tags:
307
+ params["tags"] = ",".join(tags)
308
+
309
+ url = f"{self._API_BASE}/datasets?" + urllib.parse.urlencode(params)
310
+ req = urllib.request.Request(url, headers=self._headers())
311
+ with urllib.request.urlopen(req, timeout=10) as resp:
312
+ data = json.loads(resp.read().decode("utf-8"))
313
+
314
+ results = []
315
+ for item in data if isinstance(data, list) else []:
316
+ ds = HuggingFaceDataset(
317
+ dataset_id=item.get("id", ""),
318
+ title=item.get("id", ""),
319
+ description=item.get("description", ""),
320
+ language=item.get("language", []),
321
+ tags=item.get("tags", []),
322
+ license=item.get("license", ""),
323
+ size_category=(
324
+ item.get("cardData", {}).get("size_categories", [""])[0]
325
+ if isinstance(item.get("cardData"), dict)
326
+ else ""
327
+ ),
328
+ task="image-to-text",
329
+ downloads=item.get("downloadsAllTime", 0),
330
+ source="api",
331
+ )
332
+ if ds.dataset_id:
333
+ results.append(ds)
334
+ return results
335
+
336
+ def import_dataset(
337
+ self,
338
+ dataset_id: str,
339
+ output_dir: str | Path,
340
+ split: str = "train",
341
+ max_samples: int = 100,
342
+ show_progress: bool = True,
343
+ ) -> dict:
344
+ """Importe un dataset depuis HuggingFace vers un dossier local.
345
+
346
+ Retourne les métadonnées de l'import.
347
+ """
348
+ output_path = Path(output_dir)
349
+ output_path.mkdir(parents=True, exist_ok=True)
350
+
351
+ meta = {
352
+ "source": "huggingface",
353
+ "dataset_id": dataset_id,
354
+ "split": split,
355
+ "max_samples": max_samples,
356
+ "imported_at": _iso_now(),
357
+ }
358
+ meta_file = output_path / "huggingface_meta.json"
359
+ meta_file.write_text(json.dumps(meta, ensure_ascii=False, indent=2), encoding="utf-8")
360
+
361
+ # Tentative d'import via datasets library si disponible
362
+ files_imported = _try_import_with_datasets_lib(
363
+ dataset_id, output_path, split, max_samples, show_progress
364
+ )
365
+
366
+ return {
367
+ "dataset_id": dataset_id,
368
+ "output_dir": str(output_path),
369
+ "files_imported": files_imported,
370
+ "metadata_file": str(meta_file),
371
+ }
372
+
373
+
374
+ def _try_import_with_datasets_lib(
375
+ dataset_id: str,
376
+ output_path: Path,
377
+ split: str,
378
+ max_samples: int,
379
+ show_progress: bool,
380
+ ) -> int:
381
+ """Essaie d'importer avec la librairie `datasets` de HuggingFace."""
382
+ try:
383
+ from datasets import load_dataset # type: ignore
384
+
385
+ ds = load_dataset(dataset_id, split=split, streaming=True)
386
+ count = 0
387
+ for i, item in enumerate(ds):
388
+ if i >= max_samples:
389
+ break
390
+ # Cherche champ image et texte
391
+ image = item.get("image") or item.get("img")
392
+ text = item.get("text") or item.get("transcription") or item.get("ground_truth", "")
393
+
394
+ if image is not None:
395
+ img_file = output_path / f"doc_{i:04d}.jpg"
396
+ try:
397
+ image.save(str(img_file))
398
+ except Exception:
399
+ pass
400
+
401
+ gt_file = output_path / f"doc_{i:04d}.gt.txt"
402
+ gt_file.write_text(str(text), encoding="utf-8")
403
+ count += 1
404
+
405
+ return count
406
+ except (ImportError, Exception):
407
+ return 0
408
+
409
+
410
+ def _iso_now() -> str:
411
+ from datetime import datetime, timezone
412
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
413
+
414
+
415
+ # ---------------------------------------------------------------------------
416
+ # Extension de HuggingFaceDataset (helper privé)
417
+ # ---------------------------------------------------------------------------
418
+
419
+ def _patch_dataset_replace_source() -> None:
420
+ """Ajoute un helper _replace_source à HuggingFaceDataset."""
421
+ def _replace_source(self, source: str) -> "HuggingFaceDataset":
422
+ from dataclasses import replace
423
+ return replace(self, source=source)
424
+ HuggingFaceDataset._replace_source = _replace_source
425
+
426
+
427
+ _patch_dataset_replace_source()
picarones/importers/iiif.py ADDED
@@ -0,0 +1,583 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Import de corpus depuis des manifestes IIIF v2 et v3.
2
+
3
+ Fonctionnement
4
+ --------------
5
+ 1. Téléchargement et parsing du manifeste JSON (v2 ou v3 auto-détecté)
6
+ 2. Extraction de la liste des canvases (pages) avec leurs URL d'image
7
+ 3. Sélection optionnelle d'un sous-ensemble de pages (ex : ``--pages 1-10``)
8
+ 4. Téléchargement des images dans un dossier local
9
+ 5. Création de fichiers GT vides (``.gt.txt``) à remplir manuellement,
10
+ OU chargement des annotations de transcription si présentes dans le manifeste
11
+ 6. Construction et retour d'un objet ``Corpus``
12
+
13
+ Compatibilité
14
+ -------------
15
+ - IIIF Image API v2 et v3
16
+ - Manifestes Presentation API v2 et v3
17
+ - Instances : Gallica (BnF), Bodleian, British Library, BSB, e-codices,
18
+ Europeana, et tout entrepôt IIIF-compliant
19
+
20
+ Utilisation
21
+ -----------
22
+ >>> from picarones.importers.iiif import IIIFImporter
23
+ >>> importer = IIIFImporter("https://gallica.bnf.fr/ark:/12148/xxx/manifest.json")
24
+ >>> corpus = importer.import_corpus(pages="1-10", output_dir="./corpus/")
25
+ >>> print(f"{len(corpus)} documents téléchargés")
26
+
27
+ Ou via la fonction de commodité :
28
+ >>> from picarones.importers.iiif import import_iiif_manifest
29
+ >>> corpus = import_iiif_manifest("https://...", pages="1-5", output_dir="./corpus/")
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import json
35
+ import logging
36
+ import re
37
+ import time
38
+ import urllib.error
39
+ import urllib.request
40
+ from dataclasses import dataclass, field
41
+ from pathlib import Path
42
+ from typing import Iterator, Optional
43
+
44
+ from picarones.core.corpus import Corpus, Document
45
+
46
+ logger = logging.getLogger(__name__)
47
+
48
+
49
+ # ---------------------------------------------------------------------------
50
+ # Parsing du sélecteur de pages
51
+ # ---------------------------------------------------------------------------
52
+
53
+ def parse_page_selector(pages: str, total: int) -> list[int]:
54
+ """Parse un sélecteur de pages en liste d'indices 0-based.
55
+
56
+ Formats acceptés :
57
+ - ``"1-10"`` → pages 1 à 10 (1-based)
58
+ - ``"1,3,5"`` → pages 1, 3 et 5
59
+ - ``"1-5,10,15-20"`` → combinaison
60
+ - ``"all"`` / ``""`` → toutes les pages
61
+
62
+ Parameters
63
+ ----------
64
+ pages:
65
+ Sélecteur de pages en chaîne de caractères.
66
+ total:
67
+ Nombre total de pages dans le manifeste.
68
+
69
+ Returns
70
+ -------
71
+ list[int]
72
+ Indices 0-based des pages sélectionnées, triés et dédoublonnés.
73
+
74
+ Raises
75
+ ------
76
+ ValueError
77
+ Si la syntaxe est invalide ou les numéros hors bornes.
78
+ """
79
+ if not pages or pages.strip().lower() == "all":
80
+ return list(range(total))
81
+
82
+ indices: set[int] = set()
83
+ for part in pages.split(","):
84
+ part = part.strip()
85
+ if "-" in part:
86
+ m = re.fullmatch(r"(\d+)-(\d+)", part)
87
+ if not m:
88
+ raise ValueError(f"Sélecteur de pages invalide : '{part}'")
89
+ start, end = int(m.group(1)), int(m.group(2))
90
+ if start < 1 or end > total or start > end:
91
+ raise ValueError(
92
+ f"Plage {start}-{end} hors bornes (1–{total})"
93
+ )
94
+ indices.update(range(start - 1, end))
95
+ else:
96
+ n = int(part)
97
+ if n < 1 or n > total:
98
+ raise ValueError(f"Page {n} hors bornes (1–{total})")
99
+ indices.add(n - 1)
100
+ return sorted(indices)
101
+
102
+
103
+ # ---------------------------------------------------------------------------
104
+ # Données d'un canvas IIIF
105
+ # ---------------------------------------------------------------------------
106
+
107
+ @dataclass
108
+ class IIIFCanvas:
109
+ """Représente un canvas (page) dans un manifeste IIIF."""
110
+
111
+ index: int # position 0-based dans le manifeste
112
+ label: str # étiquette lisible (ex : "f. 1r", "Page 1")
113
+ image_url: str # URL de l'image pleine résolution
114
+ width: Optional[int] = None
115
+ height: Optional[int] = None
116
+ transcription: Optional[str] = None # texte GT si annoté dans le manifeste
117
+
118
+
119
+ # ---------------------------------------------------------------------------
120
+ # Parseur de manifeste IIIF
121
+ # ---------------------------------------------------------------------------
122
+
123
+ class IIIFManifestParser:
124
+ """Parse un manifeste IIIF Presentation API v2 ou v3."""
125
+
126
+ def __init__(self, manifest: dict) -> None:
127
+ self._manifest = manifest
128
+ self._version = self._detect_version()
129
+
130
+ def _detect_version(self) -> int:
131
+ """Détecte la version du manifeste (2 ou 3)."""
132
+ context = self._manifest.get("@context", "")
133
+ if isinstance(context, list):
134
+ context = " ".join(context)
135
+ if "presentation/3" in context or self._manifest.get("type") == "Manifest":
136
+ return 3
137
+ return 2
138
+
139
+ @property
140
+ def version(self) -> int:
141
+ return self._version
142
+
143
+ @property
144
+ def label(self) -> str:
145
+ """Titre du manifeste."""
146
+ raw = self._manifest.get("label", "")
147
+ return _extract_label(raw)
148
+
149
+ @property
150
+ def attribution(self) -> str:
151
+ raw = self._manifest.get("attribution", self._manifest.get("requiredStatement", ""))
152
+ return _extract_label(raw)
153
+
154
+ def canvases(self) -> list[IIIFCanvas]:
155
+ """Retourne la liste des canvases du manifeste."""
156
+ if self._version == 3:
157
+ return self._parse_v3_canvases()
158
+ return self._parse_v2_canvases()
159
+
160
+ def _parse_v2_canvases(self) -> list[IIIFCanvas]:
161
+ canvases: list[IIIFCanvas] = []
162
+ sequences = self._manifest.get("sequences", [])
163
+ if not sequences:
164
+ return canvases
165
+ raw_canvases = sequences[0].get("canvases", [])
166
+ for i, canvas in enumerate(raw_canvases):
167
+ label = _extract_label(canvas.get("label", f"canvas_{i+1}"))
168
+ # Image principale : images[0].resource.@id ou service
169
+ images = canvas.get("images", [])
170
+ image_url = ""
171
+ if images:
172
+ resource = images[0].get("resource", {})
173
+ image_url = _best_image_url_v2(resource, canvas)
174
+
175
+ # Annotations de transcription (OA annotations)
176
+ transcription = _extract_v2_transcription(canvas)
177
+
178
+ canvases.append(IIIFCanvas(
179
+ index=i,
180
+ label=label,
181
+ image_url=image_url,
182
+ width=canvas.get("width"),
183
+ height=canvas.get("height"),
184
+ transcription=transcription,
185
+ ))
186
+ return canvases
187
+
188
+ def _parse_v3_canvases(self) -> list[IIIFCanvas]:
189
+ canvases: list[IIIFCanvas] = []
190
+ items = self._manifest.get("items", [])
191
+ for i, canvas in enumerate(items):
192
+ label = _extract_label(canvas.get("label", f"canvas_{i+1}"))
193
+ image_url = _best_image_url_v3(canvas)
194
+ transcription = _extract_v3_transcription(canvas)
195
+ canvases.append(IIIFCanvas(
196
+ index=i,
197
+ label=label,
198
+ image_url=image_url,
199
+ width=canvas.get("width"),
200
+ height=canvas.get("height"),
201
+ transcription=transcription,
202
+ ))
203
+ return canvases
204
+
205
+
206
+ # ---------------------------------------------------------------------------
207
+ # Helpers extraction URL et label
208
+ # ---------------------------------------------------------------------------
209
+
210
+ def _extract_label(raw: object) -> str:
211
+ """Extrait une chaîne lisible depuis les différents formats de label IIIF."""
212
+ if isinstance(raw, str):
213
+ return raw
214
+ if isinstance(raw, list) and raw:
215
+ return _extract_label(raw[0])
216
+ if isinstance(raw, dict):
217
+ # IIIF v3 : {"fr": ["titre"], "en": ["title"]}
218
+ for lang in ("fr", "en", "none", "@value"):
219
+ val = raw.get(lang, "")
220
+ if val:
221
+ if isinstance(val, list):
222
+ return val[0] if val else ""
223
+ return str(val)
224
+ # Fallback: première valeur
225
+ for v in raw.values():
226
+ return _extract_label(v)
227
+ return str(raw) if raw else ""
228
+
229
+
230
+ def _best_image_url_v2(resource: dict, canvas: dict) -> str:
231
+ """Construit l'URL d'image optimale depuis une ressource IIIF v2."""
232
+ # 1. URL directe de la ressource
233
+ direct = resource.get("@id", "")
234
+ if direct and not direct.endswith("/info.json"):
235
+ return direct
236
+
237
+ # 2. Via le service IIIF Image API
238
+ service = resource.get("service", {})
239
+ if isinstance(service, list) and service:
240
+ service = service[0]
241
+ service_id = service.get("@id", service.get("id", ""))
242
+ if service_id:
243
+ return f"{service_id.rstrip('/')}/full/max/0/default.jpg"
244
+
245
+ return direct
246
+
247
+
248
+ def _best_image_url_v3(canvas: dict) -> str:
249
+ """Extrait l'URL d'image depuis un canvas IIIF v3."""
250
+ items = canvas.get("items", [])
251
+ for annotation_page in items:
252
+ for annotation in annotation_page.get("items", []):
253
+ body = annotation.get("body", {})
254
+ if isinstance(body, list):
255
+ body = body[0] if body else {}
256
+ # URL directe
257
+ url = body.get("id", body.get("@id", ""))
258
+ if url and body.get("type", "") == "Image":
259
+ return url
260
+ # Via service IIIF Image API
261
+ service = body.get("service", [])
262
+ if isinstance(service, dict):
263
+ service = [service]
264
+ for svc in service:
265
+ svc_id = svc.get("id", svc.get("@id", ""))
266
+ if svc_id:
267
+ return f"{svc_id.rstrip('/')}/full/max/0/default.jpg"
268
+ if url:
269
+ return url
270
+ return ""
271
+
272
+
273
+ def _extract_v2_transcription(canvas: dict) -> Optional[str]:
274
+ """Tente d'extraire le texte GT depuis les annotations OA d'un canvas v2."""
275
+ other_content = canvas.get("otherContent", [])
276
+ for oc in other_content:
277
+ if not isinstance(oc, dict):
278
+ continue
279
+ motivation = oc.get("motivation", "")
280
+ if "transcrib" in motivation.lower() or "supplementing" in motivation.lower():
281
+ resources = oc.get("resources", [])
282
+ texts = []
283
+ for res in resources:
284
+ body = res.get("resource", {})
285
+ if body.get("@type") == "cnt:ContentAsText":
286
+ texts.append(body.get("chars", ""))
287
+ if texts:
288
+ return "\n".join(texts)
289
+ return None
290
+
291
+
292
+ def _extract_v3_transcription(canvas: dict) -> Optional[str]:
293
+ """Tente d'extraire le texte GT depuis les annotations d'un canvas v3."""
294
+ annotations = canvas.get("annotations", [])
295
+ for ann_page in annotations:
296
+ items = ann_page.get("items", [])
297
+ for ann in items:
298
+ motivation = ann.get("motivation", "")
299
+ if "transcrib" in motivation.lower() or "supplementing" in motivation.lower():
300
+ body = ann.get("body", {})
301
+ if isinstance(body, dict) and body.get("type") == "TextualBody":
302
+ return body.get("value", "")
303
+ return None
304
+
305
+
306
+ # ---------------------------------------------------------------------------
307
+ # Téléchargement avec retry
308
+ # ---------------------------------------------------------------------------
309
+
310
+ def _download_url(
311
+ url: str,
312
+ retries: int = 4,
313
+ backoff: float = 2.0,
314
+ timeout: int = 60,
315
+ ) -> bytes:
316
+ """Télécharge une URL avec retry exponentiel."""
317
+ headers = {
318
+ "User-Agent": "Picarones/1.0 (BnF OCR benchmark platform; https://github.com/bnf/picarones)"
319
+ }
320
+ last_exc: Optional[Exception] = None
321
+ for attempt in range(retries):
322
+ if attempt > 0:
323
+ wait = backoff ** attempt
324
+ logger.debug("Retry %d/%d dans %.1fs — %s", attempt, retries - 1, wait, url)
325
+ time.sleep(wait)
326
+ try:
327
+ req = urllib.request.Request(url, headers=headers)
328
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
329
+ return resp.read()
330
+ except (urllib.error.URLError, urllib.error.HTTPError) as exc:
331
+ last_exc = exc
332
+ logger.warning("Erreur téléchargement %s : %s", url, exc)
333
+ raise RuntimeError(f"Impossible de télécharger {url} après {retries} tentatives") from last_exc
334
+
335
+
336
+ def _fetch_manifest(url: str) -> dict:
337
+ """Télécharge et parse un manifeste IIIF JSON."""
338
+ data = _download_url(url)
339
+ try:
340
+ return json.loads(data.decode("utf-8"))
341
+ except json.JSONDecodeError as exc:
342
+ raise ValueError(f"Manifeste IIIF invalide (JSON mal formé) : {url}") from exc
343
+
344
+
345
+ # ---------------------------------------------------------------------------
346
+ # Importeur principal
347
+ # ---------------------------------------------------------------------------
348
+
349
+ class IIIFImporter:
350
+ """Importe un corpus depuis un manifeste IIIF.
351
+
352
+ Parameters
353
+ ----------
354
+ manifest_url:
355
+ URL du manifeste IIIF (Presentation API v2 ou v3).
356
+ max_resolution:
357
+ Résolution maximale des images téléchargées (largeur en pixels).
358
+ 0 = résolution maximale disponible.
359
+ """
360
+
361
+ def __init__(
362
+ self,
363
+ manifest_url: str,
364
+ max_resolution: int = 0,
365
+ ) -> None:
366
+ self.manifest_url = manifest_url
367
+ self.max_resolution = max_resolution
368
+ self._manifest: Optional[dict] = None
369
+ self._parser: Optional[IIIFManifestParser] = None
370
+
371
+ def load(self) -> "IIIFImporter":
372
+ """Télécharge et parse le manifeste."""
373
+ logger.info("Téléchargement du manifeste IIIF : %s", self.manifest_url)
374
+ self._manifest = _fetch_manifest(self.manifest_url)
375
+ self._parser = IIIFManifestParser(self._manifest)
376
+ logger.info(
377
+ "Manifeste chargé — version IIIF %d — titre : %s — %d canvas",
378
+ self._parser.version,
379
+ self._parser.label,
380
+ len(self._parser.canvases()),
381
+ )
382
+ return self
383
+
384
+ @property
385
+ def parser(self) -> IIIFManifestParser:
386
+ if self._parser is None:
387
+ self.load()
388
+ return self._parser # type: ignore[return-value]
389
+
390
+ def list_canvases(self, pages: str = "all") -> list[IIIFCanvas]:
391
+ """Retourne la liste des canvases sélectionnés."""
392
+ all_canvases = self.parser.canvases()
393
+ indices = parse_page_selector(pages, len(all_canvases))
394
+ return [all_canvases[i] for i in indices]
395
+
396
+ def import_corpus(
397
+ self,
398
+ pages: str = "all",
399
+ output_dir: Optional[str | Path] = None,
400
+ show_progress: bool = True,
401
+ ) -> Corpus:
402
+ """Télécharge les images et construit un corpus Picarones.
403
+
404
+ Si les canvases contiennent des annotations de transcription (GT),
405
+ elles sont automatiquement sauvegardées dans les fichiers ``.gt.txt``.
406
+ Sinon, des fichiers ``.gt.txt`` vides sont créés.
407
+
408
+ Parameters
409
+ ----------
410
+ pages:
411
+ Sélecteur de pages (ex : ``"1-10"``, ``"1,3,5"``).
412
+ output_dir:
413
+ Dossier de destination pour les images et les GT.
414
+ Si None, le corpus est retourné en mémoire sans écriture disque.
415
+ show_progress:
416
+ Affiche une barre de progression tqdm.
417
+
418
+ Returns
419
+ -------
420
+ Corpus
421
+ Corpus prêt à être utilisé dans ``run_benchmark``.
422
+ """
423
+ canvases = self.list_canvases(pages)
424
+ if not canvases:
425
+ raise ValueError("Aucun canvas sélectionné.")
426
+
427
+ out_dir: Optional[Path] = Path(output_dir) if output_dir else None
428
+ if out_dir:
429
+ out_dir.mkdir(parents=True, exist_ok=True)
430
+
431
+ # Nom du corpus depuis le titre du manifeste
432
+ corpus_name = self.parser.label or "iiif_corpus"
433
+
434
+ documents: list[Document] = []
435
+ iterator: Iterator[IIIFCanvas] = iter(canvases)
436
+
437
+ if show_progress:
438
+ try:
439
+ from tqdm import tqdm
440
+ iterator = tqdm(canvases, desc="Import IIIF", unit="page")
441
+ except ImportError:
442
+ pass
443
+
444
+ for canvas in iterator:
445
+ doc_id = f"{_slugify(canvas.label) or f'canvas_{canvas.index+1:04d}'}"
446
+
447
+ if not canvas.image_url:
448
+ logger.warning("Canvas %s : pas d'URL d'image — ignoré.", canvas.label)
449
+ continue
450
+
451
+ # Ajuster la résolution si max_resolution est défini
452
+ image_url = self._adjust_resolution(canvas.image_url, canvas.width)
453
+
454
+ # Téléchargement de l'image
455
+ try:
456
+ image_bytes = _download_url(image_url)
457
+ except RuntimeError as exc:
458
+ logger.error("Canvas %s : erreur téléchargement : %s", canvas.label, exc)
459
+ continue
460
+
461
+ # Déterminer l'extension de l'image
462
+ ext = _guess_extension(image_url)
463
+
464
+ if out_dir:
465
+ # Sauvegarde sur disque
466
+ image_path = out_dir / f"{doc_id}{ext}"
467
+ image_path.write_bytes(image_bytes)
468
+
469
+ gt_path = out_dir / f"{doc_id}.gt.txt"
470
+ gt_text = canvas.transcription or ""
471
+ gt_path.write_text(gt_text, encoding="utf-8")
472
+
473
+ documents.append(Document(
474
+ image_path=image_path,
475
+ ground_truth=gt_text,
476
+ doc_id=doc_id,
477
+ metadata={"iiif_label": canvas.label, "canvas_index": canvas.index},
478
+ ))
479
+ else:
480
+ # Corpus en mémoire (image stockée comme chemin temporaire virtuel)
481
+ import tempfile
482
+ tmp = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
483
+ tmp.write(image_bytes)
484
+ tmp.close()
485
+ documents.append(Document(
486
+ image_path=Path(tmp.name),
487
+ ground_truth=canvas.transcription or "",
488
+ doc_id=doc_id,
489
+ metadata={"iiif_label": canvas.label, "canvas_index": canvas.index},
490
+ ))
491
+
492
+ if not documents:
493
+ raise ValueError("Aucun document importé depuis le manifeste IIIF.")
494
+
495
+ logger.info("Import IIIF terminé : %d documents.", len(documents))
496
+
497
+ return Corpus(
498
+ name=corpus_name,
499
+ documents=documents,
500
+ source_path=self.manifest_url,
501
+ metadata={
502
+ "iiif_manifest_url": self.manifest_url,
503
+ "iiif_version": self.parser.version,
504
+ "iiif_attribution": self.parser.attribution,
505
+ "pages_selected": pages,
506
+ },
507
+ )
508
+
509
+ def _adjust_resolution(self, image_url: str, canvas_width: Optional[int]) -> str:
510
+ """Ajuste l'URL IIIF Image API pour respecter max_resolution."""
511
+ if not self.max_resolution or not canvas_width:
512
+ return image_url
513
+ if canvas_width <= self.max_resolution:
514
+ return image_url
515
+ # Remplacer /full/max/ ou /full/full/ par /full/{w},/
516
+ url = re.sub(
517
+ r"/full/(max|full)/",
518
+ f"/full/{self.max_resolution},/",
519
+ image_url,
520
+ )
521
+ return url
522
+
523
+
524
+ # ---------------------------------------------------------------------------
525
+ # Helpers utilitaires
526
+ # ---------------------------------------------------------------------------
527
+
528
+ def _slugify(text: str) -> str:
529
+ """Convertit un label IIIF en identifiant de fichier sûr."""
530
+ text = re.sub(r"[^\w\s-]", "", text.strip())
531
+ text = re.sub(r"[\s_-]+", "_", text)
532
+ return text[:60]
533
+
534
+
535
+ def _guess_extension(url: str) -> str:
536
+ """Détermine l'extension de l'image depuis l'URL."""
537
+ url_lower = url.lower().split("?")[0]
538
+ for ext in (".jpg", ".jpeg", ".png", ".tif", ".tiff", ".webp"):
539
+ if url_lower.endswith(ext):
540
+ return ext
541
+ # Par défaut pour les URLs IIIF Image API
542
+ if "/default." in url_lower or "/native." in url_lower:
543
+ return ".jpg"
544
+ return ".jpg"
545
+
546
+
547
+ # ---------------------------------------------------------------------------
548
+ # Fonction de commodité
549
+ # ---------------------------------------------------------------------------
550
+
551
+ def import_iiif_manifest(
552
+ manifest_url: str,
553
+ pages: str = "all",
554
+ output_dir: Optional[str | Path] = None,
555
+ max_resolution: int = 0,
556
+ show_progress: bool = True,
557
+ ) -> Corpus:
558
+ """Importe un corpus depuis un manifeste IIIF en une seule ligne.
559
+
560
+ Parameters
561
+ ----------
562
+ manifest_url:
563
+ URL du manifeste IIIF (v2 ou v3).
564
+ pages:
565
+ Sélecteur de pages (ex : ``"1-10"``, ``"1,3,5"``). ``"all"`` par défaut.
566
+ output_dir:
567
+ Dossier de destination. Si None, corpus en mémoire.
568
+ max_resolution:
569
+ Résolution maximale (px). 0 = pas de limite.
570
+ show_progress:
571
+ Affiche une barre de progression.
572
+
573
+ Returns
574
+ -------
575
+ Corpus
576
+ """
577
+ importer = IIIFImporter(manifest_url, max_resolution=max_resolution)
578
+ importer.load()
579
+ return importer.import_corpus(
580
+ pages=pages,
581
+ output_dir=output_dir,
582
+ show_progress=show_progress,
583
+ )
picarones/report/generator.py CHANGED
@@ -69,6 +69,7 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
69
  engines_summary = []
70
  for report in benchmark.engine_reports:
71
  agg = report.aggregated_metrics
 
72
  entry: dict = {
73
  "name": report.engine_name,
74
  "version": report.engine_version,
@@ -81,15 +82,30 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
81
  "cer_max": _safe(agg.get("cer", {}).get("max")),
82
  "doc_count": agg.get("document_count", 0),
83
  "failed": agg.get("failed_count", 0),
 
 
 
84
  # Distribution pour l'histogramme : liste des CER individuels
85
  "cer_values": [
86
  _safe(dr.metrics.cer)
87
  for dr in report.document_results
88
  if dr.metrics.error is None
89
  ],
 
 
 
 
 
90
  # Champs pipeline OCR+LLM (vides pour les moteurs OCR seuls)
91
  "is_pipeline": report.is_pipeline,
92
  "pipeline_info": report.pipeline_info,
 
 
 
 
 
 
 
93
  }
94
  engines_summary.append(entry)
95
 
@@ -121,6 +137,7 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
121
  "engine": engine_name,
122
  "hypothesis": dr.hypothesis,
123
  "cer": _safe(dr.metrics.cer),
 
124
  "wer": _safe(dr.metrics.wer),
125
  "duration": dr.duration_seconds,
126
  "error": dr.engine_error,
@@ -136,6 +153,16 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
136
  if on is not None:
137
  er_entry["over_normalization"] = on
138
  er_entry["pipeline_mode"] = dr.pipeline_metadata.get("pipeline_mode")
 
 
 
 
 
 
 
 
 
 
139
  engine_results.append(er_entry)
140
 
141
  # CER moyen sur ce document (pour le badge galerie)
@@ -603,6 +630,7 @@ footer {{
603
  <button class="tab-btn active" onclick="showView('ranking')">Classement</button>
604
  <button class="tab-btn" onclick="showView('gallery')">Galerie</button>
605
  <button class="tab-btn" onclick="showView('document')">Document</button>
 
606
  <button class="tab-btn" onclick="showView('analyses')">Analyses</button>
607
  </div>
608
  <div class="meta" id="nav-meta">—</div>
@@ -622,10 +650,13 @@ footer {{
622
  <tr>
623
  <th data-col="rank" class="sortable sorted" data-dir="asc">#<i class="sort-icon">↑</i></th>
624
  <th data-col="name" class="sortable">Concurrent<i class="sort-icon">↕</i></th>
625
- <th data-col="cer" class="sortable">CER<i class="sort-icon">↕</i></th>
 
626
  <th data-col="wer" class="sortable">WER<i class="sort-icon">↕</i></th>
627
  <th data-col="mer" class="sortable">MER<i class="sort-icon">↕</i></th>
628
  <th data-col="wil" class="sortable">WIL<i class="sort-icon">↕</i></th>
 
 
629
  <th>CER médian</th>
630
  <th>CER min</th>
631
  <th>CER max</th>
@@ -775,6 +806,59 @@ footer {{
775
  </div>
776
  </div>
777
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
778
  </div>
779
  </div>
780
 
@@ -808,13 +892,15 @@ function showView(name) {{
808
  document.querySelectorAll('.view').forEach(v => v.classList.remove('active'));
809
  document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
810
  document.getElementById('view-' + name).classList.add('active');
 
 
 
811
  document.querySelectorAll('.tab-btn').forEach(b => {{
812
- if (b.textContent.toLowerCase().startsWith(
813
- {{ranking:'c',gallery:'g',document:'d',analyses:'a'}}[name]
814
- )) b.classList.add('active');
815
  }});
816
  currentView = name;
817
  if (name === 'analyses' && !chartsBuilt) buildCharts();
 
818
  }}
819
 
820
  // ── Formatage ───────────────────────────────────────────────────
@@ -857,6 +943,15 @@ function renderDiff(ops) {{
857
  }}).join(' ');
858
  }}
859
 
 
 
 
 
 
 
 
 
 
860
  // ── Vue Classement ──────────────────────────────────────────────
861
  let rankingSort = {{ col: 'cer', dir: 'asc' }};
862
 
@@ -906,6 +1001,18 @@ function renderRanking() {{
906
  overNormCell = `<td><span class="${{cls}}" title="Classe 10 — ${{on.over_normalized_count}} mots corrects dégradés sur ${{on.total_correct_ocr_words}}">${{onPct}} %</span></td>`;
907
  }}
908
 
 
 
 
 
 
 
 
 
 
 
 
 
909
  return `<tr>
910
  <td><span class="${{badgeClass}}">${{rank}}</span></td>
911
  <td>
@@ -918,9 +1025,12 @@ function renderRanking() {{
918
  <span class="bar" style="width:${{barW}}px;background:${{cerC}}"></span>
919
  <span class="cer-badge" style="color:${{cerC}};background:${{cerB}}">${{pct(e.cer)}}</span>
920
  </td>
 
921
  <td>${{pct(e.wer)}}</td>
922
  <td>${{pct(e.mer)}}</td>
923
  <td>${{pct(e.wil)}}</td>
 
 
924
  <td style="color:var(--text-muted)">${{pct(e.cer_median)}}</td>
925
  <td style="color:var(--text-muted)">${{pct(e.cer_min)}}</td>
926
  <td style="color:var(--text-muted)">${{pct(e.cer_max)}}</td>
@@ -1109,12 +1219,23 @@ function loadDocument(docId) {{
1109
  </div>`;
1110
  }}
1111
 
 
 
 
 
 
 
 
 
 
 
1112
  return `<div class="diff-panel">
1113
  <div class="diff-panel-header">
1114
  <span class="diff-panel-title">${{esc(er.engine)}}</span>
1115
  ${{pipeTagPanel}}
1116
  <span class="diff-panel-metrics">
1117
  <span class="cer-badge" style="color:${{c}};background:${{bg}}">${{pct(er.cer)}}</span>
 
1118
  <span class="badge" style="background:#f1f5f9">WER ${{pct(er.wer)}}</span>
1119
  ${{onBadge}}
1120
  ${{errBadge}}
@@ -1187,6 +1308,8 @@ function buildCharts() {{
1187
  buildRadar();
1188
  buildCerPerDoc();
1189
  buildDurationChart();
 
 
1190
  }}
1191
 
1192
  function buildCerHistogram() {{
@@ -1330,6 +1453,315 @@ function buildDurationChart() {{
1330
  }});
1331
  }}
1332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1333
  // ── Init ────────────────────────────────────────────────────────
1334
  function init() {{
1335
  // Méta nav
 
69
  engines_summary = []
70
  for report in benchmark.engine_reports:
71
  agg = report.aggregated_metrics
72
+ diplo_agg = agg.get("cer_diplomatic", {})
73
  entry: dict = {
74
  "name": report.engine_name,
75
  "version": report.engine_version,
 
82
  "cer_max": _safe(agg.get("cer", {}).get("max")),
83
  "doc_count": agg.get("document_count", 0),
84
  "failed": agg.get("failed_count", 0),
85
+ # CER diplomatique (après normalisation historique : ſ=s, u=v, i=j…)
86
+ "cer_diplomatic": _safe(diplo_agg.get("mean")) if diplo_agg else None,
87
+ "cer_diplomatic_profile": diplo_agg.get("profile"),
88
  # Distribution pour l'histogramme : liste des CER individuels
89
  "cer_values": [
90
  _safe(dr.metrics.cer)
91
  for dr in report.document_results
92
  if dr.metrics.error is None
93
  ],
94
+ "cer_diplomatic_values": [
95
+ _safe(dr.metrics.cer_diplomatic)
96
+ for dr in report.document_results
97
+ if dr.metrics.error is None and dr.metrics.cer_diplomatic is not None
98
+ ],
99
  # Champs pipeline OCR+LLM (vides pour les moteurs OCR seuls)
100
  "is_pipeline": report.is_pipeline,
101
  "pipeline_info": report.pipeline_info,
102
+ # Sprint 5 — métriques avancées patrimoniales
103
+ "ligature_score": _safe(report.ligature_score) if report.ligature_score is not None else None,
104
+ "diacritic_score": _safe(report.diacritic_score) if report.diacritic_score is not None else None,
105
+ "aggregated_confusion": report.aggregated_confusion,
106
+ "aggregated_taxonomy": report.aggregated_taxonomy,
107
+ "aggregated_structure": report.aggregated_structure,
108
+ "aggregated_image_quality": report.aggregated_image_quality,
109
  }
110
  engines_summary.append(entry)
111
 
 
137
  "engine": engine_name,
138
  "hypothesis": dr.hypothesis,
139
  "cer": _safe(dr.metrics.cer),
140
+ "cer_diplomatic": _safe(dr.metrics.cer_diplomatic) if dr.metrics.cer_diplomatic is not None else None,
141
  "wer": _safe(dr.metrics.wer),
142
  "duration": dr.duration_seconds,
143
  "error": dr.engine_error,
 
153
  if on is not None:
154
  er_entry["over_normalization"] = on
155
  er_entry["pipeline_mode"] = dr.pipeline_metadata.get("pipeline_mode")
156
+ # Sprint 5 — métriques avancées par document
157
+ if dr.char_scores is not None:
158
+ er_entry["ligature_score"] = _safe(dr.char_scores.get("ligature", {}).get("score"))
159
+ er_entry["diacritic_score"] = _safe(dr.char_scores.get("diacritic", {}).get("score"))
160
+ if dr.taxonomy is not None:
161
+ er_entry["taxonomy"] = dr.taxonomy
162
+ if dr.structure is not None:
163
+ er_entry["structure"] = dr.structure
164
+ if dr.image_quality is not None:
165
+ er_entry["image_quality"] = dr.image_quality
166
  engine_results.append(er_entry)
167
 
168
  # CER moyen sur ce document (pour le badge galerie)
 
630
  <button class="tab-btn active" onclick="showView('ranking')">Classement</button>
631
  <button class="tab-btn" onclick="showView('gallery')">Galerie</button>
632
  <button class="tab-btn" onclick="showView('document')">Document</button>
633
+ <button class="tab-btn" onclick="showView('characters')">Caractères</button>
634
  <button class="tab-btn" onclick="showView('analyses')">Analyses</button>
635
  </div>
636
  <div class="meta" id="nav-meta">—</div>
 
650
  <tr>
651
  <th data-col="rank" class="sortable sorted" data-dir="asc">#<i class="sort-icon">↑</i></th>
652
  <th data-col="name" class="sortable">Concurrent<i class="sort-icon">↕</i></th>
653
+ <th data-col="cer" class="sortable">CER exact<i class="sort-icon">↕</i></th>
654
+ <th data-col="cer_diplomatic" class="sortable" title="CER après normalisation diplomatique (ſ=s, u=v, i=j…) — mesure les erreurs substantielles en ignorant les variantes graphiques codifiées">CER diplo.<i class="sort-icon">↕</i></th>
655
  <th data-col="wer" class="sortable">WER<i class="sort-icon">↕</i></th>
656
  <th data-col="mer" class="sortable">MER<i class="sort-icon">↕</i></th>
657
  <th data-col="wil" class="sortable">WIL<i class="sort-icon">↕</i></th>
658
+ <th data-col="ligature_score" class="sortable" title="Taux de reconnaissance des ligatures (fi, fl, œ, æ, ff…)">Ligatures<i class="sort-icon">↕</i></th>
659
+ <th data-col="diacritic_score" class="sortable" title="Taux de conservation des diacritiques (accents, cédilles, trémas…)">Diacritiques<i class="sort-icon">↕</i></th>
660
  <th>CER médian</th>
661
  <th>CER min</th>
662
  <th>CER max</th>
 
806
  </div>
807
  </div>
808
 
809
+ <div class="chart-card">
810
+ <h3>Qualité image ↔ CER (scatter plot)</h3>
811
+ <div class="chart-canvas-wrap">
812
+ <canvas id="chart-quality-cer"></canvas>
813
+ </div>
814
+ <div style="font-size:.72rem;color:var(--text-muted);margin-top:.4rem">
815
+ Chaque point = un document. Axe X = score qualité image [0–1]. Axe Y = CER. Corrélation négative attendue.
816
+ </div>
817
+ </div>
818
+
819
+ <div class="chart-card" style="grid-column:1/-1">
820
+ <h3>Taxonomie des erreurs par moteur</h3>
821
+ <div class="chart-canvas-wrap" style="max-height:300px">
822
+ <canvas id="chart-taxonomy"></canvas>
823
+ </div>
824
+ <div style="font-size:.72rem;color:var(--text-muted);margin-top:.4rem">
825
+ Distribution des classes d'erreurs (classes 1–9 de la taxonomie Picarones).
826
+ </div>
827
+ </div>
828
+
829
+ </div>
830
+ </div>
831
+
832
+ <!-- ════ Vue 5 : Caractères ════════════════════════════════════════ -->
833
+ <div id="view-characters" class="view">
834
+ <div class="card">
835
+ <h2>Analyse des caractères</h2>
836
+
837
+ <!-- Sélecteur de moteur -->
838
+ <div class="stat-row" style="margin-bottom:1rem">
839
+ <label for="char-engine-select" style="font-weight:600;margin-right:.5rem">Moteur :</label>
840
+ <select id="char-engine-select" onchange="renderCharView()"
841
+ style="padding:.35rem .7rem;border-radius:6px;border:1px solid var(--border)"></select>
842
+ </div>
843
+
844
+ <!-- Scores ligatures / diacritiques -->
845
+ <div class="stat-row" id="char-scores-row" style="gap:1.5rem;margin-bottom:1.5rem"></div>
846
+
847
+ <!-- Matrice de confusion unicode -->
848
+ <h3 style="margin-bottom:.75rem">Matrice de confusion unicode
849
+ <span style="font-size:.75rem;font-weight:400;color:var(--text-muted)">
850
+ — substitutions les plus fréquentes (caractère GT → caractère OCR)
851
+ </span>
852
+ </h3>
853
+ <div id="confusion-heatmap" style="overflow-x:auto;margin-bottom:1.5rem"></div>
854
+
855
+ <!-- Détail ligatures par type -->
856
+ <h3 style="margin-bottom:.75rem">Reconnaissance des ligatures</h3>
857
+ <div id="ligature-detail" style="margin-bottom:1.5rem"></div>
858
+
859
+ <!-- Taxonomie détaillée -->
860
+ <h3 style="margin-bottom:.75rem">Distribution taxonomique des erreurs</h3>
861
+ <div id="taxonomy-detail"></div>
862
  </div>
863
  </div>
864
 
 
892
  document.querySelectorAll('.view').forEach(v => v.classList.remove('active'));
893
  document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
894
  document.getElementById('view-' + name).classList.add('active');
895
+ // Activer le bon onglet nav
896
+ const tabMap = {{ranking:'classement',gallery:'galerie',document:'document',characters:'caract',analyses:'analyses'}};
897
+ const prefix = tabMap[name] || name;
898
  document.querySelectorAll('.tab-btn').forEach(b => {{
899
+ if (b.textContent.toLowerCase().startsWith(prefix.toLowerCase())) b.classList.add('active');
 
 
900
  }});
901
  currentView = name;
902
  if (name === 'analyses' && !chartsBuilt) buildCharts();
903
+ if (name === 'characters' && !charViewBuilt) initCharView();
904
  }}
905
 
906
  // ── Formatage ───────────────────────────────────────────────────
 
943
  }}).join(' ');
944
  }}
945
 
946
+ // ── Score badge (ligatures / diacritiques) ───────────────────────
947
+ function _scoreBadge(v, label) {{
948
+ if (v === null || v === undefined) return '<span style="color:var(--text-muted)">—</span>';
949
+ const pctVal = (v * 100).toFixed(1);
950
+ const color = v >= 0.9 ? '#16a34a' : v >= 0.7 ? '#ca8a04' : '#dc2626';
951
+ const bg = v >= 0.9 ? '#f0fdf4' : v >= 0.7 ? '#fefce8' : '#fef2f2';
952
+ return `<span class="cer-badge" style="color:${{color}};background:${{bg}}" title="${{label}} : ${{pctVal}}%">${{pctVal}}%</span>`;
953
+ }}
954
+
955
  // ── Vue Classement ──────────────────────────────────────────────
956
  let rankingSort = {{ col: 'cer', dir: 'asc' }};
957
 
 
1001
  overNormCell = `<td><span class="${{cls}}" title="Classe 10 — ${{on.over_normalized_count}} mots corrects dégradés sur ${{on.total_correct_ocr_words}}">${{onPct}} %</span></td>`;
1002
  }}
1003
 
1004
+ // CER diplomatique
1005
+ let diploCerCell = '<td style="color:var(--text-muted)">—</td>';
1006
+ if (e.cer_diplomatic !== null && e.cer_diplomatic !== undefined) {{
1007
+ const dipC = cerColor(e.cer_diplomatic); const dipB = cerBg(e.cer_diplomatic);
1008
+ const delta = e.cer - e.cer_diplomatic;
1009
+ const deltaStr = delta > 0.001 ? ` <span style="font-size:.65rem;color:#059669">-${{(delta*100).toFixed(1)}}%</span>` : '';
1010
+ const profileHint = e.cer_diplomatic_profile ? ` title="Profil : ${{esc(e.cer_diplomatic_profile)}}"` : '';
1011
+ diploCerCell = `<td${{profileHint}}>
1012
+ <span class="cer-badge" style="color:${{dipC}};background:${{dipB}}">${{pct(e.cer_diplomatic)}}</span>${{deltaStr}}
1013
+ </td>`;
1014
+ }}
1015
+
1016
  return `<tr>
1017
  <td><span class="${{badgeClass}}">${{rank}}</span></td>
1018
  <td>
 
1025
  <span class="bar" style="width:${{barW}}px;background:${{cerC}}"></span>
1026
  <span class="cer-badge" style="color:${{cerC}};background:${{cerB}}">${{pct(e.cer)}}</span>
1027
  </td>
1028
+ ${{diploCerCell}}
1029
  <td>${{pct(e.wer)}}</td>
1030
  <td>${{pct(e.mer)}}</td>
1031
  <td>${{pct(e.wil)}}</td>
1032
+ <td>${{_scoreBadge(e.ligature_score, 'Ligatures')}}</td>
1033
+ <td>${{_scoreBadge(e.diacritic_score, 'Diacritiques')}}</td>
1034
  <td style="color:var(--text-muted)">${{pct(e.cer_median)}}</td>
1035
  <td style="color:var(--text-muted)">${{pct(e.cer_min)}}</td>
1036
  <td style="color:var(--text-muted)">${{pct(e.cer_max)}}</td>
 
1219
  </div>`;
1220
  }}
1221
 
1222
+ // CER diplomatique par document
1223
+ let diplomaBadge = '';
1224
+ if (er.cer_diplomatic !== null && er.cer_diplomatic !== undefined) {{
1225
+ const dipC = cerColor(er.cer_diplomatic); const dipB = cerBg(er.cer_diplomatic);
1226
+ const delta = er.cer - er.cer_diplomatic;
1227
+ const deltaHint = delta > 0.001 ? ` (−${{(delta*100).toFixed(1)}}% avec normalisation)` : '';
1228
+ diplomaBadge = `<span class="cer-badge" style="color:${{dipC}};background:${{dipB}};opacity:.85"
1229
+ title="CER diplomatique (ſ=s, u=v, i=j…)${{deltaHint}}">diplo. ${{pct(er.cer_diplomatic)}}</span>`;
1230
+ }}
1231
+
1232
  return `<div class="diff-panel">
1233
  <div class="diff-panel-header">
1234
  <span class="diff-panel-title">${{esc(er.engine)}}</span>
1235
  ${{pipeTagPanel}}
1236
  <span class="diff-panel-metrics">
1237
  <span class="cer-badge" style="color:${{c}};background:${{bg}}">${{pct(er.cer)}}</span>
1238
+ ${{diplomaBadge}}
1239
  <span class="badge" style="background:#f1f5f9">WER ${{pct(er.wer)}}</span>
1240
  ${{onBadge}}
1241
  ${{errBadge}}
 
1308
  buildRadar();
1309
  buildCerPerDoc();
1310
  buildDurationChart();
1311
+ buildQualityCerScatter();
1312
+ buildTaxonomyChart();
1313
  }}
1314
 
1315
  function buildCerHistogram() {{
 
1453
  }});
1454
  }}
1455
 
1456
+ function buildQualityCerScatter() {{
1457
+ const ctx = document.getElementById('chart-quality-cer');
1458
+ if (!ctx) return;
1459
+ // Construire les points : un par document, un dataset par moteur
1460
+ const datasets = DATA.engines.map((e, ei) => {{
1461
+ const points = DATA.documents.flatMap(doc => {{
1462
+ const er = doc.engine_results.find(r => r.engine === e.name);
1463
+ if (!er || er.error || !er.image_quality) return [];
1464
+ return [{{ x: er.image_quality.quality_score, y: er.cer * 100 }}];
1465
+ }});
1466
+ return {{
1467
+ label: e.name, data: points,
1468
+ backgroundColor: engineColor(ei) + 'bb',
1469
+ borderColor: engineColor(ei),
1470
+ borderWidth: 1, pointRadius: 5, pointHoverRadius: 7,
1471
+ }};
1472
+ }}).filter(d => d.data.length > 0);
1473
+
1474
+ if (!datasets.length) {{ ctx.parentElement.innerHTML = '<p style="color:var(--text-muted);padding:1rem">Aucune donnée de qualité image disponible.</p>'; return; }}
1475
+
1476
+ chartInstances['quality-cer'] = new Chart(ctx.getContext('2d'), {{
1477
+ type: 'scatter',
1478
+ data: {{ datasets }},
1479
+ options: {{
1480
+ responsive: true, maintainAspectRatio: false,
1481
+ plugins: {{
1482
+ legend: {{ position: 'top', labels: {{ font: {{ size: 11 }} }} }},
1483
+ tooltip: {{ callbacks: {{
1484
+ label: ctx => `${{ctx.dataset.label}}: qualité=${{ctx.parsed.x.toFixed(2)}}, CER=${{ctx.parsed.y.toFixed(1)}}%`,
1485
+ }} }},
1486
+ }},
1487
+ scales: {{
1488
+ x: {{ min: 0, max: 1, title: {{ display: true, text: 'Score qualité image [0–1]', font: {{ size: 11 }} }} }},
1489
+ y: {{ min: 0, title: {{ display: true, text: 'CER (%)', font: {{ size: 11 }} }} }},
1490
+ }},
1491
+ }},
1492
+ }});
1493
+ }}
1494
+
1495
+ function buildTaxonomyChart() {{
1496
+ const ctx = document.getElementById('chart-taxonomy');
1497
+ if (!ctx) return;
1498
+ const taxLabels = ['Confusion visuelle','Diacritique','Casse','Ligature','Abréviation','Hapax','Segmentation','Hors-vocab.','Lacune'];
1499
+ const taxKeys = ['visual_confusion','diacritic_error','case_error','ligature_error','abbreviation_error','hapax','segmentation_error','oov_character','lacuna'];
1500
+ const taxColors = ['#6366f1','#f59e0b','#ec4899','#14b8a6','#8b5cf6','#64748b','#f97316','#06b6d4','#ef4444'];
1501
+
1502
+ const datasets = DATA.engines.map((e, ei) => {{
1503
+ const tax = e.aggregated_taxonomy;
1504
+ const data = taxKeys.map(k => tax && tax.counts ? (tax.counts[k] || 0) : 0);
1505
+ return {{
1506
+ label: e.name, data,
1507
+ backgroundColor: engineColor(ei) + '99',
1508
+ borderColor: engineColor(ei),
1509
+ borderWidth: 1,
1510
+ }};
1511
+ }});
1512
+
1513
+ chartInstances['taxonomy'] = new Chart(ctx.getContext('2d'), {{
1514
+ type: 'bar',
1515
+ data: {{ labels: taxLabels, datasets }},
1516
+ options: {{
1517
+ responsive: true, maintainAspectRatio: false,
1518
+ plugins: {{ legend: {{ position: 'top', labels: {{ font: {{ size: 11 }} }} }} }},
1519
+ scales: {{
1520
+ x: {{ ticks: {{ font: {{ size: 10 }} }} }},
1521
+ y: {{ title: {{ display: true, text: "Nb d'erreurs", font: {{ size: 11 }} }}, min: 0, ticks: {{ stepSize: 1 }} }},
1522
+ }},
1523
+ }},
1524
+ }});
1525
+ }}
1526
+
1527
+ // ── Vue Caractères ───────────────────────────────────────────────
1528
+ let charViewBuilt = false;
1529
+
1530
+ function initCharView() {{
1531
+ charViewBuilt = true;
1532
+ // Remplir le sélecteur de moteur
1533
+ const sel = document.getElementById('char-engine-select');
1534
+ sel.innerHTML = '';
1535
+ DATA.engines.forEach(e => {{
1536
+ const opt = document.createElement('option');
1537
+ opt.value = e.name; opt.textContent = e.name;
1538
+ sel.appendChild(opt);
1539
+ }});
1540
+ renderCharView();
1541
+ }}
1542
+
1543
+ function renderCharView() {{
1544
+ const engineName = document.getElementById('char-engine-select').value;
1545
+ const eng = DATA.engines.find(e => e.name === engineName);
1546
+ if (!eng) return;
1547
+
1548
+ // Scores ligatures / diacritiques
1549
+ const scoresRow = document.getElementById('char-scores-row');
1550
+ const ligScore = eng.ligature_score;
1551
+ const diacScore = eng.diacritic_score;
1552
+ scoresRow.innerHTML = `
1553
+ <div class="stat">Ligatures <b>${{_scoreBadge(ligScore, 'Ligatures')}}</b></div>
1554
+ <div class="stat">Diacritiques <b>${{_scoreBadge(diacScore, 'Diacritiques')}}</b></div>
1555
+ ${{eng.aggregated_structure ? `
1556
+ <div class="stat">Précision lignes <b>${{_scoreBadge(eng.aggregated_structure.mean_line_accuracy, 'Précision nb lignes')}}</b></div>
1557
+ <div class="stat">Ordre lecture <b>${{_scoreBadge(eng.aggregated_structure.mean_reading_order_score, 'Score ordre de lecture')}}</b></div>
1558
+ ` : ''}}
1559
+ ${{eng.aggregated_image_quality ? `
1560
+ <div class="stat">Qualité image moy. <b>${{_scoreBadge(eng.aggregated_image_quality.mean_quality_score, 'Qualité image moyenne')}}</b></div>
1561
+ ` : ''}}
1562
+ `;
1563
+
1564
+ // Matrice de confusion heatmap
1565
+ renderConfusionHeatmap(eng);
1566
+
1567
+ // Détail ligatures
1568
+ renderLigatureDetail(eng);
1569
+
1570
+ // Taxonomie détaillée
1571
+ renderTaxonomyDetail(eng);
1572
+ }}
1573
+
1574
+ function renderConfusionHeatmap(eng) {{
1575
+ const container = document.getElementById('confusion-heatmap');
1576
+ const cm = eng.aggregated_confusion;
1577
+ if (!cm || !cm.matrix) {{
1578
+ container.innerHTML = '<p style="color:var(--text-muted)">Aucune donnée de confusion disponible.</p>';
1579
+ return;
1580
+ }}
1581
+
1582
+ // Collecter les top confusions (substitutions uniquement, hors ∅)
1583
+ const pairs = [];
1584
+ for (const [gt, ocrs] of Object.entries(cm.matrix)) {{
1585
+ if (gt === '∅') continue;
1586
+ for (const [ocr, cnt] of Object.entries(ocrs)) {{
1587
+ if (ocr !== gt && ocr !== '∅' && cnt > 0) {{
1588
+ pairs.push({{ gt, ocr, cnt }});
1589
+ }}
1590
+ }}
1591
+ }}
1592
+ pairs.sort((a,b) => b.cnt - a.cnt);
1593
+ const top = pairs.slice(0, 30);
1594
+
1595
+ if (!top.length) {{
1596
+ container.innerHTML = '<p style="color:var(--text-muted)">Aucune substitution détectée.</p>';
1597
+ return;
1598
+ }}
1599
+
1600
+ // Heatmap sous forme de tableau compact
1601
+ const maxCnt = top[0].cnt;
1602
+ const rows = top.map(p => {{
1603
+ const intensity = Math.round((p.cnt / maxCnt) * 200 + 55); // 55–255
1604
+ const bg = `rgb(${{intensity}},50,50)`;
1605
+ const fg = intensity > 150 ? '#fff' : '#222';
1606
+ return `<tr onclick="showConfusionExamples('${{esc(p.gt)}}','${{esc(p.ocr)}}')" style="cursor:pointer" title="GT='${{esc(p.gt)}}' → OCR='${{esc(p.ocr)}}' : ${{p.cnt}} fois">
1607
+ <td style="font-family:monospace;font-size:1.1rem;padding:.3rem .6rem;text-align:center">${{esc(p.gt)}}</td>
1608
+ <td style="padding:.1rem .3rem;color:var(--text-muted)">→</td>
1609
+ <td style="font-family:monospace;font-size:1.1rem;padding:.3rem .6rem;text-align:center">${{esc(p.ocr)}}</td>
1610
+ <td style="padding:.3rem 1rem">
1611
+ <div style="display:flex;align-items:center;gap:.5rem">
1612
+ <div style="width:${{Math.round(p.cnt/maxCnt*120)}}px;height:12px;border-radius:3px;background:${{bg}}"></div>
1613
+ <span style="font-size:.8rem;color:var(--text-muted)">${{p.cnt}}×</span>
1614
+ </div>
1615
+ </td>
1616
+ </tr>`;
1617
+ }}).join('');
1618
+
1619
+ container.innerHTML = `
1620
+ <p style="font-size:.75rem;color:var(--text-muted);margin-bottom:.5rem">
1621
+ Cliquer sur une ligne pour voir les exemples dans la vue Document.
1622
+ Total substitutions : <b>${{cm.total_substitutions}}</b>
1623
+ · Insertions : <b>${{cm.total_insertions}}</b>
1624
+ · Suppressions : <b>${{cm.total_deletions}}</b>
1625
+ </p>
1626
+ <table style="border-collapse:collapse;font-size:.85rem">
1627
+ <thead><tr>
1628
+ <th style="padding:.3rem .6rem;text-align:left">GT</th>
1629
+ <th></th>
1630
+ <th style="padding:.3rem .6rem;text-align:left">OCR</th>
1631
+ <th style="padding:.3rem 1rem;text-align:left">Fréquence</th>
1632
+ </tr></thead>
1633
+ <tbody>${{rows}}</tbody>
1634
+ </table>
1635
+ `;
1636
+ }}
1637
+
1638
+ function showConfusionExamples(gtChar, ocrChar) {{
1639
+ // Naviguer vers la vue Document en cherchant un exemple de cette confusion
1640
+ showView('document');
1641
+ const docWithConfusion = DATA.documents.find(doc =>
1642
+ doc.engine_results.some(er => {{
1643
+ const h = er.hypothesis || '';
1644
+ const g = doc.ground_truth || '';
1645
+ return g.includes(gtChar) && h.includes(ocrChar);
1646
+ }})
1647
+ );
1648
+ if (docWithConfusion) loadDocument(docWithConfusion.doc_id);
1649
+ }}
1650
+
1651
+ function renderLigatureDetail(eng) {{
1652
+ const container = document.getElementById('ligature-detail');
1653
+ // Agrégation sur tous les documents pour ce moteur
1654
+ const ligData = {{}};
1655
+ DATA.documents.forEach(doc => {{
1656
+ const er = doc.engine_results.find(r => r.engine === eng.name);
1657
+ if (!er || !er.ligature_score) return;
1658
+ // On n'a que le score global par doc; pour le détail, utiliser aggregated_char_scores
1659
+ }});
1660
+
1661
+ const agg = eng.aggregated_char_scores;
1662
+ if (!agg || !agg.ligature || !agg.ligature.per_ligature) {{
1663
+ const overallScore = eng.ligature_score;
1664
+ if (overallScore !== null && overallScore !== undefined) {{
1665
+ container.innerHTML = `<div class="stat">Score global ligatures : ${{_scoreBadge(overallScore, 'Ligatures')}}</div>`;
1666
+ }} else {{
1667
+ container.innerHTML = '<p style="color:var(--text-muted)">Aucune donnée ligature disponible (pas de ligatures dans le corpus).</p>';
1668
+ }}
1669
+ return;
1670
+ }}
1671
+
1672
+ const perLig = agg.ligature.per_ligature;
1673
+ if (!Object.keys(perLig).length) {{
1674
+ container.innerHTML = '<p style="color:var(--text-muted)">Aucune ligature trouvée dans le corpus GT.</p>';
1675
+ return;
1676
+ }}
1677
+
1678
+ const rows = Object.entries(perLig)
1679
+ .sort((a,b) => b[1].gt_count - a[1].gt_count)
1680
+ .map(([lig, d]) => {{
1681
+ const sc = d.score;
1682
+ const color = sc >= 0.9 ? '#16a34a' : sc >= 0.7 ? '#ca8a04' : '#dc2626';
1683
+ const barW = Math.round(sc * 120);
1684
+ return `<tr>
1685
+ <td style="font-family:monospace;font-size:1.2rem;padding:.3rem .6rem">${{esc(lig)}}</td>
1686
+ <td style="padding:.3rem .6rem;font-size:.8rem;color:var(--text-muted)">${{esc(lig.codePointAt(0).toString(16).toUpperCase().padStart(4,'0'))}}</td>
1687
+ <td style="padding:.3rem .6rem">${{d.gt_count}} GT</td>
1688
+ <td style="padding:.3rem .6rem">${{d.ocr_correct}} corrects</td>
1689
+ <td style="padding:.3rem 1rem">
1690
+ <div style="display:flex;align-items:center;gap:.5rem">
1691
+ <div style="width:${{barW}}px;height:10px;border-radius:3px;background:${{color}}"></div>
1692
+ <span style="color:${{color}};font-weight:600">${{(sc*100).toFixed(0)}}%</span>
1693
+ </div>
1694
+ </td>
1695
+ </tr>`;
1696
+ }}).join('');
1697
+
1698
+ container.innerHTML = `
1699
+ <table style="border-collapse:collapse;font-size:.85rem">
1700
+ <thead><tr>
1701
+ <th style="padding:.3rem .6rem;text-align:left">Ligature</th>
1702
+ <th style="padding:.3rem .6rem;text-align:left">Unicode</th>
1703
+ <th style="padding:.3rem .6rem">GT</th>
1704
+ <th style="padding:.3rem .6rem">Corrects</th>
1705
+ <th style="padding:.3rem 1rem;text-align:left">Score</th>
1706
+ </tr></thead>
1707
+ <tbody>${{rows}}</tbody>
1708
+ </table>
1709
+ `;
1710
+ }}
1711
+
1712
+ function renderTaxonomyDetail(eng) {{
1713
+ const container = document.getElementById('taxonomy-detail');
1714
+ const tax = eng.aggregated_taxonomy;
1715
+ if (!tax || !tax.counts) {{
1716
+ container.innerHTML = '<p style="color:var(--text-muted)">Aucune donnée taxonomique disponible.</p>';
1717
+ return;
1718
+ }}
1719
+
1720
+ const classNames = {{
1721
+ visual_confusion: '1 — Confusion visuelle',
1722
+ diacritic_error: '2 — Erreur diacritique',
1723
+ case_error: '3 — Erreur de casse',
1724
+ ligature_error: '4 — Ligature',
1725
+ abbreviation_error: '5 — Abréviation',
1726
+ hapax: '6 — Hapax',
1727
+ segmentation_error: '7 — Segmentation',
1728
+ oov_character: '8 — Hors-vocabulaire',
1729
+ lacuna: '9 — Lacune',
1730
+ }};
1731
+ const total = tax.total_errors || 1;
1732
+ const maxCnt = Math.max(...Object.values(tax.counts));
1733
+
1734
+ const rows = Object.entries(tax.counts)
1735
+ .filter(([, cnt]) => cnt > 0)
1736
+ .sort((a,b) => b[1]-a[1])
1737
+ .map(([cls, cnt]) => {{
1738
+ const pctVal = (cnt / total * 100).toFixed(1);
1739
+ const barW = maxCnt > 0 ? Math.round(cnt/maxCnt * 200) : 0;
1740
+ return `<tr>
1741
+ <td style="padding:.3rem .6rem;font-size:.85rem">${{esc(classNames[cls] || cls)}}</td>
1742
+ <td style="padding:.3rem .6rem;text-align:right;font-variant-numeric:tabular-nums">${{cnt}}</td>
1743
+ <td style="padding:.3rem 1rem">
1744
+ <div style="display:flex;align-items:center;gap:.5rem">
1745
+ <div style="width:${{barW}}px;height:10px;border-radius:3px;background:#6366f1"></div>
1746
+ <span style="color:var(--text-muted);font-size:.8rem">${{pctVal}}%</span>
1747
+ </div>
1748
+ </td>
1749
+ </tr>`;
1750
+ }}).join('');
1751
+
1752
+ container.innerHTML = `
1753
+ <p style="font-size:.75rem;color:var(--text-muted);margin-bottom:.5rem">Total : <b>${{tax.total_errors}}</b> erreurs classifiées.</p>
1754
+ <table style="border-collapse:collapse;font-size:.85rem;min-width:400px">
1755
+ <thead><tr>
1756
+ <th style="padding:.3rem .6rem;text-align:left">Classe</th>
1757
+ <th style="padding:.3rem .6rem;text-align:right">N</th>
1758
+ <th style="padding:.3rem 1rem;text-align:left">Proportion</th>
1759
+ </tr></thead>
1760
+ <tbody>${{rows}}</tbody>
1761
+ </table>
1762
+ `;
1763
+ }}
1764
+
1765
  // ── Init ────────────────────────────────────────────────────────
1766
  function init() {{
1767
  // Méta nav
picarones/web/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Interface web locale Picarones — FastAPI."""
picarones/web/app.py ADDED
@@ -0,0 +1,1634 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Interface web locale Picarones — application FastAPI.
2
+
3
+ Lance avec :
4
+ picarones serve [--port 8000] [--host 127.0.0.1]
5
+ ou directement :
6
+ uvicorn picarones.web.app:app --reload --port 8000
7
+
8
+ Routes
9
+ ------
10
+ GET / Page principale (SPA)
11
+ GET /api/status Version et état de l'application
12
+ GET /api/engines Statut des moteurs OCR et LLMs disponibles
13
+ GET /api/corpus/browse Parcourir les dossiers du serveur
14
+ GET /api/reports Liste des rapports générés
15
+ GET /api/normalization/profiles Profils de normalisation disponibles
16
+ POST /api/benchmark/start Lancer un benchmark (retourne job_id)
17
+ GET /api/benchmark/{job_id}/stream Stream SSE de progression
18
+ GET /api/benchmark/{job_id}/status Statut courant d'un job
19
+ POST /api/benchmark/{job_id}/cancel Annuler un job
20
+ GET /api/htr-united/catalogue Catalogue HTR-United
21
+ POST /api/htr-united/import Importer un corpus HTR-United
22
+ GET /api/huggingface/search Rechercher des datasets HuggingFace
23
+ POST /api/huggingface/import Importer un dataset HuggingFace
24
+ GET /reports/{filename} Accéder à un rapport HTML généré
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import asyncio
30
+ import json
31
+ import os
32
+ import threading
33
+ import time
34
+ import uuid
35
+ from dataclasses import dataclass, field
36
+ from datetime import datetime, timezone
37
+ from pathlib import Path
38
+ from typing import Any, AsyncIterator, Optional
39
+
40
+ from fastapi import FastAPI, HTTPException, Query
41
+ from fastapi.responses import FileResponse, HTMLResponse, StreamingResponse
42
+ from pydantic import BaseModel
43
+
44
+ from picarones import __version__
45
+
46
+ # ---------------------------------------------------------------------------
47
+ # App initialization
48
+ # ---------------------------------------------------------------------------
49
+
50
+ app = FastAPI(
51
+ title="Picarones",
52
+ description="Plateforme de comparaison de moteurs OCR/HTR pour documents patrimoniaux",
53
+ version=__version__,
54
+ docs_url="/api/docs",
55
+ redoc_url="/api/redoc",
56
+ )
57
+
58
+ # ---------------------------------------------------------------------------
59
+ # Job management
60
+ # ---------------------------------------------------------------------------
61
+
62
+ @dataclass
63
+ class BenchmarkJob:
64
+ job_id: str
65
+ status: str = "pending" # pending | running | complete | error | cancelled
66
+ progress: float = 0.0 # 0.0 – 1.0
67
+ current_engine: str = ""
68
+ total_docs: int = 0
69
+ processed_docs: int = 0
70
+ output_path: str = ""
71
+ error: str = ""
72
+ started_at: Optional[str] = None
73
+ finished_at: Optional[str] = None
74
+ events: list[dict] = field(default_factory=list)
75
+ _subscribers: list[asyncio.Queue] = field(default_factory=list)
76
+
77
+ def add_event(self, kind: str, data: Any) -> None:
78
+ event = {"kind": kind, "data": data, "ts": _iso_now()}
79
+ self.events.append(event)
80
+ for q in self._subscribers:
81
+ try:
82
+ q.put_nowait(event)
83
+ except asyncio.QueueFull:
84
+ pass
85
+
86
+ def subscribe(self) -> asyncio.Queue:
87
+ q: asyncio.Queue = asyncio.Queue(maxsize=200)
88
+ self._subscribers.append(q)
89
+ return q
90
+
91
+ def unsubscribe(self, q: asyncio.Queue) -> None:
92
+ try:
93
+ self._subscribers.remove(q)
94
+ except ValueError:
95
+ pass
96
+
97
+ def as_dict(self) -> dict:
98
+ return {
99
+ "job_id": self.job_id,
100
+ "status": self.status,
101
+ "progress": self.progress,
102
+ "current_engine": self.current_engine,
103
+ "total_docs": self.total_docs,
104
+ "processed_docs": self.processed_docs,
105
+ "output_path": self.output_path,
106
+ "error": self.error,
107
+ "started_at": self.started_at,
108
+ "finished_at": self.finished_at,
109
+ }
110
+
111
+
112
+ _JOBS: dict[str, BenchmarkJob] = {}
113
+
114
+ # ---------------------------------------------------------------------------
115
+ # Pydantic models
116
+ # ---------------------------------------------------------------------------
117
+
118
+ class BenchmarkRequest(BaseModel):
119
+ corpus_path: str
120
+ engines: list[str] = ["tesseract"]
121
+ normalization_profile: str = "nfc"
122
+ output_dir: str = "./rapports/"
123
+ report_name: str = ""
124
+ lang: str = "fra"
125
+
126
+ class HTRUnitedImportRequest(BaseModel):
127
+ entry_id: str
128
+ output_dir: str = "./corpus/"
129
+ max_samples: int = 100
130
+
131
+ class HuggingFaceImportRequest(BaseModel):
132
+ dataset_id: str
133
+ output_dir: str = "./corpus/"
134
+ split: str = "train"
135
+ max_samples: int = 100
136
+
137
+
138
+ # ---------------------------------------------------------------------------
139
+ # API — status
140
+ # ---------------------------------------------------------------------------
141
+
142
+ @app.get("/api/status")
143
+ async def api_status() -> dict:
144
+ return {
145
+ "app": "Picarones",
146
+ "version": __version__,
147
+ "status": "ok",
148
+ "timestamp": _iso_now(),
149
+ }
150
+
151
+
152
+ # ---------------------------------------------------------------------------
153
+ # API — engines
154
+ # ---------------------------------------------------------------------------
155
+
156
+ @app.get("/api/engines")
157
+ async def api_engines() -> dict:
158
+ engines = []
159
+
160
+ # Tesseract
161
+ tess = _check_engine("tesseract", "pytesseract")
162
+ engines.append(tess)
163
+
164
+ # Pero OCR
165
+ pero = _check_engine("pero_ocr", "pero_ocr", label="Pero OCR")
166
+ engines.append(pero)
167
+
168
+ # Kraken
169
+ kraken = _check_engine("kraken", "kraken", label="Kraken")
170
+ engines.append(kraken)
171
+
172
+ # Calamari
173
+ calamari = _check_engine("calamari", "calamari_ocr", label="Calamari")
174
+ engines.append(calamari)
175
+
176
+ llms = []
177
+
178
+ # OpenAI
179
+ llms.append({
180
+ "id": "openai",
181
+ "label": "OpenAI (GPT-4o, GPT-4o mini)",
182
+ "type": "llm",
183
+ "available": bool(os.environ.get("OPENAI_API_KEY")),
184
+ "key_env": "OPENAI_API_KEY",
185
+ "status": "configured" if os.environ.get("OPENAI_API_KEY") else "missing_key",
186
+ })
187
+
188
+ # Anthropic
189
+ llms.append({
190
+ "id": "anthropic",
191
+ "label": "Anthropic (Claude Sonnet, Haiku)",
192
+ "type": "llm",
193
+ "available": bool(os.environ.get("ANTHROPIC_API_KEY")),
194
+ "key_env": "ANTHROPIC_API_KEY",
195
+ "status": "configured" if os.environ.get("ANTHROPIC_API_KEY") else "missing_key",
196
+ })
197
+
198
+ # Mistral
199
+ llms.append({
200
+ "id": "mistral",
201
+ "label": "Mistral (Mistral OCR, Pixtral, Large)",
202
+ "type": "llm",
203
+ "available": bool(os.environ.get("MISTRAL_API_KEY")),
204
+ "key_env": "MISTRAL_API_KEY",
205
+ "status": "configured" if os.environ.get("MISTRAL_API_KEY") else "missing_key",
206
+ })
207
+
208
+ # Ollama
209
+ ollama_available = _check_ollama()
210
+ ollama_models = _list_ollama_models() if ollama_available else []
211
+ llms.append({
212
+ "id": "ollama",
213
+ "label": "Ollama (Llama 3, Gemma, Phi — local)",
214
+ "type": "llm_local",
215
+ "available": ollama_available,
216
+ "status": "running" if ollama_available else "not_running",
217
+ "models": ollama_models,
218
+ "base_url": "http://localhost:11434",
219
+ })
220
+
221
+ return {"engines": engines, "llms": llms}
222
+
223
+
224
+ def _check_engine(engine_id: str, module_name: str, label: str = "") -> dict:
225
+ label = label or engine_id.replace("_", " ").title()
226
+ try:
227
+ __import__(module_name)
228
+ installed = True
229
+ except ImportError:
230
+ installed = False
231
+
232
+ version = ""
233
+ if installed and engine_id == "tesseract":
234
+ try:
235
+ import pytesseract
236
+ version = pytesseract.get_tesseract_version()
237
+ version = str(version)
238
+ except Exception:
239
+ version = "installé"
240
+ elif installed:
241
+ try:
242
+ mod = __import__(module_name)
243
+ version = getattr(mod, "__version__", "installé")
244
+ except Exception:
245
+ version = "installé"
246
+
247
+ return {
248
+ "id": engine_id,
249
+ "label": label,
250
+ "type": "ocr",
251
+ "available": installed,
252
+ "version": version,
253
+ "status": "available" if installed else "not_installed",
254
+ }
255
+
256
+
257
+ def _check_ollama() -> bool:
258
+ import urllib.error, urllib.request
259
+ try:
260
+ with urllib.request.urlopen("http://localhost:11434/api/tags", timeout=2) as r:
261
+ return r.status == 200
262
+ except Exception:
263
+ return False
264
+
265
+
266
+ def _list_ollama_models() -> list[str]:
267
+ import urllib.error, urllib.request
268
+ try:
269
+ with urllib.request.urlopen("http://localhost:11434/api/tags", timeout=2) as r:
270
+ data = json.loads(r.read().decode())
271
+ return [m.get("name", "") for m in data.get("models", [])]
272
+ except Exception:
273
+ return []
274
+
275
+
276
+ # ---------------------------------------------------------------------------
277
+ # API — corpus browse
278
+ # ---------------------------------------------------------------------------
279
+
280
+ @app.get("/api/corpus/browse")
281
+ async def api_corpus_browse(path: str = Query(default=".", description="Chemin à explorer")) -> dict:
282
+ target = Path(path).resolve()
283
+ if not target.exists() or not target.is_dir():
284
+ raise HTTPException(status_code=404, detail=f"Dossier non trouvé : {path}")
285
+
286
+ items = []
287
+ try:
288
+ for entry in sorted(target.iterdir()):
289
+ item: dict[str, Any] = {
290
+ "name": entry.name,
291
+ "path": str(entry),
292
+ "is_dir": entry.is_dir(),
293
+ }
294
+ if entry.is_dir():
295
+ # Compter les paires image/gt
296
+ gt_count = sum(1 for f in entry.iterdir() if f.suffix == ".txt" and f.stem.endswith(".gt"))
297
+ item["gt_count"] = gt_count
298
+ item["has_corpus"] = gt_count > 0
299
+ items.append(item)
300
+ except PermissionError as exc:
301
+ raise HTTPException(status_code=403, detail=str(exc))
302
+
303
+ return {
304
+ "current_path": str(target),
305
+ "parent_path": str(target.parent) if target.parent != target else None,
306
+ "items": items,
307
+ }
308
+
309
+
310
+ # ---------------------------------------------------------------------------
311
+ # API — normalization profiles
312
+ # ---------------------------------------------------------------------------
313
+
314
+ @app.get("/api/normalization/profiles")
315
+ async def api_normalization_profiles() -> dict:
316
+ from picarones.core.normalization import get_builtin_profile
317
+
318
+ profile_ids = [
319
+ "nfc",
320
+ "caseless",
321
+ "minimal",
322
+ "medieval_french",
323
+ "early_modern_french",
324
+ "medieval_latin",
325
+ ]
326
+
327
+ profiles = []
328
+ for pid in profile_ids:
329
+ try:
330
+ p = get_builtin_profile(pid)
331
+ profiles.append({
332
+ "id": pid,
333
+ "name": p.name,
334
+ "description": p.description or p.name,
335
+ "caseless": p.caseless,
336
+ "diplomatic_rules": len(p.diplomatic_table),
337
+ })
338
+ except Exception:
339
+ pass
340
+
341
+ return {"profiles": profiles}
342
+
343
+
344
+ # ---------------------------------------------------------------------------
345
+ # API — reports
346
+ # ---------------------------------------------------------------------------
347
+
348
+ @app.get("/api/reports")
349
+ async def api_reports(reports_dir: str = Query(default=".", description="Dossier rapports")) -> dict:
350
+ target = Path(reports_dir).resolve()
351
+ reports = []
352
+
353
+ search_dirs = [target, Path(".").resolve(), Path("./rapports").resolve()]
354
+ seen: set[str] = set()
355
+
356
+ for d in search_dirs:
357
+ if not d.exists():
358
+ continue
359
+ for f in sorted(d.glob("*.html"), key=lambda x: x.stat().st_mtime, reverse=True):
360
+ if str(f) not in seen:
361
+ seen.add(str(f))
362
+ stat = f.stat()
363
+ reports.append({
364
+ "filename": f.name,
365
+ "path": str(f),
366
+ "size_kb": round(stat.st_size / 1024, 1),
367
+ "modified": datetime.fromtimestamp(stat.st_mtime, tz=timezone.utc).isoformat(),
368
+ "url": f"/reports/{f.name}",
369
+ })
370
+
371
+ return {"reports": reports}
372
+
373
+
374
+ @app.get("/reports/{filename}")
375
+ async def serve_report(filename: str) -> FileResponse:
376
+ # Cherche dans le répertoire courant et ./rapports/
377
+ for d in [Path("."), Path("./rapports")]:
378
+ f = d / filename
379
+ if f.exists() and f.suffix == ".html":
380
+ return FileResponse(str(f.resolve()), media_type="text/html")
381
+ raise HTTPException(status_code=404, detail=f"Rapport non trouvé : {filename}")
382
+
383
+
384
+ # ---------------------------------------------------------------------------
385
+ # API — HTR-United
386
+ # ---------------------------------------------------------------------------
387
+
388
+ @app.get("/api/htr-united/catalogue")
389
+ async def api_htr_united_catalogue(
390
+ query: str = Query(default="", description="Recherche textuelle"),
391
+ language: str = Query(default="", description="Filtre langue"),
392
+ script: str = Query(default="", description="Filtre type d'écriture"),
393
+ ) -> dict:
394
+ from picarones.importers.htr_united import HTRUnitedCatalogue
395
+
396
+ cat = HTRUnitedCatalogue.from_demo()
397
+ results = cat.search(
398
+ query=query,
399
+ language=language or None,
400
+ script=script or None,
401
+ )
402
+ return {
403
+ "source": cat.source,
404
+ "total": len(results),
405
+ "entries": [e.as_dict() for e in results],
406
+ "available_languages": cat.available_languages(),
407
+ "available_scripts": cat.available_scripts(),
408
+ }
409
+
410
+
411
+ @app.post("/api/htr-united/import")
412
+ async def api_htr_united_import(req: HTRUnitedImportRequest) -> dict:
413
+ from picarones.importers.htr_united import HTRUnitedCatalogue, import_htr_united_corpus
414
+
415
+ cat = HTRUnitedCatalogue.from_demo()
416
+ entry = cat.get_by_id(req.entry_id)
417
+ if not entry:
418
+ raise HTTPException(status_code=404, detail=f"Entrée non trouvée : {req.entry_id}")
419
+
420
+ result = import_htr_united_corpus(
421
+ entry=entry,
422
+ output_dir=req.output_dir,
423
+ max_samples=req.max_samples,
424
+ )
425
+ return result
426
+
427
+
428
+ # ---------------------------------------------------------------------------
429
+ # API — HuggingFace
430
+ # ---------------------------------------------------------------------------
431
+
432
+ @app.get("/api/huggingface/search")
433
+ async def api_huggingface_search(
434
+ query: str = Query(default="", description="Requête de recherche"),
435
+ language: str = Query(default="", description="Filtre langue"),
436
+ tags: str = Query(default="", description="Tags séparés par des virgules"),
437
+ limit: int = Query(default=20, ge=1, le=50),
438
+ ) -> dict:
439
+ from picarones.importers.huggingface import HuggingFaceImporter
440
+
441
+ tag_list = [t.strip() for t in tags.split(",") if t.strip()] if tags else None
442
+ importer = HuggingFaceImporter()
443
+ results = importer.search(
444
+ query=query,
445
+ tags=tag_list,
446
+ language=language or None,
447
+ limit=limit,
448
+ )
449
+ return {
450
+ "total": len(results),
451
+ "datasets": [ds.as_dict() for ds in results],
452
+ }
453
+
454
+
455
+ @app.post("/api/huggingface/import")
456
+ async def api_huggingface_import(req: HuggingFaceImportRequest) -> dict:
457
+ from picarones.importers.huggingface import HuggingFaceImporter
458
+
459
+ importer = HuggingFaceImporter()
460
+ result = importer.import_dataset(
461
+ dataset_id=req.dataset_id,
462
+ output_dir=req.output_dir,
463
+ split=req.split,
464
+ max_samples=req.max_samples,
465
+ )
466
+ return result
467
+
468
+
469
+ # ---------------------------------------------------------------------------
470
+ # API — benchmark
471
+ # ---------------------------------------------------------------------------
472
+
473
+ @app.post("/api/benchmark/start")
474
+ async def api_benchmark_start(req: BenchmarkRequest) -> dict:
475
+ corpus_path = Path(req.corpus_path)
476
+ if not corpus_path.exists() or not corpus_path.is_dir():
477
+ raise HTTPException(status_code=400, detail=f"Corpus non trouvé : {req.corpus_path}")
478
+
479
+ job_id = str(uuid.uuid4())
480
+ job = BenchmarkJob(job_id=job_id)
481
+ _JOBS[job_id] = job
482
+
483
+ # Démarrer le benchmark dans un thread séparé
484
+ thread = threading.Thread(
485
+ target=_run_benchmark_thread,
486
+ args=(job, req),
487
+ daemon=True,
488
+ )
489
+ thread.start()
490
+
491
+ return {"job_id": job_id, "status": "pending"}
492
+
493
+
494
+ @app.get("/api/benchmark/{job_id}/status")
495
+ async def api_benchmark_status(job_id: str) -> dict:
496
+ job = _JOBS.get(job_id)
497
+ if not job:
498
+ raise HTTPException(status_code=404, detail=f"Job non trouvé : {job_id}")
499
+ return job.as_dict()
500
+
501
+
502
+ @app.post("/api/benchmark/{job_id}/cancel")
503
+ async def api_benchmark_cancel(job_id: str) -> dict:
504
+ job = _JOBS.get(job_id)
505
+ if not job:
506
+ raise HTTPException(status_code=404, detail=f"Job non trouvé : {job_id}")
507
+ if job.status in ("complete", "error"):
508
+ return {"job_id": job_id, "status": job.status, "message": "Job déjà terminé."}
509
+ job.status = "cancelled"
510
+ job.add_event("cancelled", {"message": "Benchmark annulé par l'utilisateur."})
511
+ return {"job_id": job_id, "status": "cancelled"}
512
+
513
+
514
+ @app.get("/api/benchmark/{job_id}/stream")
515
+ async def api_benchmark_stream(job_id: str) -> StreamingResponse:
516
+ job = _JOBS.get(job_id)
517
+ if not job:
518
+ raise HTTPException(status_code=404, detail=f"Job non trouvé : {job_id}")
519
+
520
+ async def event_generator() -> AsyncIterator[str]:
521
+ # Envoie d'abord les événements déjà produits
522
+ for event in list(job.events):
523
+ yield _sse_format(event["kind"], event["data"])
524
+
525
+ if job.status in ("complete", "error", "cancelled"):
526
+ yield _sse_format("done", {"status": job.status})
527
+ return
528
+
529
+ queue = job.subscribe()
530
+ try:
531
+ while True:
532
+ try:
533
+ event = await asyncio.wait_for(queue.get(), timeout=30.0)
534
+ yield _sse_format(event["kind"], event["data"])
535
+ if event["kind"] in ("complete", "error", "cancelled", "done"):
536
+ break
537
+ except asyncio.TimeoutError:
538
+ # Keepalive
539
+ yield ": keepalive\n\n"
540
+ if job.status in ("complete", "error", "cancelled"):
541
+ yield _sse_format("done", {"status": job.status})
542
+ break
543
+ finally:
544
+ job.unsubscribe(queue)
545
+
546
+ return StreamingResponse(
547
+ event_generator(),
548
+ media_type="text/event-stream",
549
+ headers={
550
+ "Cache-Control": "no-cache",
551
+ "X-Accel-Buffering": "no",
552
+ },
553
+ )
554
+
555
+
556
+ def _sse_format(event_type: str, data: Any) -> str:
557
+ payload = json.dumps(data, ensure_ascii=False)
558
+ return f"event: {event_type}\ndata: {payload}\n\n"
559
+
560
+
561
+ def _run_benchmark_thread(job: BenchmarkJob, req: BenchmarkRequest) -> None:
562
+ """Exécute le benchmark dans un thread et envoie des événements SSE."""
563
+ import time
564
+
565
+ job.status = "running"
566
+ job.started_at = _iso_now()
567
+ job.add_event("start", {"message": "Démarrage du benchmark…", "corpus": req.corpus_path})
568
+
569
+ try:
570
+ from picarones.core.corpus import load_corpus_from_directory
571
+ from picarones.core.runner import run_benchmark
572
+
573
+ # Charger le corpus
574
+ job.add_event("log", {"message": f"Chargement du corpus : {req.corpus_path}"})
575
+ corpus = load_corpus_from_directory(req.corpus_path)
576
+ job.total_docs = len(corpus)
577
+ job.add_event("log", {"message": f"{job.total_docs} documents chargés."})
578
+
579
+ if job.status == "cancelled":
580
+ return
581
+
582
+ # Instancier les moteurs
583
+ from picarones.cli import _engine_from_name
584
+ import click
585
+
586
+ ocr_engines = []
587
+ for engine_name in req.engines:
588
+ try:
589
+ eng = _engine_from_name(engine_name, lang=req.lang, psm=6)
590
+ ocr_engines.append(eng)
591
+ job.add_event("log", {"message": f"Moteur chargé : {engine_name}"})
592
+ except (click.BadParameter, Exception) as exc:
593
+ job.add_event("warning", {"message": f"Moteur ignoré '{engine_name}' : {exc}"})
594
+
595
+ if not ocr_engines:
596
+ raise ValueError("Aucun moteur valide disponible.")
597
+
598
+ # Répertoire de sortie
599
+ output_dir = Path(req.output_dir)
600
+ output_dir.mkdir(parents=True, exist_ok=True)
601
+ report_name = req.report_name or f"rapport_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
602
+ output_json = str(output_dir / f"{report_name}.json")
603
+ output_html = str(output_dir / f"{report_name}.html")
604
+
605
+ # Callback de progression (injecté dans un wrapper)
606
+ n_engines = len(ocr_engines)
607
+ total_steps = job.total_docs * n_engines
608
+
609
+ step_counter = [0]
610
+
611
+ original_engine_names = [e.name for e in ocr_engines]
612
+
613
+ def _progress_callback(engine_name: str, doc_idx: int, doc_id: str) -> None:
614
+ if job.status == "cancelled":
615
+ return
616
+ step_counter[0] += 1
617
+ job.current_engine = engine_name
618
+ job.processed_docs = doc_idx
619
+ job.progress = step_counter[0] / max(total_steps, 1)
620
+ job.add_event("progress", {
621
+ "engine": engine_name,
622
+ "doc_idx": doc_idx,
623
+ "doc_id": doc_id,
624
+ "progress": job.progress,
625
+ "processed": step_counter[0],
626
+ "total": total_steps,
627
+ })
628
+
629
+ # Lancer le benchmark
630
+ result = run_benchmark(
631
+ corpus=corpus,
632
+ engines=ocr_engines,
633
+ output_json=output_json,
634
+ show_progress=False,
635
+ progress_callback=_progress_callback,
636
+ )
637
+
638
+ if job.status == "cancelled":
639
+ return
640
+
641
+ # Générer le rapport HTML
642
+ job.add_event("log", {"message": "Génération du rapport HTML…"})
643
+ from picarones.report.generator import ReportGenerator
644
+ gen = ReportGenerator(result)
645
+ gen.generate(output_html)
646
+
647
+ job.output_path = output_html
648
+ job.progress = 1.0
649
+ job.status = "complete"
650
+ job.finished_at = _iso_now()
651
+
652
+ # Classement final
653
+ ranking = result.ranking()
654
+ job.add_event("complete", {
655
+ "message": "Benchmark terminé.",
656
+ "output_html": output_html,
657
+ "output_json": output_json,
658
+ "ranking": ranking,
659
+ })
660
+
661
+ except Exception as exc:
662
+ job.status = "error"
663
+ job.error = str(exc)
664
+ job.finished_at = _iso_now()
665
+ job.add_event("error", {"message": f"Erreur : {exc}"})
666
+
667
+
668
+ # ---------------------------------------------------------------------------
669
+ # Page principale HTML (SPA)
670
+ # ---------------------------------------------------------------------------
671
+
672
+ @app.get("/", response_class=HTMLResponse)
673
+ async def index() -> HTMLResponse:
674
+ return HTMLResponse(content=_HTML_TEMPLATE)
675
+
676
+
677
+ # ---------------------------------------------------------------------------
678
+ # Helper
679
+ # ---------------------------------------------------------------------------
680
+
681
+ def _iso_now() -> str:
682
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
683
+
684
+
685
+ # ---------------------------------------------------------------------------
686
+ # HTML Template (SPA, French/English, Vanilla JS)
687
+ # ---------------------------------------------------------------------------
688
+
689
+ _HTML_TEMPLATE = r"""<!DOCTYPE html>
690
+ <html lang="fr">
691
+ <head>
692
+ <meta charset="UTF-8">
693
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
694
+ <title>Picarones — OCR Benchmark</title>
695
+ <style>
696
+ :root {
697
+ --bg: #f8f7f4;
698
+ --bg2: #ffffff;
699
+ --border: #d8d5ce;
700
+ --accent: #2d5a9e;
701
+ --accent-hover: #1e4080;
702
+ --success: #2a7a3b;
703
+ --warning: #c17b00;
704
+ --danger: #c0392b;
705
+ --text: #2c2c2c;
706
+ --text-muted: #6b6b6b;
707
+ --radius: 6px;
708
+ --shadow: 0 1px 4px rgba(0,0,0,0.1);
709
+ }
710
+ * { box-sizing: border-box; margin: 0; padding: 0; }
711
+ body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; background: var(--bg); color: var(--text); font-size: 14px; line-height: 1.5; }
712
+ a { color: var(--accent); text-decoration: none; }
713
+ a:hover { text-decoration: underline; }
714
+
715
+ /* Layout */
716
+ #header { background: var(--accent); color: #fff; padding: 0 24px; display: flex; align-items: center; height: 52px; gap: 24px; position: sticky; top: 0; z-index: 100; }
717
+ #header h1 { font-size: 18px; font-weight: 600; letter-spacing: -0.3px; }
718
+ #header span.version { font-size: 11px; opacity: 0.7; margin-left: 4px; }
719
+ #nav { display: flex; gap: 4px; margin-left: auto; }
720
+ .nav-btn { background: transparent; border: 1px solid rgba(255,255,255,0.3); color: #fff; padding: 5px 12px; border-radius: var(--radius); cursor: pointer; font-size: 13px; transition: background 0.15s; }
721
+ .nav-btn:hover, .nav-btn.active { background: rgba(255,255,255,0.18); }
722
+ #lang-btn { margin-left: 12px; font-size: 12px; background: rgba(255,255,255,0.15); border: 1px solid rgba(255,255,255,0.3); color: #fff; padding: 4px 10px; border-radius: var(--radius); cursor: pointer; }
723
+
724
+ #main { max-width: 1100px; margin: 0 auto; padding: 24px 16px; }
725
+ .view { display: none; }
726
+ .view.active { display: block; }
727
+
728
+ /* Cards */
729
+ .card { background: var(--bg2); border: 1px solid var(--border); border-radius: var(--radius); padding: 20px; margin-bottom: 16px; box-shadow: var(--shadow); }
730
+ .card h2 { font-size: 15px; font-weight: 600; margin-bottom: 14px; padding-bottom: 8px; border-bottom: 1px solid var(--border); color: var(--accent); }
731
+ .card h3 { font-size: 13px; font-weight: 600; margin-bottom: 10px; color: var(--text); }
732
+
733
+ /* Forms */
734
+ .form-row { display: flex; gap: 12px; flex-wrap: wrap; margin-bottom: 12px; align-items: flex-start; }
735
+ .form-group { display: flex; flex-direction: column; gap: 4px; flex: 1; min-width: 160px; }
736
+ label { font-size: 12px; font-weight: 500; color: var(--text-muted); }
737
+ input[type=text], input[type=number], select { padding: 7px 10px; border: 1px solid var(--border); border-radius: var(--radius); font-size: 13px; color: var(--text); background: #fff; width: 100%; }
738
+ input:focus, select:focus { outline: 2px solid var(--accent); outline-offset: -1px; }
739
+ .path-input-row { display: flex; gap: 8px; }
740
+ .path-input-row input { flex: 1; }
741
+ .btn { padding: 7px 16px; border: none; border-radius: var(--radius); cursor: pointer; font-size: 13px; font-weight: 500; transition: background 0.15s; display: inline-flex; align-items: center; gap: 6px; }
742
+ .btn-primary { background: var(--accent); color: #fff; }
743
+ .btn-primary:hover { background: var(--accent-hover); }
744
+ .btn-secondary { background: #e8e5de; color: var(--text); }
745
+ .btn-secondary:hover { background: #d8d5ce; }
746
+ .btn-danger { background: var(--danger); color: #fff; }
747
+ .btn-sm { padding: 4px 10px; font-size: 12px; }
748
+
749
+ /* Checkboxes list */
750
+ .checkbox-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); gap: 8px; }
751
+ .checkbox-item { display: flex; align-items: center; gap: 8px; padding: 8px 10px; border: 1px solid var(--border); border-radius: var(--radius); cursor: pointer; transition: border-color 0.1s; }
752
+ .checkbox-item:hover { border-color: var(--accent); }
753
+ .checkbox-item input { cursor: pointer; }
754
+ .checkbox-item.checked { border-color: var(--accent); background: #eef2fc; }
755
+ .engine-status { width: 8px; height: 8px; border-radius: 50%; display: inline-block; flex-shrink: 0; }
756
+ .status-ok { background: var(--success); }
757
+ .status-warn { background: var(--warning); }
758
+ .status-err { background: var(--danger); }
759
+
760
+ /* Progress */
761
+ .progress-bar-outer { height: 10px; background: #e0ddd5; border-radius: 5px; overflow: hidden; margin: 4px 0; }
762
+ .progress-bar-inner { height: 100%; background: var(--accent); border-radius: 5px; transition: width 0.3s; }
763
+ .log-box { background: #1a1a2e; color: #c8d8f8; font-family: monospace; font-size: 12px; padding: 12px; border-radius: var(--radius); max-height: 260px; overflow-y: auto; white-space: pre-wrap; line-height: 1.6; }
764
+ .log-box .log-warn { color: #f0c060; }
765
+ .log-box .log-error { color: #ff6b6b; }
766
+ .log-box .log-success { color: #6bf08a; }
767
+
768
+ /* Tables */
769
+ table { width: 100%; border-collapse: collapse; font-size: 13px; }
770
+ th { text-align: left; padding: 8px 10px; border-bottom: 2px solid var(--border); color: var(--text-muted); font-weight: 600; font-size: 12px; }
771
+ td { padding: 8px 10px; border-bottom: 1px solid var(--border); }
772
+ tr:last-child td { border-bottom: none; }
773
+ tr:hover td { background: #f0ede6; }
774
+ .badge { padding: 2px 7px; border-radius: 10px; font-size: 11px; font-weight: 500; }
775
+ .badge-ok { background: #d4edda; color: var(--success); }
776
+ .badge-warn { background: #fff3cd; color: var(--warning); }
777
+ .badge-err { background: #fde8e8; color: var(--danger); }
778
+
779
+ /* File browser */
780
+ #file-browser { border: 1px solid var(--border); border-radius: var(--radius); max-height: 300px; overflow-y: auto; }
781
+ .fb-item { display: flex; align-items: center; gap: 8px; padding: 8px 12px; cursor: pointer; border-bottom: 1px solid var(--border); }
782
+ .fb-item:last-child { border-bottom: none; }
783
+ .fb-item:hover { background: #f0ede6; }
784
+ .fb-icon { font-size: 16px; flex-shrink: 0; }
785
+ .fb-name { flex: 1; font-size: 13px; }
786
+ .fb-badge { font-size: 11px; color: var(--text-muted); }
787
+ .fb-path { font-size: 12px; color: var(--text-muted); padding: 6px 12px; background: #f4f2ed; border-bottom: 1px solid var(--border); font-family: monospace; }
788
+
789
+ /* Notifications */
790
+ .alert { padding: 10px 14px; border-radius: var(--radius); margin-bottom: 12px; font-size: 13px; }
791
+ .alert-success { background: #d4edda; color: var(--success); border: 1px solid #b8dfc4; }
792
+ .alert-error { background: #fde8e8; color: var(--danger); border: 1px solid #f5c6cb; }
793
+ .alert-info { background: #d0e4f7; color: #1a568c; border: 1px solid #b8d4ef; }
794
+
795
+ /* Dataset cards */
796
+ .ds-grid { display: grid; gap: 10px; }
797
+ .ds-card { border: 1px solid var(--border); border-radius: var(--radius); padding: 12px; background: #fff; }
798
+ .ds-card h4 { font-size: 13px; font-weight: 600; margin-bottom: 4px; }
799
+ .ds-card p { font-size: 12px; color: var(--text-muted); margin-bottom: 6px; }
800
+ .ds-meta { display: flex; gap: 8px; flex-wrap: wrap; }
801
+ .ds-tag { font-size: 11px; background: #eef2fc; color: var(--accent); padding: 2px 7px; border-radius: 10px; }
802
+
803
+ /* Spinner */
804
+ .spinner { display: inline-block; width: 14px; height: 14px; border: 2px solid #ccc; border-top-color: var(--accent); border-radius: 50%; animation: spin 0.7s linear infinite; }
805
+ @keyframes spin { to { transform: rotate(360deg); } }
806
+ </style>
807
+ </head>
808
+ <body>
809
+
810
+ <div id="header">
811
+ <h1 data-i18n="app_title">Picarones <span class="version" id="app-version"></span></h1>
812
+ <nav id="nav">
813
+ <button class="nav-btn active" onclick="showView('benchmark')" data-i18n="nav_benchmark">Benchmark</button>
814
+ <button class="nav-btn" onclick="showView('reports')" data-i18n="nav_reports">Rapports</button>
815
+ <button class="nav-btn" onclick="showView('engines')" data-i18n="nav_engines">Moteurs</button>
816
+ <button class="nav-btn" onclick="showView('import')" data-i18n="nav_import">Import</button>
817
+ </nav>
818
+ <button id="lang-btn" onclick="toggleLang()">EN</button>
819
+ </div>
820
+
821
+ <div id="main">
822
+
823
+ <!-- ===== VUE BENCHMARK ===== -->
824
+ <div id="view-benchmark" class="view active">
825
+
826
+ <div class="card">
827
+ <h2 data-i18n="bench_corpus_title">1. Corpus</h2>
828
+ <div class="form-group">
829
+ <label data-i18n="bench_corpus_label">Chemin vers le dossier corpus (paires image/.gt.txt)</label>
830
+ <div class="path-input-row">
831
+ <input type="text" id="corpus-path" placeholder="./corpus/" value="" />
832
+ <button class="btn btn-secondary btn-sm" onclick="openFileBrowser()" data-i18n="bench_browse">Parcourir</button>
833
+ </div>
834
+ </div>
835
+ <div id="file-browser-container" style="display:none; margin-top:10px;">
836
+ <div class="fb-path" id="fb-current-path">.</div>
837
+ <div id="file-browser"></div>
838
+ </div>
839
+ <div id="corpus-info" style="margin-top:8px; font-size:12px; color: var(--text-muted);"></div>
840
+ </div>
841
+
842
+ <div class="card">
843
+ <h2 data-i18n="bench_engines_title">2. Moteurs et pipelines</h2>
844
+ <div id="engine-checkboxes" class="checkbox-grid">
845
+ <div style="color: var(--text-muted); font-size: 12px;" data-i18n="loading">Chargement…</div>
846
+ </div>
847
+ </div>
848
+
849
+ <div class="card">
850
+ <h2 data-i18n="bench_options_title">3. Options</h2>
851
+ <div class="form-row">
852
+ <div class="form-group">
853
+ <label data-i18n="bench_norm_label">Profil de normalisation</label>
854
+ <select id="norm-profile">
855
+ <option value="nfc">NFC (standard)</option>
856
+ </select>
857
+ </div>
858
+ <div class="form-group">
859
+ <label data-i18n="bench_lang_label">Langue (Tesseract)</label>
860
+ <input type="text" id="bench-lang" value="fra" placeholder="fra" />
861
+ </div>
862
+ <div class="form-group">
863
+ <label data-i18n="bench_output_label">Dossier de sortie</label>
864
+ <input type="text" id="output-dir" value="./rapports/" />
865
+ </div>
866
+ <div class="form-group">
867
+ <label data-i18n="bench_name_label">Nom du rapport (optionnel)</label>
868
+ <input type="text" id="report-name" placeholder="rapport_2024_01_15" />
869
+ </div>
870
+ </div>
871
+ </div>
872
+
873
+ <div style="display:flex; gap:10px; align-items:center; margin-bottom:16px;">
874
+ <button class="btn btn-primary" id="start-btn" onclick="startBenchmark()" data-i18n="bench_start">▶ Lancer le benchmark</button>
875
+ <button class="btn btn-secondary" id="cancel-btn" style="display:none;" onclick="cancelBenchmark()" data-i18n="bench_cancel">✕ Annuler</button>
876
+ <span id="bench-status-text" style="font-size:12px; color: var(--text-muted);"></span>
877
+ </div>
878
+
879
+ <div id="bench-progress-section" style="display:none;">
880
+ <div class="card">
881
+ <h2 data-i18n="bench_progress_title">Progression</h2>
882
+ <div id="engine-progress-list"></div>
883
+ <div style="margin-top: 12px;">
884
+ <label style="font-size:12px; color: var(--text-muted); display:block; margin-bottom:4px;" data-i18n="bench_log">Journal</label>
885
+ <div class="log-box" id="bench-log"></div>
886
+ </div>
887
+ </div>
888
+ </div>
889
+
890
+ <div id="bench-result-section" style="display:none;">
891
+ <div class="card">
892
+ <h2 data-i18n="bench_result_title">Résultats</h2>
893
+ <div id="bench-ranking-table"></div>
894
+ <div style="margin-top:12px;">
895
+ <a id="bench-report-link" href="#" class="btn btn-primary" target="_blank" data-i18n="bench_open_report">Ouvrir le rapport</a>
896
+ </div>
897
+ </div>
898
+ </div>
899
+ </div>
900
+
901
+ <!-- ===== VUE RAPPORTS ===== -->
902
+ <div id="view-reports" class="view">
903
+ <div class="card">
904
+ <h2 data-i18n="reports_title">Rapports générés</h2>
905
+ <div class="form-row" style="margin-bottom:12px;">
906
+ <div class="form-group" style="max-width:320px;">
907
+ <label data-i18n="reports_dir_label">Dossier de rapports</label>
908
+ <div class="path-input-row">
909
+ <input type="text" id="reports-dir" value="." />
910
+ <button class="btn btn-secondary btn-sm" onclick="loadReports()" data-i18n="reports_refresh">Rafraîchir</button>
911
+ </div>
912
+ </div>
913
+ </div>
914
+ <div id="reports-list">
915
+ <div style="color: var(--text-muted); font-size: 12px;" data-i18n="loading">Chargement…</div>
916
+ </div>
917
+ </div>
918
+ </div>
919
+
920
+ <!-- ===== VUE MOTEURS ===== -->
921
+ <div id="view-engines" class="view">
922
+ <div class="card">
923
+ <h2 data-i18n="engines_ocr_title">Moteurs OCR</h2>
924
+ <div id="engines-ocr-list">
925
+ <div style="color: var(--text-muted); font-size: 12px;" data-i18n="loading">Chargement…</div>
926
+ </div>
927
+ </div>
928
+ <div class="card">
929
+ <h2 data-i18n="engines_llm_title">LLMs disponibles</h2>
930
+ <div id="engines-llm-list">
931
+ <div style="color: var(--text-muted); font-size: 12px;" data-i18n="loading">Chargement…</div>
932
+ </div>
933
+ </div>
934
+ </div>
935
+
936
+ <!-- ===== VUE IMPORT ===== -->
937
+ <div id="view-import" class="view">
938
+
939
+ <!-- HTR-United -->
940
+ <div class="card">
941
+ <h2 data-i18n="import_htr_title">Import HTR-United</h2>
942
+ <p style="font-size:12px; color:var(--text-muted); margin-bottom:12px;" data-i18n="import_htr_desc">
943
+ Catalogue communautaire de corpus HTR/OCR pour documents patrimoniaux.
944
+ </p>
945
+ <div class="form-row">
946
+ <div class="form-group" style="flex:2;">
947
+ <label data-i18n="import_search_label">Recherche</label>
948
+ <input type="text" id="htr-search" placeholder="médiéval, latin, manuscrits…" />
949
+ </div>
950
+ <div class="form-group">
951
+ <label data-i18n="import_lang_filter">Langue</label>
952
+ <select id="htr-lang-filter">
953
+ <option value="" data-i18n="all">Toutes</option>
954
+ </select>
955
+ </div>
956
+ <div class="form-group">
957
+ <label data-i18n="import_script_filter">Type d'écriture</label>
958
+ <select id="htr-script-filter">
959
+ <option value="" data-i18n="all">Tous</option>
960
+ </select>
961
+ </div>
962
+ <div class="form-group" style="justify-content: flex-end; padding-top: 18px;">
963
+ <button class="btn btn-primary btn-sm" onclick="searchHTRUnited()" data-i18n="search">Rechercher</button>
964
+ </div>
965
+ </div>
966
+ <div id="htr-results" class="ds-grid"></div>
967
+ </div>
968
+
969
+ <!-- HuggingFace -->
970
+ <div class="card">
971
+ <h2 data-i18n="import_hf_title">Import HuggingFace Datasets</h2>
972
+ <p style="font-size:12px; color:var(--text-muted); margin-bottom:12px;" data-i18n="import_hf_desc">
973
+ Datasets OCR/HTR publics depuis HuggingFace Hub (IAM, RIMES, CATMuS, Gallica…).
974
+ </p>
975
+ <div class="form-row">
976
+ <div class="form-group" style="flex:2;">
977
+ <label data-i18n="import_search_label">Recherche</label>
978
+ <input type="text" id="hf-search" placeholder="medieval OCR, IAM, RIMES…" />
979
+ </div>
980
+ <div class="form-group">
981
+ <label data-i18n="import_lang_filter">Langue</label>
982
+ <input type="text" id="hf-lang-filter" placeholder="French, Latin…" />
983
+ </div>
984
+ <div class="form-group">
985
+ <label data-i18n="import_tag_filter">Tags</label>
986
+ <input type="text" id="hf-tags" placeholder="ocr, htr, historical…" />
987
+ </div>
988
+ <div class="form-group" style="justify-content: flex-end; padding-top: 18px;">
989
+ <button class="btn btn-primary btn-sm" onclick="searchHuggingFace()" data-i18n="search">Rechercher</button>
990
+ </div>
991
+ </div>
992
+ <div id="hf-results" class="ds-grid"></div>
993
+ </div>
994
+
995
+ </div><!-- end view-import -->
996
+
997
+ </div><!-- end #main -->
998
+
999
+ <!-- Import modal -->
1000
+ <div id="import-modal" style="display:none; position:fixed; inset:0; background:rgba(0,0,0,0.4); z-index:200; align-items:center; justify-content:center;">
1001
+ <div class="card" style="width: 420px; max-width: 95vw;">
1002
+ <h2 id="import-modal-title" data-i18n="import_modal_title">Importer le corpus</h2>
1003
+ <input type="hidden" id="import-modal-type" />
1004
+ <input type="hidden" id="import-modal-id" />
1005
+ <div class="form-group" style="margin-bottom:12px;">
1006
+ <label data-i18n="import_output_dir">Dossier de destination</label>
1007
+ <input type="text" id="import-modal-output" value="./corpus/" />
1008
+ </div>
1009
+ <div class="form-group" style="margin-bottom:16px;">
1010
+ <label data-i18n="import_max_samples">Nombre max de documents</label>
1011
+ <input type="number" id="import-modal-max" value="100" min="1" max="10000" />
1012
+ </div>
1013
+ <div id="import-modal-status" style="margin-bottom:12px;"></div>
1014
+ <div style="display:flex; gap:8px;">
1015
+ <button class="btn btn-primary" onclick="confirmImport()" data-i18n="import_confirm">Importer</button>
1016
+ <button class="btn btn-secondary" onclick="closeImportModal()" data-i18n="cancel">Annuler</button>
1017
+ </div>
1018
+ </div>
1019
+ </div>
1020
+
1021
+ <script>
1022
+ // ─── i18n ────────────────────────────────────────────────────────────────────
1023
+ const T = {
1024
+ fr: {
1025
+ app_title: "Picarones",
1026
+ nav_benchmark: "Benchmark",
1027
+ nav_reports: "Rapports",
1028
+ nav_engines: "Moteurs",
1029
+ nav_import: "Import",
1030
+ loading: "Chargement…",
1031
+ search: "Rechercher",
1032
+ all: "Tous",
1033
+ cancel: "Annuler",
1034
+ bench_corpus_title: "1. Corpus",
1035
+ bench_corpus_label: "Chemin vers le dossier corpus (paires image / .gt.txt)",
1036
+ bench_browse: "Parcourir",
1037
+ bench_engines_title: "2. Moteurs et pipelines",
1038
+ bench_options_title: "3. Options",
1039
+ bench_norm_label: "Profil de normalisation",
1040
+ bench_lang_label: "Langue (Tesseract)",
1041
+ bench_output_label: "Dossier de sortie",
1042
+ bench_name_label: "Nom du rapport (optionnel)",
1043
+ bench_start: "▶ Lancer le benchmark",
1044
+ bench_cancel: "✕ Annuler",
1045
+ bench_progress_title: "Progression",
1046
+ bench_log: "Journal",
1047
+ bench_result_title: "Résultats",
1048
+ bench_open_report: "Ouvrir le rapport",
1049
+ reports_title: "Rapports générés",
1050
+ reports_dir_label: "Dossier de rapports",
1051
+ reports_refresh: "Rafraîchir",
1052
+ engines_ocr_title: "Moteurs OCR",
1053
+ engines_llm_title: "LLMs disponibles",
1054
+ import_htr_title: "Import HTR-United",
1055
+ import_htr_desc: "Catalogue communautaire de corpus HTR/OCR pour documents patrimoniaux.",
1056
+ import_hf_title: "Import HuggingFace Datasets",
1057
+ import_hf_desc: "Datasets OCR/HTR publics depuis HuggingFace Hub (IAM, RIMES, CATMuS, Gallica…).",
1058
+ import_search_label: "Recherche",
1059
+ import_lang_filter: "Langue",
1060
+ import_script_filter: "Type d'écriture",
1061
+ import_tag_filter: "Tags",
1062
+ import_modal_title: "Importer le corpus",
1063
+ import_output_dir: "Dossier de destination",
1064
+ import_max_samples: "Nombre max de documents",
1065
+ import_confirm: "Importer",
1066
+ available: "disponible",
1067
+ not_installed: "non installé",
1068
+ configured: "configuré",
1069
+ missing_key: "clé manquante",
1070
+ running: "actif",
1071
+ not_running: "inactif",
1072
+ no_reports: "Aucun rapport trouvé.",
1073
+ lines: "lignes",
1074
+ centuries: "siècles",
1075
+ },
1076
+ en: {
1077
+ app_title: "Picarones",
1078
+ nav_benchmark: "Benchmark",
1079
+ nav_reports: "Reports",
1080
+ nav_engines: "Engines",
1081
+ nav_import: "Import",
1082
+ loading: "Loading…",
1083
+ search: "Search",
1084
+ all: "All",
1085
+ cancel: "Cancel",
1086
+ bench_corpus_title: "1. Corpus",
1087
+ bench_corpus_label: "Path to corpus directory (image / .gt.txt pairs)",
1088
+ bench_browse: "Browse",
1089
+ bench_engines_title: "2. Engines & pipelines",
1090
+ bench_options_title: "3. Options",
1091
+ bench_norm_label: "Normalization profile",
1092
+ bench_lang_label: "Language (Tesseract)",
1093
+ bench_output_label: "Output directory",
1094
+ bench_name_label: "Report name (optional)",
1095
+ bench_start: "▶ Start benchmark",
1096
+ bench_cancel: "✕ Cancel",
1097
+ bench_progress_title: "Progress",
1098
+ bench_log: "Log",
1099
+ bench_result_title: "Results",
1100
+ bench_open_report: "Open report",
1101
+ reports_title: "Generated reports",
1102
+ reports_dir_label: "Reports directory",
1103
+ reports_refresh: "Refresh",
1104
+ engines_ocr_title: "OCR Engines",
1105
+ engines_llm_title: "Available LLMs",
1106
+ import_htr_title: "Import from HTR-United",
1107
+ import_htr_desc: "Community catalogue of HTR/OCR datasets for heritage documents.",
1108
+ import_hf_title: "Import from HuggingFace Datasets",
1109
+ import_hf_desc: "Public OCR/HTR datasets from HuggingFace Hub (IAM, RIMES, CATMuS, Gallica…).",
1110
+ import_search_label: "Search",
1111
+ import_lang_filter: "Language",
1112
+ import_script_filter: "Script type",
1113
+ import_tag_filter: "Tags",
1114
+ import_modal_title: "Import corpus",
1115
+ import_output_dir: "Output directory",
1116
+ import_max_samples: "Max documents",
1117
+ import_confirm: "Import",
1118
+ available: "available",
1119
+ not_installed: "not installed",
1120
+ configured: "configured",
1121
+ missing_key: "key missing",
1122
+ running: "running",
1123
+ not_running: "not running",
1124
+ no_reports: "No reports found.",
1125
+ lines: "lines",
1126
+ centuries: "centuries",
1127
+ },
1128
+ };
1129
+ let lang = "fr";
1130
+ function t(key) { return (T[lang][key]) || key; }
1131
+ function toggleLang() {
1132
+ lang = lang === "fr" ? "en" : "fr";
1133
+ document.getElementById("lang-btn").textContent = lang === "fr" ? "EN" : "FR";
1134
+ document.querySelectorAll("[data-i18n]").forEach(el => {
1135
+ const k = el.getAttribute("data-i18n");
1136
+ if (T[lang][k]) el.textContent = T[lang][k];
1137
+ });
1138
+ }
1139
+
1140
+ // ─── Navigation ──────────────────────────────────────────────────────────────
1141
+ function showView(name) {
1142
+ document.querySelectorAll(".view").forEach(v => v.classList.remove("active"));
1143
+ document.querySelectorAll(".nav-btn").forEach(b => b.classList.remove("active"));
1144
+ const view = document.getElementById("view-" + name);
1145
+ if (view) view.classList.add("active");
1146
+ const btns = document.querySelectorAll(".nav-btn");
1147
+ const idx = ["benchmark","reports","engines","import"].indexOf(name);
1148
+ if (btns[idx]) btns[idx].classList.add("active");
1149
+
1150
+ if (name === "reports") loadReports();
1151
+ if (name === "engines") loadEngines();
1152
+ if (name === "import") { searchHTRUnited(); searchHuggingFace(); }
1153
+ }
1154
+
1155
+ // ─── Status / version ────────────────────────────────────────────────────────
1156
+ async function loadStatus() {
1157
+ try {
1158
+ const r = await fetch("/api/status");
1159
+ const d = await r.json();
1160
+ document.getElementById("app-version").textContent = "v" + d.version;
1161
+ } catch(e) {}
1162
+ }
1163
+
1164
+ // ─── Engine checkboxes ───────────────────────────────────────────────────────
1165
+ async function loadEngineCheckboxes() {
1166
+ try {
1167
+ const r = await fetch("/api/engines");
1168
+ const d = await r.json();
1169
+ const container = document.getElementById("engine-checkboxes");
1170
+ container.innerHTML = "";
1171
+
1172
+ [...d.engines, ...d.llms].forEach(eng => {
1173
+ const item = document.createElement("label");
1174
+ item.className = "checkbox-item" + (eng.available ? " checked" : "");
1175
+ const dot = `<span class="engine-status ${eng.available ? "status-ok" : "status-err"}"></span>`;
1176
+ const chk = `<input type="checkbox" name="engine" value="${eng.id}" ${eng.available ? "checked" : ""} ${eng.available ? "" : ""}>`;
1177
+ item.innerHTML = `${chk}${dot}<span>${eng.label}</span>`;
1178
+ item.querySelector("input").addEventListener("change", e => {
1179
+ item.classList.toggle("checked", e.target.checked);
1180
+ });
1181
+ container.appendChild(item);
1182
+ });
1183
+
1184
+ // Store all engine data for later
1185
+ window._enginesData = d;
1186
+ } catch(e) {
1187
+ document.getElementById("engine-checkboxes").innerHTML =
1188
+ '<span style="color: var(--danger); font-size:12px;">Erreur chargement moteurs</span>';
1189
+ }
1190
+ }
1191
+
1192
+ // ─── Normalization profiles ──────────────────────────────────────────────────
1193
+ async function loadNormProfiles() {
1194
+ try {
1195
+ const r = await fetch("/api/normalization/profiles");
1196
+ const d = await r.json();
1197
+ const sel = document.getElementById("norm-profile");
1198
+ sel.innerHTML = "";
1199
+ d.profiles.forEach(p => {
1200
+ const opt = document.createElement("option");
1201
+ opt.value = p.id;
1202
+ opt.textContent = `${p.name} — ${p.description}`;
1203
+ if (p.id === "nfc") opt.selected = true;
1204
+ sel.appendChild(opt);
1205
+ });
1206
+ } catch(e) {}
1207
+ }
1208
+
1209
+ // ─── File browser ────────────────────────────────────────────────────────────
1210
+ let _fbVisible = false;
1211
+ function openFileBrowser() {
1212
+ _fbVisible = !_fbVisible;
1213
+ const c = document.getElementById("file-browser-container");
1214
+ c.style.display = _fbVisible ? "block" : "none";
1215
+ if (_fbVisible) browsePath(".");
1216
+ }
1217
+ async function browsePath(path) {
1218
+ try {
1219
+ const r = await fetch(`/api/corpus/browse?path=${encodeURIComponent(path)}`);
1220
+ const d = await r.json();
1221
+ document.getElementById("fb-current-path").textContent = d.current_path;
1222
+ const fb = document.getElementById("file-browser");
1223
+ fb.innerHTML = "";
1224
+ if (d.parent_path) {
1225
+ const up = document.createElement("div");
1226
+ up.className = "fb-item";
1227
+ up.innerHTML = `<span class="fb-icon">⬆</span><span class="fb-name">..</span>`;
1228
+ up.onclick = () => browsePath(d.parent_path);
1229
+ fb.appendChild(up);
1230
+ }
1231
+ d.items.filter(i => i.is_dir).forEach(item => {
1232
+ const el = document.createElement("div");
1233
+ el.className = "fb-item";
1234
+ const hasCorpus = item.has_corpus ? `<span class="fb-badge" style="color:var(--success)">✓ ${item.gt_count} GT</span>` : "";
1235
+ el.innerHTML = `<span class="fb-icon">📁</span><span class="fb-name">${item.name}</span>${hasCorpus}`;
1236
+ el.onclick = () => {
1237
+ if (item.has_corpus) {
1238
+ document.getElementById("corpus-path").value = item.path;
1239
+ document.getElementById("corpus-info").textContent = `✓ ${item.gt_count} documents GT trouvés.`;
1240
+ _fbVisible = false;
1241
+ document.getElementById("file-browser-container").style.display = "none";
1242
+ } else {
1243
+ browsePath(item.path);
1244
+ }
1245
+ };
1246
+ fb.appendChild(el);
1247
+ });
1248
+ if (fb.children.length === 0) {
1249
+ fb.innerHTML = '<div style="padding:12px; color: var(--text-muted); font-size:12px;">Dossier vide</div>';
1250
+ }
1251
+ } catch(e) {
1252
+ document.getElementById("file-browser").innerHTML =
1253
+ `<div style="padding:12px; color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`;
1254
+ }
1255
+ }
1256
+
1257
+ // ─── Benchmark ───────────────────────────────────────────────────────────────
1258
+ let _currentJobId = null;
1259
+ let _eventSource = null;
1260
+
1261
+ async function startBenchmark() {
1262
+ const corpusPath = document.getElementById("corpus-path").value.trim();
1263
+ if (!corpusPath) {
1264
+ alert(lang === "fr" ? "Veuillez sélectionner un dossier corpus." : "Please select a corpus directory.");
1265
+ return;
1266
+ }
1267
+ const engines = Array.from(document.querySelectorAll("input[name=engine]:checked")).map(e => e.value);
1268
+ if (engines.length === 0) {
1269
+ alert(lang === "fr" ? "Veuillez sélectionner au moins un moteur." : "Please select at least one engine.");
1270
+ return;
1271
+ }
1272
+
1273
+ const payload = {
1274
+ corpus_path: corpusPath,
1275
+ engines: engines,
1276
+ normalization_profile: document.getElementById("norm-profile").value,
1277
+ output_dir: document.getElementById("output-dir").value,
1278
+ report_name: document.getElementById("report-name").value,
1279
+ lang: document.getElementById("bench-lang").value,
1280
+ };
1281
+
1282
+ document.getElementById("start-btn").disabled = true;
1283
+ document.getElementById("cancel-btn").style.display = "inline-flex";
1284
+ document.getElementById("bench-progress-section").style.display = "block";
1285
+ document.getElementById("bench-result-section").style.display = "none";
1286
+ document.getElementById("bench-log").textContent = "";
1287
+ document.getElementById("engine-progress-list").innerHTML = "";
1288
+ document.getElementById("bench-status-text").textContent = lang === "fr" ? "Démarrage…" : "Starting…";
1289
+
1290
+ try {
1291
+ const r = await fetch("/api/benchmark/start", {
1292
+ method: "POST",
1293
+ headers: {"Content-Type": "application/json"},
1294
+ body: JSON.stringify(payload),
1295
+ });
1296
+ if (!r.ok) {
1297
+ const err = await r.json();
1298
+ throw new Error(err.detail || "Erreur serveur");
1299
+ }
1300
+ const d = await r.json();
1301
+ _currentJobId = d.job_id;
1302
+ _startSSE(_currentJobId, engines);
1303
+ } catch(e) {
1304
+ appendLog(`Erreur : ${e.message}`, "error");
1305
+ document.getElementById("start-btn").disabled = false;
1306
+ document.getElementById("cancel-btn").style.display = "none";
1307
+ document.getElementById("bench-status-text").textContent = "";
1308
+ }
1309
+ }
1310
+
1311
+ function _startSSE(jobId, engines) {
1312
+ if (_eventSource) _eventSource.close();
1313
+ // Init engine progress bars
1314
+ const pl = document.getElementById("engine-progress-list");
1315
+ pl.innerHTML = "";
1316
+ engines.forEach(eng => {
1317
+ const div = document.createElement("div");
1318
+ div.id = `eng-progress-${eng}`;
1319
+ div.style = "margin-bottom: 8px;";
1320
+ div.innerHTML = `<div style="display:flex; justify-content:space-between; font-size:12px; margin-bottom:3px;">
1321
+ <span>${eng}</span><span id="eng-pct-${eng}">0%</span></div>
1322
+ <div class="progress-bar-outer"><div class="progress-bar-inner" id="eng-bar-${eng}" style="width:0%"></div></div>`;
1323
+ pl.appendChild(div);
1324
+ });
1325
+
1326
+ _eventSource = new EventSource(`/api/benchmark/${jobId}/stream`);
1327
+
1328
+ _eventSource.addEventListener("start", e => {
1329
+ const d = JSON.parse(e.data);
1330
+ appendLog(d.message, "success");
1331
+ document.getElementById("bench-status-text").textContent = lang === "fr" ? "En cours…" : "Running…";
1332
+ });
1333
+
1334
+ _eventSource.addEventListener("log", e => {
1335
+ const d = JSON.parse(e.data);
1336
+ appendLog(d.message);
1337
+ });
1338
+
1339
+ _eventSource.addEventListener("warning", e => {
1340
+ const d = JSON.parse(e.data);
1341
+ appendLog(d.message, "warn");
1342
+ });
1343
+
1344
+ _eventSource.addEventListener("progress", e => {
1345
+ const d = JSON.parse(e.data);
1346
+ const pct = Math.round(d.progress * 100);
1347
+ document.getElementById("bench-status-text").textContent =
1348
+ `${pct}% — ${d.engine} (${d.processed}/${d.total})`;
1349
+ engines.forEach(eng => {
1350
+ const bar = document.getElementById(`eng-bar-${eng}`);
1351
+ const pctEl = document.getElementById(`eng-pct-${eng}`);
1352
+ if (d.engine === eng && bar && pctEl) {
1353
+ bar.style.width = pct + "%";
1354
+ pctEl.textContent = pct + "%";
1355
+ }
1356
+ });
1357
+ });
1358
+
1359
+ _eventSource.addEventListener("complete", e => {
1360
+ const d = JSON.parse(e.data);
1361
+ appendLog(d.message, "success");
1362
+ _showResults(d);
1363
+ _finishBenchmark();
1364
+ });
1365
+
1366
+ _eventSource.addEventListener("error", e => {
1367
+ const d = JSON.parse(e.data);
1368
+ appendLog(d.message, "error");
1369
+ _finishBenchmark();
1370
+ });
1371
+
1372
+ _eventSource.addEventListener("cancelled", e => {
1373
+ appendLog(lang === "fr" ? "Benchmark annulé." : "Benchmark cancelled.", "warn");
1374
+ _finishBenchmark();
1375
+ });
1376
+
1377
+ _eventSource.addEventListener("done", e => {
1378
+ _finishBenchmark();
1379
+ });
1380
+
1381
+ _eventSource.onerror = () => {
1382
+ if (_currentJobId) {
1383
+ _finishBenchmark();
1384
+ }
1385
+ };
1386
+ }
1387
+
1388
+ function _showResults(data) {
1389
+ const section = document.getElementById("bench-result-section");
1390
+ section.style.display = "block";
1391
+ if (data.output_html) {
1392
+ const link = document.getElementById("bench-report-link");
1393
+ link.href = `/reports/${data.output_html.split("/").pop()}`;
1394
+ }
1395
+ if (data.ranking) {
1396
+ let html = `<table><thead><tr><th>#</th><th>${lang==="fr"?"Moteur":"Engine"}</th><th>CER</th><th>WER</th><th>${lang==="fr"?"Docs":"Docs"}</th></tr></thead><tbody>`;
1397
+ data.ranking.forEach((row, i) => {
1398
+ const cer = row.mean_cer != null ? (row.mean_cer*100).toFixed(2)+"%" : "N/A";
1399
+ const wer = row.mean_wer != null ? (row.mean_wer*100).toFixed(2)+"%" : "N/A";
1400
+ html += `<tr><td>${i+1}</td><td>${row.engine}</td><td>${cer}</td><td>${wer}</td><td>${row.total_docs || ""}</td></tr>`;
1401
+ });
1402
+ html += "</tbody></table>";
1403
+ document.getElementById("bench-ranking-table").innerHTML = html;
1404
+ }
1405
+ }
1406
+
1407
+ function _finishBenchmark() {
1408
+ if (_eventSource) { _eventSource.close(); _eventSource = null; }
1409
+ document.getElementById("start-btn").disabled = false;
1410
+ document.getElementById("cancel-btn").style.display = "none";
1411
+ document.getElementById("bench-status-text").textContent = "";
1412
+ }
1413
+
1414
+ async function cancelBenchmark() {
1415
+ if (!_currentJobId) return;
1416
+ await fetch(`/api/benchmark/${_currentJobId}/cancel`, {method: "POST"});
1417
+ }
1418
+
1419
+ function appendLog(msg, cls) {
1420
+ const box = document.getElementById("bench-log");
1421
+ const line = document.createElement("div");
1422
+ if (cls === "error") line.className = "log-error";
1423
+ else if (cls === "warn") line.className = "log-warn";
1424
+ else if (cls === "success") line.className = "log-success";
1425
+ line.textContent = msg;
1426
+ box.appendChild(line);
1427
+ box.scrollTop = box.scrollHeight;
1428
+ }
1429
+
1430
+ // ─── Reports ─────────────────────────────────────────────────────────────────
1431
+ async function loadReports() {
1432
+ const dir = document.getElementById("reports-dir").value || ".";
1433
+ const container = document.getElementById("reports-list");
1434
+ container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${t("loading")}</div>`;
1435
+ try {
1436
+ const r = await fetch(`/api/reports?reports_dir=${encodeURIComponent(dir)}`);
1437
+ const d = await r.json();
1438
+ if (d.reports.length === 0) {
1439
+ container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${t("no_reports")}</div>`;
1440
+ return;
1441
+ }
1442
+ let html = `<table><thead><tr><th>${lang==="fr"?"Fichier":"File"}</th><th>${lang==="fr"?"Taille":"Size"}</th><th>${lang==="fr"?"Modifié":"Modified"}</th><th></th></tr></thead><tbody>`;
1443
+ d.reports.forEach(rep => {
1444
+ const date = new Date(rep.modified).toLocaleString(lang === "fr" ? "fr-FR" : "en-US");
1445
+ html += `<tr><td>${rep.filename}</td><td>${rep.size_kb} Ko</td><td>${date}</td>
1446
+ <td><a href="${rep.url}" target="_blank" class="btn btn-primary btn-sm">${lang==="fr"?"Ouvrir":"Open"}</a></td></tr>`;
1447
+ });
1448
+ html += "</tbody></table>";
1449
+ container.innerHTML = html;
1450
+ } catch(e) {
1451
+ container.innerHTML = `<div style="color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`;
1452
+ }
1453
+ }
1454
+
1455
+ // ─── Engines status ──────────────────────────────────────────────────────────
1456
+ async function loadEngines() {
1457
+ try {
1458
+ const r = await fetch("/api/engines");
1459
+ const d = await r.json();
1460
+
1461
+ // OCR
1462
+ let html = `<table><thead><tr><th>ID</th><th>${lang==="fr"?"Nom":"Name"}</th><th>Version</th><th>Statut</th></tr></thead><tbody>`;
1463
+ d.engines.forEach(e => {
1464
+ const cls = e.available ? "badge-ok" : "badge-err";
1465
+ const lbl = e.available ? t("available") : t("not_installed");
1466
+ html += `<tr><td><code>${e.id}</code></td><td>${e.label}</td><td>${e.version||"—"}</td>
1467
+ <td><span class="badge ${cls}">${lbl}</span></td></tr>`;
1468
+ });
1469
+ html += "</tbody></table>";
1470
+ document.getElementById("engines-ocr-list").innerHTML = html;
1471
+
1472
+ // LLMs
1473
+ let llmHtml = `<table><thead><tr><th>ID</th><th>${lang==="fr"?"Nom":"Name"}</th><th>Statut</th><th>${lang==="fr"?"Détail":"Detail"}</th></tr></thead><tbody>`;
1474
+ d.llms.forEach(e => {
1475
+ const cls = e.available ? "badge-ok" : "badge-warn";
1476
+ const statusKey = e.status === "configured" ? "configured"
1477
+ : e.status === "running" ? "running"
1478
+ : e.status === "not_running" ? "not_running"
1479
+ : "missing_key";
1480
+ const lbl = t(statusKey);
1481
+ let detail = "";
1482
+ if (e.key_env) detail = `<code style="font-size:11px;">${e.key_env}</code>`;
1483
+ if (e.models && e.models.length > 0) detail = e.models.slice(0, 3).join(", ");
1484
+ llmHtml += `<tr><td><code>${e.id}</code></td><td>${e.label}</td>
1485
+ <td><span class="badge ${cls}">${lbl}</span></td><td>${detail}</td></tr>`;
1486
+ });
1487
+ llmHtml += "</tbody></table>";
1488
+ document.getElementById("engines-llm-list").innerHTML = llmHtml;
1489
+ } catch(e) {
1490
+ document.getElementById("engines-ocr-list").innerHTML =
1491
+ `<div style="color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`;
1492
+ }
1493
+ }
1494
+
1495
+ // ─── HTR-United ──────────────────────────────────────────────────────────────
1496
+ async function initHTRFilters() {
1497
+ try {
1498
+ const r = await fetch("/api/htr-united/catalogue");
1499
+ const d = await r.json();
1500
+ const langSel = document.getElementById("htr-lang-filter");
1501
+ const scriptSel = document.getElementById("htr-script-filter");
1502
+ langSel.innerHTML = `<option value="">${t("all")}</option>`;
1503
+ d.available_languages.forEach(l => {
1504
+ langSel.innerHTML += `<option value="${l}">${l}</option>`;
1505
+ });
1506
+ scriptSel.innerHTML = `<option value="">${t("all")}</option>`;
1507
+ d.available_scripts.forEach(s => {
1508
+ scriptSel.innerHTML += `<option value="${s}">${s}</option>`;
1509
+ });
1510
+ } catch(e) {}
1511
+ }
1512
+
1513
+ async function searchHTRUnited() {
1514
+ const q = document.getElementById("htr-search").value;
1515
+ const lang2 = document.getElementById("htr-lang-filter").value;
1516
+ const script = document.getElementById("htr-script-filter").value;
1517
+ const container = document.getElementById("htr-results");
1518
+ container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${t("loading")}</div>`;
1519
+ try {
1520
+ const url = `/api/htr-united/catalogue?query=${encodeURIComponent(q)}&language=${encodeURIComponent(lang2)}&script=${encodeURIComponent(script)}`;
1521
+ const r = await fetch(url);
1522
+ const d = await r.json();
1523
+ if (d.entries.length === 0) {
1524
+ container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${lang==="fr"?"Aucun résultat.":"No results."}</div>`;
1525
+ return;
1526
+ }
1527
+ container.innerHTML = d.entries.map(e => {
1528
+ const tags = [...e.language, ...e.script].map(s => `<span class="ds-tag">${s}</span>`).join("");
1529
+ return `<div class="ds-card">
1530
+ <div style="display:flex; justify-content:space-between; align-items:flex-start;">
1531
+ <h4>${e.title}</h4>
1532
+ <button class="btn btn-primary btn-sm" onclick="openImportModal('htr', '${e.id}', '${e.title.replace(/'/g,"\\'")}')">
1533
+ ${lang==="fr"?"Importer":"Import"}
1534
+ </button>
1535
+ </div>
1536
+ <p>${e.description}</p>
1537
+ <p style="color: var(--text-muted);">${e.institution} — ${e.lines.toLocaleString()} ${t("lines")} — ${e.format}</p>
1538
+ <div class="ds-meta">${tags}</div>
1539
+ </div>`;
1540
+ }).join("");
1541
+ } catch(e) {
1542
+ container.innerHTML = `<div style="color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`;
1543
+ }
1544
+ }
1545
+
1546
+ async function searchHuggingFace() {
1547
+ const q = document.getElementById("hf-search").value;
1548
+ const langFilter = document.getElementById("hf-lang-filter").value;
1549
+ const tags = document.getElementById("hf-tags").value;
1550
+ const container = document.getElementById("hf-results");
1551
+ container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${t("loading")}</div>`;
1552
+ try {
1553
+ const url = `/api/huggingface/search?query=${encodeURIComponent(q)}&language=${encodeURIComponent(langFilter)}&tags=${encodeURIComponent(tags)}`;
1554
+ const r = await fetch(url);
1555
+ const d = await r.json();
1556
+ if (d.datasets.length === 0) {
1557
+ container.innerHTML = `<div style="color: var(--text-muted); font-size:12px;">${lang==="fr"?"Aucun résultat.":"No results."}</div>`;
1558
+ return;
1559
+ }
1560
+ container.innerHTML = d.datasets.map(ds => {
1561
+ const tags2 = ds.tags.slice(0,5).map(s => `<span class="ds-tag">${s}</span>`).join("");
1562
+ return `<div class="ds-card">
1563
+ <div style="display:flex; justify-content:space-between; align-items:flex-start;">
1564
+ <h4>${ds.title}</h4>
1565
+ <button class="btn btn-primary btn-sm" onclick="openImportModal('hf', '${ds.dataset_id.replace(/'/g,"\\'")}', '${ds.title.replace(/'/g,"\\'")}')">
1566
+ ${lang==="fr"?"Importer":"Import"}
1567
+ </button>
1568
+ </div>
1569
+ <p>${ds.description}</p>
1570
+ <p style="color: var(--text-muted);">${ds.institution||ds.dataset_id} ${ds.downloads ? "— " + ds.downloads.toLocaleString() + " téléchargements" : ""}</p>
1571
+ <div class="ds-meta">${tags2}</div>
1572
+ </div>`;
1573
+ }).join("");
1574
+ } catch(e) {
1575
+ container.innerHTML = `<div style="color: var(--danger); font-size:12px;">Erreur : ${e.message}</div>`;
1576
+ }
1577
+ }
1578
+
1579
+ // ─── Import modal ─────────────────────────────────────────────────────────────
1580
+ function openImportModal(type, id, title) {
1581
+ document.getElementById("import-modal-type").value = type;
1582
+ document.getElementById("import-modal-id").value = id;
1583
+ document.getElementById("import-modal-title").textContent = `${t("import_modal_title")} : ${title}`;
1584
+ document.getElementById("import-modal-status").innerHTML = "";
1585
+ document.getElementById("import-modal").style.display = "flex";
1586
+ }
1587
+ function closeImportModal() {
1588
+ document.getElementById("import-modal").style.display = "none";
1589
+ }
1590
+ async function confirmImport() {
1591
+ const type = document.getElementById("import-modal-type").value;
1592
+ const id = document.getElementById("import-modal-id").value;
1593
+ const outputDir = document.getElementById("import-modal-output").value;
1594
+ const maxSamples = parseInt(document.getElementById("import-modal-max").value);
1595
+ const statusDiv = document.getElementById("import-modal-status");
1596
+ statusDiv.innerHTML = `<div class="alert alert-info"><span class="spinner"></span> ${lang==="fr"?"Import en cours…":"Importing…"}</div>`;
1597
+
1598
+ try {
1599
+ let url, body;
1600
+ if (type === "htr") {
1601
+ url = "/api/htr-united/import";
1602
+ body = {entry_id: id, output_dir: outputDir, max_samples: maxSamples};
1603
+ } else {
1604
+ url = "/api/huggingface/import";
1605
+ body = {dataset_id: id, output_dir: outputDir, max_samples: maxSamples};
1606
+ }
1607
+ const r = await fetch(url, {method:"POST", headers:{"Content-Type":"application/json"}, body: JSON.stringify(body)});
1608
+ const d = await r.json();
1609
+ if (!r.ok) throw new Error(d.detail || "Erreur");
1610
+ const msg = lang === "fr"
1611
+ ? `✓ Import terminé. ${d.files_imported || 0} fichiers dans <code>${d.output_dir}</code>`
1612
+ : `✓ Import done. ${d.files_imported || 0} files in <code>${d.output_dir}</code>`;
1613
+ statusDiv.innerHTML = `<div class="alert alert-success">${msg}</div>`;
1614
+ // Suggestion de corpus path
1615
+ document.getElementById("corpus-path").value = d.output_dir;
1616
+ } catch(e) {
1617
+ statusDiv.innerHTML = `<div class="alert alert-error">Erreur : ${e.message}</div>`;
1618
+ }
1619
+ }
1620
+
1621
+ // ─── Init ────────────────────────────────────────────────────────────────────
1622
+ document.addEventListener("DOMContentLoaded", () => {
1623
+ loadStatus();
1624
+ loadEngineCheckboxes();
1625
+ loadNormProfiles();
1626
+ initHTRFilters();
1627
+ // Close modal on backdrop click
1628
+ document.getElementById("import-modal").addEventListener("click", e => {
1629
+ if (e.target === document.getElementById("import-modal")) closeImportModal();
1630
+ });
1631
+ });
1632
+ </script>
1633
+ </body>
1634
+ </html>"""
pyproject.toml CHANGED
@@ -29,8 +29,10 @@ dependencies = [
29
  ]
30
 
31
  [project.optional-dependencies]
32
- dev = ["pytest>=7.4.0", "pytest-cov>=4.1.0"]
33
  pero = ["pero-ocr>=0.1.0"]
 
 
34
 
35
  [project.scripts]
36
  picarones = "picarones.cli:cli"
 
29
  ]
30
 
31
  [project.optional-dependencies]
32
+ dev = ["pytest>=7.4.0", "pytest-cov>=4.1.0", "httpx>=0.27.0"]
33
  pero = ["pero-ocr>=0.1.0"]
34
+ web = ["fastapi>=0.111.0", "uvicorn[standard]>=0.29.0", "httpx>=0.27.0"]
35
+ hf = ["datasets>=2.19.0"]
36
 
37
  [project.scripts]
38
  picarones = "picarones.cli:cli"
rapport_demo.html CHANGED
The diff for this file is too large to render. See raw diff
 
tests/test_sprint4_normalization_iiif.py ADDED
@@ -0,0 +1,834 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests Sprint 4 : normalisation diplomatique, import IIIF, adaptateurs API OCR."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import pytest
8
+
9
+ from picarones.core.normalization import (
10
+ NormalizationProfile,
11
+ DIPLOMATIC_FR_MEDIEVAL,
12
+ DIPLOMATIC_FR_EARLY_MODERN,
13
+ DIPLOMATIC_LATIN_MEDIEVAL,
14
+ DIPLOMATIC_MINIMAL,
15
+ DEFAULT_DIPLOMATIC_PROFILE,
16
+ _apply_diplomatic_table,
17
+ get_builtin_profile,
18
+ )
19
+ from picarones.core.metrics import compute_metrics, aggregate_metrics, MetricsResult
20
+ from picarones.importers.iiif import (
21
+ IIIFManifestParser,
22
+ IIIFCanvas,
23
+ parse_page_selector,
24
+ _extract_label,
25
+ _best_image_url_v2,
26
+ _best_image_url_v3,
27
+ _guess_extension,
28
+ _slugify,
29
+ )
30
+
31
+
32
+ # ===========================================================================
33
+ # Tests NormalizationProfile
34
+ # ===========================================================================
35
+
36
+ class TestNormalizationProfile:
37
+
38
+ def test_default_nfc_only(self):
39
+ profile = NormalizationProfile(name="test")
40
+ assert profile.nfc is True
41
+ assert profile.caseless is False
42
+ assert profile.diplomatic_table == {}
43
+
44
+ def test_normalize_nfc(self):
45
+ profile = NormalizationProfile(name="nfc_only")
46
+ # NFD vs NFC : après NFC, les deux doivent être identiques
47
+ decomposed = "e\u0301" # e + accent
48
+ assert profile.normalize(decomposed) == "\u00e9" # é NFC
49
+
50
+ def test_normalize_caseless(self):
51
+ profile = NormalizationProfile(name="caseless", caseless=True)
52
+ assert profile.normalize("Bonjour MONDE") == "bonjour monde"
53
+
54
+ def test_normalize_diplomatic_table(self):
55
+ profile = NormalizationProfile(
56
+ name="test",
57
+ diplomatic_table={"ſ": "s", "u": "v"}
58
+ )
59
+ # "maiſon": ſ→s gives "maison", no u present → "maison"
60
+ assert profile.normalize("maiſon") == "maison"
61
+ # "uers" (vers ancien): u→v gives "vers"
62
+ assert profile.normalize("uers") == "vers"
63
+
64
+ def test_normalize_order_nfc_then_caseless_then_diplomatic(self):
65
+ """L'ordre est : NFC → caseless → table diplomatique."""
66
+ profile = NormalizationProfile(
67
+ name="combined",
68
+ caseless=True,
69
+ diplomatic_table={"ſ": "s"}
70
+ )
71
+ result = profile.normalize("Maiſon")
72
+ assert result == "maison"
73
+
74
+ def test_as_dict(self):
75
+ profile = NormalizationProfile(
76
+ name="medieval_french",
77
+ nfc=True,
78
+ caseless=False,
79
+ diplomatic_table={"ſ": "s"},
80
+ description="Test",
81
+ )
82
+ d = profile.as_dict()
83
+ assert d["name"] == "medieval_french"
84
+ assert d["diplomatic_table"] == {"ſ": "s"}
85
+ assert d["caseless"] is False
86
+
87
+ def test_from_dict(self):
88
+ data = {
89
+ "name": "custom",
90
+ "caseless": True,
91
+ "diplomatic": {"ſ": "s", "u": "v"},
92
+ "description": "Custom profile",
93
+ }
94
+ profile = NormalizationProfile.from_dict(data)
95
+ assert profile.name == "custom"
96
+ assert profile.caseless is True
97
+ assert profile.diplomatic_table == {"ſ": "s", "u": "v"}
98
+
99
+ def test_from_dict_defaults(self):
100
+ profile = NormalizationProfile.from_dict({})
101
+ assert profile.name == "custom"
102
+ assert profile.nfc is True
103
+ assert profile.caseless is False
104
+
105
+ def test_from_yaml(self, tmp_path):
106
+ yaml_content = "name: my_profile\ncaseless: false\ndiplomatic:\n \u017f: s\n u: v\n"
107
+ yaml_file = tmp_path / "profile.yaml"
108
+ yaml_file.write_text(yaml_content, encoding="utf-8")
109
+ try:
110
+ profile = NormalizationProfile.from_yaml(yaml_file)
111
+ assert profile.name == "my_profile"
112
+ assert profile.diplomatic_table == {"\u017f": "s", "u": "v"}
113
+ except RuntimeError as e:
114
+ if "pyyaml" in str(e):
115
+ pytest.skip("pyyaml non installé")
116
+ raise
117
+
118
+
119
+ class TestApplyDiplomaticTable:
120
+
121
+ def test_simple_substitutions(self):
122
+ table = {"ſ": "s", "u": "v"}
123
+ # "maiſon": ſ→s gives "maison"; no u → "maison"
124
+ assert _apply_diplomatic_table("maiſon", table) == "maison"
125
+ # "uers": u→v gives "vers"
126
+ assert _apply_diplomatic_table("uers", table) == "vers"
127
+
128
+ def test_multi_char_key_priority(self):
129
+ """Les clés multi-chars sont appliquées avant les clés simples."""
130
+ table = {"ae": "X", "a": "Y"}
131
+ # "ae" doit être remplacé en "X" et non "Ye"
132
+ result = _apply_diplomatic_table("aeb", table)
133
+ assert result == "Xb"
134
+
135
+ def test_ampersand_to_et(self):
136
+ table = {"&": "et"}
137
+ assert _apply_diplomatic_table("noir & blanc", table) == "noir et blanc"
138
+
139
+ def test_empty_table(self):
140
+ assert _apply_diplomatic_table("hello", {}) == "hello"
141
+
142
+ def test_empty_text(self):
143
+ assert _apply_diplomatic_table("", {"a": "b"}) == ""
144
+
145
+
146
+ class TestGetBuiltinProfile:
147
+
148
+ def test_medieval_french(self):
149
+ profile = get_builtin_profile("medieval_french")
150
+ assert profile.name == "medieval_french"
151
+ assert "ſ" in profile.diplomatic_table
152
+ assert profile.diplomatic_table["ſ"] == "s"
153
+
154
+ def test_early_modern_french(self):
155
+ profile = get_builtin_profile("early_modern_french")
156
+ assert "ſ" in profile.diplomatic_table
157
+
158
+ def test_medieval_latin(self):
159
+ profile = get_builtin_profile("medieval_latin")
160
+ assert "ꝑ" in profile.diplomatic_table
161
+
162
+ def test_minimal(self):
163
+ profile = get_builtin_profile("minimal")
164
+ assert "ſ" in profile.diplomatic_table
165
+ assert "u" not in profile.diplomatic_table
166
+
167
+ def test_nfc(self):
168
+ profile = get_builtin_profile("nfc")
169
+ assert profile.nfc is True
170
+ assert profile.diplomatic_table == {}
171
+
172
+ def test_caseless(self):
173
+ profile = get_builtin_profile("caseless")
174
+ assert profile.caseless is True
175
+
176
+ def test_unknown_raises_key_error(self):
177
+ with pytest.raises(KeyError, match="inexistant"):
178
+ get_builtin_profile("inexistant")
179
+
180
+ def test_default_profile_is_medieval_french(self):
181
+ assert DEFAULT_DIPLOMATIC_PROFILE.name == "medieval_french"
182
+
183
+
184
+ # ===========================================================================
185
+ # Tests CER diplomatique dans compute_metrics
186
+ # ===========================================================================
187
+
188
+ class TestDiplomaticCER:
189
+
190
+ def test_cer_diplomatic_computed_by_default(self):
191
+ """Le CER diplomatique est calculé par défaut avec le profil médiéval."""
192
+ result = compute_metrics("maiſon", "maison")
193
+ assert result.cer_diplomatic is not None
194
+ assert result.diplomatic_profile_name == "medieval_french"
195
+
196
+ def test_cer_diplomatic_lower_than_exact_for_long_s(self):
197
+ """
198
+ Avec ſ→s : le CER diplomatique doit être 0.0 pour "maiſon" vs "maison"
199
+ car après normalisation les deux deviennent "maivon" ou "maison".
200
+ """
201
+ # "maiſon" vs "maison" — différence uniquement sur ſ vs s
202
+ result = compute_metrics("maiſon", "maison")
203
+ # CER brut > 0 (ſ ≠ s, deux bytes UTF-8 vs un)
204
+ assert result.cer > 0.0
205
+ # CER diplomatique = 0 car ſ et s sont équivalents dans le profil médiéval
206
+ assert result.cer_diplomatic == pytest.approx(0.0)
207
+
208
+ def test_cer_diplomatic_in_as_dict(self):
209
+ result = compute_metrics("maiſon", "maison")
210
+ d = result.as_dict()
211
+ assert "cer_diplomatic" in d
212
+ assert "diplomatic_profile_name" in d
213
+
214
+ def test_cer_diplomatic_with_custom_profile(self):
215
+ from picarones.core.normalization import NormalizationProfile
216
+ profile = NormalizationProfile(
217
+ name="test_profile",
218
+ diplomatic_table={"ſ": "s"}
219
+ )
220
+ result = compute_metrics("maiſon", "maison", normalization_profile=profile)
221
+ assert result.cer_diplomatic == pytest.approx(0.0)
222
+ assert result.diplomatic_profile_name == "test_profile"
223
+
224
+ def test_cer_diplomatic_not_in_as_dict_when_none(self):
225
+ """Si le CER diplomatique n'a pas pu être calculé, il n'est pas dans as_dict."""
226
+ result = MetricsResult(
227
+ cer=0.1, cer_nfc=0.1, cer_caseless=0.1,
228
+ wer=0.1, wer_normalized=0.1, mer=0.1, wil=0.1,
229
+ reference_length=10, hypothesis_length=10,
230
+ cer_diplomatic=None, diplomatic_profile_name=None,
231
+ )
232
+ d = result.as_dict()
233
+ assert "cer_diplomatic" not in d
234
+
235
+ def test_aggregate_metrics_includes_diplomatic_cer(self):
236
+ """aggregate_metrics doit agréger cer_diplomatic quand disponible."""
237
+ results = [
238
+ MetricsResult(
239
+ cer=0.1, cer_nfc=0.1, cer_caseless=0.1,
240
+ wer=0.1, wer_normalized=0.1, mer=0.1, wil=0.1,
241
+ reference_length=10, hypothesis_length=10,
242
+ cer_diplomatic=0.05, diplomatic_profile_name="medieval_french",
243
+ ),
244
+ MetricsResult(
245
+ cer=0.2, cer_nfc=0.2, cer_caseless=0.2,
246
+ wer=0.2, wer_normalized=0.2, mer=0.2, wil=0.2,
247
+ reference_length=10, hypothesis_length=10,
248
+ cer_diplomatic=0.10, diplomatic_profile_name="medieval_french",
249
+ ),
250
+ ]
251
+ agg = aggregate_metrics(results)
252
+ assert "cer_diplomatic" in agg
253
+ assert agg["cer_diplomatic"]["mean"] == pytest.approx(0.075)
254
+ assert agg["cer_diplomatic"].get("profile") == "medieval_french"
255
+
256
+
257
+ # ===========================================================================
258
+ # Tests parse_page_selector
259
+ # ===========================================================================
260
+
261
+ class TestParsePageSelector:
262
+
263
+ def test_all(self):
264
+ assert parse_page_selector("all", 10) == list(range(10))
265
+
266
+ def test_empty_string(self):
267
+ assert parse_page_selector("", 5) == list(range(5))
268
+
269
+ def test_single_page(self):
270
+ assert parse_page_selector("3", 10) == [2] # 0-based
271
+
272
+ def test_range(self):
273
+ assert parse_page_selector("1-5", 10) == [0, 1, 2, 3, 4]
274
+
275
+ def test_comma_list(self):
276
+ assert parse_page_selector("1,3,5", 10) == [0, 2, 4]
277
+
278
+ def test_combined(self):
279
+ result = parse_page_selector("1-3,5,8-9", 10)
280
+ assert result == [0, 1, 2, 4, 7, 8]
281
+
282
+ def test_deduplication(self):
283
+ result = parse_page_selector("1,1,2", 5)
284
+ assert result == [0, 1]
285
+
286
+ def test_sorted_output(self):
287
+ result = parse_page_selector("5,1,3", 10)
288
+ assert result == [0, 2, 4]
289
+
290
+ def test_page_out_of_range_raises(self):
291
+ with pytest.raises(ValueError):
292
+ parse_page_selector("15", 10)
293
+
294
+ def test_range_out_of_bounds_raises(self):
295
+ with pytest.raises(ValueError):
296
+ parse_page_selector("1-15", 10)
297
+
298
+ def test_invalid_syntax_raises(self):
299
+ with pytest.raises((ValueError, Exception)):
300
+ parse_page_selector("abc", 10)
301
+
302
+ def test_last_page(self):
303
+ assert parse_page_selector("10", 10) == [9]
304
+
305
+ def test_first_page(self):
306
+ assert parse_page_selector("1", 10) == [0]
307
+
308
+
309
+ # ===========================================================================
310
+ # Tests IIIFManifestParser — IIIF v2
311
+ # ===========================================================================
312
+
313
+ def _make_v2_manifest(num_canvases: int = 3, with_service: bool = False) -> dict:
314
+ """Fabrique un manifeste IIIF v2 minimal de test."""
315
+ canvases = []
316
+ for i in range(num_canvases):
317
+ resource: dict
318
+ if with_service:
319
+ resource = {
320
+ "@type": "dctypes:Image",
321
+ "service": {"@id": f"https://example.com/iiif/img{i+1}"},
322
+ }
323
+ else:
324
+ resource = {
325
+ "@type": "dctypes:Image",
326
+ "@id": f"https://example.com/images/img{i+1}.jpg",
327
+ }
328
+ canvases.append({
329
+ "@id": f"https://example.com/canvas/{i+1}",
330
+ "@type": "sc:Canvas",
331
+ "label": f"f. {i+1}r",
332
+ "width": 2000,
333
+ "height": 3000,
334
+ "images": [
335
+ {
336
+ "@type": "oa:Annotation",
337
+ "motivation": "sc:painting",
338
+ "resource": resource,
339
+ "on": f"https://example.com/canvas/{i+1}",
340
+ }
341
+ ],
342
+ })
343
+ return {
344
+ "@context": "http://iiif.io/api/presentation/2/context.json",
345
+ "@type": "sc:Manifest",
346
+ "@id": "https://example.com/manifest.json",
347
+ "label": "Manuscript de test",
348
+ "sequences": [
349
+ {
350
+ "@type": "sc:Sequence",
351
+ "canvases": canvases,
352
+ }
353
+ ],
354
+ }
355
+
356
+
357
+ def _make_v3_manifest(num_canvases: int = 3) -> dict:
358
+ """Fabrique un manifeste IIIF v3 minimal de test."""
359
+ items = []
360
+ for i in range(num_canvases):
361
+ items.append({
362
+ "id": f"https://example.com/canvas/{i+1}",
363
+ "type": "Canvas",
364
+ "label": {"fr": [f"Page {i+1}"]},
365
+ "width": 1500,
366
+ "height": 2200,
367
+ "items": [
368
+ {
369
+ "id": f"https://example.com/canvas/{i+1}/ap",
370
+ "type": "AnnotationPage",
371
+ "items": [
372
+ {
373
+ "id": f"https://example.com/canvas/{i+1}/ap/a",
374
+ "type": "Annotation",
375
+ "motivation": "painting",
376
+ "body": {
377
+ "id": f"https://example.com/images/{i+1}/full/max/0/default.jpg",
378
+ "type": "Image",
379
+ "format": "image/jpeg",
380
+ },
381
+ "target": f"https://example.com/canvas/{i+1}",
382
+ }
383
+ ],
384
+ }
385
+ ],
386
+ })
387
+ return {
388
+ "@context": "http://iiif.io/api/presentation/3/context.json",
389
+ "id": "https://example.com/manifest.json",
390
+ "type": "Manifest",
391
+ "label": {"fr": ["Manuscrit v3 de test"]},
392
+ "items": items,
393
+ }
394
+
395
+
396
+ class TestIIIFManifestParserV2:
397
+
398
+ def test_version_detection(self):
399
+ manifest = _make_v2_manifest()
400
+ parser = IIIFManifestParser(manifest)
401
+ assert parser.version == 2
402
+
403
+ def test_canvases_count(self):
404
+ parser = IIIFManifestParser(_make_v2_manifest(5))
405
+ assert len(parser.canvases()) == 5
406
+
407
+ def test_canvas_label(self):
408
+ parser = IIIFManifestParser(_make_v2_manifest())
409
+ canvases = parser.canvases()
410
+ assert canvases[0].label == "f. 1r"
411
+ assert canvases[1].label == "f. 2r"
412
+
413
+ def test_canvas_image_url_direct(self):
414
+ parser = IIIFManifestParser(_make_v2_manifest())
415
+ canvases = parser.canvases()
416
+ assert canvases[0].image_url == "https://example.com/images/img1.jpg"
417
+
418
+ def test_canvas_image_url_via_service(self):
419
+ parser = IIIFManifestParser(_make_v2_manifest(with_service=True))
420
+ canvases = parser.canvases()
421
+ assert "/full/max/0/default.jpg" in canvases[0].image_url
422
+
423
+ def test_canvas_dimensions(self):
424
+ parser = IIIFManifestParser(_make_v2_manifest())
425
+ c = parser.canvases()[0]
426
+ assert c.width == 2000
427
+ assert c.height == 3000
428
+
429
+ def test_canvas_index(self):
430
+ parser = IIIFManifestParser(_make_v2_manifest(3))
431
+ canvases = parser.canvases()
432
+ for i, c in enumerate(canvases):
433
+ assert c.index == i
434
+
435
+ def test_label(self):
436
+ parser = IIIFManifestParser(_make_v2_manifest())
437
+ assert parser.label == "Manuscript de test"
438
+
439
+ def test_empty_sequences(self):
440
+ manifest = {
441
+ "@context": "http://iiif.io/api/presentation/2/context.json",
442
+ "@type": "sc:Manifest",
443
+ "label": "Empty",
444
+ "sequences": [],
445
+ }
446
+ parser = IIIFManifestParser(manifest)
447
+ assert parser.canvases() == []
448
+
449
+
450
+ class TestIIIFManifestParserV3:
451
+
452
+ def test_version_detection(self):
453
+ manifest = _make_v3_manifest()
454
+ parser = IIIFManifestParser(manifest)
455
+ assert parser.version == 3
456
+
457
+ def test_canvases_count(self):
458
+ parser = IIIFManifestParser(_make_v3_manifest(4))
459
+ assert len(parser.canvases()) == 4
460
+
461
+ def test_canvas_label_from_language_map(self):
462
+ parser = IIIFManifestParser(_make_v3_manifest())
463
+ canvases = parser.canvases()
464
+ assert "Page 1" in canvases[0].label
465
+
466
+ def test_canvas_image_url(self):
467
+ parser = IIIFManifestParser(_make_v3_manifest())
468
+ canvases = parser.canvases()
469
+ assert "default.jpg" in canvases[0].image_url
470
+
471
+ def test_manifest_label_language_map(self):
472
+ parser = IIIFManifestParser(_make_v3_manifest())
473
+ assert "v3" in parser.label.lower() or "test" in parser.label.lower()
474
+
475
+ def test_type_manifest_triggers_v3(self):
476
+ """Un manifeste avec type == 'Manifest' est détecté comme v3."""
477
+ manifest = {"type": "Manifest", "items": []}
478
+ parser = IIIFManifestParser(manifest)
479
+ assert parser.version == 3
480
+
481
+
482
+ class TestExtractLabel:
483
+
484
+ def test_string(self):
485
+ assert _extract_label("Page 1") == "Page 1"
486
+
487
+ def test_list(self):
488
+ assert _extract_label(["Page 1", "Page 2"]) == "Page 1"
489
+
490
+ def test_dict_fr(self):
491
+ assert _extract_label({"fr": ["Folio 1r"]}) == "Folio 1r"
492
+
493
+ def test_dict_en(self):
494
+ assert _extract_label({"en": ["Folio 1r"]}) == "Folio 1r"
495
+
496
+ def test_dict_none_key(self):
497
+ assert _extract_label({"none": ["Label"]}) == "Label"
498
+
499
+ def test_empty_string(self):
500
+ assert _extract_label("") == ""
501
+
502
+ def test_none_value(self):
503
+ result = _extract_label(None)
504
+ assert isinstance(result, str)
505
+
506
+
507
+ class TestBestImageUrlV2:
508
+
509
+ def test_direct_id(self):
510
+ resource = {"@id": "https://example.com/img.jpg"}
511
+ url = _best_image_url_v2(resource, {})
512
+ assert url == "https://example.com/img.jpg"
513
+
514
+ def test_service_id(self):
515
+ resource = {
516
+ "@id": "https://example.com/info.json",
517
+ "service": {"@id": "https://example.com/iiif/img1"},
518
+ }
519
+ url = _best_image_url_v2(resource, {})
520
+ assert url == "https://example.com/iiif/img1/full/max/0/default.jpg"
521
+
522
+ def test_service_list(self):
523
+ resource = {
524
+ "service": [
525
+ {"@id": "https://example.com/iiif/img2"},
526
+ ]
527
+ }
528
+ url = _best_image_url_v2(resource, {})
529
+ assert url == "https://example.com/iiif/img2/full/max/0/default.jpg"
530
+
531
+
532
+ class TestBestImageUrlV3:
533
+
534
+ def test_direct_body_image(self):
535
+ canvas = {
536
+ "items": [
537
+ {
538
+ "type": "AnnotationPage",
539
+ "items": [
540
+ {
541
+ "type": "Annotation",
542
+ "motivation": "painting",
543
+ "body": {
544
+ "id": "https://example.com/img.jpg",
545
+ "type": "Image",
546
+ },
547
+ }
548
+ ],
549
+ }
550
+ ]
551
+ }
552
+ url = _best_image_url_v3(canvas)
553
+ assert url == "https://example.com/img.jpg"
554
+
555
+ def test_body_via_service(self):
556
+ canvas = {
557
+ "items": [
558
+ {
559
+ "items": [
560
+ {
561
+ "body": {
562
+ "type": "Image",
563
+ "id": "",
564
+ "service": [{"id": "https://example.com/iiif/3/img1"}],
565
+ }
566
+ }
567
+ ]
568
+ }
569
+ ]
570
+ }
571
+ url = _best_image_url_v3(canvas)
572
+ assert "/full/max/0/default.jpg" in url
573
+
574
+ def test_empty_canvas(self):
575
+ url = _best_image_url_v3({})
576
+ assert url == ""
577
+
578
+
579
+ class TestGuessExtension:
580
+
581
+ def test_jpg(self):
582
+ assert _guess_extension("https://example.com/img.jpg") == ".jpg"
583
+
584
+ def test_png(self):
585
+ assert _guess_extension("https://example.com/img.png") == ".png"
586
+
587
+ def test_tiff(self):
588
+ assert _guess_extension("https://example.com/img.tiff") == ".tiff"
589
+
590
+ def test_iiif_default(self):
591
+ # URL IIIF standard contient /default.jpg
592
+ url = "https://example.com/iiif/img/full/max/0/default.jpg"
593
+ assert _guess_extension(url) == ".jpg"
594
+
595
+ def test_unknown_defaults_to_jpg(self):
596
+ assert _guess_extension("https://example.com/resource/123") == ".jpg"
597
+
598
+
599
+ class TestSlugify:
600
+
601
+ def test_simple(self):
602
+ assert _slugify("Page 1") == "Page_1"
603
+
604
+ def test_special_chars_removed(self):
605
+ result = _slugify("f. 1r (recto)")
606
+ assert "/" not in result
607
+ assert "." not in result
608
+
609
+ def test_max_length(self):
610
+ long_label = "x" * 100
611
+ assert len(_slugify(long_label)) <= 60
612
+
613
+ def test_empty(self):
614
+ assert _slugify("") == ""
615
+
616
+
617
+ # ===========================================================================
618
+ # Tests structure des nouveaux moteurs OCR (sans appel réseau)
619
+ # ===========================================================================
620
+
621
+ class TestMistralOCREngine:
622
+
623
+ def test_import(self):
624
+ from picarones.engines.mistral_ocr import MistralOCREngine
625
+ assert MistralOCREngine is not None
626
+
627
+ def test_name(self):
628
+ from picarones.engines.mistral_ocr import MistralOCREngine
629
+ engine = MistralOCREngine()
630
+ assert engine.name == "mistral_ocr"
631
+
632
+ def test_version_default_model(self):
633
+ from picarones.engines.mistral_ocr import MistralOCREngine
634
+ engine = MistralOCREngine()
635
+ assert "pixtral" in engine.version()
636
+
637
+ def test_version_custom_model(self):
638
+ from picarones.engines.mistral_ocr import MistralOCREngine
639
+ engine = MistralOCREngine({"model": "pixtral-large-latest"})
640
+ assert engine.version() == "pixtral-large-latest"
641
+
642
+ def test_missing_api_key_raises(self, monkeypatch, tmp_path):
643
+ from picarones.engines.mistral_ocr import MistralOCREngine
644
+ monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
645
+ engine = MistralOCREngine()
646
+ # Créer un fichier image factice
647
+ img = tmp_path / "test.jpg"
648
+ img.write_bytes(b"\xff\xd8\xff") # JPEG header minimal
649
+ with pytest.raises(RuntimeError, match="MISTRAL_API_KEY"):
650
+ engine._run_ocr(img)
651
+
652
+ def test_exported_from_engines(self):
653
+ from picarones.engines import MistralOCREngine
654
+ assert MistralOCREngine is not None
655
+
656
+
657
+ class TestGoogleVisionEngine:
658
+
659
+ def test_import(self):
660
+ from picarones.engines.google_vision import GoogleVisionEngine
661
+ assert GoogleVisionEngine is not None
662
+
663
+ def test_name(self):
664
+ from picarones.engines.google_vision import GoogleVisionEngine
665
+ engine = GoogleVisionEngine()
666
+ assert engine.name == "google_vision"
667
+
668
+ def test_version(self):
669
+ from picarones.engines.google_vision import GoogleVisionEngine
670
+ engine = GoogleVisionEngine()
671
+ assert engine.version() == "v1"
672
+
673
+ def test_missing_credentials_raises(self, monkeypatch, tmp_path):
674
+ from picarones.engines.google_vision import GoogleVisionEngine
675
+ monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
676
+ monkeypatch.delenv("GOOGLE_API_KEY", raising=False)
677
+ engine = GoogleVisionEngine()
678
+ img = tmp_path / "test.jpg"
679
+ img.write_bytes(b"\xff\xd8\xff")
680
+ with pytest.raises(RuntimeError):
681
+ engine._run_ocr(img)
682
+
683
+ def test_exported_from_engines(self):
684
+ from picarones.engines import GoogleVisionEngine
685
+ assert GoogleVisionEngine is not None
686
+
687
+
688
+ class TestAzureDocIntelEngine:
689
+
690
+ def test_import(self):
691
+ from picarones.engines.azure_doc_intel import AzureDocIntelEngine
692
+ assert AzureDocIntelEngine is not None
693
+
694
+ def test_name(self):
695
+ from picarones.engines.azure_doc_intel import AzureDocIntelEngine
696
+ engine = AzureDocIntelEngine()
697
+ assert engine.name == "azure_doc_intel"
698
+
699
+ def test_missing_key_raises(self, monkeypatch, tmp_path):
700
+ from picarones.engines.azure_doc_intel import AzureDocIntelEngine
701
+ monkeypatch.delenv("AZURE_DOC_INTEL_KEY", raising=False)
702
+ monkeypatch.delenv("AZURE_DOC_INTEL_ENDPOINT", raising=False)
703
+ engine = AzureDocIntelEngine()
704
+ img = tmp_path / "test.jpg"
705
+ img.write_bytes(b"\xff\xd8\xff")
706
+ with pytest.raises(RuntimeError):
707
+ engine._run_ocr(img)
708
+
709
+ def test_exported_from_engines(self):
710
+ from picarones.engines import AzureDocIntelEngine
711
+ assert AzureDocIntelEngine is not None
712
+
713
+
714
+ # ===========================================================================
715
+ # Tests CLI — commande import iiif
716
+ # ===========================================================================
717
+
718
+ class TestCLIImportIIIF:
719
+
720
+ def test_import_group_exists(self):
721
+ from picarones.cli import cli
722
+ from click.testing import CliRunner
723
+ runner = CliRunner()
724
+ result = runner.invoke(cli, ["import", "--help"])
725
+ assert result.exit_code == 0
726
+
727
+ def test_import_iiif_command_exists(self):
728
+ from picarones.cli import cli
729
+ from click.testing import CliRunner
730
+ runner = CliRunner()
731
+ result = runner.invoke(cli, ["import", "iiif", "--help"])
732
+ assert result.exit_code == 0
733
+ assert "manifest_url" in result.output.lower() or "MANIFEST_URL" in result.output
734
+
735
+ def test_import_iiif_options(self):
736
+ from picarones.cli import cli
737
+ from click.testing import CliRunner
738
+ runner = CliRunner()
739
+ result = runner.invoke(cli, ["import", "iiif", "--help"])
740
+ assert "--pages" in result.output
741
+ assert "--output" in result.output
742
+
743
+ def test_import_iiif_requires_url(self):
744
+ from picarones.cli import cli
745
+ from click.testing import CliRunner
746
+ runner = CliRunner()
747
+ result = runner.invoke(cli, ["import", "iiif"])
748
+ # Sans URL, doit afficher une erreur
749
+ assert result.exit_code != 0
750
+
751
+
752
+ # ===========================================================================
753
+ # Tests fixtures Sprint 4 (CER diplomatique dans la démo)
754
+ # ===========================================================================
755
+
756
+ class TestFixturesDiplomaticCER:
757
+
758
+ def test_gt_texts_contain_medieval_graphies(self):
759
+ """Les textes GT de démo doivent contenir des graphies médiévales."""
760
+ from picarones.fixtures import _GT_TEXTS
761
+ all_gt = " ".join(_GT_TEXTS)
762
+ # Les GT doivent contenir au moins ſ, & ou æ/œ
763
+ has_medieval_chars = any(c in all_gt for c in ["ſ", "&", "æ", "œ"])
764
+ assert has_medieval_chars, "Les GT de démo doivent inclure des graphies médiévales pour illustrer le CER diplomatique"
765
+
766
+ def test_benchmark_results_have_diplomatic_cer(self):
767
+ """Les résultats du benchmark fictif doivent inclure le CER diplomatique."""
768
+ from picarones.fixtures import generate_sample_benchmark
769
+ bm = generate_sample_benchmark()
770
+ for engine_report in bm.engine_reports:
771
+ for doc_result in engine_report.document_results:
772
+ if doc_result.metrics.error is None:
773
+ # Le CER diplomatique doit être calculé
774
+ assert doc_result.metrics.cer_diplomatic is not None, (
775
+ f"CER diplomatique manquant pour {engine_report.engine_name}"
776
+ )
777
+ break # Un seul doc suffit pour vérifier
778
+
779
+ def test_diplomatic_cer_lower_for_medieval_graphies(self):
780
+ """Pour un texte avec ſ, le CER diplomatique doit être ≤ CER exact."""
781
+ result = compute_metrics(
782
+ "maiſon & jardin", # GT avec graphies médiévales
783
+ "maison et jardin", # OCR avec graphies modernisées
784
+ )
785
+ assert result.cer_diplomatic is not None
786
+ # CER diplomatique doit être inférieur ou égal au CER exact
787
+ assert result.cer_diplomatic <= result.cer
788
+
789
+
790
+ # ===========================================================================
791
+ # Tests rapport HTML Sprint 4 (CER diplomatique affiché)
792
+ # ===========================================================================
793
+
794
+ class TestReportDiplomaticCER:
795
+
796
+ def test_report_data_has_cer_diplomatic(self):
797
+ """_build_report_data doit inclure cer_diplomatic dans engines_summary."""
798
+ from picarones.fixtures import generate_sample_benchmark
799
+ from picarones.report.generator import _build_report_data
800
+
801
+ bm = generate_sample_benchmark()
802
+ data = _build_report_data(bm, images_b64={})
803
+
804
+ # Chaque entrée engines doit avoir cer_diplomatic (ou None)
805
+ assert "engines" in data
806
+ for engine_data in data["engines"]:
807
+ assert "cer_diplomatic" in engine_data, (
808
+ f"cer_diplomatic manquant dans {engine_data.get('name', '?')}"
809
+ )
810
+
811
+ def test_html_contains_cer_diplo_column(self, tmp_path):
812
+ """Le HTML généré doit contenir la colonne CER diplo."""
813
+ from picarones.fixtures import generate_sample_benchmark
814
+ from picarones.report.generator import ReportGenerator
815
+
816
+ bm = generate_sample_benchmark()
817
+ out = tmp_path / "report_test.html"
818
+ ReportGenerator(bm).generate(out)
819
+ html = out.read_text(encoding="utf-8")
820
+ assert "diplo" in html.lower() or "diplomatique" in html.lower(), (
821
+ "Le rapport HTML doit mentionner le CER diplomatique"
822
+ )
823
+
824
+ def test_html_contains_medieval_graphie_indicator(self, tmp_path):
825
+ """Le rapport doit mentionner les graphies médiévales (ſ=s ou u=v)."""
826
+ from picarones.fixtures import generate_sample_benchmark
827
+ from picarones.report.generator import ReportGenerator
828
+
829
+ bm = generate_sample_benchmark()
830
+ out = tmp_path / "report_test.html"
831
+ ReportGenerator(bm).generate(out)
832
+ html = out.read_text(encoding="utf-8")
833
+ # Le tooltip ou la légende doit mentionner les correspondances diplomatiques
834
+ assert "ſ=s" in html or "u=v" in html or "diplomatique" in html.lower()
tests/test_sprint5_advanced_metrics.py ADDED
@@ -0,0 +1,876 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests Sprint 5 : métriques avancées patrimoniales.
2
+
3
+ Couvre :
4
+ - Matrice de confusion unicode (confusion.py)
5
+ - Scores ligatures et diacritiques (char_scores.py)
6
+ - Taxonomie des erreurs classes 1-9 (taxonomy.py)
7
+ - Analyse structurelle (structure.py)
8
+ - Qualité image (image_quality.py)
9
+ - Intégration dans les fixtures et le rapport HTML
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import pytest
15
+
16
+ # ===========================================================================
17
+ # Tests ConfusionMatrix
18
+ # ===========================================================================
19
+
20
+ from picarones.core.confusion import (
21
+ ConfusionMatrix,
22
+ EMPTY_CHAR,
23
+ build_confusion_matrix,
24
+ aggregate_confusion_matrices,
25
+ top_confused_chars,
26
+ )
27
+
28
+
29
+ class TestBuildConfusionMatrix:
30
+
31
+ def test_identical_texts(self):
32
+ cm = build_confusion_matrix("abc", "abc")
33
+ # Pas de substitutions
34
+ assert cm.total_substitutions == 0
35
+ assert cm.total_insertions == 0
36
+ assert cm.total_deletions == 0
37
+
38
+ def test_empty_texts(self):
39
+ cm = build_confusion_matrix("", "")
40
+ assert cm.total_errors == 0
41
+
42
+ def test_simple_substitution(self):
43
+ cm = build_confusion_matrix("abc", "axc")
44
+ # 'b' → 'x'
45
+ assert "b" in cm.matrix
46
+ assert "x" in cm.matrix["b"]
47
+ assert cm.matrix["b"]["x"] >= 1
48
+
49
+ def test_deletion_recorded(self):
50
+ cm = build_confusion_matrix("abc", "ac")
51
+ # 'b' supprimé
52
+ assert "b" in cm.matrix
53
+ assert EMPTY_CHAR in cm.matrix["b"]
54
+
55
+ def test_insertion_recorded(self):
56
+ cm = build_confusion_matrix("ac", "abc")
57
+ # 'b' inséré
58
+ assert EMPTY_CHAR in cm.matrix
59
+ assert "b" in cm.matrix[EMPTY_CHAR]
60
+
61
+ def test_no_whitespace_recorded_by_default(self):
62
+ cm = build_confusion_matrix("a b", "a x")
63
+ # Les espaces ne doivent pas être dans la matrice
64
+ assert " " not in cm.matrix
65
+
66
+ def test_as_dict_structure(self):
67
+ cm = build_confusion_matrix("hello", "hallo")
68
+ d = cm.as_dict()
69
+ assert "matrix" in d
70
+ assert "total_substitutions" in d
71
+ assert "total_insertions" in d
72
+ assert "total_deletions" in d
73
+
74
+ def test_top_confusions(self):
75
+ cm = build_confusion_matrix("eeee", "aaaa")
76
+ tops = cm.top_confusions(n=5)
77
+ assert len(tops) >= 1
78
+ assert tops[0]["gt"] == "e"
79
+ assert tops[0]["ocr"] == "a"
80
+ assert tops[0]["count"] == 4
81
+
82
+ def test_medieval_chars_tracked(self):
83
+ cm = build_confusion_matrix("maiſon", "maifon")
84
+ # ſ confondu avec f
85
+ assert "ſ" in cm.matrix
86
+ assert "f" in cm.matrix["ſ"]
87
+
88
+ def test_as_compact_dict_filters_low_count(self):
89
+ cm = build_confusion_matrix("aab", "axb")
90
+ # avec min_count=2, une substitution unique filtrée
91
+ compact = cm.as_compact_dict(min_count=2)
92
+ # Le 'a'→'x' ne doit pas apparaître (1 seule occurrence)
93
+ matrix = compact["matrix"]
94
+ for gt_counts in matrix.values():
95
+ for ocr_char, cnt in gt_counts.items():
96
+ assert cnt >= 2
97
+
98
+
99
+ class TestAggregateConfusionMatrices:
100
+
101
+ def test_empty_list(self):
102
+ cm = aggregate_confusion_matrices([])
103
+ assert cm.total_errors == 0
104
+
105
+ def test_single_matrix(self):
106
+ cm1 = build_confusion_matrix("abc", "axc")
107
+ agg = aggregate_confusion_matrices([cm1])
108
+ assert agg.matrix == cm1.matrix
109
+
110
+ def test_counts_sum(self):
111
+ cm1 = build_confusion_matrix("abc", "axc")
112
+ cm2 = build_confusion_matrix("abc", "axc")
113
+ agg = aggregate_confusion_matrices([cm1, cm2])
114
+ # La confusion 'b'→'x' doit apparaître 2 fois
115
+ assert agg.matrix.get("b", {}).get("x", 0) >= 2
116
+
117
+ def test_total_errors_sum(self):
118
+ cm1 = build_confusion_matrix("abc", "axc")
119
+ cm2 = build_confusion_matrix("def", "dxf")
120
+ agg = aggregate_confusion_matrices([cm1, cm2])
121
+ assert agg.total_errors >= cm1.total_errors + cm2.total_errors
122
+
123
+
124
+ class TestTopConfusedChars:
125
+
126
+ def test_returns_list(self):
127
+ cm = build_confusion_matrix("aaabbb", "aaaxxx")
128
+ tops = top_confused_chars(cm, n=5)
129
+ assert isinstance(tops, list)
130
+
131
+ def test_sorted_by_errors_desc(self):
132
+ cm = aggregate_confusion_matrices([
133
+ build_confusion_matrix("bbb", "xxx"), # 3 fois
134
+ build_confusion_matrix("a", "y"), # 1 fois
135
+ ])
136
+ tops = top_confused_chars(cm, n=10)
137
+ if len(tops) >= 2:
138
+ assert tops[0]["total_errors"] >= tops[1]["total_errors"]
139
+
140
+ def test_excludes_empty_char(self):
141
+ cm = build_confusion_matrix("abc", "ac") # b supprimé
142
+ tops = top_confused_chars(cm, exclude_empty=True)
143
+ assert all(t["char"] != EMPTY_CHAR for t in tops)
144
+
145
+
146
+ # ===========================================================================
147
+ # Tests LigatureScore
148
+ # ===========================================================================
149
+
150
+ from picarones.core.char_scores import (
151
+ LIGATURE_TABLE,
152
+ DIACRITIC_MAP,
153
+ LigatureScore,
154
+ DiacriticScore,
155
+ compute_ligature_score,
156
+ compute_diacritic_score,
157
+ aggregate_ligature_scores,
158
+ aggregate_diacritic_scores,
159
+ _ALL_LIGATURES,
160
+ _ALL_DIACRITICS,
161
+ )
162
+
163
+
164
+ class TestLigatureTable:
165
+
166
+ def test_fi_ligature_present(self):
167
+ assert "\uFB01" in LIGATURE_TABLE # fi
168
+
169
+ def test_fl_ligature_present(self):
170
+ assert "\uFB02" in LIGATURE_TABLE # fl
171
+
172
+ def test_oe_ligature_present(self):
173
+ assert "\u0153" in LIGATURE_TABLE # œ
174
+
175
+ def test_ae_ligature_present(self):
176
+ assert "\u00E6" in LIGATURE_TABLE # æ
177
+
178
+ def test_ff_ligature_present(self):
179
+ assert "\uFB00" in LIGATURE_TABLE # ff
180
+
181
+ def test_equivalents_are_lists(self):
182
+ for lig, equivs in LIGATURE_TABLE.items():
183
+ assert isinstance(equivs, list)
184
+ assert len(equivs) >= 1
185
+
186
+
187
+ class TestComputeLigatureScore:
188
+
189
+ def test_no_ligatures_in_gt(self):
190
+ result = compute_ligature_score("bonjour monde", "bonjour monde")
191
+ assert result.score == pytest.approx(1.0)
192
+ assert result.total_in_gt == 0
193
+
194
+ def test_ligature_correctly_recognized(self):
195
+ # GT avec fi (fi ligature), OCR reconnaît "fi"
196
+ result = compute_ligature_score("fin", "fin")
197
+ assert result.total_in_gt == 1
198
+ assert result.score == pytest.approx(1.0)
199
+
200
+ def test_ligature_unicode_to_unicode(self):
201
+ # GT et OCR ont tous les deux fi
202
+ result = compute_ligature_score("fin", "fin")
203
+ assert result.score == pytest.approx(1.0)
204
+
205
+ def test_oe_ligature(self):
206
+ result = compute_ligature_score("œuvre", "oeuvre")
207
+ assert result.total_in_gt == 1
208
+ assert result.score == pytest.approx(1.0)
209
+
210
+ def test_ae_ligature(self):
211
+ result = compute_ligature_score("æther", "aether")
212
+ assert result.total_in_gt == 1
213
+ assert result.score == pytest.approx(1.0)
214
+
215
+ def test_as_dict_structure(self):
216
+ result = compute_ligature_score("fin", "fin")
217
+ d = result.as_dict()
218
+ assert "total_in_gt" in d
219
+ assert "correctly_recognized" in d
220
+ assert "score" in d
221
+ assert "per_ligature" in d
222
+
223
+ def test_empty_texts(self):
224
+ result = compute_ligature_score("", "")
225
+ assert result.score == pytest.approx(1.0)
226
+ assert result.total_in_gt == 0
227
+
228
+
229
+ class TestComputeDiacriticScore:
230
+
231
+ def test_no_diacritics(self):
232
+ result = compute_diacritic_score("bonjour", "bonjour")
233
+ assert result.score == pytest.approx(1.0)
234
+ assert result.total_in_gt == 0
235
+
236
+ def test_accent_preserved(self):
237
+ result = compute_diacritic_score("été", "été")
238
+ assert result.score == pytest.approx(1.0)
239
+ assert result.correctly_recognized == result.total_in_gt
240
+
241
+ def test_accent_lost(self):
242
+ result = compute_diacritic_score("étude", "etude")
243
+ assert result.total_in_gt >= 1
244
+ # é → e : perte du diacritique
245
+ assert result.correctly_recognized < result.total_in_gt
246
+ assert result.score < 1.0
247
+
248
+ def test_cedille_tracked(self):
249
+ result = compute_diacritic_score("façon", "facon")
250
+ assert result.total_in_gt >= 1
251
+ assert result.score < 1.0
252
+
253
+ def test_empty_texts(self):
254
+ result = compute_diacritic_score("", "")
255
+ assert result.score == pytest.approx(1.0)
256
+
257
+ def test_as_dict_structure(self):
258
+ result = compute_diacritic_score("été", "ete")
259
+ d = result.as_dict()
260
+ assert "total_in_gt" in d
261
+ assert "correctly_recognized" in d
262
+ assert "score" in d
263
+
264
+
265
+ class TestAggregateLigatureScores:
266
+
267
+ def test_empty_list(self):
268
+ result = aggregate_ligature_scores([])
269
+ assert result["score"] == pytest.approx(1.0)
270
+ assert result["total_in_gt"] == 0
271
+
272
+ def test_aggregation(self):
273
+ s1 = LigatureScore(total_in_gt=4, correctly_recognized=3, score=0.75)
274
+ s2 = LigatureScore(total_in_gt=2, correctly_recognized=2, score=1.0)
275
+ result = aggregate_ligature_scores([s1, s2])
276
+ assert result["total_in_gt"] == 6
277
+ assert result["correctly_recognized"] == 5
278
+ assert result["score"] == pytest.approx(5/6, abs=1e-4)
279
+
280
+
281
+ class TestAggregateDiacriticScores:
282
+
283
+ def test_aggregation(self):
284
+ s1 = DiacriticScore(total_in_gt=10, correctly_recognized=8, score=0.8)
285
+ s2 = DiacriticScore(total_in_gt=5, correctly_recognized=5, score=1.0)
286
+ result = aggregate_diacritic_scores([s1, s2])
287
+ assert result["total_in_gt"] == 15
288
+ assert result["correctly_recognized"] == 13
289
+
290
+
291
+ # ===========================================================================
292
+ # Tests TaxonomyResult
293
+ # ===========================================================================
294
+
295
+ from picarones.core.taxonomy import (
296
+ TaxonomyResult,
297
+ ERROR_CLASSES,
298
+ classify_errors,
299
+ aggregate_taxonomy,
300
+ VISUAL_CONFUSIONS,
301
+ )
302
+
303
+
304
+ class TestErrorClasses:
305
+
306
+ def test_nine_classes(self):
307
+ assert len(ERROR_CLASSES) == 9
308
+
309
+ def test_class_names(self):
310
+ assert "visual_confusion" in ERROR_CLASSES
311
+ assert "diacritic_error" in ERROR_CLASSES
312
+ assert "case_error" in ERROR_CLASSES
313
+ assert "ligature_error" in ERROR_CLASSES
314
+ assert "lacuna" in ERROR_CLASSES
315
+
316
+
317
+ class TestClassifyErrors:
318
+
319
+ def test_identical_texts(self):
320
+ result = classify_errors("bonjour monde", "bonjour monde")
321
+ assert result.total_errors == 0
322
+
323
+ def test_empty_texts(self):
324
+ result = classify_errors("", "")
325
+ assert result.total_errors == 0
326
+
327
+ def test_case_error_detected(self):
328
+ result = classify_errors("Bonjour Monde", "bonjour monde")
329
+ assert result.counts["case_error"] >= 1
330
+
331
+ def test_diacritic_error_detected(self):
332
+ result = classify_errors("été chez nous", "ete chez nous")
333
+ assert result.counts["diacritic_error"] >= 1
334
+
335
+ def test_lacuna_detected(self):
336
+ result = classify_errors("le chat dort paisiblement", "le chat")
337
+ assert result.counts["lacuna"] >= 1
338
+
339
+ def test_segmentation_detected(self):
340
+ result = classify_errors("hello world test", "helloworld test")
341
+ # "hello world" fusionné en "helloworld"
342
+ assert result.counts["segmentation_error"] >= 0 # peut être classé hapax aussi
343
+
344
+ def test_ligature_error_detected(self):
345
+ result = classify_errors("fin de siècle", "fin de siècle")
346
+ # fi vs fi est une ligature correcte, pas une erreur
347
+ # Mais si on avait: GT=fi, OCR=fi → correct
348
+ # Test avec ligature mal reconnue: GT=fin, OCR=fïn (erreur diac)
349
+ assert result.total_errors >= 0 # pas d'erreur ici (fin est équivalent)
350
+
351
+ def test_as_dict_structure(self):
352
+ result = classify_errors("test erreur ici", "test erreur là")
353
+ d = result.as_dict()
354
+ assert "counts" in d
355
+ assert "total_errors" in d
356
+ assert "class_distribution" in d
357
+ assert "examples" in d
358
+
359
+ def test_from_dict_roundtrip(self):
360
+ result = classify_errors("bonjour monde", "Bonjour monde")
361
+ d = result.as_dict()
362
+ restored = TaxonomyResult.from_dict(d)
363
+ assert restored.total_errors == result.total_errors
364
+ assert restored.counts == result.counts
365
+
366
+ def test_class_distribution_sums_to_one(self):
367
+ result = classify_errors("abc def ghi", "xyz uvw rst")
368
+ dist = result.class_distribution
369
+ if dist:
370
+ assert abs(sum(dist.values()) - 1.0) < 1e-6
371
+
372
+ def test_all_classes_in_counts(self):
373
+ result = classify_errors("test", "teSt")
374
+ for cls in ERROR_CLASSES:
375
+ assert cls in result.counts
376
+
377
+
378
+ class TestAggregateTaxonomy:
379
+
380
+ def test_empty(self):
381
+ result = aggregate_taxonomy([])
382
+ assert result["total_errors"] == 0
383
+
384
+ def test_sums_counts(self):
385
+ r1 = TaxonomyResult(
386
+ counts={"visual_confusion": 2, "diacritic_error": 1, **{k: 0 for k in ERROR_CLASSES if k not in ["visual_confusion", "diacritic_error"]}},
387
+ total_errors=3,
388
+ )
389
+ r2 = TaxonomyResult(
390
+ counts={"visual_confusion": 1, "diacritic_error": 3, **{k: 0 for k in ERROR_CLASSES if k not in ["visual_confusion", "diacritic_error"]}},
391
+ total_errors=4,
392
+ )
393
+ agg = aggregate_taxonomy([r1, r2])
394
+ assert agg["counts"]["visual_confusion"] == 3
395
+ assert agg["counts"]["diacritic_error"] == 4
396
+ assert agg["total_errors"] == 7
397
+
398
+
399
+ # ===========================================================================
400
+ # Tests StructureResult
401
+ # ===========================================================================
402
+
403
+ from picarones.core.structure import (
404
+ StructureResult,
405
+ analyze_structure,
406
+ aggregate_structure,
407
+ )
408
+
409
+
410
+ class TestAnalyzeStructure:
411
+
412
+ def test_identical_single_line(self):
413
+ result = analyze_structure("ligne unique", "ligne unique")
414
+ assert result.gt_line_count == 1
415
+ assert result.ocr_line_count == 1
416
+ assert result.line_fusion_count == 0
417
+ assert result.line_fragmentation_count == 0
418
+
419
+ def test_empty_texts(self):
420
+ result = analyze_structure("", "")
421
+ assert result.gt_line_count == 0
422
+ assert result.ocr_line_count == 0
423
+
424
+ def test_multiline_equal(self):
425
+ gt = "ligne 1\nligne 2\nligne 3"
426
+ result = analyze_structure(gt, gt)
427
+ assert result.gt_line_count == 3
428
+ assert result.ocr_line_count == 3
429
+
430
+ def test_line_fusion_detected(self):
431
+ gt = "ligne 1\nligne 2\nligne 3"
432
+ ocr = "ligne 1 ligne 2\nligne 3" # fusion de 2 lignes en 1
433
+ result = analyze_structure(gt, ocr)
434
+ # Le nombre de lignes OCR < GT
435
+ assert result.ocr_line_count < result.gt_line_count
436
+
437
+ def test_reading_order_score_perfect(self):
438
+ text = "le chat dort ici"
439
+ result = analyze_structure(text, text)
440
+ assert result.reading_order_score > 0.9
441
+
442
+ def test_reading_order_score_low_for_scrambled(self):
443
+ gt = "le chat dort paisiblement sur le canapé"
444
+ ocr = "canapé sur le paisiblement dort chat le"
445
+ result = analyze_structure(gt, ocr)
446
+ assert result.reading_order_score < 1.0
447
+
448
+ def test_line_accuracy_perfect(self):
449
+ gt = "ligne 1\nligne 2"
450
+ ocr = "ligne 1\nligne 2"
451
+ result = analyze_structure(gt, ocr)
452
+ assert result.line_accuracy == pytest.approx(1.0)
453
+
454
+ def test_line_accuracy_degraded(self):
455
+ gt = "ligne 1\nligne 2\nligne 3\nligne 4"
456
+ ocr = "ligne 1"
457
+ result = analyze_structure(gt, ocr)
458
+ assert result.line_accuracy < 1.0
459
+
460
+ def test_as_dict_structure(self):
461
+ result = analyze_structure("ligne 1\nligne 2", "ligne 1\nligne 2")
462
+ d = result.as_dict()
463
+ required = ["gt_line_count", "ocr_line_count", "line_fusion_count",
464
+ "line_fragmentation_count", "reading_order_score",
465
+ "paragraph_conservation_score", "line_accuracy"]
466
+ for key in required:
467
+ assert key in d
468
+
469
+ def test_from_dict_roundtrip(self):
470
+ result = analyze_structure("a\nb\nc", "a\nb")
471
+ d = result.as_dict()
472
+ restored = StructureResult.from_dict(d)
473
+ assert restored.gt_line_count == result.gt_line_count
474
+ assert restored.ocr_line_count == result.ocr_line_count
475
+
476
+ def test_line_fusion_rate_property(self):
477
+ result = StructureResult(gt_line_count=10, ocr_line_count=8, line_fusion_count=2)
478
+ assert result.line_fusion_rate == pytest.approx(0.2)
479
+
480
+ def test_line_fragmentation_rate_property(self):
481
+ result = StructureResult(gt_line_count=5, ocr_line_count=8, line_fragmentation_count=3)
482
+ assert result.line_fragmentation_rate == pytest.approx(0.6)
483
+
484
+
485
+ class TestAggregateStructure:
486
+
487
+ def test_empty(self):
488
+ result = aggregate_structure([])
489
+ assert result == {}
490
+
491
+ def test_single_result(self):
492
+ r = StructureResult(
493
+ gt_line_count=5, ocr_line_count=5,
494
+ reading_order_score=0.9, paragraph_conservation_score=1.0,
495
+ )
496
+ agg = aggregate_structure([r])
497
+ assert agg["mean_reading_order_score"] == pytest.approx(0.9)
498
+ assert agg["document_count"] == 1
499
+
500
+ def test_mean_fusion_rate(self):
501
+ r1 = StructureResult(gt_line_count=10, ocr_line_count=8, line_fusion_count=2)
502
+ r2 = StructureResult(gt_line_count=10, ocr_line_count=6, line_fusion_count=4)
503
+ agg = aggregate_structure([r1, r2])
504
+ # fusion rates: 0.2 et 0.4 → mean = 0.3
505
+ assert agg["mean_line_fusion_rate"] == pytest.approx(0.3, rel=1e-3)
506
+
507
+
508
+ # ===========================================================================
509
+ # Tests ImageQualityResult
510
+ # ===========================================================================
511
+
512
+ from picarones.core.image_quality import (
513
+ ImageQualityResult,
514
+ generate_mock_quality_scores,
515
+ aggregate_image_quality,
516
+ _global_quality_score,
517
+ )
518
+
519
+
520
+ class TestImageQualityResult:
521
+
522
+ def test_quality_tier_good(self):
523
+ r = ImageQualityResult(quality_score=0.8)
524
+ assert r.quality_tier == "good"
525
+ assert r.is_good_quality is True
526
+
527
+ def test_quality_tier_medium(self):
528
+ r = ImageQualityResult(quality_score=0.55)
529
+ assert r.quality_tier == "medium"
530
+ assert r.is_good_quality is False
531
+
532
+ def test_quality_tier_poor(self):
533
+ r = ImageQualityResult(quality_score=0.2)
534
+ assert r.quality_tier == "poor"
535
+
536
+ def test_as_dict_structure(self):
537
+ r = ImageQualityResult(
538
+ sharpness_score=0.8, noise_level=0.1, rotation_degrees=0.5,
539
+ contrast_score=0.9, quality_score=0.75, analysis_method="mock",
540
+ )
541
+ d = r.as_dict()
542
+ assert "sharpness_score" in d
543
+ assert "noise_level" in d
544
+ assert "rotation_degrees" in d
545
+ assert "contrast_score" in d
546
+ assert "quality_score" in d
547
+ assert "quality_tier" in d
548
+ assert "analysis_method" in d
549
+
550
+ def test_from_dict_roundtrip(self):
551
+ r = ImageQualityResult(
552
+ sharpness_score=0.7, noise_level=0.2, rotation_degrees=1.0,
553
+ contrast_score=0.8, quality_score=0.65, analysis_method="pillow",
554
+ )
555
+ d = r.as_dict()
556
+ restored = ImageQualityResult.from_dict(d)
557
+ assert restored.sharpness_score == pytest.approx(r.sharpness_score, rel=1e-3)
558
+ assert restored.quality_score == pytest.approx(r.quality_score, rel=1e-3)
559
+ assert restored.analysis_method == r.analysis_method
560
+
561
+ def test_from_dict_ignores_quality_tier(self):
562
+ # quality_tier est une propriété, pas un param init → from_dict doit l'ignorer
563
+ data = {
564
+ "sharpness_score": 0.5, "noise_level": 0.3, "rotation_degrees": 0.0,
565
+ "contrast_score": 0.6, "quality_score": 0.5, "analysis_method": "mock",
566
+ "quality_tier": "medium", # doit être ignoré
567
+ }
568
+ r = ImageQualityResult.from_dict(data)
569
+ assert r.quality_score == pytest.approx(0.5)
570
+
571
+
572
+ class TestGenerateMockQualityScores:
573
+
574
+ def test_returns_image_quality_result(self):
575
+ r = generate_mock_quality_scores("folio_001")
576
+ assert isinstance(r, ImageQualityResult)
577
+
578
+ def test_scores_in_range(self):
579
+ r = generate_mock_quality_scores("folio_001", seed=42)
580
+ assert 0.0 <= r.quality_score <= 1.0
581
+ assert 0.0 <= r.sharpness_score <= 1.0
582
+ assert 0.0 <= r.noise_level <= 1.0
583
+ assert 0.0 <= r.contrast_score <= 1.0
584
+
585
+ def test_reproducible_with_seed(self):
586
+ r1 = generate_mock_quality_scores("folio_001", seed=42)
587
+ r2 = generate_mock_quality_scores("folio_001", seed=42)
588
+ assert r1.quality_score == r2.quality_score
589
+
590
+ def test_analysis_method_mock(self):
591
+ r = generate_mock_quality_scores("folio_001")
592
+ assert r.analysis_method == "mock"
593
+
594
+ def test_no_error(self):
595
+ r = generate_mock_quality_scores("folio_001")
596
+ assert r.error is None
597
+
598
+
599
+ class TestGlobalQualityScore:
600
+
601
+ def test_perfect_input(self):
602
+ score = _global_quality_score(sharpness=1.0, noise=0.0, rotation_abs=0.0, contrast=1.0)
603
+ assert score == pytest.approx(1.0)
604
+
605
+ def test_worst_input(self):
606
+ score = _global_quality_score(sharpness=0.0, noise=1.0, rotation_abs=10.0, contrast=0.0)
607
+ assert score == pytest.approx(0.0)
608
+
609
+ def test_medium_input(self):
610
+ score = _global_quality_score(sharpness=0.5, noise=0.5, rotation_abs=0.0, contrast=0.5)
611
+ assert 0.0 < score < 1.0
612
+
613
+
614
+ class TestAggregateImageQuality:
615
+
616
+ def test_empty_list(self):
617
+ result = aggregate_image_quality([])
618
+ assert result == {}
619
+
620
+ def test_single_result(self):
621
+ r = ImageQualityResult(quality_score=0.75, analysis_method="mock")
622
+ agg = aggregate_image_quality([r])
623
+ assert agg["mean_quality_score"] == pytest.approx(0.75)
624
+ assert agg["document_count"] == 1
625
+
626
+ def test_tier_distribution(self):
627
+ results = [
628
+ ImageQualityResult(quality_score=0.8, analysis_method="mock"), # good
629
+ ImageQualityResult(quality_score=0.5, analysis_method="mock"), # medium
630
+ ImageQualityResult(quality_score=0.2, analysis_method="mock"), # poor
631
+ ]
632
+ agg = aggregate_image_quality(results)
633
+ assert agg["quality_distribution"]["good"] == 1
634
+ assert agg["quality_distribution"]["medium"] == 1
635
+ assert agg["quality_distribution"]["poor"] == 1
636
+
637
+ def test_scores_list_present(self):
638
+ results = [ImageQualityResult(quality_score=0.6, analysis_method="mock")]
639
+ agg = aggregate_image_quality(results)
640
+ assert "scores" in agg
641
+ assert len(agg["scores"]) == 1
642
+
643
+ def test_errors_excluded(self):
644
+ results = [
645
+ ImageQualityResult(quality_score=0.8, analysis_method="mock"),
646
+ ImageQualityResult(quality_score=0.0, analysis_method="none", error="file not found"),
647
+ ]
648
+ agg = aggregate_image_quality(results)
649
+ assert agg["document_count"] == 1 # seul le résultat sans erreur compte
650
+
651
+
652
+ # ===========================================================================
653
+ # Tests d'intégration Sprint 5 (fixtures + rapport)
654
+ # ===========================================================================
655
+
656
+ class TestFixturesSprint5:
657
+
658
+ def test_doc_result_has_confusion_matrix(self):
659
+ from picarones.fixtures import generate_sample_benchmark
660
+ bm = generate_sample_benchmark()
661
+ for er in bm.engine_reports:
662
+ for dr in er.document_results:
663
+ assert dr.confusion_matrix is not None, (
664
+ f"confusion_matrix manquante pour {er.engine_name}/{dr.doc_id}"
665
+ )
666
+ break
667
+
668
+ def test_doc_result_has_char_scores(self):
669
+ from picarones.fixtures import generate_sample_benchmark
670
+ bm = generate_sample_benchmark()
671
+ for er in bm.engine_reports:
672
+ dr = er.document_results[0]
673
+ assert dr.char_scores is not None
674
+ assert "ligature" in dr.char_scores
675
+ assert "diacritic" in dr.char_scores
676
+
677
+ def test_doc_result_has_taxonomy(self):
678
+ from picarones.fixtures import generate_sample_benchmark
679
+ bm = generate_sample_benchmark()
680
+ for er in bm.engine_reports:
681
+ dr = er.document_results[0]
682
+ assert dr.taxonomy is not None
683
+ assert "counts" in dr.taxonomy
684
+ assert "total_errors" in dr.taxonomy
685
+
686
+ def test_doc_result_has_structure(self):
687
+ from picarones.fixtures import generate_sample_benchmark
688
+ bm = generate_sample_benchmark()
689
+ for er in bm.engine_reports:
690
+ dr = er.document_results[0]
691
+ assert dr.structure is not None
692
+ assert "gt_line_count" in dr.structure
693
+
694
+ def test_doc_result_has_image_quality(self):
695
+ from picarones.fixtures import generate_sample_benchmark
696
+ bm = generate_sample_benchmark()
697
+ for er in bm.engine_reports:
698
+ dr = er.document_results[0]
699
+ assert dr.image_quality is not None
700
+ assert "quality_score" in dr.image_quality
701
+
702
+ def test_engine_report_has_aggregated_confusion(self):
703
+ from picarones.fixtures import generate_sample_benchmark
704
+ bm = generate_sample_benchmark()
705
+ for er in bm.engine_reports:
706
+ assert er.aggregated_confusion is not None
707
+ assert "matrix" in er.aggregated_confusion
708
+
709
+ def test_engine_report_has_aggregated_char_scores(self):
710
+ from picarones.fixtures import generate_sample_benchmark
711
+ bm = generate_sample_benchmark()
712
+ for er in bm.engine_reports:
713
+ assert er.aggregated_char_scores is not None
714
+ assert "ligature" in er.aggregated_char_scores
715
+ assert "diacritic" in er.aggregated_char_scores
716
+
717
+ def test_engine_report_ligature_score_property(self):
718
+ from picarones.fixtures import generate_sample_benchmark
719
+ bm = generate_sample_benchmark()
720
+ for er in bm.engine_reports:
721
+ score = er.ligature_score
722
+ assert score is not None
723
+ assert 0.0 <= score <= 1.0
724
+
725
+ def test_engine_report_diacritic_score_property(self):
726
+ from picarones.fixtures import generate_sample_benchmark
727
+ bm = generate_sample_benchmark()
728
+ for er in bm.engine_reports:
729
+ score = er.diacritic_score
730
+ assert score is not None
731
+ assert 0.0 <= score <= 1.0
732
+
733
+ def test_engine_report_has_aggregated_taxonomy(self):
734
+ from picarones.fixtures import generate_sample_benchmark
735
+ bm = generate_sample_benchmark()
736
+ for er in bm.engine_reports:
737
+ assert er.aggregated_taxonomy is not None
738
+ assert "total_errors" in er.aggregated_taxonomy
739
+
740
+ def test_engine_report_has_aggregated_structure(self):
741
+ from picarones.fixtures import generate_sample_benchmark
742
+ bm = generate_sample_benchmark()
743
+ for er in bm.engine_reports:
744
+ assert er.aggregated_structure is not None
745
+ assert "mean_reading_order_score" in er.aggregated_structure
746
+
747
+ def test_engine_report_has_aggregated_image_quality(self):
748
+ from picarones.fixtures import generate_sample_benchmark
749
+ bm = generate_sample_benchmark()
750
+ for er in bm.engine_reports:
751
+ assert er.aggregated_image_quality is not None
752
+ assert "mean_quality_score" in er.aggregated_image_quality
753
+
754
+ def test_bad_engine_has_more_errors(self):
755
+ """L'ancien moteur doit avoir plus d'erreurs taxonomiques que pero_ocr."""
756
+ from picarones.fixtures import generate_sample_benchmark
757
+ bm = generate_sample_benchmark()
758
+ pero = next(er for er in bm.engine_reports if er.engine_name == "pero_ocr")
759
+ bad = next(er for er in bm.engine_reports if er.engine_name == "ancien_moteur")
760
+ assert bad.aggregated_taxonomy["total_errors"] > pero.aggregated_taxonomy["total_errors"]
761
+
762
+
763
+ class TestReportSprint5:
764
+
765
+ def test_report_data_has_ligature_score(self):
766
+ from picarones.fixtures import generate_sample_benchmark
767
+ from picarones.report.generator import _build_report_data
768
+ bm = generate_sample_benchmark()
769
+ data = _build_report_data(bm, {})
770
+ for eng in data["engines"]:
771
+ assert "ligature_score" in eng, f"ligature_score manquant pour {eng['name']}"
772
+
773
+ def test_report_data_has_diacritic_score(self):
774
+ from picarones.fixtures import generate_sample_benchmark
775
+ from picarones.report.generator import _build_report_data
776
+ bm = generate_sample_benchmark()
777
+ data = _build_report_data(bm, {})
778
+ for eng in data["engines"]:
779
+ assert "diacritic_score" in eng
780
+
781
+ def test_report_data_has_aggregated_taxonomy(self):
782
+ from picarones.fixtures import generate_sample_benchmark
783
+ from picarones.report.generator import _build_report_data
784
+ bm = generate_sample_benchmark()
785
+ data = _build_report_data(bm, {})
786
+ for eng in data["engines"]:
787
+ assert "aggregated_taxonomy" in eng
788
+
789
+ def test_report_data_has_aggregated_image_quality(self):
790
+ from picarones.fixtures import generate_sample_benchmark
791
+ from picarones.report.generator import _build_report_data
792
+ bm = generate_sample_benchmark()
793
+ data = _build_report_data(bm, {})
794
+ for eng in data["engines"]:
795
+ assert "aggregated_image_quality" in eng
796
+
797
+ def test_html_has_characters_tab(self, tmp_path):
798
+ from picarones.fixtures import generate_sample_benchmark
799
+ from picarones.report.generator import ReportGenerator
800
+ bm = generate_sample_benchmark()
801
+ out = tmp_path / "report.html"
802
+ ReportGenerator(bm).generate(out)
803
+ html = out.read_text(encoding="utf-8")
804
+ assert "Caractères" in html
805
+
806
+ def test_html_has_ligatures_column(self, tmp_path):
807
+ from picarones.fixtures import generate_sample_benchmark
808
+ from picarones.report.generator import ReportGenerator
809
+ bm = generate_sample_benchmark()
810
+ out = tmp_path / "report.html"
811
+ ReportGenerator(bm).generate(out)
812
+ html = out.read_text(encoding="utf-8")
813
+ assert "Ligatures" in html
814
+
815
+ def test_html_has_diacritiques_column(self, tmp_path):
816
+ from picarones.fixtures import generate_sample_benchmark
817
+ from picarones.report.generator import ReportGenerator
818
+ bm = generate_sample_benchmark()
819
+ out = tmp_path / "report.html"
820
+ ReportGenerator(bm).generate(out)
821
+ html = out.read_text(encoding="utf-8")
822
+ assert "Diacritiques" in html
823
+
824
+ def test_html_has_scatter_plot(self, tmp_path):
825
+ from picarones.fixtures import generate_sample_benchmark
826
+ from picarones.report.generator import ReportGenerator
827
+ bm = generate_sample_benchmark()
828
+ out = tmp_path / "report.html"
829
+ ReportGenerator(bm).generate(out)
830
+ html = out.read_text(encoding="utf-8")
831
+ assert "chart-quality-cer" in html
832
+
833
+ def test_html_has_taxonomy_chart(self, tmp_path):
834
+ from picarones.fixtures import generate_sample_benchmark
835
+ from picarones.report.generator import ReportGenerator
836
+ bm = generate_sample_benchmark()
837
+ out = tmp_path / "report.html"
838
+ ReportGenerator(bm).generate(out)
839
+ html = out.read_text(encoding="utf-8")
840
+ assert "chart-taxonomy" in html
841
+
842
+ def test_html_has_confusion_heatmap(self, tmp_path):
843
+ from picarones.fixtures import generate_sample_benchmark
844
+ from picarones.report.generator import ReportGenerator
845
+ bm = generate_sample_benchmark()
846
+ out = tmp_path / "report.html"
847
+ ReportGenerator(bm).generate(out)
848
+ html = out.read_text(encoding="utf-8")
849
+ assert "confusion-heatmap" in html or "matrice de confusion" in html.lower()
850
+
851
+ def test_doc_results_have_image_quality_in_report(self):
852
+ from picarones.fixtures import generate_sample_benchmark
853
+ from picarones.report.generator import _build_report_data
854
+ bm = generate_sample_benchmark()
855
+ data = _build_report_data(bm, {})
856
+ doc = data["documents"][0]
857
+ # Au moins un engine result doit avoir image_quality
858
+ has_iq = any("image_quality" in er for er in doc["engine_results"])
859
+ assert has_iq, "Aucun document result n'a de données image_quality"
860
+
861
+ def test_json_export_contains_sprint5_data(self, tmp_path):
862
+ from picarones.fixtures import generate_sample_benchmark
863
+ import json
864
+ bm = generate_sample_benchmark()
865
+ out = tmp_path / "results.json"
866
+ bm.to_json(out)
867
+ data = json.loads(out.read_text())
868
+ # Vérifier dans les engine_reports
869
+ er = data["engine_reports"][0]
870
+ assert "aggregated_taxonomy" in er
871
+ assert "aggregated_char_scores" in er
872
+ # Vérifier dans les document_results
873
+ dr = er["document_results"][0]
874
+ assert "taxonomy" in dr
875
+ assert "char_scores" in dr
876
+ assert "structure" in dr
tests/test_sprint6_web_interface.py ADDED
@@ -0,0 +1,982 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests Sprint 6 — Interface web FastAPI, import HTR-United, HuggingFace, serve CLI.
2
+
3
+ Classes de tests
4
+ ----------------
5
+ TestHTRUnitedEntry (8 tests) — dataclass, as_dict, from_dict, century_str
6
+ TestHTRUnitedCatalogue (10 tests) — from_demo, search, get_by_id, available_languages/scripts
7
+ TestHTRUnitedSearch (8 tests) — recherche textuelle, filtre langue, script, siècle
8
+ TestHTRUnitedImport (4 tests) — import_htr_united_corpus crée les fichiers meta
9
+ TestHuggingFaceDataset (7 tests) — dataclass, as_dict, from_dict, hf_url
10
+ TestHuggingFaceImporter (10 tests) — search référence, filtres, import
11
+ TestHuggingFaceReferenceData (4 tests) — datasets de référence pré-intégrés
12
+ TestNormalizationProfiles (8 tests) — profils disponibles via API route
13
+ TestFastAPIStatus (3 tests) — GET /api/status
14
+ TestFastAPIEngines (8 tests) — GET /api/engines
15
+ TestFastAPICorpusBrowse (6 tests) — GET /api/corpus/browse
16
+ TestFastAPIReports (5 tests) — GET /api/reports
17
+ TestFastAPIHTRUnited (7 tests) — GET /api/htr-united/catalogue + POST import
18
+ TestFastAPIHuggingFace (6 tests) — GET /api/huggingface/search + POST import
19
+ TestFastAPIBenchmark (8 tests) — POST start, GET status, GET stream, POST cancel
20
+ TestFastAPIHTML (5 tests) — GET / retourne HTML valide
21
+ TestFastAPIReportServe (4 tests) — GET /reports/{filename}
22
+ TestCLIServeCommand (5 tests) — commande picarones serve enregistrée
23
+ TestRunnerProgressCallback (5 tests) — progress_callback injecté dans run_benchmark
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import json
29
+ import os
30
+ import tempfile
31
+ import threading
32
+ import time
33
+ from pathlib import Path
34
+ from unittest.mock import MagicMock, patch
35
+
36
+ import pytest
37
+ from click.testing import CliRunner
38
+ from fastapi.testclient import TestClient
39
+
40
+ # ---------------------------------------------------------------------------
41
+ # Fixtures
42
+ # ---------------------------------------------------------------------------
43
+
44
+ @pytest.fixture
45
+ def tmp_corpus(tmp_path):
46
+ """Crée un corpus minimal avec 2 documents."""
47
+ from PIL import Image
48
+ for i in range(2):
49
+ img = Image.new("RGB", (100, 50), color=(200, 200, 200))
50
+ img.save(tmp_path / f"doc_{i:02d}.jpg")
51
+ (tmp_path / f"doc_{i:02d}.gt.txt").write_text(f"Texte vérité terrain {i}", encoding="utf-8")
52
+ return tmp_path
53
+
54
+
55
+ @pytest.fixture
56
+ def client():
57
+ from picarones.web.app import app
58
+ return TestClient(app)
59
+
60
+
61
+ @pytest.fixture
62
+ def htr_catalogue():
63
+ from picarones.importers.htr_united import HTRUnitedCatalogue
64
+ return HTRUnitedCatalogue.from_demo()
65
+
66
+
67
+ @pytest.fixture
68
+ def hf_importer():
69
+ from picarones.importers.huggingface import HuggingFaceImporter
70
+ return HuggingFaceImporter()
71
+
72
+
73
+ # ===========================================================================
74
+ # TestHTRUnitedEntry
75
+ # ===========================================================================
76
+
77
+ class TestHTRUnitedEntry:
78
+
79
+ def test_from_dict_basic(self):
80
+ from picarones.importers.htr_united import HTRUnitedEntry
81
+ d = {
82
+ "id": "test-corpus", "title": "Test Corpus", "url": "https://github.com/test/corpus",
83
+ "language": ["French"], "script": ["Gothic"], "century": [14, 15],
84
+ "institution": "BnF", "description": "Un corpus de test.", "license": "CC-BY 4.0",
85
+ "lines": 5000, "format": "ALTO", "tags": ["test", "médiéval"],
86
+ }
87
+ e = HTRUnitedEntry.from_dict(d)
88
+ assert e.id == "test-corpus"
89
+ assert e.title == "Test Corpus"
90
+ assert e.language == ["French"]
91
+ assert e.lines == 5000
92
+
93
+ def test_as_dict_roundtrip(self):
94
+ from picarones.importers.htr_united import HTRUnitedEntry
95
+ d = {
96
+ "id": "rtrip", "title": "Round Trip", "url": "https://github.com/a/b",
97
+ "language": ["Latin"], "script": ["Caroline"], "century": [9],
98
+ "institution": "IRHT", "description": "Test.", "license": "CC0",
99
+ "lines": 1000, "format": "PAGE", "tags": [],
100
+ }
101
+ e = HTRUnitedEntry.from_dict(d)
102
+ out = e.as_dict()
103
+ assert out["id"] == "rtrip"
104
+ assert out["lines"] == 1000
105
+ assert out["format"] == "PAGE"
106
+
107
+ def test_century_str_roman(self):
108
+ from picarones.importers.htr_united import HTRUnitedEntry
109
+ e = HTRUnitedEntry(id="x", title="x", url="x", century=[12, 14])
110
+ cs = e.century_str
111
+ assert "XIIe" in cs
112
+ assert "XIVe" in cs
113
+
114
+ def test_century_str_single(self):
115
+ from picarones.importers.htr_united import HTRUnitedEntry
116
+ e = HTRUnitedEntry(id="x", title="x", url="x", century=[19])
117
+ assert "XIXe" in e.century_str
118
+
119
+ def test_default_fields(self):
120
+ from picarones.importers.htr_united import HTRUnitedEntry
121
+ e = HTRUnitedEntry(id="minimal", title="Min", url="http://x")
122
+ assert e.language == []
123
+ assert e.lines == 0
124
+ assert e.format == "ALTO"
125
+ assert e.tags == []
126
+
127
+ def test_from_dict_missing_fields(self):
128
+ from picarones.importers.htr_united import HTRUnitedEntry
129
+ e = HTRUnitedEntry.from_dict({"id": "sparse", "title": "Sparse"})
130
+ assert e.id == "sparse"
131
+ assert e.institution == ""
132
+ assert e.lines == 0
133
+
134
+ def test_as_dict_has_all_keys(self):
135
+ from picarones.importers.htr_united import HTRUnitedEntry
136
+ e = HTRUnitedEntry(id="k", title="K", url="http://k")
137
+ d = e.as_dict()
138
+ for key in ["id", "title", "url", "language", "script", "century",
139
+ "institution", "description", "license", "lines", "format", "tags"]:
140
+ assert key in d, f"Missing key: {key}"
141
+
142
+ def test_url_preserved(self):
143
+ from picarones.importers.htr_united import HTRUnitedEntry
144
+ url = "https://github.com/HTR-United/cremma-medieval"
145
+ e = HTRUnitedEntry(id="c", title="CREMMA", url=url)
146
+ assert e.url == url
147
+
148
+
149
+ # ===========================================================================
150
+ # TestHTRUnitedCatalogue
151
+ # ===========================================================================
152
+
153
+ class TestHTRUnitedCatalogue:
154
+
155
+ def test_from_demo_length(self, htr_catalogue):
156
+ assert len(htr_catalogue) >= 6
157
+
158
+ def test_from_demo_source(self, htr_catalogue):
159
+ assert htr_catalogue.source == "demo"
160
+
161
+ def test_all_entries_have_id(self, htr_catalogue):
162
+ for e in htr_catalogue.entries:
163
+ assert e.id, f"Entry missing id: {e}"
164
+
165
+ def test_all_entries_have_title(self, htr_catalogue):
166
+ for e in htr_catalogue.entries:
167
+ assert e.title
168
+
169
+ def test_get_by_id_found(self, htr_catalogue):
170
+ first_id = htr_catalogue.entries[0].id
171
+ found = htr_catalogue.get_by_id(first_id)
172
+ assert found is not None
173
+ assert found.id == first_id
174
+
175
+ def test_get_by_id_not_found(self, htr_catalogue):
176
+ result = htr_catalogue.get_by_id("nonexistent-corpus-xyz")
177
+ assert result is None
178
+
179
+ def test_available_languages_non_empty(self, htr_catalogue):
180
+ langs = htr_catalogue.available_languages()
181
+ assert len(langs) > 0
182
+ assert isinstance(langs, list)
183
+
184
+ def test_available_languages_sorted(self, htr_catalogue):
185
+ langs = htr_catalogue.available_languages()
186
+ assert langs == sorted(langs)
187
+
188
+ def test_available_scripts_non_empty(self, htr_catalogue):
189
+ scripts = htr_catalogue.available_scripts()
190
+ assert len(scripts) > 0
191
+
192
+ def test_len(self, htr_catalogue):
193
+ assert len(htr_catalogue) == len(htr_catalogue.entries)
194
+
195
+
196
+ # ===========================================================================
197
+ # TestHTRUnitedSearch
198
+ # ===========================================================================
199
+
200
+ class TestHTRUnitedSearch:
201
+
202
+ def test_search_empty_returns_all(self, htr_catalogue):
203
+ results = htr_catalogue.search()
204
+ assert len(results) == len(htr_catalogue.entries)
205
+
206
+ def test_search_by_query(self, htr_catalogue):
207
+ results = htr_catalogue.search(query="médiéval")
208
+ assert len(results) > 0
209
+ for r in results:
210
+ text = (r.title + r.description + " ".join(r.tags)).lower()
211
+ assert "médiéval" in text
212
+
213
+ def test_search_by_language(self, htr_catalogue):
214
+ results = htr_catalogue.search(language="French")
215
+ assert len(results) > 0
216
+ for r in results:
217
+ assert any("french" in l.lower() for l in r.language)
218
+
219
+ def test_search_by_language_latin(self, htr_catalogue):
220
+ results = htr_catalogue.search(language="Latin")
221
+ assert len(results) > 0
222
+
223
+ def test_search_by_script(self, htr_catalogue):
224
+ results = htr_catalogue.search(script="Gothic")
225
+ assert len(results) > 0
226
+
227
+ def test_search_no_results(self, htr_catalogue):
228
+ results = htr_catalogue.search(query="xyzzy_corpus_inexistant_42")
229
+ assert results == []
230
+
231
+ def test_search_combined_filters(self, htr_catalogue):
232
+ # Ne doit pas lever d'exception
233
+ results = htr_catalogue.search(query="", language="French", script="Cursiva")
234
+ assert isinstance(results, list)
235
+
236
+ def test_search_century_min(self, htr_catalogue):
237
+ results = htr_catalogue.search(century_min=18)
238
+ for r in results:
239
+ assert any(c >= 18 for c in r.century)
240
+
241
+
242
+ # ===========================================================================
243
+ # TestHTRUnitedImport
244
+ # ===========================================================================
245
+
246
+ class TestHTRUnitedImport:
247
+
248
+ def test_import_creates_meta_file(self, tmp_path, htr_catalogue):
249
+ from picarones.importers.htr_united import import_htr_united_corpus
250
+ entry = htr_catalogue.entries[0]
251
+ result = import_htr_united_corpus(entry, tmp_path, max_samples=5)
252
+ meta_file = Path(result["metadata_file"])
253
+ assert meta_file.exists()
254
+
255
+ def test_import_meta_content(self, tmp_path, htr_catalogue):
256
+ from picarones.importers.htr_united import import_htr_united_corpus
257
+ entry = htr_catalogue.entries[0]
258
+ result = import_htr_united_corpus(entry, tmp_path, max_samples=5)
259
+ meta = json.loads(Path(result["metadata_file"]).read_text())
260
+ assert meta["source"] == "htr-united"
261
+ assert meta["entry_id"] == entry.id
262
+
263
+ def test_import_returns_dict_keys(self, tmp_path, htr_catalogue):
264
+ from picarones.importers.htr_united import import_htr_united_corpus
265
+ entry = htr_catalogue.entries[0]
266
+ result = import_htr_united_corpus(entry, tmp_path, max_samples=5)
267
+ for k in ["entry_id", "title", "output_dir", "files_imported", "metadata_file"]:
268
+ assert k in result, f"Missing key: {k}"
269
+
270
+ def test_import_creates_output_dir(self, tmp_path, htr_catalogue):
271
+ from picarones.importers.htr_united import import_htr_united_corpus
272
+ entry = htr_catalogue.entries[0]
273
+ new_dir = tmp_path / "new_subdir" / "corpus"
274
+ result = import_htr_united_corpus(entry, new_dir, max_samples=5)
275
+ assert new_dir.exists()
276
+
277
+
278
+ # ===========================================================================
279
+ # TestHuggingFaceDataset
280
+ # ===========================================================================
281
+
282
+ class TestHuggingFaceDataset:
283
+
284
+ def test_from_dict_basic(self):
285
+ from picarones.importers.huggingface import HuggingFaceDataset
286
+ d = {
287
+ "dataset_id": "test/dataset", "title": "Test Dataset",
288
+ "description": "A test dataset.", "language": ["French"],
289
+ "tags": ["ocr", "french"], "license": "cc-by-4.0",
290
+ "institution": "Test Lab", "downloads": 500,
291
+ }
292
+ ds = HuggingFaceDataset.from_dict(d)
293
+ assert ds.dataset_id == "test/dataset"
294
+ assert ds.language == ["French"]
295
+ assert ds.downloads == 500
296
+
297
+ def test_as_dict_roundtrip(self):
298
+ from picarones.importers.huggingface import HuggingFaceDataset
299
+ ds = HuggingFaceDataset(
300
+ dataset_id="a/b", title="AB", description="desc",
301
+ language=["Latin"], tags=["htr"],
302
+ )
303
+ d = ds.as_dict()
304
+ assert d["dataset_id"] == "a/b"
305
+ assert d["language"] == ["Latin"]
306
+
307
+ def test_hf_url(self):
308
+ from picarones.importers.huggingface import HuggingFaceDataset
309
+ ds = HuggingFaceDataset(dataset_id="CATMuS/medieval", title="CATMuS")
310
+ assert ds.hf_url == "https://huggingface.co/datasets/CATMuS/medieval"
311
+
312
+ def test_as_dict_has_all_keys(self):
313
+ from picarones.importers.huggingface import HuggingFaceDataset
314
+ ds = HuggingFaceDataset(dataset_id="x/y", title="XY")
315
+ d = ds.as_dict()
316
+ for k in ["dataset_id", "title", "description", "language", "tags",
317
+ "license", "size_category", "task", "institution", "downloads", "source"]:
318
+ assert k in d, f"Missing: {k}"
319
+
320
+ def test_default_source(self):
321
+ from picarones.importers.huggingface import HuggingFaceDataset
322
+ ds = HuggingFaceDataset(dataset_id="x/y", title="XY")
323
+ assert ds.source == "reference"
324
+
325
+ def test_from_dict_uses_id_as_fallback_title(self):
326
+ from picarones.importers.huggingface import HuggingFaceDataset
327
+ ds = HuggingFaceDataset.from_dict({"dataset_id": "owner/repo"})
328
+ assert ds.title == "owner/repo"
329
+
330
+ def test_replace_source_helper(self):
331
+ from picarones.importers.huggingface import HuggingFaceDataset
332
+ ds = HuggingFaceDataset(dataset_id="x/y", title="XY", source="reference")
333
+ ds2 = ds._replace_source("api")
334
+ assert ds2.source == "api"
335
+ assert ds.source == "reference" # original unchanged
336
+
337
+
338
+ # ===========================================================================
339
+ # TestHuggingFaceImporter
340
+ # ===========================================================================
341
+
342
+ class TestHuggingFaceImporter:
343
+
344
+ def test_search_returns_list(self, hf_importer):
345
+ results = hf_importer.search()
346
+ assert isinstance(results, list)
347
+ assert len(results) > 0
348
+
349
+ def test_search_reference_datasets(self, hf_importer):
350
+ results = hf_importer.search(use_reference=True)
351
+ assert len(results) >= 5
352
+
353
+ def test_search_query_filter(self, hf_importer):
354
+ results = hf_importer.search(query="RIMES", use_reference=True)
355
+ assert len(results) >= 1
356
+ assert any("RIMES" in ds.title or "rimes" in ds.dataset_id.lower() for ds in results)
357
+
358
+ def test_search_language_filter(self, hf_importer):
359
+ results = hf_importer.search(language="French", use_reference=True)
360
+ assert len(results) > 0
361
+ for ds in results:
362
+ assert any("french" in l.lower() for l in ds.language)
363
+
364
+ def test_search_tag_filter(self, hf_importer):
365
+ results = hf_importer.search(tags=["historical"], use_reference=True)
366
+ assert isinstance(results, list)
367
+
368
+ def test_search_limit(self, hf_importer):
369
+ results = hf_importer.search(limit=3)
370
+ assert len(results) <= 3
371
+
372
+ def test_search_no_api_fallback(self, hf_importer):
373
+ # Même sans accès réseau, on a les datasets de référence
374
+ results = hf_importer.search(query="medieval", use_reference=True)
375
+ assert len(results) >= 1
376
+
377
+ def test_import_creates_meta(self, tmp_path, hf_importer):
378
+ result = hf_importer.import_dataset("CATMuS/medieval", output_dir=tmp_path, max_samples=5)
379
+ assert Path(result["metadata_file"]).exists()
380
+
381
+ def test_import_meta_content(self, tmp_path, hf_importer):
382
+ result = hf_importer.import_dataset("CATMuS/medieval", output_dir=tmp_path, max_samples=5)
383
+ meta = json.loads(Path(result["metadata_file"]).read_text())
384
+ assert meta["dataset_id"] == "CATMuS/medieval"
385
+ assert meta["source"] == "huggingface"
386
+
387
+ def test_import_returns_dict_keys(self, tmp_path, hf_importer):
388
+ result = hf_importer.import_dataset("x/y", output_dir=tmp_path, max_samples=5)
389
+ for k in ["dataset_id", "output_dir", "files_imported", "metadata_file"]:
390
+ assert k in result
391
+
392
+
393
+ # ===========================================================================
394
+ # TestHuggingFaceReferenceData
395
+ # ===========================================================================
396
+
397
+ class TestHuggingFaceReferenceData:
398
+
399
+ def test_reference_datasets_loaded(self):
400
+ from picarones.importers.huggingface import _REFERENCE_DATASETS
401
+ assert len(_REFERENCE_DATASETS) >= 5
402
+
403
+ def test_catmus_present(self):
404
+ from picarones.importers.huggingface import _REFERENCE_DATASETS
405
+ ids = [d["dataset_id"] for d in _REFERENCE_DATASETS]
406
+ assert any("CATMuS" in did or "catmus" in did.lower() for did in ids)
407
+
408
+ def test_all_have_required_fields(self):
409
+ from picarones.importers.huggingface import _REFERENCE_DATASETS
410
+ for d in _REFERENCE_DATASETS:
411
+ assert "dataset_id" in d
412
+ assert "title" in d
413
+ assert "language" in d
414
+
415
+ def test_all_are_image_to_text(self):
416
+ from picarones.importers.huggingface import _REFERENCE_DATASETS
417
+ for d in _REFERENCE_DATASETS:
418
+ assert d.get("task", "image-to-text") == "image-to-text"
419
+
420
+
421
+ # ===========================================================================
422
+ # TestNormalizationProfiles
423
+ # ===========================================================================
424
+
425
+ class TestNormalizationProfiles:
426
+
427
+ def test_api_returns_profiles(self, client):
428
+ r = client.get("/api/normalization/profiles")
429
+ assert r.status_code == 200
430
+ d = r.json()
431
+ assert "profiles" in d
432
+ assert len(d["profiles"]) >= 4
433
+
434
+ def test_nfc_profile_present(self, client):
435
+ r = client.get("/api/normalization/profiles")
436
+ ids = [p["id"] for p in r.json()["profiles"]]
437
+ assert "nfc" in ids
438
+
439
+ def test_medieval_french_present(self, client):
440
+ r = client.get("/api/normalization/profiles")
441
+ ids = [p["id"] for p in r.json()["profiles"]]
442
+ assert "medieval_french" in ids
443
+
444
+ def test_profiles_have_required_fields(self, client):
445
+ r = client.get("/api/normalization/profiles")
446
+ for p in r.json()["profiles"]:
447
+ assert "id" in p
448
+ assert "name" in p
449
+ assert "description" in p
450
+ assert "caseless" in p
451
+ assert "diplomatic_rules" in p
452
+
453
+ def test_caseless_profile(self, client):
454
+ r = client.get("/api/normalization/profiles")
455
+ profiles = {p["id"]: p for p in r.json()["profiles"]}
456
+ assert "caseless" in profiles
457
+ assert profiles["caseless"]["caseless"] is True
458
+
459
+ def test_medieval_french_has_diplomatic_rules(self, client):
460
+ r = client.get("/api/normalization/profiles")
461
+ profiles = {p["id"]: p for p in r.json()["profiles"]}
462
+ assert profiles["medieval_french"]["diplomatic_rules"] > 0
463
+
464
+ def test_nfc_no_diplomatic_rules(self, client):
465
+ r = client.get("/api/normalization/profiles")
466
+ profiles = {p["id"]: p for p in r.json()["profiles"]}
467
+ assert profiles["nfc"]["diplomatic_rules"] == 0
468
+
469
+ def test_early_modern_french_present(self, client):
470
+ r = client.get("/api/normalization/profiles")
471
+ ids = [p["id"] for p in r.json()["profiles"]]
472
+ assert "early_modern_french" in ids
473
+
474
+
475
+ # ===========================================================================
476
+ # TestFastAPIStatus
477
+ # ===========================================================================
478
+
479
+ class TestFastAPIStatus:
480
+
481
+ def test_status_200(self, client):
482
+ r = client.get("/api/status")
483
+ assert r.status_code == 200
484
+
485
+ def test_status_has_version(self, client):
486
+ r = client.get("/api/status")
487
+ d = r.json()
488
+ assert "version" in d
489
+ assert d["version"]
490
+
491
+ def test_status_ok(self, client):
492
+ r = client.get("/api/status")
493
+ assert r.json()["status"] == "ok"
494
+
495
+
496
+ # ===========================================================================
497
+ # TestFastAPIEngines
498
+ # ===========================================================================
499
+
500
+ class TestFastAPIEngines:
501
+
502
+ def test_engines_200(self, client):
503
+ r = client.get("/api/engines")
504
+ assert r.status_code == 200
505
+
506
+ def test_engines_has_engines_key(self, client):
507
+ r = client.get("/api/engines")
508
+ assert "engines" in r.json()
509
+
510
+ def test_engines_has_llms_key(self, client):
511
+ r = client.get("/api/engines")
512
+ assert "llms" in r.json()
513
+
514
+ def test_engines_list_not_empty(self, client):
515
+ r = client.get("/api/engines")
516
+ assert len(r.json()["engines"]) > 0
517
+
518
+ def test_llms_list_not_empty(self, client):
519
+ r = client.get("/api/engines")
520
+ assert len(r.json()["llms"]) > 0
521
+
522
+ def test_tesseract_in_engines(self, client):
523
+ r = client.get("/api/engines")
524
+ ids = [e["id"] for e in r.json()["engines"]]
525
+ assert "tesseract" in ids
526
+
527
+ def test_ollama_in_llms(self, client):
528
+ r = client.get("/api/engines")
529
+ ids = [e["id"] for e in r.json()["llms"]]
530
+ assert "ollama" in ids
531
+
532
+ def test_engine_has_required_fields(self, client):
533
+ r = client.get("/api/engines")
534
+ for eng in r.json()["engines"]:
535
+ assert "id" in eng
536
+ assert "label" in eng
537
+ assert "available" in eng
538
+ assert "status" in eng
539
+
540
+
541
+ # ===========================================================================
542
+ # TestFastAPICorpusBrowse
543
+ # ===========================================================================
544
+
545
+ class TestFastAPICorpusBrowse:
546
+
547
+ def test_browse_current_dir(self, client):
548
+ r = client.get("/api/corpus/browse?path=.")
549
+ assert r.status_code == 200
550
+
551
+ def test_browse_has_required_keys(self, client):
552
+ r = client.get("/api/corpus/browse?path=.")
553
+ d = r.json()
554
+ assert "current_path" in d
555
+ assert "items" in d
556
+
557
+ def test_browse_items_are_dirs(self, client, tmp_path):
558
+ r = client.get(f"/api/corpus/browse?path={tmp_path}")
559
+ assert r.status_code == 200
560
+ assert r.json()["items"] == []
561
+
562
+ def test_browse_with_corpus(self, client, tmp_corpus):
563
+ r = client.get(f"/api/corpus/browse?path={tmp_corpus.parent}")
564
+ assert r.status_code == 200
565
+ items = r.json()["items"]
566
+ assert any(i["name"] == tmp_corpus.name for i in items)
567
+
568
+ def test_browse_404_for_nonexistent(self, client):
569
+ r = client.get("/api/corpus/browse?path=/nonexistent/path/xyz")
570
+ assert r.status_code == 404
571
+
572
+ def test_browse_corpus_gt_count(self, client, tmp_corpus):
573
+ r = client.get(f"/api/corpus/browse?path={tmp_corpus.parent}")
574
+ items = {i["name"]: i for i in r.json()["items"] if i["is_dir"]}
575
+ if tmp_corpus.name in items:
576
+ assert items[tmp_corpus.name]["gt_count"] >= 2
577
+
578
+
579
+ # ===========================================================================
580
+ # TestFastAPIReports
581
+ # ===========================================================================
582
+
583
+ class TestFastAPIReports:
584
+
585
+ def test_reports_200(self, client):
586
+ r = client.get("/api/reports")
587
+ assert r.status_code == 200
588
+
589
+ def test_reports_has_reports_key(self, client):
590
+ r = client.get("/api/reports")
591
+ assert "reports" in r.json()
592
+
593
+ def test_reports_returns_list(self, client):
594
+ r = client.get("/api/reports")
595
+ assert isinstance(r.json()["reports"], list)
596
+
597
+ def test_reports_finds_existing_html(self, client, tmp_path):
598
+ # Crée un rapport HTML fictif
599
+ html_file = tmp_path / "test_rapport.html"
600
+ html_file.write_text("<html><body>Test rapport</body></html>")
601
+ r = client.get(f"/api/reports?reports_dir={tmp_path}")
602
+ reports = r.json()["reports"]
603
+ assert any(rep["filename"] == "test_rapport.html" for rep in reports)
604
+
605
+ def test_report_entry_has_fields(self, client, tmp_path):
606
+ html_file = tmp_path / "my_report.html"
607
+ html_file.write_text("<html></html>")
608
+ r = client.get(f"/api/reports?reports_dir={tmp_path}")
609
+ rep = next(rep for rep in r.json()["reports"] if rep["filename"] == "my_report.html")
610
+ assert "filename" in rep
611
+ assert "path" in rep
612
+ assert "size_kb" in rep
613
+ assert "modified" in rep
614
+ assert "url" in rep
615
+
616
+
617
+ # ===========================================================================
618
+ # TestFastAPIHTRUnited
619
+ # ===========================================================================
620
+
621
+ class TestFastAPIHTRUnited:
622
+
623
+ def test_catalogue_200(self, client):
624
+ r = client.get("/api/htr-united/catalogue")
625
+ assert r.status_code == 200
626
+
627
+ def test_catalogue_has_entries(self, client):
628
+ r = client.get("/api/htr-united/catalogue")
629
+ d = r.json()
630
+ assert "entries" in d
631
+ assert len(d["entries"]) >= 4
632
+
633
+ def test_catalogue_has_filters(self, client):
634
+ r = client.get("/api/htr-united/catalogue")
635
+ d = r.json()
636
+ assert "available_languages" in d
637
+ assert "available_scripts" in d
638
+
639
+ def test_catalogue_search_query(self, client):
640
+ r = client.get("/api/htr-united/catalogue?query=médiéval")
641
+ assert r.status_code == 200
642
+ d = r.json()
643
+ assert d["total"] >= 0 # Can be 0 if no match — no error
644
+
645
+ def test_catalogue_search_language(self, client):
646
+ r = client.get("/api/htr-united/catalogue?language=French")
647
+ assert r.status_code == 200
648
+ d = r.json()
649
+ for e in d["entries"]:
650
+ assert any("french" in l.lower() for l in e["language"])
651
+
652
+ def test_import_valid_entry(self, client, tmp_path):
653
+ # Get first entry id
654
+ r = client.get("/api/htr-united/catalogue")
655
+ entry_id = r.json()["entries"][0]["id"]
656
+ r2 = client.post("/api/htr-united/import", json={
657
+ "entry_id": entry_id,
658
+ "output_dir": str(tmp_path),
659
+ "max_samples": 5,
660
+ })
661
+ assert r2.status_code == 200
662
+ assert "entry_id" in r2.json()
663
+
664
+ def test_import_invalid_entry(self, client, tmp_path):
665
+ r = client.post("/api/htr-united/import", json={
666
+ "entry_id": "this-does-not-exist-xyz",
667
+ "output_dir": str(tmp_path),
668
+ "max_samples": 5,
669
+ })
670
+ assert r.status_code == 404
671
+
672
+
673
+ # ===========================================================================
674
+ # TestFastAPIHuggingFace
675
+ # ===========================================================================
676
+
677
+ class TestFastAPIHuggingFace:
678
+
679
+ def test_search_200(self, client):
680
+ r = client.get("/api/huggingface/search")
681
+ assert r.status_code == 200
682
+
683
+ def test_search_has_datasets(self, client):
684
+ r = client.get("/api/huggingface/search")
685
+ d = r.json()
686
+ assert "datasets" in d
687
+ assert d["total"] >= 1
688
+
689
+ def test_search_with_query(self, client):
690
+ r = client.get("/api/huggingface/search?query=RIMES")
691
+ assert r.status_code == 200
692
+ d = r.json()
693
+ assert isinstance(d["datasets"], list)
694
+
695
+ def test_search_with_language(self, client):
696
+ r = client.get("/api/huggingface/search?language=French")
697
+ assert r.status_code == 200
698
+
699
+ def test_import_creates_meta(self, client, tmp_path):
700
+ r = client.post("/api/huggingface/import", json={
701
+ "dataset_id": "CATMuS/medieval",
702
+ "output_dir": str(tmp_path),
703
+ "split": "train",
704
+ "max_samples": 5,
705
+ })
706
+ assert r.status_code == 200
707
+ d = r.json()
708
+ assert Path(d["metadata_file"]).exists()
709
+
710
+ def test_import_returns_keys(self, client, tmp_path):
711
+ r = client.post("/api/huggingface/import", json={
712
+ "dataset_id": "test/dataset",
713
+ "output_dir": str(tmp_path),
714
+ })
715
+ assert r.status_code == 200
716
+ for k in ["dataset_id", "output_dir", "files_imported", "metadata_file"]:
717
+ assert k in r.json()
718
+
719
+
720
+ # ===========================================================================
721
+ # TestFastAPIBenchmark
722
+ # ===========================================================================
723
+
724
+ class TestFastAPIBenchmark:
725
+
726
+ def test_start_missing_corpus(self, client):
727
+ r = client.post("/api/benchmark/start", json={
728
+ "corpus_path": "/nonexistent/path/xyz",
729
+ "engines": ["tesseract"],
730
+ })
731
+ assert r.status_code == 400
732
+
733
+ def test_start_valid_corpus(self, client, tmp_corpus):
734
+ r = client.post("/api/benchmark/start", json={
735
+ "corpus_path": str(tmp_corpus),
736
+ "engines": ["tesseract"],
737
+ })
738
+ assert r.status_code == 200
739
+ d = r.json()
740
+ assert "job_id" in d
741
+ assert d["status"] in ("pending", "running")
742
+
743
+ def test_status_nonexistent_job(self, client):
744
+ r = client.get("/api/benchmark/nonexistent-job-id/status")
745
+ assert r.status_code == 404
746
+
747
+ def test_status_valid_job(self, client, tmp_corpus):
748
+ r = client.post("/api/benchmark/start", json={
749
+ "corpus_path": str(tmp_corpus),
750
+ "engines": ["tesseract"],
751
+ })
752
+ job_id = r.json()["job_id"]
753
+ r2 = client.get(f"/api/benchmark/{job_id}/status")
754
+ assert r2.status_code == 200
755
+ d = r2.json()
756
+ assert d["job_id"] == job_id
757
+ assert "status" in d
758
+ assert "progress" in d
759
+
760
+ def test_cancel_nonexistent_job(self, client):
761
+ r = client.post("/api/benchmark/nonexistent-id/cancel")
762
+ assert r.status_code == 404
763
+
764
+ def test_cancel_valid_job(self, client, tmp_corpus):
765
+ r = client.post("/api/benchmark/start", json={
766
+ "corpus_path": str(tmp_corpus),
767
+ "engines": ["tesseract"],
768
+ })
769
+ job_id = r.json()["job_id"]
770
+ r2 = client.post(f"/api/benchmark/{job_id}/cancel")
771
+ assert r2.status_code == 200
772
+
773
+ def test_job_status_fields(self, client, tmp_corpus):
774
+ r = client.post("/api/benchmark/start", json={
775
+ "corpus_path": str(tmp_corpus),
776
+ "engines": ["tesseract"],
777
+ })
778
+ job_id = r.json()["job_id"]
779
+ r2 = client.get(f"/api/benchmark/{job_id}/status")
780
+ d = r2.json()
781
+ for k in ["job_id", "status", "progress", "total_docs", "processed_docs", "output_path"]:
782
+ assert k in d, f"Missing key: {k}"
783
+
784
+ def test_stream_nonexistent_job(self, client):
785
+ r = client.get("/api/benchmark/nonexistent-id/stream")
786
+ assert r.status_code == 404
787
+
788
+
789
+ # ===========================================================================
790
+ # TestFastAPIHTML
791
+ # ===========================================================================
792
+
793
+ class TestFastAPIHTML:
794
+
795
+ def test_root_200(self, client):
796
+ r = client.get("/")
797
+ assert r.status_code == 200
798
+
799
+ def test_root_is_html(self, client):
800
+ r = client.get("/")
801
+ assert "text/html" in r.headers["content-type"]
802
+
803
+ def test_html_has_picarones_title(self, client):
804
+ r = client.get("/")
805
+ assert "Picarones" in r.text
806
+
807
+ def test_html_has_nav_sections(self, client):
808
+ r = client.get("/")
809
+ for section in ["benchmark", "reports", "engines", "import"]:
810
+ assert section in r.text.lower()
811
+
812
+ def test_html_has_french_content(self, client):
813
+ r = client.get("/")
814
+ assert "Moteurs" in r.text or "moteurs" in r.text.lower()
815
+
816
+
817
+ # ===========================================================================
818
+ # TestFastAPIReportServe
819
+ # ===========================================================================
820
+
821
+ class TestFastAPIReportServe:
822
+
823
+ def test_serve_nonexistent_report(self, client):
824
+ r = client.get("/reports/nonexistent_report.html")
825
+ assert r.status_code == 404
826
+
827
+ def test_serve_existing_report(self, client, tmp_path, monkeypatch):
828
+ # Crée un rapport HTML dans le répertoire courant
829
+ import os
830
+ orig_cwd = os.getcwd()
831
+ os.chdir(tmp_path)
832
+ try:
833
+ html_file = tmp_path / "test_serve.html"
834
+ html_file.write_text("<html><body>Test</body></html>")
835
+ r = client.get("/reports/test_serve.html")
836
+ assert r.status_code == 200
837
+ finally:
838
+ os.chdir(orig_cwd)
839
+
840
+ def test_serve_non_html_rejected(self, client):
841
+ # Tente de servir un .py — doit retourner 404 (extension non-html)
842
+ r = client.get("/reports/malicious.py")
843
+ assert r.status_code == 404
844
+
845
+ def test_serve_report_content_type(self, client, tmp_path):
846
+ import os
847
+ orig_cwd = os.getcwd()
848
+ os.chdir(tmp_path)
849
+ try:
850
+ html_file = tmp_path / "report_ct.html"
851
+ html_file.write_text("<html><body>Content</body></html>")
852
+ r = client.get("/reports/report_ct.html")
853
+ if r.status_code == 200:
854
+ assert "html" in r.headers.get("content-type", "").lower()
855
+ finally:
856
+ os.chdir(orig_cwd)
857
+
858
+
859
+ # ===========================================================================
860
+ # TestCLIServeCommand
861
+ # ===========================================================================
862
+
863
+ class TestCLIServeCommand:
864
+
865
+ def test_serve_command_registered(self):
866
+ from picarones.cli import cli
867
+ commands = cli.commands
868
+ assert "serve" in commands
869
+
870
+ def test_serve_help_text(self):
871
+ from picarones.cli import cli
872
+ runner = CliRunner()
873
+ result = runner.invoke(cli, ["serve", "--help"])
874
+ assert result.exit_code == 0
875
+ assert "serve" in result.output.lower() or "localhost" in result.output.lower()
876
+
877
+ def test_serve_default_port_in_help(self):
878
+ from picarones.cli import cli
879
+ runner = CliRunner()
880
+ result = runner.invoke(cli, ["serve", "--help"])
881
+ assert "8000" in result.output
882
+
883
+ def test_serve_help_has_port_option(self):
884
+ from picarones.cli import cli
885
+ runner = CliRunner()
886
+ result = runner.invoke(cli, ["serve", "--help"])
887
+ assert "--port" in result.output
888
+
889
+ def test_serve_missing_uvicorn_exits_gracefully(self):
890
+ from picarones.cli import cli
891
+ runner = CliRunner()
892
+ # Avec uvicorn installé, cela démarrerait le serveur — on teste juste que
893
+ # la commande existe et est invocable (pas qu'elle démare le serveur)
894
+ # On vérifie juste le help
895
+ result = runner.invoke(cli, ["serve", "--help"])
896
+ assert result.exit_code == 0
897
+
898
+
899
+ # ===========================================================================
900
+ # TestRunnerProgressCallback
901
+ # ===========================================================================
902
+
903
+ class TestRunnerProgressCallback:
904
+
905
+ def test_callback_signature_accepted(self):
906
+ """run_benchmark accepte un paramètre progress_callback."""
907
+ import inspect
908
+ from picarones.core.runner import run_benchmark
909
+ sig = inspect.signature(run_benchmark)
910
+ assert "progress_callback" in sig.parameters
911
+
912
+ def test_callback_is_optional(self):
913
+ """progress_callback est optionnel (valeur par défaut None)."""
914
+ import inspect
915
+ from picarones.core.runner import run_benchmark
916
+ sig = inspect.signature(run_benchmark)
917
+ param = sig.parameters["progress_callback"]
918
+ assert param.default is None
919
+
920
+ def test_callback_called_with_mock_engine(self, tmp_corpus):
921
+ """Le callback est appelé pour chaque document."""
922
+ from picarones.core.corpus import load_corpus_from_directory
923
+ from picarones.core.runner import run_benchmark
924
+ from picarones.engines.base import BaseOCREngine, EngineResult
925
+
926
+ class MockEngine(BaseOCREngine):
927
+ @property
928
+ def name(self): return "mock"
929
+ @property
930
+ def version(self): return "0.0.1"
931
+ def _run_ocr(self, image_path): return "texte mock"
932
+
933
+ corpus = load_corpus_from_directory(str(tmp_corpus))
934
+ calls = []
935
+ def my_callback(engine_name, doc_idx, doc_id):
936
+ calls.append((engine_name, doc_idx, doc_id))
937
+
938
+ run_benchmark(corpus, [MockEngine()], progress_callback=my_callback)
939
+ assert len(calls) == len(corpus), f"Expected {len(corpus)} calls, got {len(calls)}"
940
+
941
+ def test_callback_receives_engine_name(self, tmp_corpus):
942
+ """Le callback reçoit le nom du moteur."""
943
+ from picarones.core.corpus import load_corpus_from_directory
944
+ from picarones.core.runner import run_benchmark
945
+ from picarones.engines.base import BaseOCREngine
946
+
947
+ class MockEngine(BaseOCREngine):
948
+ @property
949
+ def name(self): return "test_engine_name"
950
+ @property
951
+ def version(self): return "0.0.1"
952
+ def _run_ocr(self, image_path): return "texte"
953
+
954
+ corpus = load_corpus_from_directory(str(tmp_corpus))
955
+ engine_names = []
956
+ def my_callback(engine_name, doc_idx, doc_id):
957
+ engine_names.append(engine_name)
958
+
959
+ run_benchmark(corpus, [MockEngine()], progress_callback=my_callback)
960
+ assert all(n == "test_engine_name" for n in engine_names)
961
+
962
+ def test_callback_exception_does_not_crash(self, tmp_corpus):
963
+ """Une exception dans le callback ne plante pas le benchmark."""
964
+ from picarones.core.corpus import load_corpus_from_directory
965
+ from picarones.core.runner import run_benchmark
966
+ from picarones.engines.base import BaseOCREngine
967
+
968
+ class MockEngine(BaseOCREngine):
969
+ @property
970
+ def name(self): return "mock"
971
+ @property
972
+ def version(self): return "0.0.1"
973
+ def _run_ocr(self, image_path): return "texte"
974
+
975
+ corpus = load_corpus_from_directory(str(tmp_corpus))
976
+
977
+ def bad_callback(engine_name, doc_idx, doc_id):
978
+ raise RuntimeError("Callback error!")
979
+
980
+ # Ne doit pas lever d'exception
981
+ result = run_benchmark(corpus, [MockEngine()], progress_callback=bad_callback)
982
+ assert result is not None