"""Labels i18n pour le rapport HTML et l'interface Picarones.
Langues supportées
------------------
- ``"fr"`` : français (défaut)
- ``"en"`` : anglais patrimonial (heritage English)
"""
from __future__ import annotations
TRANSLATIONS: dict[str, dict[str, str]] = {
"fr": {
# ── HTML méta ──────────────────────────────────────────────────────
"html_lang": "fr",
"date_locale": "fr-FR",
# ── Navigation ─────────────────────────────────────────────────────
"nav_report": "rapport OCR",
"tab_ranking": "Classement",
"tab_gallery": "Galerie",
"tab_document": "Document",
"tab_characters": "Caractères",
"tab_analyses": "Analyses",
"btn_present": "⊞ Présentation",
# ── Classement ─────────────────────────────────────────────────────
"h_ranking": "Classement des moteurs",
"col_rank": "#",
"col_engine": "Concurrent",
"col_cer": "CER exact",
"col_cer_diplo": "CER diplo.",
"col_cer_diplo_title": "CER après normalisation diplomatique (ſ=s, u=v, i=j…) — mesure les erreurs substantielles en ignorant les variantes graphiques codifiées",
"col_wer": "WER",
"col_mer": "MER",
"col_wil": "WIL",
"col_ligatures": "Ligatures",
"col_ligatures_title": "Taux de reconnaissance des ligatures (fi, fl, œ, æ, ff…)",
"col_diacritics": "Diacritiques",
"col_diacritics_title": "Taux de conservation des diacritiques (accents, cédilles, trémas…)",
"col_gini": "Gini",
"col_gini_title": "Coefficient de Gini des erreurs CER par ligne — 0 = erreurs uniformes, 1 = erreurs concentrées. Un bon moteur a CER bas ET Gini bas.",
"col_anchor": "Ancrage",
"col_anchor_title": "Score d'ancrage : proportion des trigrammes de la sortie trouvant un ancrage dans le GT — faible score = hallucinations probables (LLM/VLM)",
"col_cer_median": "CER médian",
"col_cer_min": "CER min",
"col_cer_max": "CER max",
"col_overnorm": "Sur-norm.",
"col_overnorm_title": "Classe 10 — Sur-normalisation LLM : taux de mots corrects dégradés par le LLM",
"col_docs": "Docs",
# ── Galerie ────────────────────────────────────────────────────────
"h_gallery": "Galerie des documents",
"gallery_sort_label": "Trier par :",
"gallery_sort_id": "Identifiant",
"gallery_sort_cer": "CER moyen",
"gallery_sort_difficulty": "Difficulté",
"gallery_sort_best": "Meilleur moteur",
"gallery_filter_cer_label": "Filtrer CER >",
"gallery_filter_engine_label": "Moteur :",
"gallery_filter_all": "Tous",
"gallery_empty": "Aucun document ne correspond aux filtres.",
# ── Document ───────────────────────────────────────────────────────
"doc_sidebar_header": "Documents",
"doc_title_default": "Sélectionner un document",
"h_image": "Image originale",
"h_gt": "Vérité terrain (GT)",
"h_diff": "Sorties OCR — diff par moteur",
"h_line_metrics": "Distribution des erreurs par ligne",
"h_hallucination": "Analyse des hallucinations",
# ── Analyses ───────────────────────────────────────────────────────
"h_characters": "Analyse des caractères",
"char_engine_label": "Moteur :",
"h_cer_dist": "Distribution du CER par moteur",
"h_radar": "Profil des moteurs (radar)",
"radar_note": "Axe radar : CER, WER, MER, WIL — valeurs inversées (plus c'est haut, meilleur est le moteur).",
"h_cer_doc": "CER par document (tous moteurs)",
"h_duration": "Temps d'exécution moyen (secondes/document)",
"h_quality_cer": "Qualité image ↔ CER (scatter plot)",
"quality_cer_note": "Chaque point = un document. Axe X = score qualité image [0–1]. Axe Y = CER. Corrélation négative attendue.",
"h_taxonomy": "Taxonomie des erreurs par moteur",
"taxonomy_note": "Distribution des classes d'erreurs (classes 1–9 de la taxonomie Picarones).",
"h_reliability": "Courbes de fiabilité",
"reliability_note": "Pour les X% documents les plus faciles (triés par CER croissant), quel est le CER moyen cumulé ? Une courbe basse = moteur performant même sur les documents faciles.",
"h_bootstrap": "Intervalles de confiance à 95 % (bootstrap)",
"bootstrap_note": "IC à 95% sur le CER moyen par moteur (1000 itérations bootstrap).",
"h_venn": "Erreurs communes / exclusives (Venn)",
"venn_note": "Intersection des ensembles d'erreurs entre les 2 ou 3 premiers concurrents. Erreurs communes = segments partagés.",
"h_pairwise": "Tests de Wilcoxon — comparaisons par paires",
"pairwise_note": "Test signé-rangé de Wilcoxon (non-paramétrique). Seuil α = 0.05.",
"h_clusters": "Clustering des patterns d'erreurs",
"h_gini_cer": "Gini vs CER moyen",
"gini_cer_ideal": "— idéal : bas-gauche",
"gini_cer_note": "Axe X = CER moyen, Axe Y = coefficient de Gini. Un moteur idéal a CER bas ET Gini bas (erreurs rares et uniformes).",
"h_ratio_anchor": "Ratio longueur vs ancrage",
"ratio_anchor_subtitle": "— hallucinations VLM",
"ratio_anchor_note": "Axe X = score d'ancrage trigrammes [0–1]. Axe Y = ratio longueur sortie/GT. Zone ⚠️ : ancrage < 0.5 ou ratio > 1.2 → hallucinations probables.",
"h_correlation": "Matrice de corrélation entre métriques",
"corr_engine_label": "Moteur :",
"corr_note": "Coefficient de Pearson entre les métriques CER, WER, qualité image, ligatures, diacritiques. Vert = corrélation positive, Rouge = corrélation négative.",
# ── Footer ────────────────────────────────────────────────────────
"footer_generated": "Rapport généré le",
"footer_by": "par Picarones",
# ── JS strings dynamiques ─────────────────────────────────────────
"heatmap_start": "Début",
"heatmap_mid": "Milieu",
"heatmap_end": "Fin",
"heatmap_title": "CARTE THERMIQUE (position)",
"percentile_title": "PERCENTILES CER",
"lines": "lignes",
"no_line_metrics": "Aucune métrique de ligne disponible.",
"no_hall_metrics": "Aucune métrique d'hallucination disponible.",
"no_hall_blocks": "Aucun bloc halluciné détecté.",
"hall_detected": "⚠️ Hallucinations détectées",
"hall_ok": "✓ Ancrage satisfaisant",
"hall_blocks_title": "Blocs sans ancrage dans le GT :",
"hall_block_label": "Bloc halluciné",
"hall_more_blocks": "bloc(s) supplémentaire(s)",
"no_gini": "Données Gini non disponibles.",
"no_scatter": "Données non disponibles.",
"total_errors": "Total :",
"errors_classified": "erreurs classifiées.",
"class_col": "Classe",
"proportion_col": "Proportion",
"taxonomy_engine_label": "Moteur :",
},
"en": {
# ── HTML méta ──────────────────────────────────────────────────────
"html_lang": "en",
"date_locale": "en-GB",
# ── Navigation ─────────────────────────────────────────────────────
"nav_report": "OCR report",
"tab_ranking": "Ranking",
"tab_gallery": "Gallery",
"tab_document": "Document",
"tab_characters": "Characters",
"tab_analyses": "Analyses",
"btn_present": "⊞ Presentation",
# ── Ranking ────────────────────────────────────────────────────────
"h_ranking": "Engine Ranking",
"col_rank": "#",
"col_engine": "Engine",
"col_cer": "Exact CER",
"col_cer_diplo": "Diplo. CER",
"col_cer_diplo_title": "CER after diplomatic normalisation (ſ=s, u=v, i=j…) — measures substantial errors ignoring codified graphical variants",
"col_wer": "WER",
"col_mer": "MER",
"col_wil": "WIL",
"col_ligatures": "Ligatures",
"col_ligatures_title": "Ligature recognition rate (fi, fl, œ, æ, ff…)",
"col_diacritics": "Diacritics",
"col_diacritics_title": "Diacritic preservation rate (accents, cedillas, umlauts…)",
"col_gini": "Gini",
"col_gini_title": "Gini coefficient of per-line CER errors — 0 = uniform errors, 1 = concentrated errors. A good engine has low CER AND low Gini.",
"col_anchor": "Anchor",
"col_anchor_title": "Anchor score: proportion of output trigrams found in the GT — low score = probable hallucinations (LLM/VLM)",
"col_cer_median": "Median CER",
"col_cer_min": "Min CER",
"col_cer_max": "Max CER",
"col_overnorm": "Over-norm.",
"col_overnorm_title": "Class 10 — LLM over-normalisation: rate of correct words degraded by the LLM",
"col_docs": "Docs",
# ── Gallery ────────────────────────────────────────────────────────
"h_gallery": "Document Gallery",
"gallery_sort_label": "Sort by:",
"gallery_sort_id": "Identifier",
"gallery_sort_cer": "Mean CER",
"gallery_sort_difficulty": "Difficulty",
"gallery_sort_best": "Best engine",
"gallery_filter_cer_label": "Filter CER >",
"gallery_filter_engine_label": "Engine:",
"gallery_filter_all": "All",
"gallery_empty": "No documents match the filters.",
# ── Document ───────────────────────────────────────────────────────
"doc_sidebar_header": "Documents",
"doc_title_default": "Select a document",
"h_image": "Original Image",
"h_gt": "Ground Truth (GT)",
"h_diff": "OCR Output — diff by engine",
"h_line_metrics": "Error Distribution by Line",
"h_hallucination": "Hallucination Analysis",
# ── Analyses ───────────────────────────────────────────────────────
"h_characters": "Character Analysis",
"char_engine_label": "Engine:",
"h_cer_dist": "CER Distribution by Engine",
"h_radar": "Engine Profile (radar)",
"radar_note": "Radar axes: CER, WER, MER, WIL — inverted values (higher = better engine).",
"h_cer_doc": "CER by Document (all engines)",
"h_duration": "Average Execution Time (seconds/document)",
"h_quality_cer": "Image Quality ↔ CER (scatter plot)",
"quality_cer_note": "Each point = one document. X-axis = image quality score [0–1]. Y-axis = CER. Negative correlation expected.",
"h_taxonomy": "Error Taxonomy by Engine",
"taxonomy_note": "Distribution of error classes (classes 1–9 of the Picarones taxonomy).",
"h_reliability": "Reliability Curves",
"reliability_note": "For the X% easiest documents (sorted by ascending CER), what is the cumulative mean CER? A low curve = engine performing well even on easy documents.",
"h_bootstrap": "95% Bootstrap Confidence Intervals",
"bootstrap_note": "95% CI on mean CER per engine (1000 bootstrap iterations).",
"h_venn": "Shared / Exclusive Errors (Venn)",
"venn_note": "Intersection of error sets between the 2 or 3 top engines. Shared errors = overlapping segments.",
"h_pairwise": "Wilcoxon Tests — pairwise comparisons",
"pairwise_note": "Wilcoxon signed-rank test (non-parametric). Threshold α = 0.05.",
"h_clusters": "Frequent Error Clusters",
"h_gini_cer": "Gini vs Mean CER",
"gini_cer_ideal": "— ideal: bottom-left",
"gini_cer_note": "X-axis = mean CER, Y-axis = Gini coefficient. An ideal engine has low CER AND low Gini (rare, uniform errors).",
"h_ratio_anchor": "Length Ratio vs Anchor Score",
"ratio_anchor_subtitle": "— VLM hallucinations",
"ratio_anchor_note": "X-axis = trigram anchor score [0–1]. Y-axis = output/GT length ratio. ⚠️ Zone: anchor < 0.5 or ratio > 1.2 → probable hallucinations.",
"h_correlation": "Metric Correlation Matrix",
"corr_engine_label": "Engine:",
"corr_note": "Pearson coefficient between CER, WER, image quality, ligatures, diacritics. Green = positive correlation, Red = negative.",
# ── Footer ────────────────────────────────────────────────────────
"footer_generated": "Report generated on",
"footer_by": "by Picarones",
# ── JS strings dynamiques ─────────────────────────────────────────
"heatmap_start": "Start",
"heatmap_mid": "Middle",
"heatmap_end": "End",
"heatmap_title": "HEATMAP (position)",
"percentile_title": "CER PERCENTILES",
"lines": "lines",
"no_line_metrics": "No line metrics available.",
"no_hall_metrics": "No hallucination metrics available.",
"no_hall_blocks": "No hallucinated blocks detected.",
"hall_detected": "⚠️ Hallucinations detected",
"hall_ok": "✓ Satisfactory anchoring",
"hall_blocks_title": "Blocks with no anchor in GT:",
"hall_block_label": "Hallucinated block",
"hall_more_blocks": "additional block(s)",
"no_gini": "Gini data not available.",
"no_scatter": "Data not available.",
"total_errors": "Total:",
"errors_classified": "classified errors.",
"class_col": "Class",
"proportion_col": "Proportion",
"taxonomy_engine_label": "Engine:",
},
}
def get_labels(lang: str = "fr") -> dict[str, str]:
"""Retourne le dictionnaire de labels pour la langue donnée.
Parameters
----------
lang:
Code langue : ``"fr"`` (défaut) ou ``"en"``.
Returns
-------
dict
Labels traduits. Toujours valide : bascule sur ``"fr"`` si lang inconnu.
"""
return TRANSLATIONS.get(lang, TRANSLATIONS["fr"])
SUPPORTED_LANGS: list[str] = list(TRANSLATIONS.keys())