Picarones / picarones /i18n.py
Claude
Sprint 11 : internationalisation complète — support anglais patrimonial
ce0bff3 unverified
Raw
History Blame
15.7 kB
"""Labels i18n pour le rapport HTML et l'interface Picarones.
Langues supportΓ©es
------------------
- ``"fr"`` : franΓ§ais (dΓ©faut)
- ``"en"`` : anglais patrimonial (heritage English)
"""
from __future__ import annotations
TRANSLATIONS: dict[str, dict[str, str]] = {
"fr": {
# ── HTML mΓ©ta ──────────────────────────────────────────────────────
"html_lang": "fr",
"date_locale": "fr-FR",
# ── Navigation ─────────────────────────────────────────────────────
"nav_report": "rapport OCR",
"tab_ranking": "Classement",
"tab_gallery": "Galerie",
"tab_document": "Document",
"tab_characters": "Caractères",
"tab_analyses": "Analyses",
"btn_present": "⊞ Présentation",
# ── Classement ─────────────────────────────────────────────────────
"h_ranking": "Classement des moteurs",
"col_rank": "#",
"col_engine": "Concurrent",
"col_cer": "CER exact",
"col_cer_diplo": "CER diplo.",
"col_cer_diplo_title": "CER aprΓ¨s normalisation diplomatique (ΕΏ=s, u=v, i=j…) β€” mesure les erreurs substantielles en ignorant les variantes graphiques codifiΓ©es",
"col_wer": "WER",
"col_mer": "MER",
"col_wil": "WIL",
"col_ligatures": "Ligatures",
"col_ligatures_title": "Taux de reconnaissance des ligatures (fi, fl, Ε“, Γ¦, ff…)",
"col_diacritics": "Diacritiques",
"col_diacritics_title": "Taux de conservation des diacritiques (accents, cΓ©dilles, trΓ©mas…)",
"col_gini": "Gini",
"col_gini_title": "Coefficient de Gini des erreurs CER par ligne β€” 0 = erreurs uniformes, 1 = erreurs concentrΓ©es. Un bon moteur a CER bas ET Gini bas.",
"col_anchor": "Ancrage",
"col_anchor_title": "Score d'ancrage : proportion des trigrammes de la sortie trouvant un ancrage dans le GT β€” faible score = hallucinations probables (LLM/VLM)",
"col_cer_median": "CER mΓ©dian",
"col_cer_min": "CER min",
"col_cer_max": "CER max",
"col_overnorm": "Sur-norm.",
"col_overnorm_title": "Classe 10 β€” Sur-normalisation LLM : taux de mots corrects dΓ©gradΓ©s par le LLM",
"col_docs": "Docs",
# ── Galerie ────────────────────────────────────────────────────────
"h_gallery": "Galerie des documents",
"gallery_sort_label": "Trier par :",
"gallery_sort_id": "Identifiant",
"gallery_sort_cer": "CER moyen",
"gallery_sort_difficulty": "DifficultΓ©",
"gallery_sort_best": "Meilleur moteur",
"gallery_filter_cer_label": "Filtrer CER >",
"gallery_filter_engine_label": "Moteur :",
"gallery_filter_all": "Tous",
"gallery_empty": "Aucun document ne correspond aux filtres.",
# ── Document ───────────────────────────────────────────────────────
"doc_sidebar_header": "Documents",
"doc_title_default": "SΓ©lectionner un document",
"h_image": "Image originale",
"h_gt": "VΓ©ritΓ© terrain (GT)",
"h_diff": "Sorties OCR β€” diff par moteur",
"h_line_metrics": "Distribution des erreurs par ligne",
"h_hallucination": "Analyse des hallucinations",
# ── Analyses ───────────────────────────────────────────────────────
"h_characters": "Analyse des caractères",
"char_engine_label": "Moteur :",
"h_cer_dist": "Distribution du CER par moteur",
"h_radar": "Profil des moteurs (radar)",
"radar_note": "Axe radar : CER, WER, MER, WIL β€” valeurs inversΓ©es (plus c'est haut, meilleur est le moteur).",
"h_cer_doc": "CER par document (tous moteurs)",
"h_duration": "Temps d'exΓ©cution moyen (secondes/document)",
"h_quality_cer": "QualitΓ© image ↔ CER (scatter plot)",
"quality_cer_note": "Chaque point = un document. Axe X = score qualitΓ© image [0–1]. Axe Y = CER. CorrΓ©lation nΓ©gative attendue.",
"h_taxonomy": "Taxonomie des erreurs par moteur",
"taxonomy_note": "Distribution des classes d'erreurs (classes 1–9 de la taxonomie Picarones).",
"h_reliability": "Courbes de fiabilitΓ©",
"reliability_note": "Pour les X% documents les plus faciles (triΓ©s par CER croissant), quel est le CER moyen cumulΓ© ? Une courbe basse = moteur performant mΓͺme sur les documents faciles.",
"h_bootstrap": "Intervalles de confiance Γ  95 % (bootstrap)",
"bootstrap_note": "IC Γ  95% sur le CER moyen par moteur (1000 itΓ©rations bootstrap).",
"h_venn": "Erreurs communes / exclusives (Venn)",
"venn_note": "Intersection des ensembles d'erreurs entre les 2 ou 3 premiers concurrents. Erreurs communes = segments partagΓ©s.",
"h_pairwise": "Tests de Wilcoxon β€” comparaisons par paires",
"pairwise_note": "Test signΓ©-rangΓ© de Wilcoxon (non-paramΓ©trique). Seuil Ξ± = 0.05.",
"h_clusters": "Clustering des patterns d'erreurs",
"h_gini_cer": "Gini vs CER moyen",
"gini_cer_ideal": "β€” idΓ©al : bas-gauche",
"gini_cer_note": "Axe X = CER moyen, Axe Y = coefficient de Gini. Un moteur idΓ©al a CER bas ET Gini bas (erreurs rares et uniformes).",
"h_ratio_anchor": "Ratio longueur vs ancrage",
"ratio_anchor_subtitle": "β€” hallucinations VLM",
"ratio_anchor_note": "Axe X = score d'ancrage trigrammes [0–1]. Axe Y = ratio longueur sortie/GT. Zone ⚠️ : ancrage < 0.5 ou ratio > 1.2 β†’ hallucinations probables.",
"h_correlation": "Matrice de corrΓ©lation entre mΓ©triques",
"corr_engine_label": "Moteur :",
"corr_note": "Coefficient de Pearson entre les mΓ©triques CER, WER, qualitΓ© image, ligatures, diacritiques. Vert = corrΓ©lation positive, Rouge = corrΓ©lation nΓ©gative.",
# ── Footer ────────────────────────────────────────────────────────
"footer_generated": "Rapport gΓ©nΓ©rΓ© le",
"footer_by": "par Picarones",
# ── JS strings dynamiques ─────────────────────────────────────────
"heatmap_start": "DΓ©but",
"heatmap_mid": "Milieu",
"heatmap_end": "Fin",
"heatmap_title": "CARTE THERMIQUE (position)",
"percentile_title": "PERCENTILES CER",
"lines": "lignes",
"no_line_metrics": "Aucune mΓ©trique de ligne disponible.",
"no_hall_metrics": "Aucune mΓ©trique d'hallucination disponible.",
"no_hall_blocks": "Aucun bloc hallucinΓ© dΓ©tectΓ©.",
"hall_detected": "⚠️ Hallucinations détectées",
"hall_ok": "βœ“ Ancrage satisfaisant",
"hall_blocks_title": "Blocs sans ancrage dans le GT :",
"hall_block_label": "Bloc hallucinΓ©",
"hall_more_blocks": "bloc(s) supplΓ©mentaire(s)",
"no_gini": "DonnΓ©es Gini non disponibles.",
"no_scatter": "DonnΓ©es non disponibles.",
"total_errors": "Total :",
"errors_classified": "erreurs classifiΓ©es.",
"class_col": "Classe",
"proportion_col": "Proportion",
"taxonomy_engine_label": "Moteur :",
},
"en": {
# ── HTML mΓ©ta ──────────────────────────────────────────────────────
"html_lang": "en",
"date_locale": "en-GB",
# ── Navigation ─────────────────────────────────────────────────────
"nav_report": "OCR report",
"tab_ranking": "Ranking",
"tab_gallery": "Gallery",
"tab_document": "Document",
"tab_characters": "Characters",
"tab_analyses": "Analyses",
"btn_present": "⊞ Presentation",
# ── Ranking ────────────────────────────────────────────────────────
"h_ranking": "Engine Ranking",
"col_rank": "#",
"col_engine": "Engine",
"col_cer": "Exact CER",
"col_cer_diplo": "Diplo. CER",
"col_cer_diplo_title": "CER after diplomatic normalisation (ΕΏ=s, u=v, i=j…) β€” measures substantial errors ignoring codified graphical variants",
"col_wer": "WER",
"col_mer": "MER",
"col_wil": "WIL",
"col_ligatures": "Ligatures",
"col_ligatures_title": "Ligature recognition rate (fi, fl, Ε“, Γ¦, ff…)",
"col_diacritics": "Diacritics",
"col_diacritics_title": "Diacritic preservation rate (accents, cedillas, umlauts…)",
"col_gini": "Gini",
"col_gini_title": "Gini coefficient of per-line CER errors β€” 0 = uniform errors, 1 = concentrated errors. A good engine has low CER AND low Gini.",
"col_anchor": "Anchor",
"col_anchor_title": "Anchor score: proportion of output trigrams found in the GT β€” low score = probable hallucinations (LLM/VLM)",
"col_cer_median": "Median CER",
"col_cer_min": "Min CER",
"col_cer_max": "Max CER",
"col_overnorm": "Over-norm.",
"col_overnorm_title": "Class 10 β€” LLM over-normalisation: rate of correct words degraded by the LLM",
"col_docs": "Docs",
# ── Gallery ────────────────────────────────────────────────────────
"h_gallery": "Document Gallery",
"gallery_sort_label": "Sort by:",
"gallery_sort_id": "Identifier",
"gallery_sort_cer": "Mean CER",
"gallery_sort_difficulty": "Difficulty",
"gallery_sort_best": "Best engine",
"gallery_filter_cer_label": "Filter CER >",
"gallery_filter_engine_label": "Engine:",
"gallery_filter_all": "All",
"gallery_empty": "No documents match the filters.",
# ── Document ───────────────────────────────────────────────────────
"doc_sidebar_header": "Documents",
"doc_title_default": "Select a document",
"h_image": "Original Image",
"h_gt": "Ground Truth (GT)",
"h_diff": "OCR Output β€” diff by engine",
"h_line_metrics": "Error Distribution by Line",
"h_hallucination": "Hallucination Analysis",
# ── Analyses ───────────────────────────────────────────────────────
"h_characters": "Character Analysis",
"char_engine_label": "Engine:",
"h_cer_dist": "CER Distribution by Engine",
"h_radar": "Engine Profile (radar)",
"radar_note": "Radar axes: CER, WER, MER, WIL β€” inverted values (higher = better engine).",
"h_cer_doc": "CER by Document (all engines)",
"h_duration": "Average Execution Time (seconds/document)",
"h_quality_cer": "Image Quality ↔ CER (scatter plot)",
"quality_cer_note": "Each point = one document. X-axis = image quality score [0–1]. Y-axis = CER. Negative correlation expected.",
"h_taxonomy": "Error Taxonomy by Engine",
"taxonomy_note": "Distribution of error classes (classes 1–9 of the Picarones taxonomy).",
"h_reliability": "Reliability Curves",
"reliability_note": "For the X% easiest documents (sorted by ascending CER), what is the cumulative mean CER? A low curve = engine performing well even on easy documents.",
"h_bootstrap": "95% Bootstrap Confidence Intervals",
"bootstrap_note": "95% CI on mean CER per engine (1000 bootstrap iterations).",
"h_venn": "Shared / Exclusive Errors (Venn)",
"venn_note": "Intersection of error sets between the 2 or 3 top engines. Shared errors = overlapping segments.",
"h_pairwise": "Wilcoxon Tests β€” pairwise comparisons",
"pairwise_note": "Wilcoxon signed-rank test (non-parametric). Threshold Ξ± = 0.05.",
"h_clusters": "Frequent Error Clusters",
"h_gini_cer": "Gini vs Mean CER",
"gini_cer_ideal": "β€” ideal: bottom-left",
"gini_cer_note": "X-axis = mean CER, Y-axis = Gini coefficient. An ideal engine has low CER AND low Gini (rare, uniform errors).",
"h_ratio_anchor": "Length Ratio vs Anchor Score",
"ratio_anchor_subtitle": "β€” VLM hallucinations",
"ratio_anchor_note": "X-axis = trigram anchor score [0–1]. Y-axis = output/GT length ratio. ⚠️ Zone: anchor < 0.5 or ratio > 1.2 β†’ probable hallucinations.",
"h_correlation": "Metric Correlation Matrix",
"corr_engine_label": "Engine:",
"corr_note": "Pearson coefficient between CER, WER, image quality, ligatures, diacritics. Green = positive correlation, Red = negative.",
# ── Footer ────────────────────────────────────────────────────────
"footer_generated": "Report generated on",
"footer_by": "by Picarones",
# ── JS strings dynamiques ─────────────────────────────────────────
"heatmap_start": "Start",
"heatmap_mid": "Middle",
"heatmap_end": "End",
"heatmap_title": "HEATMAP (position)",
"percentile_title": "CER PERCENTILES",
"lines": "lines",
"no_line_metrics": "No line metrics available.",
"no_hall_metrics": "No hallucination metrics available.",
"no_hall_blocks": "No hallucinated blocks detected.",
"hall_detected": "⚠️ Hallucinations detected",
"hall_ok": "βœ“ Satisfactory anchoring",
"hall_blocks_title": "Blocks with no anchor in GT:",
"hall_block_label": "Hallucinated block",
"hall_more_blocks": "additional block(s)",
"no_gini": "Gini data not available.",
"no_scatter": "Data not available.",
"total_errors": "Total:",
"errors_classified": "classified errors.",
"class_col": "Class",
"proportion_col": "Proportion",
"taxonomy_engine_label": "Engine:",
},
}
def get_labels(lang: str = "fr") -> dict[str, str]:
"""Retourne le dictionnaire de labels pour la langue donnΓ©e.
Parameters
----------
lang:
Code langue : ``"fr"`` (dΓ©faut) ou ``"en"``.
Returns
-------
dict
Labels traduits. Toujours valide : bascule sur ``"fr"`` si lang inconnu.
"""
return TRANSLATIONS.get(lang, TRANSLATIONS["fr"])
SUPPORTED_LANGS: list[str] = list(TRANSLATIONS.keys())