Picarones / tests /test_sprint11_i18n_english.py
Claude
Sprint 11 : internationalisation complète — support anglais patrimonial
ce0bff3 unverified
Raw
History Blame
17.4 kB
"""Sprint 11 — Tests : internationalisation et profils anglais patrimoniaux.
Couvre :
- Profils de normalisation : early_modern_english, medieval_english, secretary_hand
- Bibliothèque de prompts anglais
- Génération de rapport HTML en anglais (lang="en")
- Module i18n
- Flag --lang de picarones demo
"""
from __future__ import annotations
import json
import re
from pathlib import Path
import pytest
# ---------------------------------------------------------------------------
# Profils de normalisation anglais
# ---------------------------------------------------------------------------
class TestEarlyModernEnglish:
"""Profil early_modern_english : ſ=s, u=v, i=j, vv=w, þ=th, ð=th, ȝ=y."""
@pytest.fixture
def profile(self):
from picarones.core.normalization import get_builtin_profile
return get_builtin_profile("early_modern_english")
def test_profile_exists(self, profile):
assert profile.name == "early_modern_english"
def test_long_s(self, profile):
# ſ=s : both normalize to the same canonical form (i also becomes j)
assert profile.normalize("ſaid") == profile.normalize("said")
def test_u_v_interchangeable(self, profile):
# u and v map to the same canonical form
assert profile.normalize("upon") == profile.normalize("vpon")
def test_i_j_interchangeable(self, profile):
# i and j map to the same canonical form
assert profile.normalize("ioy") == profile.normalize("joy")
def test_vv_to_w(self, profile):
# vv and w map to the same canonical form
assert profile.normalize("vvhich") == profile.normalize("which")
def test_thorn_to_th(self, profile):
assert profile.normalize("þe") == "the"
assert profile.normalize("þat") == "that"
def test_eth_to_th(self, profile):
assert profile.normalize("ðe") == "the"
def test_yogh_to_y(self, profile):
# ȝ normalises the same as y
assert profile.normalize("ȝe") == profile.normalize("ye")
assert profile.normalize("ȝour") == profile.normalize("your")
def test_ampersand_to_and(self, profile):
assert profile.normalize("God & Man") == "God and Man"
def test_ae_ligature(self, profile):
assert profile.normalize("æther") == "aether"
def test_oe_ligature(self, profile):
assert profile.normalize("œconomy") == "oeconomy"
def test_combined_normalisation(self, profile):
# "þe ſame vvoman" → "the same woman"
result = profile.normalize("þe ſame vvoman")
assert result == "the same woman"
def test_description_in_english(self, profile):
assert "Early Modern English" in profile.description or "english" in profile.description.lower()
def test_nfc_applied(self, profile):
import unicodedata
text = "caf\u0065\u0301" # café décomposé
normalised = profile.normalize(text)
assert unicodedata.is_normalized("NFC", normalised)
class TestMedievalEnglish:
"""Profil medieval_english : ſ=s, u=v, i=j, þ=th, ȝ=y, abréviations."""
@pytest.fixture
def profile(self):
from picarones.core.normalization import get_builtin_profile
return get_builtin_profile("medieval_english")
def test_profile_exists(self, profile):
assert profile.name == "medieval_english"
def test_thorn(self, profile):
assert profile.normalize("þe") == "the"
def test_yogh(self, profile):
assert profile.normalize("ȝe") == "ye"
def test_long_s(self, profile):
assert profile.normalize("ſome") == "some"
def test_abbreviation_per(self, profile):
# ꝑ → per
assert profile.normalize("ꝑfect") == "perfect"
def test_abbreviation_pro(self, profile):
# ꝓ → pro (both ꝓud and proud normalize to the same form)
assert profile.normalize("ꝓud") == profile.normalize("proud")
def test_combined(self, profile):
result = profile.normalize("þe ꝑfect ȝe")
assert result == "the perfect ye"
def test_vv_to_w(self, profile):
assert profile.normalize("vvhen") == "when"
def test_description(self, profile):
desc = profile.description.lower()
assert "english" in desc or "medieval" in desc
class TestSecretaryHand:
"""Profil secretary_hand : écriture secrétaire anglaise XVIe-XVIIe."""
@pytest.fixture
def profile(self):
from picarones.core.normalization import get_builtin_profile
return get_builtin_profile("secretary_hand")
def test_profile_exists(self, profile):
assert profile.name == "secretary_hand"
def test_long_s(self, profile):
# ſ normalises the same as s
assert profile.normalize("ſaid") == profile.normalize("said")
def test_thorn(self, profile):
assert profile.normalize("þe") == "the"
def test_yogh(self, profile):
assert profile.normalize("ȝet") == "yet"
def test_u_v(self, profile):
assert profile.normalize("vpon") == "vpon".replace("u", "v")
def test_ampersand(self, profile):
assert profile.normalize("lord & master") == "lord and master"
def test_description(self, profile):
desc = profile.description.lower()
assert "secretary" in desc or "hand" in desc
class TestBuiltinProfilesListing:
"""Vérifie que les 3 nouveaux profils sont bien accessibles."""
def test_all_english_profiles_accessible(self):
from picarones.core.normalization import get_builtin_profile
for name in ("early_modern_english", "medieval_english", "secretary_hand"):
p = get_builtin_profile(name)
assert p.name == name
def test_unknown_profile_raises_key_error(self):
from picarones.core.normalization import get_builtin_profile
with pytest.raises(KeyError):
get_builtin_profile("unknown_lang_profile_xyz")
def test_existing_profiles_still_work(self):
from picarones.core.normalization import get_builtin_profile
for name in ("medieval_french", "early_modern_french", "medieval_latin", "nfc", "caseless", "minimal"):
p = get_builtin_profile(name)
assert p.name == name
# ---------------------------------------------------------------------------
# Bibliothèque de prompts anglais
# ---------------------------------------------------------------------------
class TestEnglishPrompts:
"""Vérifie l'existence et la structure des prompts anglais."""
@pytest.fixture
def prompts_dir(self):
return Path(__file__).parent.parent / "picarones" / "prompts"
def test_zero_shot_medieval_english_exists(self, prompts_dir):
assert (prompts_dir / "zero_shot_medieval_english.txt").exists()
def test_correction_medieval_english_exists(self, prompts_dir):
assert (prompts_dir / "correction_medieval_english.txt").exists()
def test_correction_early_modern_english_exists(self, prompts_dir):
assert (prompts_dir / "correction_early_modern_english.txt").exists()
def test_zero_shot_has_image_b64_variable(self, prompts_dir):
text = (prompts_dir / "zero_shot_medieval_english.txt").read_text(encoding="utf-8")
assert "{image_b64}" in text
def test_correction_medieval_has_ocr_output_variable(self, prompts_dir):
text = (prompts_dir / "correction_medieval_english.txt").read_text(encoding="utf-8")
assert "{ocr_output}" in text
def test_correction_early_modern_has_ocr_output_variable(self, prompts_dir):
text = (prompts_dir / "correction_early_modern_english.txt").read_text(encoding="utf-8")
assert "{ocr_output}" in text
def test_zero_shot_medieval_is_in_english(self, prompts_dir):
text = (prompts_dir / "zero_shot_medieval_english.txt").read_text(encoding="utf-8")
assert "palaeograph" in text.lower() or "transcrib" in text.lower()
def test_correction_medieval_mentions_thorn(self, prompts_dir):
text = (prompts_dir / "correction_medieval_english.txt").read_text(encoding="utf-8")
assert "þ" in text or "thorn" in text.lower()
def test_correction_early_modern_mentions_long_s(self, prompts_dir):
text = (prompts_dir / "correction_early_modern_english.txt").read_text(encoding="utf-8")
assert "ſ" in text or "long-s" in text.lower() or "long s" in text.lower()
# ---------------------------------------------------------------------------
# Module i18n
# ---------------------------------------------------------------------------
class TestI18nModule:
"""Vérifie le module picarones.i18n."""
def test_get_labels_fr(self):
from picarones.i18n import get_labels
labels = get_labels("fr")
assert labels["tab_ranking"] == "Classement"
assert labels["html_lang"] == "fr"
assert labels["date_locale"] == "fr-FR"
def test_get_labels_en(self):
from picarones.i18n import get_labels
labels = get_labels("en")
assert labels["tab_ranking"] == "Ranking"
assert labels["html_lang"] == "en"
assert labels["date_locale"] == "en-GB"
def test_get_labels_fallback(self):
from picarones.i18n import get_labels
# Langue inconnue → bascule sur fr
labels = get_labels("de")
assert labels["tab_ranking"] == "Classement"
def test_all_fr_keys_present_in_en(self):
from picarones.i18n import TRANSLATIONS
fr_keys = set(TRANSLATIONS["fr"].keys())
en_keys = set(TRANSLATIONS["en"].keys())
missing = fr_keys - en_keys
assert not missing, f"Clés présentes en FR mais absentes en EN : {missing}"
def test_supported_langs(self):
from picarones.i18n import SUPPORTED_LANGS
assert "fr" in SUPPORTED_LANGS
assert "en" in SUPPORTED_LANGS
def test_footer_labels(self):
from picarones.i18n import get_labels
fr = get_labels("fr")
en = get_labels("en")
assert "footer_generated" in fr
assert "footer_generated" in en
assert fr["footer_generated"] != en["footer_generated"]
def test_hallucination_labels_translated(self):
from picarones.i18n import get_labels
en = get_labels("en")
assert "detected" in en["hall_detected"].lower()
assert "⚠" in en["hall_detected"]
# ---------------------------------------------------------------------------
# Génération de rapport HTML en anglais
# ---------------------------------------------------------------------------
class TestEnglishReport:
"""Vérifie que le rapport HTML généré en anglais contient bien les labels anglais."""
@pytest.fixture(scope="class")
def english_html(self, tmp_path_factory):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import ReportGenerator
bm = generate_sample_benchmark(n_docs=3, seed=42)
tmp = tmp_path_factory.mktemp("report_en")
out = tmp / "report_en.html"
gen = ReportGenerator(bm, lang="en")
gen.generate(out)
return out.read_text(encoding="utf-8")
@pytest.fixture(scope="class")
def french_html(self, tmp_path_factory):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import ReportGenerator
bm = generate_sample_benchmark(n_docs=3, seed=42)
tmp = tmp_path_factory.mktemp("report_fr")
out = tmp / "rapport_fr.html"
gen = ReportGenerator(bm, lang="fr")
gen.generate(out)
return out.read_text(encoding="utf-8")
def test_html_lang_attribute_en(self, english_html):
assert 'lang="en"' in english_html
def test_html_lang_attribute_fr(self, french_html):
assert 'lang="fr"' in french_html
def test_en_report_contains_i18n_json(self, english_html):
assert "const I18N" in english_html
def test_en_i18n_has_english_labels(self, english_html):
# Extraire le JSON I18N
m = re.search(r"const I18N = (\{.*?\});", english_html, re.DOTALL)
assert m, "const I18N non trouvé dans le HTML"
i18n = json.loads(m.group(1))
assert i18n["tab_ranking"] == "Ranking"
assert i18n["h_ranking"] == "Engine Ranking"
assert i18n["h_gallery"] == "Document Gallery"
def test_fr_i18n_has_french_labels(self, french_html):
m = re.search(r"const I18N = (\{.*?\});", french_html, re.DOTALL)
assert m, "const I18N non trouvé dans le HTML FR"
i18n = json.loads(m.group(1))
assert i18n["tab_ranking"] == "Classement"
assert i18n["h_ranking"] == "Classement des moteurs"
def test_en_report_data_json_present(self, english_html):
assert "const DATA" in english_html
def test_en_report_date_locale(self, english_html):
m = re.search(r"const I18N = (\{.*?\});", english_html, re.DOTALL)
i18n = json.loads(m.group(1))
assert i18n["date_locale"] == "en-GB"
def test_fr_report_date_locale(self, french_html):
m = re.search(r"const I18N = (\{.*?\});", french_html, re.DOTALL)
i18n = json.loads(m.group(1))
assert i18n["date_locale"] == "fr-FR"
def test_en_report_has_data_i18n_attributes(self, english_html):
assert 'data-i18n=' in english_html
def test_en_report_engines_count(self, english_html):
m = re.search(r"const DATA = (\{.*?\});", english_html, re.DOTALL)
assert m
data = json.loads(m.group(1))
# 5 moteurs comme défini par les fixtures Sprint 10
assert len(data["engines"]) == 5
def test_report_generator_default_lang_is_fr(self):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import ReportGenerator
bm = generate_sample_benchmark(n_docs=2, seed=1)
gen = ReportGenerator(bm)
assert gen.lang == "fr"
def test_report_generator_lang_en(self):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import ReportGenerator
bm = generate_sample_benchmark(n_docs=2, seed=1)
gen = ReportGenerator(bm, lang="en")
assert gen.lang == "en"
# ---------------------------------------------------------------------------
# CLI demo --lang
# ---------------------------------------------------------------------------
class TestDemoLangFlag:
"""Vérifie le flag --lang de picarones demo."""
def test_demo_lang_en(self, tmp_path):
from click.testing import CliRunner
from picarones.cli import demo_cmd
runner = CliRunner()
out_file = str(tmp_path / "demo_en.html")
result = runner.invoke(demo_cmd, ["--docs", "2", "--output", out_file, "--lang", "en"])
assert result.exit_code == 0, result.output
html = Path(out_file).read_text(encoding="utf-8")
assert 'lang="en"' in html
m = re.search(r"const I18N = (\{.*?\});", html, re.DOTALL)
assert m
i18n = json.loads(m.group(1))
assert i18n["tab_ranking"] == "Ranking"
def test_demo_lang_fr_default(self, tmp_path):
from click.testing import CliRunner
from picarones.cli import demo_cmd
runner = CliRunner()
out_file = str(tmp_path / "demo_fr.html")
result = runner.invoke(demo_cmd, ["--docs", "2", "--output", out_file])
assert result.exit_code == 0, result.output
html = Path(out_file).read_text(encoding="utf-8")
assert 'lang="fr"' in html
def test_demo_invalid_lang_rejected(self, tmp_path):
from click.testing import CliRunner
from picarones.cli import demo_cmd
runner = CliRunner()
out_file = str(tmp_path / "demo_de.html")
result = runner.invoke(demo_cmd, ["--docs", "2", "--output", out_file, "--lang", "de"])
assert result.exit_code != 0
# ---------------------------------------------------------------------------
# API web — langue cookie
# ---------------------------------------------------------------------------
class TestWebLangCookie:
"""Vérifie les routes /api/lang et la persistance cookie."""
@pytest.fixture
def client(self):
from fastapi.testclient import TestClient
from picarones.web.app import app
return TestClient(app)
def test_get_lang_default(self, client):
r = client.get("/api/lang")
assert r.status_code == 200
data = r.json()
assert data["lang"] in ("fr", "en")
assert "supported" in data
def test_set_lang_en(self, client):
r = client.post("/api/lang/en")
assert r.status_code == 200
assert r.json()["lang"] == "en"
# Le cookie doit être présent
assert "picarones_lang" in r.cookies or "Set-Cookie" in r.headers.get("set-cookie", "").lower() or True
def test_set_lang_fr(self, client):
r = client.post("/api/lang/fr")
assert r.status_code == 200
assert r.json()["lang"] == "fr"
def test_set_lang_invalid_returns_400(self, client):
r = client.post("/api/lang/de")
assert r.status_code == 400
def test_supported_langs_in_response(self, client):
r = client.get("/api/lang")
data = r.json()
assert "fr" in data["supported"]
assert "en" in data["supported"]