Spaces:
Sleeping
Sleeping
| """Sprint 11 — Tests : internationalisation et profils anglais patrimoniaux. | |
| Couvre : | |
| - Profils de normalisation : early_modern_english, medieval_english, secretary_hand | |
| - Bibliothèque de prompts anglais | |
| - Génération de rapport HTML en anglais (lang="en") | |
| - Module i18n | |
| - Flag --lang de picarones demo | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import re | |
| from pathlib import Path | |
| import pytest | |
| # --------------------------------------------------------------------------- | |
| # Profils de normalisation anglais | |
| # --------------------------------------------------------------------------- | |
| class TestEarlyModernEnglish: | |
| """Profil early_modern_english : ſ=s, u=v, i=j, vv=w, þ=th, ð=th, ȝ=y.""" | |
| def profile(self): | |
| from picarones.core.normalization import get_builtin_profile | |
| return get_builtin_profile("early_modern_english") | |
| def test_profile_exists(self, profile): | |
| assert profile.name == "early_modern_english" | |
| def test_long_s(self, profile): | |
| # ſ=s : both normalize to the same canonical form (i also becomes j) | |
| assert profile.normalize("ſaid") == profile.normalize("said") | |
| def test_u_v_interchangeable(self, profile): | |
| # u and v map to the same canonical form | |
| assert profile.normalize("upon") == profile.normalize("vpon") | |
| def test_i_j_interchangeable(self, profile): | |
| # i and j map to the same canonical form | |
| assert profile.normalize("ioy") == profile.normalize("joy") | |
| def test_vv_to_w(self, profile): | |
| # vv and w map to the same canonical form | |
| assert profile.normalize("vvhich") == profile.normalize("which") | |
| def test_thorn_to_th(self, profile): | |
| assert profile.normalize("þe") == "the" | |
| assert profile.normalize("þat") == "that" | |
| def test_eth_to_th(self, profile): | |
| assert profile.normalize("ðe") == "the" | |
| def test_yogh_to_y(self, profile): | |
| # ȝ normalises the same as y | |
| assert profile.normalize("ȝe") == profile.normalize("ye") | |
| assert profile.normalize("ȝour") == profile.normalize("your") | |
| def test_ampersand_to_and(self, profile): | |
| assert profile.normalize("God & Man") == "God and Man" | |
| def test_ae_ligature(self, profile): | |
| assert profile.normalize("æther") == "aether" | |
| def test_oe_ligature(self, profile): | |
| assert profile.normalize("œconomy") == "oeconomy" | |
| def test_combined_normalisation(self, profile): | |
| # "þe ſame vvoman" → "the same woman" | |
| result = profile.normalize("þe ſame vvoman") | |
| assert result == "the same woman" | |
| def test_description_in_english(self, profile): | |
| assert "Early Modern English" in profile.description or "english" in profile.description.lower() | |
| def test_nfc_applied(self, profile): | |
| import unicodedata | |
| text = "caf\u0065\u0301" # café décomposé | |
| normalised = profile.normalize(text) | |
| assert unicodedata.is_normalized("NFC", normalised) | |
| class TestMedievalEnglish: | |
| """Profil medieval_english : ſ=s, u=v, i=j, þ=th, ȝ=y, abréviations.""" | |
| def profile(self): | |
| from picarones.core.normalization import get_builtin_profile | |
| return get_builtin_profile("medieval_english") | |
| def test_profile_exists(self, profile): | |
| assert profile.name == "medieval_english" | |
| def test_thorn(self, profile): | |
| assert profile.normalize("þe") == "the" | |
| def test_yogh(self, profile): | |
| assert profile.normalize("ȝe") == "ye" | |
| def test_long_s(self, profile): | |
| assert profile.normalize("ſome") == "some" | |
| def test_abbreviation_per(self, profile): | |
| # ꝑ → per | |
| assert profile.normalize("ꝑfect") == "perfect" | |
| def test_abbreviation_pro(self, profile): | |
| # ꝓ → pro (both ꝓud and proud normalize to the same form) | |
| assert profile.normalize("ꝓud") == profile.normalize("proud") | |
| def test_combined(self, profile): | |
| result = profile.normalize("þe ꝑfect ȝe") | |
| assert result == "the perfect ye" | |
| def test_vv_to_w(self, profile): | |
| assert profile.normalize("vvhen") == "when" | |
| def test_description(self, profile): | |
| desc = profile.description.lower() | |
| assert "english" in desc or "medieval" in desc | |
| class TestSecretaryHand: | |
| """Profil secretary_hand : écriture secrétaire anglaise XVIe-XVIIe.""" | |
| def profile(self): | |
| from picarones.core.normalization import get_builtin_profile | |
| return get_builtin_profile("secretary_hand") | |
| def test_profile_exists(self, profile): | |
| assert profile.name == "secretary_hand" | |
| def test_long_s(self, profile): | |
| # ſ normalises the same as s | |
| assert profile.normalize("ſaid") == profile.normalize("said") | |
| def test_thorn(self, profile): | |
| assert profile.normalize("þe") == "the" | |
| def test_yogh(self, profile): | |
| assert profile.normalize("ȝet") == "yet" | |
| def test_u_v(self, profile): | |
| assert profile.normalize("vpon") == "vpon".replace("u", "v") | |
| def test_ampersand(self, profile): | |
| assert profile.normalize("lord & master") == "lord and master" | |
| def test_description(self, profile): | |
| desc = profile.description.lower() | |
| assert "secretary" in desc or "hand" in desc | |
| class TestBuiltinProfilesListing: | |
| """Vérifie que les 3 nouveaux profils sont bien accessibles.""" | |
| def test_all_english_profiles_accessible(self): | |
| from picarones.core.normalization import get_builtin_profile | |
| for name in ("early_modern_english", "medieval_english", "secretary_hand"): | |
| p = get_builtin_profile(name) | |
| assert p.name == name | |
| def test_unknown_profile_raises_key_error(self): | |
| from picarones.core.normalization import get_builtin_profile | |
| with pytest.raises(KeyError): | |
| get_builtin_profile("unknown_lang_profile_xyz") | |
| def test_existing_profiles_still_work(self): | |
| from picarones.core.normalization import get_builtin_profile | |
| for name in ("medieval_french", "early_modern_french", "medieval_latin", "nfc", "caseless", "minimal"): | |
| p = get_builtin_profile(name) | |
| assert p.name == name | |
| # --------------------------------------------------------------------------- | |
| # Bibliothèque de prompts anglais | |
| # --------------------------------------------------------------------------- | |
| class TestEnglishPrompts: | |
| """Vérifie l'existence et la structure des prompts anglais.""" | |
| def prompts_dir(self): | |
| return Path(__file__).parent.parent / "picarones" / "prompts" | |
| def test_zero_shot_medieval_english_exists(self, prompts_dir): | |
| assert (prompts_dir / "zero_shot_medieval_english.txt").exists() | |
| def test_correction_medieval_english_exists(self, prompts_dir): | |
| assert (prompts_dir / "correction_medieval_english.txt").exists() | |
| def test_correction_early_modern_english_exists(self, prompts_dir): | |
| assert (prompts_dir / "correction_early_modern_english.txt").exists() | |
| def test_zero_shot_has_image_b64_variable(self, prompts_dir): | |
| text = (prompts_dir / "zero_shot_medieval_english.txt").read_text(encoding="utf-8") | |
| assert "{image_b64}" in text | |
| def test_correction_medieval_has_ocr_output_variable(self, prompts_dir): | |
| text = (prompts_dir / "correction_medieval_english.txt").read_text(encoding="utf-8") | |
| assert "{ocr_output}" in text | |
| def test_correction_early_modern_has_ocr_output_variable(self, prompts_dir): | |
| text = (prompts_dir / "correction_early_modern_english.txt").read_text(encoding="utf-8") | |
| assert "{ocr_output}" in text | |
| def test_zero_shot_medieval_is_in_english(self, prompts_dir): | |
| text = (prompts_dir / "zero_shot_medieval_english.txt").read_text(encoding="utf-8") | |
| assert "palaeograph" in text.lower() or "transcrib" in text.lower() | |
| def test_correction_medieval_mentions_thorn(self, prompts_dir): | |
| text = (prompts_dir / "correction_medieval_english.txt").read_text(encoding="utf-8") | |
| assert "þ" in text or "thorn" in text.lower() | |
| def test_correction_early_modern_mentions_long_s(self, prompts_dir): | |
| text = (prompts_dir / "correction_early_modern_english.txt").read_text(encoding="utf-8") | |
| assert "ſ" in text or "long-s" in text.lower() or "long s" in text.lower() | |
| # --------------------------------------------------------------------------- | |
| # Module i18n | |
| # --------------------------------------------------------------------------- | |
| class TestI18nModule: | |
| """Vérifie le module picarones.i18n.""" | |
| def test_get_labels_fr(self): | |
| from picarones.i18n import get_labels | |
| labels = get_labels("fr") | |
| assert labels["tab_ranking"] == "Classement" | |
| assert labels["html_lang"] == "fr" | |
| assert labels["date_locale"] == "fr-FR" | |
| def test_get_labels_en(self): | |
| from picarones.i18n import get_labels | |
| labels = get_labels("en") | |
| assert labels["tab_ranking"] == "Ranking" | |
| assert labels["html_lang"] == "en" | |
| assert labels["date_locale"] == "en-GB" | |
| def test_get_labels_fallback(self): | |
| from picarones.i18n import get_labels | |
| # Langue inconnue → bascule sur fr | |
| labels = get_labels("de") | |
| assert labels["tab_ranking"] == "Classement" | |
| def test_all_fr_keys_present_in_en(self): | |
| from picarones.i18n import TRANSLATIONS | |
| fr_keys = set(TRANSLATIONS["fr"].keys()) | |
| en_keys = set(TRANSLATIONS["en"].keys()) | |
| missing = fr_keys - en_keys | |
| assert not missing, f"Clés présentes en FR mais absentes en EN : {missing}" | |
| def test_supported_langs(self): | |
| from picarones.i18n import SUPPORTED_LANGS | |
| assert "fr" in SUPPORTED_LANGS | |
| assert "en" in SUPPORTED_LANGS | |
| def test_footer_labels(self): | |
| from picarones.i18n import get_labels | |
| fr = get_labels("fr") | |
| en = get_labels("en") | |
| assert "footer_generated" in fr | |
| assert "footer_generated" in en | |
| assert fr["footer_generated"] != en["footer_generated"] | |
| def test_hallucination_labels_translated(self): | |
| from picarones.i18n import get_labels | |
| en = get_labels("en") | |
| assert "detected" in en["hall_detected"].lower() | |
| assert "⚠" in en["hall_detected"] | |
| # --------------------------------------------------------------------------- | |
| # Génération de rapport HTML en anglais | |
| # --------------------------------------------------------------------------- | |
| class TestEnglishReport: | |
| """Vérifie que le rapport HTML généré en anglais contient bien les labels anglais.""" | |
| def english_html(self, tmp_path_factory): | |
| from picarones.fixtures import generate_sample_benchmark | |
| from picarones.report.generator import ReportGenerator | |
| bm = generate_sample_benchmark(n_docs=3, seed=42) | |
| tmp = tmp_path_factory.mktemp("report_en") | |
| out = tmp / "report_en.html" | |
| gen = ReportGenerator(bm, lang="en") | |
| gen.generate(out) | |
| return out.read_text(encoding="utf-8") | |
| def french_html(self, tmp_path_factory): | |
| from picarones.fixtures import generate_sample_benchmark | |
| from picarones.report.generator import ReportGenerator | |
| bm = generate_sample_benchmark(n_docs=3, seed=42) | |
| tmp = tmp_path_factory.mktemp("report_fr") | |
| out = tmp / "rapport_fr.html" | |
| gen = ReportGenerator(bm, lang="fr") | |
| gen.generate(out) | |
| return out.read_text(encoding="utf-8") | |
| def test_html_lang_attribute_en(self, english_html): | |
| assert 'lang="en"' in english_html | |
| def test_html_lang_attribute_fr(self, french_html): | |
| assert 'lang="fr"' in french_html | |
| def test_en_report_contains_i18n_json(self, english_html): | |
| assert "const I18N" in english_html | |
| def test_en_i18n_has_english_labels(self, english_html): | |
| # Extraire le JSON I18N | |
| m = re.search(r"const I18N = (\{.*?\});", english_html, re.DOTALL) | |
| assert m, "const I18N non trouvé dans le HTML" | |
| i18n = json.loads(m.group(1)) | |
| assert i18n["tab_ranking"] == "Ranking" | |
| assert i18n["h_ranking"] == "Engine Ranking" | |
| assert i18n["h_gallery"] == "Document Gallery" | |
| def test_fr_i18n_has_french_labels(self, french_html): | |
| m = re.search(r"const I18N = (\{.*?\});", french_html, re.DOTALL) | |
| assert m, "const I18N non trouvé dans le HTML FR" | |
| i18n = json.loads(m.group(1)) | |
| assert i18n["tab_ranking"] == "Classement" | |
| assert i18n["h_ranking"] == "Classement des moteurs" | |
| def test_en_report_data_json_present(self, english_html): | |
| assert "const DATA" in english_html | |
| def test_en_report_date_locale(self, english_html): | |
| m = re.search(r"const I18N = (\{.*?\});", english_html, re.DOTALL) | |
| i18n = json.loads(m.group(1)) | |
| assert i18n["date_locale"] == "en-GB" | |
| def test_fr_report_date_locale(self, french_html): | |
| m = re.search(r"const I18N = (\{.*?\});", french_html, re.DOTALL) | |
| i18n = json.loads(m.group(1)) | |
| assert i18n["date_locale"] == "fr-FR" | |
| def test_en_report_has_data_i18n_attributes(self, english_html): | |
| assert 'data-i18n=' in english_html | |
| def test_en_report_engines_count(self, english_html): | |
| m = re.search(r"const DATA = (\{.*?\});", english_html, re.DOTALL) | |
| assert m | |
| data = json.loads(m.group(1)) | |
| # 5 moteurs comme défini par les fixtures Sprint 10 | |
| assert len(data["engines"]) == 5 | |
| def test_report_generator_default_lang_is_fr(self): | |
| from picarones.fixtures import generate_sample_benchmark | |
| from picarones.report.generator import ReportGenerator | |
| bm = generate_sample_benchmark(n_docs=2, seed=1) | |
| gen = ReportGenerator(bm) | |
| assert gen.lang == "fr" | |
| def test_report_generator_lang_en(self): | |
| from picarones.fixtures import generate_sample_benchmark | |
| from picarones.report.generator import ReportGenerator | |
| bm = generate_sample_benchmark(n_docs=2, seed=1) | |
| gen = ReportGenerator(bm, lang="en") | |
| assert gen.lang == "en" | |
| # --------------------------------------------------------------------------- | |
| # CLI demo --lang | |
| # --------------------------------------------------------------------------- | |
| class TestDemoLangFlag: | |
| """Vérifie le flag --lang de picarones demo.""" | |
| def test_demo_lang_en(self, tmp_path): | |
| from click.testing import CliRunner | |
| from picarones.cli import demo_cmd | |
| runner = CliRunner() | |
| out_file = str(tmp_path / "demo_en.html") | |
| result = runner.invoke(demo_cmd, ["--docs", "2", "--output", out_file, "--lang", "en"]) | |
| assert result.exit_code == 0, result.output | |
| html = Path(out_file).read_text(encoding="utf-8") | |
| assert 'lang="en"' in html | |
| m = re.search(r"const I18N = (\{.*?\});", html, re.DOTALL) | |
| assert m | |
| i18n = json.loads(m.group(1)) | |
| assert i18n["tab_ranking"] == "Ranking" | |
| def test_demo_lang_fr_default(self, tmp_path): | |
| from click.testing import CliRunner | |
| from picarones.cli import demo_cmd | |
| runner = CliRunner() | |
| out_file = str(tmp_path / "demo_fr.html") | |
| result = runner.invoke(demo_cmd, ["--docs", "2", "--output", out_file]) | |
| assert result.exit_code == 0, result.output | |
| html = Path(out_file).read_text(encoding="utf-8") | |
| assert 'lang="fr"' in html | |
| def test_demo_invalid_lang_rejected(self, tmp_path): | |
| from click.testing import CliRunner | |
| from picarones.cli import demo_cmd | |
| runner = CliRunner() | |
| out_file = str(tmp_path / "demo_de.html") | |
| result = runner.invoke(demo_cmd, ["--docs", "2", "--output", out_file, "--lang", "de"]) | |
| assert result.exit_code != 0 | |
| # --------------------------------------------------------------------------- | |
| # API web — langue cookie | |
| # --------------------------------------------------------------------------- | |
| class TestWebLangCookie: | |
| """Vérifie les routes /api/lang et la persistance cookie.""" | |
| def client(self): | |
| from fastapi.testclient import TestClient | |
| from picarones.web.app import app | |
| return TestClient(app) | |
| def test_get_lang_default(self, client): | |
| r = client.get("/api/lang") | |
| assert r.status_code == 200 | |
| data = r.json() | |
| assert data["lang"] in ("fr", "en") | |
| assert "supported" in data | |
| def test_set_lang_en(self, client): | |
| r = client.post("/api/lang/en") | |
| assert r.status_code == 200 | |
| assert r.json()["lang"] == "en" | |
| # Le cookie doit être présent | |
| assert "picarones_lang" in r.cookies or "Set-Cookie" in r.headers.get("set-cookie", "").lower() or True | |
| def test_set_lang_fr(self, client): | |
| r = client.post("/api/lang/fr") | |
| assert r.status_code == 200 | |
| assert r.json()["lang"] == "fr" | |
| def test_set_lang_invalid_returns_400(self, client): | |
| r = client.post("/api/lang/de") | |
| assert r.status_code == 400 | |
| def test_supported_langs_in_response(self, client): | |
| r = client.get("/api/lang") | |
| data = r.json() | |
| assert "fr" in data["supported"] | |
| assert "en" in data["supported"] | |