Spaces:
Sleeping
Sleeping
Claude
chantier1: reconstructeur ALTO de rΓ©fΓ©rence + refonte BaseOCREngine via hooks
ceb4ba7 unverified | """Tests du chantier 1 β reconstructeur ALTO baseline + mΓ©triques (ALTO, ALTO). | |
| Couvre : | |
| - :class:`picarones.modules.TextToAltoMonoRegion` : produit un ALTO 4.2 | |
| conforme, déterministe, qui tolère absence d'image / image | |
| introuvable / dimensions invalides. | |
| - :func:`picarones.core.alto_metrics.extract_text_from_alto` : parsing | |
| tolΓ©rant (avec/sans namespace, ALTO partiel, GT ``AltoGT`` ou ``str``). | |
| - MΓ©triques ``alto_text_cer`` / ``alto_text_wer`` enregistrΓ©es sur | |
| ``(ALTO, ALTO)`` et dΓ©couvrables via ``compute_at_junction``. | |
| - Bout-en-bout : ``PipelineRunner`` exΓ©cute une pipeline | |
| ``MockOCR β TextToAltoMonoRegion`` et calcule automatiquement la | |
| jonction ``(ALTO, ALTO)`` contre une ``AltoGT`` du document. | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from typing import Any | |
| from xml.etree import ElementTree as ET | |
| import pytest | |
| from picarones.core.alto_metrics import ( | |
| alto_text_cer, | |
| extract_text_from_alto, | |
| ) | |
| from picarones.core.corpus import AltoGT, Document, GTLevel, TextGT | |
| from picarones.core.metric_registry import compute_at_junction, select_metrics | |
| from picarones.core.modules import ArtifactType, BaseModule | |
| from picarones.core.pipeline_runner import ( | |
| PipelineRunner, | |
| PipelineSpec, | |
| PipelineStep, | |
| ) | |
| from picarones.modules import TextToAltoMonoRegion | |
| from picarones.modules.alto_text_to_mono_region import _build_alto_xml | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. _build_alto_xml β fonction pure | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestBuildAltoXml: | |
| def test_minimal_text_produces_valid_xml(self): | |
| xml = _build_alto_xml("hello world", width=1000, height=2000) | |
| # Doit Γͺtre parsable | |
| root = ET.fromstring(xml) | |
| assert "alto" in root.tag.lower() | |
| def test_namespace_is_alto_v4(self): | |
| xml = _build_alto_xml("a", width=100, height=200) | |
| assert "loc.gov/standards/alto/ns-v4" in xml | |
| def test_two_lines_produce_two_textlines(self): | |
| xml = _build_alto_xml("line one\nline two", width=1000, height=1000) | |
| root = ET.fromstring(xml) | |
| textlines = [el for el in root.iter() if el.tag.endswith("TextLine")] | |
| assert len(textlines) == 2 | |
| def test_each_word_produces_one_string(self): | |
| xml = _build_alto_xml("alpha beta gamma", width=1000, height=1000) | |
| root = ET.fromstring(xml) | |
| strings = [el for el in root.iter() if el.tag.endswith("String")] | |
| assert len(strings) == 3 | |
| contents = [s.attrib["CONTENT"] for s in strings] | |
| assert contents == ["alpha", "beta", "gamma"] | |
| def test_xml_chars_are_escaped(self): | |
| xml = _build_alto_xml('<foo> & "bar"', width=100, height=100) | |
| # Ne doit pas casser le parser malgré les méta-caractères XML | |
| root = ET.fromstring(xml) | |
| strings = [el for el in root.iter() if el.tag.endswith("String")] | |
| contents = "".join(s.attrib["CONTENT"] for s in strings) | |
| # Les caractΓ¨res doivent Γͺtre prΓ©servΓ©s sΓ©mantiquement | |
| assert "<foo>" in contents | |
| assert "&" in contents | |
| def test_invalid_dimensions_fall_back(self, caplog): | |
| with caplog.at_level("WARNING"): | |
| xml = _build_alto_xml("test", width=0, height=-5) | |
| root = ET.fromstring(xml) | |
| page = next(el for el in root.iter() if el.tag.endswith("Page")) | |
| # Dimensions de repli documentΓ©es | |
| assert int(page.attrib["WIDTH"]) > 0 | |
| assert int(page.attrib["HEIGHT"]) > 0 | |
| def test_empty_text_still_valid_xml(self): | |
| xml = _build_alto_xml("", width=100, height=100) | |
| root = ET.fromstring(xml) | |
| textlines = [el for el in root.iter() if el.tag.endswith("TextLine")] | |
| # Une TextLine vide est Γ©mise (placeholder) β ALTO valide | |
| assert len(textlines) == 1 | |
| def test_deterministic(self): | |
| xml1 = _build_alto_xml("hello world", width=500, height=800) | |
| xml2 = _build_alto_xml("hello world", width=500, height=800) | |
| assert xml1 == xml2 | |
| def test_image_filename_in_description(self): | |
| xml = _build_alto_xml( | |
| "x", width=10, height=10, image_filename="page_42.png", | |
| ) | |
| assert "page_42.png" in xml | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. TextToAltoMonoRegion β module | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestTextToAltoMonoRegion: | |
| def test_module_declares_correct_types(self): | |
| m = TextToAltoMonoRegion() | |
| assert m.input_types == (ArtifactType.IMAGE, ArtifactType.TEXT) | |
| assert m.output_types == (ArtifactType.ALTO,) | |
| assert m.execution_mode == "cpu" | |
| def test_module_name_default(self): | |
| assert TextToAltoMonoRegion().name == "alto_text_to_mono_region" | |
| def test_module_name_overridable(self): | |
| m = TextToAltoMonoRegion(config={"name": "my_baseline"}) | |
| assert m.name == "my_baseline" | |
| def test_validate_inputs_missing_raises(self): | |
| m = TextToAltoMonoRegion() | |
| with pytest.raises(ValueError, match="entrΓ©es manquantes"): | |
| m.process({ArtifactType.TEXT: "hello"}) | |
| def test_process_with_dimensions_tuple(self): | |
| m = TextToAltoMonoRegion() | |
| outputs = m.process({ | |
| ArtifactType.IMAGE: (1024, 768), | |
| ArtifactType.TEXT: "hello world", | |
| }) | |
| assert ArtifactType.ALTO in outputs | |
| xml = outputs[ArtifactType.ALTO] | |
| assert 'WIDTH="1024"' in xml | |
| assert 'HEIGHT="768"' in xml | |
| def test_process_with_missing_image_falls_back(self, caplog): | |
| m = TextToAltoMonoRegion() | |
| with caplog.at_level("WARNING"): | |
| outputs = m.process({ | |
| ArtifactType.IMAGE: "/nonexistent/path/image.png", | |
| ArtifactType.TEXT: "hello", | |
| }) | |
| xml = outputs[ArtifactType.ALTO] | |
| # Tombe sur les valeurs par dΓ©faut documentΓ©es | |
| assert 'WIDTH="2000"' in xml | |
| assert 'HEIGHT="3000"' in xml | |
| def test_process_accepts_textgt_payload(self): | |
| m = TextToAltoMonoRegion() | |
| outputs = m.process({ | |
| ArtifactType.IMAGE: (100, 100), | |
| ArtifactType.TEXT: TextGT(text="aze rty"), | |
| }) | |
| xml = outputs[ArtifactType.ALTO] | |
| assert 'CONTENT="aze"' in xml | |
| assert 'CONTENT="rty"' in xml | |
| def test_metadata_is_traceable(self): | |
| meta = TextToAltoMonoRegion().metadata() | |
| assert meta["module_kind"] == "alto_reconstructor" | |
| assert meta["variant"] == "mono_region_baseline" | |
| assert meta["deterministic"] is True | |
| def test_repr_contains_input_output_types(self): | |
| m = TextToAltoMonoRegion() | |
| r = repr(m) | |
| assert "image" in r | |
| assert "alto" in r | |
| assert "text" in r | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. extract_text_from_alto β parser tolΓ©rant | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestExtractTextFromAlto: | |
| def test_round_trip_through_baseline(self): | |
| # Le reconstructeur produit un ALTO ; le parser doit | |
| # retrouver le texte (modulo whitespace). | |
| m = TextToAltoMonoRegion() | |
| out = m.process({ | |
| ArtifactType.IMAGE: (200, 200), | |
| ArtifactType.TEXT: "hello world\nsecond line", | |
| }) | |
| text = extract_text_from_alto(out[ArtifactType.ALTO]) | |
| assert text == "hello world\nsecond line" | |
| def test_empty_string_returns_empty(self): | |
| assert extract_text_from_alto("") == "" | |
| def test_none_returns_empty(self): | |
| assert extract_text_from_alto(None) == "" | |
| def test_invalid_xml_returns_empty(self, caplog): | |
| with caplog.at_level("WARNING"): | |
| assert extract_text_from_alto("<not xml") == "" | |
| def test_alto_v2_without_namespace(self): | |
| xml = ( | |
| '<?xml version="1.0"?><alto>' | |
| '<Layout><Page><PrintSpace><TextBlock>' | |
| '<TextLine><String CONTENT="foo"/><String CONTENT="bar"/>' | |
| '</TextLine>' | |
| '</TextBlock></PrintSpace></Page></Layout></alto>' | |
| ) | |
| assert extract_text_from_alto(xml) == "foo bar" | |
| def test_accepts_altogt_object(self): | |
| xml = '<alto><TextLine><String CONTENT="x"/></TextLine></alto>' | |
| gt = AltoGT(xml_content=xml) | |
| assert extract_text_from_alto(gt) == "x" | |
| def test_multiple_textlines_joined_with_newline(self): | |
| xml = ( | |
| '<alto>' | |
| '<TextLine><String CONTENT="a"/></TextLine>' | |
| '<TextLine><String CONTENT="b"/></TextLine>' | |
| '</alto>' | |
| ) | |
| assert extract_text_from_alto(xml) == "a\nb" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. Registre typΓ© β mΓ©triques (ALTO, ALTO) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestAltoMetricsRegistration: | |
| def test_alto_metrics_are_registered(self): | |
| # L'import du module doit avoir peuplΓ© le registre. | |
| import picarones.core.alto_metrics # noqa: F401 | |
| applicable = select_metrics( | |
| (ArtifactType.ALTO, ArtifactType.ALTO), | |
| ) | |
| names = {spec.name for spec in applicable} | |
| assert "alto_text_cer" in names | |
| assert "alto_text_wer" in names | |
| assert "alto_text_mer" in names | |
| assert "alto_text_wil" in names | |
| def test_compute_at_junction_runs_alto_metrics(self): | |
| import picarones.core.alto_metrics # noqa: F401 | |
| ref = '<alto><TextLine><String CONTENT="hello"/></TextLine></alto>' | |
| hyp = '<alto><TextLine><String CONTENT="hello"/></TextLine></alto>' | |
| results = compute_at_junction( | |
| ref, hyp, | |
| (ArtifactType.ALTO, ArtifactType.ALTO), | |
| ) | |
| assert results["alto_text_cer"] == pytest.approx(0.0) | |
| def test_alto_text_cer_value_is_correct(self): | |
| # jiwer est une dΓ©pendance dure de Picarones (cf. pyproject.toml) ; | |
| # un environnement sans jiwer ne peut pas faire tourner le bench | |
| # de toute faΓ§on. | |
| try: | |
| import jiwer # noqa: F401 | |
| except ImportError: | |
| pytest.skip("jiwer absent du runtime") | |
| ref = '<alto><TextLine><String CONTENT="abcd"/></TextLine></alto>' | |
| hyp = '<alto><TextLine><String CONTENT="abXd"/></TextLine></alto>' | |
| # 1 substitution sur 4 caractΓ¨res β CER β 0.25 | |
| cer = alto_text_cer(ref, hyp) | |
| assert cer == pytest.approx(0.25, abs=1e-6) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. End-to-end β pipeline OCR β ALTO + jonction (ALTO, ALTO) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class _MockOCRModule(BaseModule): | |
| """Simule un OCR qui retourne un texte fixe.""" | |
| input_types = (ArtifactType.IMAGE,) | |
| output_types = (ArtifactType.TEXT,) | |
| execution_mode = "cpu" | |
| def __init__(self, text: str): | |
| self._text = text | |
| def name(self) -> str: | |
| return "mock_ocr" | |
| def process(self, inputs: dict[ArtifactType, Any]) -> dict[ArtifactType, Any]: | |
| self.validate_inputs(inputs) | |
| return {ArtifactType.TEXT: self._text} | |
| class TestPipelineOCRToAltoEndToEnd: | |
| def test_pipeline_ocr_then_alto_runs_and_evaluates(self, tmp_path: Path): | |
| # Document avec une image factice (jamais lue par le mock OCR | |
| # mais nΓ©cessaire pour les dimensions ALTO du reconstructeur). | |
| img = tmp_path / "page_001.png" | |
| img.write_bytes(b"\x89PNG\r\n") # En-tΓͺte PNG (Pillow lΓ¨vera, repli sur dΓ©faut) | |
| gt_alto = ( | |
| '<alto><TextLine><String CONTENT="bonjour"/>' | |
| '<String CONTENT="monde"/></TextLine></alto>' | |
| ) | |
| doc = Document( | |
| image_path=img, | |
| ground_truth="bonjour monde", | |
| ground_truths={ | |
| GTLevel.TEXT: TextGT(text="bonjour monde"), | |
| GTLevel.ALTO: AltoGT(xml_content=gt_alto), | |
| }, | |
| ) | |
| pipeline = PipelineSpec( | |
| name="ocr_to_alto", | |
| steps=[ | |
| PipelineStep("ocr", _MockOCRModule(text="bonjour monde")), | |
| PipelineStep("alto", TextToAltoMonoRegion()), | |
| ], | |
| ) | |
| result = PipelineRunner.run( | |
| pipeline, doc, {ArtifactType.IMAGE: str(img)}, | |
| ) | |
| assert result.error is None, result.error | |
| assert result.succeeded, [s.error for s in result.steps] | |
| # L'Γ©tape OCR a Γ©valuΓ© (TEXT, TEXT) β CER = 0 | |
| ocr_step = result.steps[0] | |
| assert "text" in ocr_step.junction_metrics | |
| assert ocr_step.junction_metrics["text"]["cer"] == pytest.approx(0.0) | |
| # L'Γ©tape ALTO a Γ©valuΓ© (ALTO, ALTO) β CER β 0 sur le texte extrait | |
| alto_step = result.steps[1] | |
| assert "alto" in alto_step.junction_metrics | |
| assert alto_step.junction_metrics["alto"]["alto_text_cer"] == pytest.approx( | |
| 0.0, abs=1e-6, | |
| ) | |
| def test_pipeline_with_imperfect_ocr_shows_propagation( | |
| self, tmp_path: Path, | |
| ): | |
| """Quand l'OCR introduit une erreur, elle doit se voir aussi | |
| Γ la jonction ALTO β preuve que la mesure suit l'erreur le | |
| long du DAG.""" | |
| img = tmp_path / "p.png" | |
| img.write_bytes(b"\x89PNG\r\n") | |
| gt_text = "abcd efgh" | |
| # Le reconstructeur baseline reproduira le texte tel quel ; | |
| # le CER sur le texte extrait de l'ALTO doit Γͺtre identique | |
| # au CER sur le texte de l'OCR. | |
| gt_alto = ( | |
| '<alto><TextLine><String CONTENT="abcd"/>' | |
| '<String CONTENT="efgh"/></TextLine></alto>' | |
| ) | |
| doc = Document( | |
| image_path=img, | |
| ground_truth=gt_text, | |
| ground_truths={ | |
| GTLevel.TEXT: TextGT(text=gt_text), | |
| GTLevel.ALTO: AltoGT(xml_content=gt_alto), | |
| }, | |
| ) | |
| pipeline = PipelineSpec( | |
| name="ocr_to_alto_imperfect", | |
| steps=[ | |
| PipelineStep("ocr", _MockOCRModule(text="abXd efgh")), | |
| PipelineStep("alto", TextToAltoMonoRegion()), | |
| ], | |
| ) | |
| result = PipelineRunner.run( | |
| pipeline, doc, {ArtifactType.IMAGE: str(img)}, | |
| ) | |
| ocr_cer = result.steps[0].junction_metrics["text"]["cer"] | |
| alto_cer = result.steps[1].junction_metrics["alto"]["alto_text_cer"] | |
| # Le baseline ne corrige pas, ne dΓ©grade pas β les deux CER | |
| # sont identiques (preuve que le canal information est intact | |
| # Γ travers le reconstructeur ALTO). | |
| assert ocr_cer > 0 | |
| assert alto_cer == pytest.approx(ocr_cer, abs=1e-6) | |