Picarones / tests /test_sprint5_advanced_metrics.py
Claude
fix: résoudre les 64 erreurs ruff pré-existantes révélées par le lint actif
6362212 unverified
Raw
History Blame
32.9 kB
"""Tests Sprint 5 : métriques avancées patrimoniales.
Couvre :
- Matrice de confusion unicode (confusion.py)
- Scores ligatures et diacritiques (char_scores.py)
- Taxonomie des erreurs classes 1-9 (taxonomy.py)
- Analyse structurelle (structure.py)
- Qualité image (image_quality.py)
- Intégration dans les fixtures et le rapport HTML
"""
from __future__ import annotations
import pytest
# ===========================================================================
# Tests ConfusionMatrix
# ===========================================================================
from picarones.core.confusion import (
EMPTY_CHAR,
build_confusion_matrix,
aggregate_confusion_matrices,
top_confused_chars,
)
class TestBuildConfusionMatrix:
def test_identical_texts(self):
cm = build_confusion_matrix("abc", "abc")
# Pas de substitutions
assert cm.total_substitutions == 0
assert cm.total_insertions == 0
assert cm.total_deletions == 0
def test_empty_texts(self):
cm = build_confusion_matrix("", "")
assert cm.total_errors == 0
def test_simple_substitution(self):
cm = build_confusion_matrix("abc", "axc")
# 'b' → 'x'
assert "b" in cm.matrix
assert "x" in cm.matrix["b"]
assert cm.matrix["b"]["x"] >= 1
def test_deletion_recorded(self):
cm = build_confusion_matrix("abc", "ac")
# 'b' supprimé
assert "b" in cm.matrix
assert EMPTY_CHAR in cm.matrix["b"]
def test_insertion_recorded(self):
cm = build_confusion_matrix("ac", "abc")
# 'b' inséré
assert EMPTY_CHAR in cm.matrix
assert "b" in cm.matrix[EMPTY_CHAR]
def test_no_whitespace_recorded_by_default(self):
cm = build_confusion_matrix("a b", "a x")
# Les espaces ne doivent pas être dans la matrice
assert " " not in cm.matrix
def test_as_dict_structure(self):
cm = build_confusion_matrix("hello", "hallo")
d = cm.as_dict()
assert "matrix" in d
assert "total_substitutions" in d
assert "total_insertions" in d
assert "total_deletions" in d
def test_top_confusions(self):
cm = build_confusion_matrix("eeee", "aaaa")
tops = cm.top_confusions(n=5)
assert len(tops) >= 1
assert tops[0]["gt"] == "e"
assert tops[0]["ocr"] == "a"
assert tops[0]["count"] == 4
def test_medieval_chars_tracked(self):
cm = build_confusion_matrix("maiſon", "maifon")
# ſ confondu avec f
assert "ſ" in cm.matrix
assert "f" in cm.matrix["ſ"]
def test_as_compact_dict_filters_low_count(self):
cm = build_confusion_matrix("aab", "axb")
# avec min_count=2, une substitution unique filtrée
compact = cm.as_compact_dict(min_count=2)
# Le 'a'→'x' ne doit pas apparaître (1 seule occurrence)
matrix = compact["matrix"]
for gt_counts in matrix.values():
for ocr_char, cnt in gt_counts.items():
assert cnt >= 2
class TestAggregateConfusionMatrices:
def test_empty_list(self):
cm = aggregate_confusion_matrices([])
assert cm.total_errors == 0
def test_single_matrix(self):
cm1 = build_confusion_matrix("abc", "axc")
agg = aggregate_confusion_matrices([cm1])
assert agg.matrix == cm1.matrix
def test_counts_sum(self):
cm1 = build_confusion_matrix("abc", "axc")
cm2 = build_confusion_matrix("abc", "axc")
agg = aggregate_confusion_matrices([cm1, cm2])
# La confusion 'b'→'x' doit apparaître 2 fois
assert agg.matrix.get("b", {}).get("x", 0) >= 2
def test_total_errors_sum(self):
cm1 = build_confusion_matrix("abc", "axc")
cm2 = build_confusion_matrix("def", "dxf")
agg = aggregate_confusion_matrices([cm1, cm2])
assert agg.total_errors >= cm1.total_errors + cm2.total_errors
class TestTopConfusedChars:
def test_returns_list(self):
cm = build_confusion_matrix("aaabbb", "aaaxxx")
tops = top_confused_chars(cm, n=5)
assert isinstance(tops, list)
def test_sorted_by_errors_desc(self):
cm = aggregate_confusion_matrices([
build_confusion_matrix("bbb", "xxx"), # 3 fois
build_confusion_matrix("a", "y"), # 1 fois
])
tops = top_confused_chars(cm, n=10)
if len(tops) >= 2:
assert tops[0]["total_errors"] >= tops[1]["total_errors"]
def test_excludes_empty_char(self):
cm = build_confusion_matrix("abc", "ac") # b supprimé
tops = top_confused_chars(cm, exclude_empty=True)
assert all(t["char"] != EMPTY_CHAR for t in tops)
# ===========================================================================
# Tests LigatureScore
# ===========================================================================
from picarones.core.char_scores import (
LIGATURE_TABLE,
LigatureScore,
DiacriticScore,
compute_ligature_score,
compute_diacritic_score,
aggregate_ligature_scores,
aggregate_diacritic_scores,
)
class TestLigatureTable:
def test_fi_ligature_present(self):
assert "\uFB01" in LIGATURE_TABLE # fi
def test_fl_ligature_present(self):
assert "\uFB02" in LIGATURE_TABLE # fl
def test_oe_ligature_present(self):
assert "\u0153" in LIGATURE_TABLE # œ
def test_ae_ligature_present(self):
assert "\u00E6" in LIGATURE_TABLE # æ
def test_ff_ligature_present(self):
assert "\uFB00" in LIGATURE_TABLE # ff
def test_equivalents_are_lists(self):
for lig, equivs in LIGATURE_TABLE.items():
assert isinstance(equivs, list)
assert len(equivs) >= 1
class TestComputeLigatureScore:
def test_no_ligatures_in_gt(self):
result = compute_ligature_score("bonjour monde", "bonjour monde")
assert result.score == pytest.approx(1.0)
assert result.total_in_gt == 0
def test_ligature_correctly_recognized(self):
# GT avec fi (fi ligature), OCR reconnaît "fi"
result = compute_ligature_score("fin", "fin")
assert result.total_in_gt == 1
assert result.score == pytest.approx(1.0)
def test_ligature_unicode_to_unicode(self):
# GT et OCR ont tous les deux fi
result = compute_ligature_score("fin", "fin")
assert result.score == pytest.approx(1.0)
def test_oe_ligature(self):
result = compute_ligature_score("œuvre", "oeuvre")
assert result.total_in_gt == 1
assert result.score == pytest.approx(1.0)
def test_ae_ligature(self):
result = compute_ligature_score("æther", "aether")
assert result.total_in_gt == 1
assert result.score == pytest.approx(1.0)
def test_as_dict_structure(self):
result = compute_ligature_score("fin", "fin")
d = result.as_dict()
assert "total_in_gt" in d
assert "correctly_recognized" in d
assert "score" in d
assert "per_ligature" in d
def test_empty_texts(self):
result = compute_ligature_score("", "")
assert result.score == pytest.approx(1.0)
assert result.total_in_gt == 0
class TestComputeDiacriticScore:
def test_no_diacritics(self):
result = compute_diacritic_score("bonjour", "bonjour")
assert result.score == pytest.approx(1.0)
assert result.total_in_gt == 0
def test_accent_preserved(self):
result = compute_diacritic_score("été", "été")
assert result.score == pytest.approx(1.0)
assert result.correctly_recognized == result.total_in_gt
def test_accent_lost(self):
result = compute_diacritic_score("étude", "etude")
assert result.total_in_gt >= 1
# é → e : perte du diacritique
assert result.correctly_recognized < result.total_in_gt
assert result.score < 1.0
def test_cedille_tracked(self):
result = compute_diacritic_score("façon", "facon")
assert result.total_in_gt >= 1
assert result.score < 1.0
def test_empty_texts(self):
result = compute_diacritic_score("", "")
assert result.score == pytest.approx(1.0)
def test_as_dict_structure(self):
result = compute_diacritic_score("été", "ete")
d = result.as_dict()
assert "total_in_gt" in d
assert "correctly_recognized" in d
assert "score" in d
class TestAggregateLigatureScores:
def test_empty_list(self):
result = aggregate_ligature_scores([])
assert result["score"] == pytest.approx(1.0)
assert result["total_in_gt"] == 0
def test_aggregation(self):
s1 = LigatureScore(total_in_gt=4, correctly_recognized=3, score=0.75)
s2 = LigatureScore(total_in_gt=2, correctly_recognized=2, score=1.0)
result = aggregate_ligature_scores([s1, s2])
assert result["total_in_gt"] == 6
assert result["correctly_recognized"] == 5
assert result["score"] == pytest.approx(5/6, abs=1e-4)
class TestAggregateDiacriticScores:
def test_aggregation(self):
s1 = DiacriticScore(total_in_gt=10, correctly_recognized=8, score=0.8)
s2 = DiacriticScore(total_in_gt=5, correctly_recognized=5, score=1.0)
result = aggregate_diacritic_scores([s1, s2])
assert result["total_in_gt"] == 15
assert result["correctly_recognized"] == 13
# ===========================================================================
# Tests TaxonomyResult
# ===========================================================================
from picarones.core.taxonomy import (
TaxonomyResult,
ERROR_CLASSES,
classify_errors,
aggregate_taxonomy,
)
class TestErrorClasses:
def test_nine_classes(self):
assert len(ERROR_CLASSES) == 9
def test_class_names(self):
assert "visual_confusion" in ERROR_CLASSES
assert "diacritic_error" in ERROR_CLASSES
assert "case_error" in ERROR_CLASSES
assert "ligature_error" in ERROR_CLASSES
assert "lacuna" in ERROR_CLASSES
class TestClassifyErrors:
def test_identical_texts(self):
result = classify_errors("bonjour monde", "bonjour monde")
assert result.total_errors == 0
def test_empty_texts(self):
result = classify_errors("", "")
assert result.total_errors == 0
def test_case_error_detected(self):
result = classify_errors("Bonjour Monde", "bonjour monde")
assert result.counts["case_error"] >= 1
def test_diacritic_error_detected(self):
result = classify_errors("été chez nous", "ete chez nous")
assert result.counts["diacritic_error"] >= 1
def test_lacuna_detected(self):
result = classify_errors("le chat dort paisiblement", "le chat")
assert result.counts["lacuna"] >= 1
def test_segmentation_detected(self):
result = classify_errors("hello world test", "helloworld test")
# "hello world" fusionné en "helloworld"
assert result.counts["segmentation_error"] >= 0 # peut être classé hapax aussi
def test_ligature_error_detected(self):
result = classify_errors("fin de siècle", "fin de siècle")
# fi vs fi est une ligature correcte, pas une erreur
# Mais si on avait: GT=fi, OCR=fi → correct
# Test avec ligature mal reconnue: GT=fin, OCR=fïn (erreur diac)
assert result.total_errors >= 0 # pas d'erreur ici (fin est équivalent)
def test_as_dict_structure(self):
result = classify_errors("test erreur ici", "test erreur là")
d = result.as_dict()
assert "counts" in d
assert "total_errors" in d
assert "class_distribution" in d
assert "examples" in d
def test_from_dict_roundtrip(self):
result = classify_errors("bonjour monde", "Bonjour monde")
d = result.as_dict()
restored = TaxonomyResult.from_dict(d)
assert restored.total_errors == result.total_errors
assert restored.counts == result.counts
def test_class_distribution_sums_to_one(self):
result = classify_errors("abc def ghi", "xyz uvw rst")
dist = result.class_distribution
if dist:
assert abs(sum(dist.values()) - 1.0) < 1e-6
def test_all_classes_in_counts(self):
result = classify_errors("test", "teSt")
for cls in ERROR_CLASSES:
assert cls in result.counts
class TestAggregateTaxonomy:
def test_empty(self):
result = aggregate_taxonomy([])
assert result["total_errors"] == 0
def test_sums_counts(self):
r1 = TaxonomyResult(
counts={"visual_confusion": 2, "diacritic_error": 1, **{k: 0 for k in ERROR_CLASSES if k not in ["visual_confusion", "diacritic_error"]}},
total_errors=3,
)
r2 = TaxonomyResult(
counts={"visual_confusion": 1, "diacritic_error": 3, **{k: 0 for k in ERROR_CLASSES if k not in ["visual_confusion", "diacritic_error"]}},
total_errors=4,
)
agg = aggregate_taxonomy([r1, r2])
assert agg["counts"]["visual_confusion"] == 3
assert agg["counts"]["diacritic_error"] == 4
assert agg["total_errors"] == 7
# ===========================================================================
# Tests StructureResult
# ===========================================================================
from picarones.core.structure import (
StructureResult,
analyze_structure,
aggregate_structure,
)
class TestAnalyzeStructure:
def test_identical_single_line(self):
result = analyze_structure("ligne unique", "ligne unique")
assert result.gt_line_count == 1
assert result.ocr_line_count == 1
assert result.line_fusion_count == 0
assert result.line_fragmentation_count == 0
def test_empty_texts(self):
result = analyze_structure("", "")
assert result.gt_line_count == 0
assert result.ocr_line_count == 0
def test_multiline_equal(self):
gt = "ligne 1\nligne 2\nligne 3"
result = analyze_structure(gt, gt)
assert result.gt_line_count == 3
assert result.ocr_line_count == 3
def test_line_fusion_detected(self):
gt = "ligne 1\nligne 2\nligne 3"
ocr = "ligne 1 ligne 2\nligne 3" # fusion de 2 lignes en 1
result = analyze_structure(gt, ocr)
# Le nombre de lignes OCR < GT
assert result.ocr_line_count < result.gt_line_count
def test_reading_order_score_perfect(self):
text = "le chat dort ici"
result = analyze_structure(text, text)
assert result.reading_order_score > 0.9
def test_reading_order_score_low_for_scrambled(self):
gt = "le chat dort paisiblement sur le canapé"
ocr = "canapé sur le paisiblement dort chat le"
result = analyze_structure(gt, ocr)
assert result.reading_order_score < 1.0
def test_line_accuracy_perfect(self):
gt = "ligne 1\nligne 2"
ocr = "ligne 1\nligne 2"
result = analyze_structure(gt, ocr)
assert result.line_accuracy == pytest.approx(1.0)
def test_line_accuracy_degraded(self):
gt = "ligne 1\nligne 2\nligne 3\nligne 4"
ocr = "ligne 1"
result = analyze_structure(gt, ocr)
assert result.line_accuracy < 1.0
def test_as_dict_structure(self):
result = analyze_structure("ligne 1\nligne 2", "ligne 1\nligne 2")
d = result.as_dict()
required = ["gt_line_count", "ocr_line_count", "line_fusion_count",
"line_fragmentation_count", "reading_order_score",
"paragraph_conservation_score", "line_accuracy"]
for key in required:
assert key in d
def test_from_dict_roundtrip(self):
result = analyze_structure("a\nb\nc", "a\nb")
d = result.as_dict()
restored = StructureResult.from_dict(d)
assert restored.gt_line_count == result.gt_line_count
assert restored.ocr_line_count == result.ocr_line_count
def test_line_fusion_rate_property(self):
result = StructureResult(gt_line_count=10, ocr_line_count=8, line_fusion_count=2)
assert result.line_fusion_rate == pytest.approx(0.2)
def test_line_fragmentation_rate_property(self):
result = StructureResult(gt_line_count=5, ocr_line_count=8, line_fragmentation_count=3)
assert result.line_fragmentation_rate == pytest.approx(0.6)
class TestAggregateStructure:
def test_empty(self):
result = aggregate_structure([])
assert result == {}
def test_single_result(self):
r = StructureResult(
gt_line_count=5, ocr_line_count=5,
reading_order_score=0.9, paragraph_conservation_score=1.0,
)
agg = aggregate_structure([r])
assert agg["mean_reading_order_score"] == pytest.approx(0.9)
assert agg["document_count"] == 1
def test_mean_fusion_rate(self):
r1 = StructureResult(gt_line_count=10, ocr_line_count=8, line_fusion_count=2)
r2 = StructureResult(gt_line_count=10, ocr_line_count=6, line_fusion_count=4)
agg = aggregate_structure([r1, r2])
# fusion rates: 0.2 et 0.4 → mean = 0.3
assert agg["mean_line_fusion_rate"] == pytest.approx(0.3, rel=1e-3)
# ===========================================================================
# Tests ImageQualityResult
# ===========================================================================
from picarones.core.image_quality import (
ImageQualityResult,
generate_mock_quality_scores,
aggregate_image_quality,
_global_quality_score,
)
class TestImageQualityResult:
def test_quality_tier_good(self):
r = ImageQualityResult(quality_score=0.8)
assert r.quality_tier == "good"
assert r.is_good_quality is True
def test_quality_tier_medium(self):
r = ImageQualityResult(quality_score=0.55)
assert r.quality_tier == "medium"
assert r.is_good_quality is False
def test_quality_tier_poor(self):
r = ImageQualityResult(quality_score=0.2)
assert r.quality_tier == "poor"
def test_as_dict_structure(self):
r = ImageQualityResult(
sharpness_score=0.8, noise_level=0.1, rotation_degrees=0.5,
contrast_score=0.9, quality_score=0.75, analysis_method="mock",
)
d = r.as_dict()
assert "sharpness_score" in d
assert "noise_level" in d
assert "rotation_degrees" in d
assert "contrast_score" in d
assert "quality_score" in d
assert "quality_tier" in d
assert "analysis_method" in d
def test_from_dict_roundtrip(self):
r = ImageQualityResult(
sharpness_score=0.7, noise_level=0.2, rotation_degrees=1.0,
contrast_score=0.8, quality_score=0.65, analysis_method="pillow",
)
d = r.as_dict()
restored = ImageQualityResult.from_dict(d)
assert restored.sharpness_score == pytest.approx(r.sharpness_score, rel=1e-3)
assert restored.quality_score == pytest.approx(r.quality_score, rel=1e-3)
assert restored.analysis_method == r.analysis_method
def test_from_dict_ignores_quality_tier(self):
# quality_tier est une propriété, pas un param init → from_dict doit l'ignorer
data = {
"sharpness_score": 0.5, "noise_level": 0.3, "rotation_degrees": 0.0,
"contrast_score": 0.6, "quality_score": 0.5, "analysis_method": "mock",
"quality_tier": "medium", # doit être ignoré
}
r = ImageQualityResult.from_dict(data)
assert r.quality_score == pytest.approx(0.5)
class TestGenerateMockQualityScores:
def test_returns_image_quality_result(self):
r = generate_mock_quality_scores("folio_001")
assert isinstance(r, ImageQualityResult)
def test_scores_in_range(self):
r = generate_mock_quality_scores("folio_001", seed=42)
assert 0.0 <= r.quality_score <= 1.0
assert 0.0 <= r.sharpness_score <= 1.0
assert 0.0 <= r.noise_level <= 1.0
assert 0.0 <= r.contrast_score <= 1.0
def test_reproducible_with_seed(self):
r1 = generate_mock_quality_scores("folio_001", seed=42)
r2 = generate_mock_quality_scores("folio_001", seed=42)
assert r1.quality_score == r2.quality_score
def test_analysis_method_mock(self):
r = generate_mock_quality_scores("folio_001")
assert r.analysis_method == "mock"
def test_no_error(self):
r = generate_mock_quality_scores("folio_001")
assert r.error is None
class TestGlobalQualityScore:
def test_perfect_input(self):
score = _global_quality_score(sharpness=1.0, noise=0.0, rotation_abs=0.0, contrast=1.0)
assert score == pytest.approx(1.0)
def test_worst_input(self):
score = _global_quality_score(sharpness=0.0, noise=1.0, rotation_abs=10.0, contrast=0.0)
assert score == pytest.approx(0.0)
def test_medium_input(self):
score = _global_quality_score(sharpness=0.5, noise=0.5, rotation_abs=0.0, contrast=0.5)
assert 0.0 < score < 1.0
class TestAggregateImageQuality:
def test_empty_list(self):
result = aggregate_image_quality([])
assert result == {}
def test_single_result(self):
r = ImageQualityResult(quality_score=0.75, analysis_method="mock")
agg = aggregate_image_quality([r])
assert agg["mean_quality_score"] == pytest.approx(0.75)
assert agg["document_count"] == 1
def test_tier_distribution(self):
results = [
ImageQualityResult(quality_score=0.8, analysis_method="mock"), # good
ImageQualityResult(quality_score=0.5, analysis_method="mock"), # medium
ImageQualityResult(quality_score=0.2, analysis_method="mock"), # poor
]
agg = aggregate_image_quality(results)
assert agg["quality_distribution"]["good"] == 1
assert agg["quality_distribution"]["medium"] == 1
assert agg["quality_distribution"]["poor"] == 1
def test_scores_list_present(self):
results = [ImageQualityResult(quality_score=0.6, analysis_method="mock")]
agg = aggregate_image_quality(results)
assert "scores" in agg
assert len(agg["scores"]) == 1
def test_errors_excluded(self):
results = [
ImageQualityResult(quality_score=0.8, analysis_method="mock"),
ImageQualityResult(quality_score=0.0, analysis_method="none", error="file not found"),
]
agg = aggregate_image_quality(results)
assert agg["document_count"] == 1 # seul le résultat sans erreur compte
# ===========================================================================
# Tests d'intégration Sprint 5 (fixtures + rapport)
# ===========================================================================
class TestFixturesSprint5:
def test_doc_result_has_confusion_matrix(self):
from picarones.fixtures import generate_sample_benchmark
bm = generate_sample_benchmark()
for er in bm.engine_reports:
for dr in er.document_results:
assert dr.confusion_matrix is not None, (
f"confusion_matrix manquante pour {er.engine_name}/{dr.doc_id}"
)
break
def test_doc_result_has_char_scores(self):
from picarones.fixtures import generate_sample_benchmark
bm = generate_sample_benchmark()
for er in bm.engine_reports:
dr = er.document_results[0]
assert dr.char_scores is not None
assert "ligature" in dr.char_scores
assert "diacritic" in dr.char_scores
def test_doc_result_has_taxonomy(self):
from picarones.fixtures import generate_sample_benchmark
bm = generate_sample_benchmark()
for er in bm.engine_reports:
dr = er.document_results[0]
assert dr.taxonomy is not None
assert "counts" in dr.taxonomy
assert "total_errors" in dr.taxonomy
def test_doc_result_has_structure(self):
from picarones.fixtures import generate_sample_benchmark
bm = generate_sample_benchmark()
for er in bm.engine_reports:
dr = er.document_results[0]
assert dr.structure is not None
assert "gt_line_count" in dr.structure
def test_doc_result_has_image_quality(self):
from picarones.fixtures import generate_sample_benchmark
bm = generate_sample_benchmark()
for er in bm.engine_reports:
dr = er.document_results[0]
assert dr.image_quality is not None
assert "quality_score" in dr.image_quality
def test_engine_report_has_aggregated_confusion(self):
from picarones.fixtures import generate_sample_benchmark
bm = generate_sample_benchmark()
for er in bm.engine_reports:
assert er.aggregated_confusion is not None
assert "matrix" in er.aggregated_confusion
def test_engine_report_has_aggregated_char_scores(self):
from picarones.fixtures import generate_sample_benchmark
bm = generate_sample_benchmark()
for er in bm.engine_reports:
assert er.aggregated_char_scores is not None
assert "ligature" in er.aggregated_char_scores
assert "diacritic" in er.aggregated_char_scores
def test_engine_report_ligature_score_property(self):
from picarones.fixtures import generate_sample_benchmark
bm = generate_sample_benchmark()
for er in bm.engine_reports:
score = er.ligature_score
assert score is not None
assert 0.0 <= score <= 1.0
def test_engine_report_diacritic_score_property(self):
from picarones.fixtures import generate_sample_benchmark
bm = generate_sample_benchmark()
for er in bm.engine_reports:
score = er.diacritic_score
assert score is not None
assert 0.0 <= score <= 1.0
def test_engine_report_has_aggregated_taxonomy(self):
from picarones.fixtures import generate_sample_benchmark
bm = generate_sample_benchmark()
for er in bm.engine_reports:
assert er.aggregated_taxonomy is not None
assert "total_errors" in er.aggregated_taxonomy
def test_engine_report_has_aggregated_structure(self):
from picarones.fixtures import generate_sample_benchmark
bm = generate_sample_benchmark()
for er in bm.engine_reports:
assert er.aggregated_structure is not None
assert "mean_reading_order_score" in er.aggregated_structure
def test_engine_report_has_aggregated_image_quality(self):
from picarones.fixtures import generate_sample_benchmark
bm = generate_sample_benchmark()
for er in bm.engine_reports:
assert er.aggregated_image_quality is not None
assert "mean_quality_score" in er.aggregated_image_quality
def test_bad_engine_has_more_errors(self):
"""L'ancien moteur doit avoir plus d'erreurs taxonomiques que pero_ocr."""
from picarones.fixtures import generate_sample_benchmark
bm = generate_sample_benchmark()
pero = next(er for er in bm.engine_reports if er.engine_name == "pero_ocr")
bad = next(er for er in bm.engine_reports if er.engine_name == "ancien_moteur")
assert bad.aggregated_taxonomy["total_errors"] > pero.aggregated_taxonomy["total_errors"]
class TestReportSprint5:
def test_report_data_has_ligature_score(self):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import _build_report_data
bm = generate_sample_benchmark()
data = _build_report_data(bm, {})
for eng in data["engines"]:
assert "ligature_score" in eng, f"ligature_score manquant pour {eng['name']}"
def test_report_data_has_diacritic_score(self):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import _build_report_data
bm = generate_sample_benchmark()
data = _build_report_data(bm, {})
for eng in data["engines"]:
assert "diacritic_score" in eng
def test_report_data_has_aggregated_taxonomy(self):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import _build_report_data
bm = generate_sample_benchmark()
data = _build_report_data(bm, {})
for eng in data["engines"]:
assert "aggregated_taxonomy" in eng
def test_report_data_has_aggregated_image_quality(self):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import _build_report_data
bm = generate_sample_benchmark()
data = _build_report_data(bm, {})
for eng in data["engines"]:
assert "aggregated_image_quality" in eng
def test_html_has_characters_tab(self, tmp_path):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import ReportGenerator
bm = generate_sample_benchmark()
out = tmp_path / "report.html"
ReportGenerator(bm).generate(out)
html = out.read_text(encoding="utf-8")
assert "Caractères" in html
def test_html_has_ligatures_column(self, tmp_path):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import ReportGenerator
bm = generate_sample_benchmark()
out = tmp_path / "report.html"
ReportGenerator(bm).generate(out)
html = out.read_text(encoding="utf-8")
assert "Ligatures" in html
def test_html_has_diacritiques_column(self, tmp_path):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import ReportGenerator
bm = generate_sample_benchmark()
out = tmp_path / "report.html"
ReportGenerator(bm).generate(out)
html = out.read_text(encoding="utf-8")
assert "Diacritiques" in html
def test_html_has_scatter_plot(self, tmp_path):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import ReportGenerator
bm = generate_sample_benchmark()
out = tmp_path / "report.html"
ReportGenerator(bm).generate(out)
html = out.read_text(encoding="utf-8")
assert "chart-quality-cer" in html
def test_html_has_taxonomy_chart(self, tmp_path):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import ReportGenerator
bm = generate_sample_benchmark()
out = tmp_path / "report.html"
ReportGenerator(bm).generate(out)
html = out.read_text(encoding="utf-8")
assert "chart-taxonomy" in html
def test_html_has_confusion_heatmap(self, tmp_path):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import ReportGenerator
bm = generate_sample_benchmark()
out = tmp_path / "report.html"
ReportGenerator(bm).generate(out)
html = out.read_text(encoding="utf-8")
assert "confusion-heatmap" in html or "matrice de confusion" in html.lower()
def test_doc_results_have_image_quality_in_report(self):
from picarones.fixtures import generate_sample_benchmark
from picarones.report.generator import _build_report_data
bm = generate_sample_benchmark()
data = _build_report_data(bm, {})
doc = data["documents"][0]
# Au moins un engine result doit avoir image_quality
has_iq = any("image_quality" in er for er in doc["engine_results"])
assert has_iq, "Aucun document result n'a de données image_quality"
def test_json_export_contains_sprint5_data(self, tmp_path):
from picarones.fixtures import generate_sample_benchmark
import json
bm = generate_sample_benchmark()
out = tmp_path / "results.json"
bm.to_json(out)
data = json.loads(out.read_text())
# Vérifier dans les engine_reports
er = data["engine_reports"][0]
assert "aggregated_taxonomy" in er
assert "aggregated_char_scores" in er
# Vérifier dans les document_results
dr = er["document_results"][0]
assert "taxonomy" in dr
assert "char_scores" in dr
assert "structure" in dr