Spaces:
Running
test(harness): caractérisation totale du cœur stateful run_orchestrator
Browse filesPrécondition Phase B (décomposition stateful) demandée : verrouille
le comportement EXACT actuel là où Phase B va toucher. 17 tests,
5 groupes = 5 cas de risque :
1. doc_idx GLOBAL au run sur multi-pipeline (pas par-pipeline)
2. cancel pré-set ET mi-run (identité d'objet event à travers les
couches)
3. interruption réelle → resume (résultats/doc identiques au run
propre) + non-duplication
4. golden snapshot DÉTERMINISTE (manifest+pipeline+view+artifacts
index) sur 3 topologies (linéaire / multi-pipeline / DAG
branchant) + garde anti-flaky + déterminisme CROSS-PROCESS prouvé
(run_id scrubé, listes set-dérivées triées — sinon golden flaky
via randomisation hash chaînes)
5. isolation concurrente (2 execute() en threads : annuler A ne
fuit pas dans B)
⚠️ Le harnais a découvert un DÉFAUT pré-existant : au resume, le
partial store rejoue pipeline_results mais PAS view_results des docs
repris → view_results.jsonl incomplet → métriques agrégées
silencieusement faussées après reprise. Topologie-dépendant
(linéaire/DAG : vues ⊊ pipeline ; multi-pipeline : pas manifesté).
CARACTÉRISÉ tel quel (rôle d'un harnais : figer la réalité, warts
inclus) — fix traité en commit séparé.
https://claude.ai/code/session_01EmLiMPJJuB44QHEFzDWUvF
|
@@ -0,0 +1,528 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"artifacts_index": [
|
| 3 |
+
{
|
| 4 |
+
"content_hash": null,
|
| 5 |
+
"document_id": "doc01",
|
| 6 |
+
"id": "doc01:image",
|
| 7 |
+
"pipeline_name": "ocr_then_correct",
|
| 8 |
+
"produced_by_step": null,
|
| 9 |
+
"provenance": null,
|
| 10 |
+
"type": "image",
|
| 11 |
+
"uri": "<PATH>"
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"content_hash": null,
|
| 15 |
+
"document_id": "doc01",
|
| 16 |
+
"id": "doc01:precomputed_corr:raw_text",
|
| 17 |
+
"pipeline_name": "ocr_then_correct",
|
| 18 |
+
"produced_by_step": "ocr",
|
| 19 |
+
"provenance": null,
|
| 20 |
+
"type": "raw_text",
|
| 21 |
+
"uri": "<PATH>"
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"content_hash": null,
|
| 25 |
+
"document_id": "doc01",
|
| 26 |
+
"id": "doc01:precomputed_tess:raw_text",
|
| 27 |
+
"pipeline_name": "ocr_then_correct",
|
| 28 |
+
"produced_by_step": "ocr",
|
| 29 |
+
"provenance": null,
|
| 30 |
+
"type": "raw_text",
|
| 31 |
+
"uri": "<PATH>"
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"content_hash": null,
|
| 35 |
+
"document_id": "doc02",
|
| 36 |
+
"id": "doc02:image",
|
| 37 |
+
"pipeline_name": "ocr_then_correct",
|
| 38 |
+
"produced_by_step": null,
|
| 39 |
+
"provenance": null,
|
| 40 |
+
"type": "image",
|
| 41 |
+
"uri": "<PATH>"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"content_hash": null,
|
| 45 |
+
"document_id": "doc02",
|
| 46 |
+
"id": "doc02:precomputed_corr:raw_text",
|
| 47 |
+
"pipeline_name": "ocr_then_correct",
|
| 48 |
+
"produced_by_step": "ocr",
|
| 49 |
+
"provenance": null,
|
| 50 |
+
"type": "raw_text",
|
| 51 |
+
"uri": "<PATH>"
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"content_hash": null,
|
| 55 |
+
"document_id": "doc02",
|
| 56 |
+
"id": "doc02:precomputed_tess:raw_text",
|
| 57 |
+
"pipeline_name": "ocr_then_correct",
|
| 58 |
+
"produced_by_step": "ocr",
|
| 59 |
+
"provenance": null,
|
| 60 |
+
"type": "raw_text",
|
| 61 |
+
"uri": "<PATH>"
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"content_hash": null,
|
| 65 |
+
"document_id": "doc03",
|
| 66 |
+
"id": "doc03:image",
|
| 67 |
+
"pipeline_name": "ocr_then_correct",
|
| 68 |
+
"produced_by_step": null,
|
| 69 |
+
"provenance": null,
|
| 70 |
+
"type": "image",
|
| 71 |
+
"uri": "<PATH>"
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"content_hash": null,
|
| 75 |
+
"document_id": "doc03",
|
| 76 |
+
"id": "doc03:precomputed_corr:raw_text",
|
| 77 |
+
"pipeline_name": "ocr_then_correct",
|
| 78 |
+
"produced_by_step": "ocr",
|
| 79 |
+
"provenance": null,
|
| 80 |
+
"type": "raw_text",
|
| 81 |
+
"uri": "<PATH>"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"content_hash": null,
|
| 85 |
+
"document_id": "doc03",
|
| 86 |
+
"id": "doc03:precomputed_tess:raw_text",
|
| 87 |
+
"pipeline_name": "ocr_then_correct",
|
| 88 |
+
"produced_by_step": "ocr",
|
| 89 |
+
"provenance": null,
|
| 90 |
+
"type": "raw_text",
|
| 91 |
+
"uri": "<PATH>"
|
| 92 |
+
}
|
| 93 |
+
],
|
| 94 |
+
"manifest": {
|
| 95 |
+
"adapter_kwargs": {
|
| 96 |
+
"corrector": {
|
| 97 |
+
"source_label": "corr"
|
| 98 |
+
},
|
| 99 |
+
"ocr": {
|
| 100 |
+
"source_label": "tess"
|
| 101 |
+
}
|
| 102 |
+
},
|
| 103 |
+
"code_version": "charac-1.0",
|
| 104 |
+
"corpus_name": "charac",
|
| 105 |
+
"dependencies_lock": {
|
| 106 |
+
"CacheControl": "0.14.4",
|
| 107 |
+
"Jinja2": "3.1.6",
|
| 108 |
+
"MarkupSafe": "3.0.3",
|
| 109 |
+
"PyGObject": "3.48.2",
|
| 110 |
+
"PyJWT": "2.7.0",
|
| 111 |
+
"PyYAML": "6.0.1",
|
| 112 |
+
"Pygments": "2.20.0",
|
| 113 |
+
"RapidFuzz": "3.14.5",
|
| 114 |
+
"annotated-doc": "0.0.4",
|
| 115 |
+
"annotated-types": "0.7.0",
|
| 116 |
+
"anyio": "4.13.0",
|
| 117 |
+
"argcomplete": "3.1.4",
|
| 118 |
+
"ast_serialize": "0.5.0",
|
| 119 |
+
"bandit": "1.9.4",
|
| 120 |
+
"blinker": "1.7.0",
|
| 121 |
+
"boolean.py": "5.0",
|
| 122 |
+
"certifi": "2026.2.25",
|
| 123 |
+
"charset-normalizer": "3.4.6",
|
| 124 |
+
"click": "8.4.0",
|
| 125 |
+
"colorama": "0.4.6",
|
| 126 |
+
"conan": "2.27.0",
|
| 127 |
+
"coverage": "7.14.0",
|
| 128 |
+
"cryptography": "41.0.7",
|
| 129 |
+
"cyclonedx-python-lib": "11.7.0",
|
| 130 |
+
"dbus-python": "1.3.2",
|
| 131 |
+
"defusedxml": "0.7.1",
|
| 132 |
+
"distro": "1.9.0",
|
| 133 |
+
"fastapi": "0.136.1",
|
| 134 |
+
"fasteners": "0.20",
|
| 135 |
+
"filelock": "3.29.0",
|
| 136 |
+
"h11": "0.16.0",
|
| 137 |
+
"httpcore": "1.0.9",
|
| 138 |
+
"httplib2": "0.20.4",
|
| 139 |
+
"httptools": "0.7.1",
|
| 140 |
+
"httpx": "0.28.1",
|
| 141 |
+
"idna": "3.11",
|
| 142 |
+
"iniconfig": "2.3.0",
|
| 143 |
+
"jiwer": "4.0.0",
|
| 144 |
+
"launchpadlib": "1.11.0",
|
| 145 |
+
"lazr.restfulclient": "0.14.6",
|
| 146 |
+
"lazr.uri": "1.0.6",
|
| 147 |
+
"librt": "0.11.0",
|
| 148 |
+
"license-expression": "30.4.4",
|
| 149 |
+
"markdown-it-py": "4.2.0",
|
| 150 |
+
"mdurl": "0.1.2",
|
| 151 |
+
"msgpack": "1.1.2",
|
| 152 |
+
"mypy": "2.1.0",
|
| 153 |
+
"mypy_extensions": "1.1.0",
|
| 154 |
+
"numpy": "2.4.6",
|
| 155 |
+
"oauthlib": "3.2.2",
|
| 156 |
+
"packageurl-python": "0.17.6",
|
| 157 |
+
"packaging": "24.0",
|
| 158 |
+
"patch-ng": "1.18.1",
|
| 159 |
+
"pathspec": "1.1.1",
|
| 160 |
+
"picarones": "1.1.0.dev311",
|
| 161 |
+
"pillow": "12.2.0",
|
| 162 |
+
"pip": "24.0",
|
| 163 |
+
"pip-api": "0.0.34",
|
| 164 |
+
"pip-requirements-parser": "32.0.1",
|
| 165 |
+
"pip_audit": "2.10.0",
|
| 166 |
+
"platformdirs": "4.9.6",
|
| 167 |
+
"pluggy": "1.6.0",
|
| 168 |
+
"py-serializable": "2.1.0",
|
| 169 |
+
"pydantic": "2.13.4",
|
| 170 |
+
"pydantic_core": "2.46.4",
|
| 171 |
+
"pyparsing": "3.1.1",
|
| 172 |
+
"pytesseract": "0.3.13",
|
| 173 |
+
"pytest": "9.0.3",
|
| 174 |
+
"pytest-cov": "7.1.0",
|
| 175 |
+
"pytest-timeout": "2.4.0",
|
| 176 |
+
"python-apt": "2.7.7+ubuntu5.2",
|
| 177 |
+
"python-dateutil": "2.9.0.post0",
|
| 178 |
+
"python-dotenv": "1.2.2",
|
| 179 |
+
"python-multipart": "0.0.29",
|
| 180 |
+
"requests": "2.33.1",
|
| 181 |
+
"rich": "15.0.0",
|
| 182 |
+
"setuptools": "68.1.2",
|
| 183 |
+
"six": "1.16.0",
|
| 184 |
+
"sortedcontainers": "2.4.0",
|
| 185 |
+
"starlette": "1.0.0",
|
| 186 |
+
"stevedore": "5.8.0",
|
| 187 |
+
"toml": "0.10.2",
|
| 188 |
+
"tomli": "2.4.1",
|
| 189 |
+
"tomli_w": "1.2.0",
|
| 190 |
+
"tqdm": "4.67.3",
|
| 191 |
+
"typing-inspection": "0.4.2",
|
| 192 |
+
"typing_extensions": "4.15.0",
|
| 193 |
+
"urllib3": "2.6.3",
|
| 194 |
+
"uvicorn": "0.47.0",
|
| 195 |
+
"uvloop": "0.22.1",
|
| 196 |
+
"wadllib": "1.3.6",
|
| 197 |
+
"watchfiles": "1.2.0",
|
| 198 |
+
"websockets": "16.0",
|
| 199 |
+
"wheel": "0.42.0",
|
| 200 |
+
"xmltodict": "0.13.0",
|
| 201 |
+
"yq": "3.1.0"
|
| 202 |
+
},
|
| 203 |
+
"metadata": {
|
| 204 |
+
"orchestrator": "picarones.app.services.run_orchestrator"
|
| 205 |
+
},
|
| 206 |
+
"n_documents": 3,
|
| 207 |
+
"pipeline_names": [
|
| 208 |
+
"ocr_then_correct"
|
| 209 |
+
],
|
| 210 |
+
"pipeline_specs": [
|
| 211 |
+
{
|
| 212 |
+
"description": "",
|
| 213 |
+
"initial_inputs": [
|
| 214 |
+
"image"
|
| 215 |
+
],
|
| 216 |
+
"name": "ocr_then_correct",
|
| 217 |
+
"steps": [
|
| 218 |
+
{
|
| 219 |
+
"adapter_name": "ocr",
|
| 220 |
+
"id": "ocr",
|
| 221 |
+
"input_types": [
|
| 222 |
+
"image"
|
| 223 |
+
],
|
| 224 |
+
"inputs_from": {},
|
| 225 |
+
"kind": "step",
|
| 226 |
+
"output_types": [
|
| 227 |
+
"raw_text"
|
| 228 |
+
],
|
| 229 |
+
"params": {}
|
| 230 |
+
},
|
| 231 |
+
{
|
| 232 |
+
"adapter_name": "corrector",
|
| 233 |
+
"id": "corrector",
|
| 234 |
+
"input_types": [
|
| 235 |
+
"image",
|
| 236 |
+
"raw_text"
|
| 237 |
+
],
|
| 238 |
+
"inputs_from": {
|
| 239 |
+
"raw_text": "ocr"
|
| 240 |
+
},
|
| 241 |
+
"kind": "step",
|
| 242 |
+
"output_types": [
|
| 243 |
+
"corrected_text"
|
| 244 |
+
],
|
| 245 |
+
"params": {}
|
| 246 |
+
}
|
| 247 |
+
]
|
| 248 |
+
}
|
| 249 |
+
],
|
| 250 |
+
"system_binaries_lock": {},
|
| 251 |
+
"view_specs": [
|
| 252 |
+
{
|
| 253 |
+
"candidate_types": [
|
| 254 |
+
"alto_xml",
|
| 255 |
+
"canonical_document",
|
| 256 |
+
"corrected_text",
|
| 257 |
+
"page_xml",
|
| 258 |
+
"raw_text"
|
| 259 |
+
],
|
| 260 |
+
"char_exclude": null,
|
| 261 |
+
"description": "Compare les sorties textuelles finales après projection éventuelle (ALTO/PAGE/markdown → texte plat).",
|
| 262 |
+
"ignored_dimensions": [
|
| 263 |
+
"block_structure",
|
| 264 |
+
"confidence",
|
| 265 |
+
"formatting",
|
| 266 |
+
"geometry",
|
| 267 |
+
"ids",
|
| 268 |
+
"reading_order"
|
| 269 |
+
],
|
| 270 |
+
"metric_names": [
|
| 271 |
+
"cer",
|
| 272 |
+
"mer",
|
| 273 |
+
"wer",
|
| 274 |
+
"wil"
|
| 275 |
+
],
|
| 276 |
+
"name": "text_final",
|
| 277 |
+
"normalization_profile": null,
|
| 278 |
+
"projection": null,
|
| 279 |
+
"projections_by_source_type": {
|
| 280 |
+
"alto_xml": {
|
| 281 |
+
"params": {},
|
| 282 |
+
"projector_name": "alto_to_text",
|
| 283 |
+
"source_type": "alto_xml",
|
| 284 |
+
"target_type": "raw_text"
|
| 285 |
+
},
|
| 286 |
+
"canonical_document": {
|
| 287 |
+
"params": {},
|
| 288 |
+
"projector_name": "canonical_to_text",
|
| 289 |
+
"source_type": "canonical_document",
|
| 290 |
+
"target_type": "raw_text"
|
| 291 |
+
},
|
| 292 |
+
"page_xml": {
|
| 293 |
+
"params": {},
|
| 294 |
+
"projector_name": "page_to_text",
|
| 295 |
+
"source_type": "page_xml",
|
| 296 |
+
"target_type": "raw_text"
|
| 297 |
+
}
|
| 298 |
+
},
|
| 299 |
+
"warnings": [
|
| 300 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 301 |
+
]
|
| 302 |
+
}
|
| 303 |
+
]
|
| 304 |
+
},
|
| 305 |
+
"pipeline_results": [
|
| 306 |
+
{
|
| 307 |
+
"document_id": "doc01",
|
| 308 |
+
"pipeline_name": "ocr_then_correct",
|
| 309 |
+
"step_results": [
|
| 310 |
+
{
|
| 311 |
+
"error": null,
|
| 312 |
+
"produced_artifacts": {
|
| 313 |
+
"raw_text": "doc01:precomputed_tess:raw_text"
|
| 314 |
+
},
|
| 315 |
+
"step_id": "ocr",
|
| 316 |
+
"succeeded": true
|
| 317 |
+
},
|
| 318 |
+
{
|
| 319 |
+
"error": "missing_output: ['corrected_text']",
|
| 320 |
+
"produced_artifacts": {},
|
| 321 |
+
"step_id": "corrector",
|
| 322 |
+
"succeeded": false
|
| 323 |
+
}
|
| 324 |
+
],
|
| 325 |
+
"succeeded": false
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"document_id": "doc02",
|
| 329 |
+
"pipeline_name": "ocr_then_correct",
|
| 330 |
+
"step_results": [
|
| 331 |
+
{
|
| 332 |
+
"error": null,
|
| 333 |
+
"produced_artifacts": {
|
| 334 |
+
"raw_text": "doc02:precomputed_tess:raw_text"
|
| 335 |
+
},
|
| 336 |
+
"step_id": "ocr",
|
| 337 |
+
"succeeded": true
|
| 338 |
+
},
|
| 339 |
+
{
|
| 340 |
+
"error": "missing_output: ['corrected_text']",
|
| 341 |
+
"produced_artifacts": {},
|
| 342 |
+
"step_id": "corrector",
|
| 343 |
+
"succeeded": false
|
| 344 |
+
}
|
| 345 |
+
],
|
| 346 |
+
"succeeded": false
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"document_id": "doc03",
|
| 350 |
+
"pipeline_name": "ocr_then_correct",
|
| 351 |
+
"step_results": [
|
| 352 |
+
{
|
| 353 |
+
"error": null,
|
| 354 |
+
"produced_artifacts": {
|
| 355 |
+
"raw_text": "doc03:precomputed_tess:raw_text"
|
| 356 |
+
},
|
| 357 |
+
"step_id": "ocr",
|
| 358 |
+
"succeeded": true
|
| 359 |
+
},
|
| 360 |
+
{
|
| 361 |
+
"error": "missing_output: ['corrected_text']",
|
| 362 |
+
"produced_artifacts": {},
|
| 363 |
+
"step_id": "corrector",
|
| 364 |
+
"succeeded": false
|
| 365 |
+
}
|
| 366 |
+
],
|
| 367 |
+
"succeeded": false
|
| 368 |
+
}
|
| 369 |
+
],
|
| 370 |
+
"view_results": [
|
| 371 |
+
{
|
| 372 |
+
"candidate_artifact_id": "doc01:precomputed_tess:raw_text",
|
| 373 |
+
"document_id": "doc01",
|
| 374 |
+
"failed_metrics": {},
|
| 375 |
+
"ground_truth_artifact_id": "doc01:gt:raw_text",
|
| 376 |
+
"ignored_dimensions": [
|
| 377 |
+
"block_structure",
|
| 378 |
+
"confidence",
|
| 379 |
+
"formatting",
|
| 380 |
+
"geometry",
|
| 381 |
+
"ids",
|
| 382 |
+
"reading_order"
|
| 383 |
+
],
|
| 384 |
+
"metric_values": {
|
| 385 |
+
"cer": 0.05,
|
| 386 |
+
"mer": 0.25,
|
| 387 |
+
"wer": 0.25,
|
| 388 |
+
"wil": 0.4375
|
| 389 |
+
},
|
| 390 |
+
"pipeline_name": "ocr_then_correct",
|
| 391 |
+
"projection_report": null,
|
| 392 |
+
"view_name": "text_final",
|
| 393 |
+
"warnings": [
|
| 394 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 395 |
+
]
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"candidate_artifact_id": "doc01:precomputed_corr:raw_text",
|
| 399 |
+
"document_id": "doc01",
|
| 400 |
+
"failed_metrics": {},
|
| 401 |
+
"ground_truth_artifact_id": "doc01:gt:raw_text",
|
| 402 |
+
"ignored_dimensions": [
|
| 403 |
+
"block_structure",
|
| 404 |
+
"confidence",
|
| 405 |
+
"formatting",
|
| 406 |
+
"geometry",
|
| 407 |
+
"ids",
|
| 408 |
+
"reading_order"
|
| 409 |
+
],
|
| 410 |
+
"metric_values": {
|
| 411 |
+
"cer": 0.05,
|
| 412 |
+
"mer": 0.25,
|
| 413 |
+
"wer": 0.25,
|
| 414 |
+
"wil": 0.4375
|
| 415 |
+
},
|
| 416 |
+
"pipeline_name": "ocr_then_correct",
|
| 417 |
+
"projection_report": null,
|
| 418 |
+
"view_name": "text_final",
|
| 419 |
+
"warnings": [
|
| 420 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 421 |
+
]
|
| 422 |
+
},
|
| 423 |
+
{
|
| 424 |
+
"candidate_artifact_id": "doc02:precomputed_tess:raw_text",
|
| 425 |
+
"document_id": "doc02",
|
| 426 |
+
"failed_metrics": {},
|
| 427 |
+
"ground_truth_artifact_id": "doc02:gt:raw_text",
|
| 428 |
+
"ignored_dimensions": [
|
| 429 |
+
"block_structure",
|
| 430 |
+
"confidence",
|
| 431 |
+
"formatting",
|
| 432 |
+
"geometry",
|
| 433 |
+
"ids",
|
| 434 |
+
"reading_order"
|
| 435 |
+
],
|
| 436 |
+
"metric_values": {
|
| 437 |
+
"cer": 0.05,
|
| 438 |
+
"mer": 0.25,
|
| 439 |
+
"wer": 0.25,
|
| 440 |
+
"wil": 0.4375
|
| 441 |
+
},
|
| 442 |
+
"pipeline_name": "ocr_then_correct",
|
| 443 |
+
"projection_report": null,
|
| 444 |
+
"view_name": "text_final",
|
| 445 |
+
"warnings": [
|
| 446 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 447 |
+
]
|
| 448 |
+
},
|
| 449 |
+
{
|
| 450 |
+
"candidate_artifact_id": "doc02:precomputed_corr:raw_text",
|
| 451 |
+
"document_id": "doc02",
|
| 452 |
+
"failed_metrics": {},
|
| 453 |
+
"ground_truth_artifact_id": "doc02:gt:raw_text",
|
| 454 |
+
"ignored_dimensions": [
|
| 455 |
+
"block_structure",
|
| 456 |
+
"confidence",
|
| 457 |
+
"formatting",
|
| 458 |
+
"geometry",
|
| 459 |
+
"ids",
|
| 460 |
+
"reading_order"
|
| 461 |
+
],
|
| 462 |
+
"metric_values": {
|
| 463 |
+
"cer": 0.05,
|
| 464 |
+
"mer": 0.25,
|
| 465 |
+
"wer": 0.25,
|
| 466 |
+
"wil": 0.4375
|
| 467 |
+
},
|
| 468 |
+
"pipeline_name": "ocr_then_correct",
|
| 469 |
+
"projection_report": null,
|
| 470 |
+
"view_name": "text_final",
|
| 471 |
+
"warnings": [
|
| 472 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 473 |
+
]
|
| 474 |
+
},
|
| 475 |
+
{
|
| 476 |
+
"candidate_artifact_id": "doc03:precomputed_tess:raw_text",
|
| 477 |
+
"document_id": "doc03",
|
| 478 |
+
"failed_metrics": {},
|
| 479 |
+
"ground_truth_artifact_id": "doc03:gt:raw_text",
|
| 480 |
+
"ignored_dimensions": [
|
| 481 |
+
"block_structure",
|
| 482 |
+
"confidence",
|
| 483 |
+
"formatting",
|
| 484 |
+
"geometry",
|
| 485 |
+
"ids",
|
| 486 |
+
"reading_order"
|
| 487 |
+
],
|
| 488 |
+
"metric_values": {
|
| 489 |
+
"cer": 0.05,
|
| 490 |
+
"mer": 0.25,
|
| 491 |
+
"wer": 0.25,
|
| 492 |
+
"wil": 0.4375
|
| 493 |
+
},
|
| 494 |
+
"pipeline_name": "ocr_then_correct",
|
| 495 |
+
"projection_report": null,
|
| 496 |
+
"view_name": "text_final",
|
| 497 |
+
"warnings": [
|
| 498 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 499 |
+
]
|
| 500 |
+
},
|
| 501 |
+
{
|
| 502 |
+
"candidate_artifact_id": "doc03:precomputed_corr:raw_text",
|
| 503 |
+
"document_id": "doc03",
|
| 504 |
+
"failed_metrics": {},
|
| 505 |
+
"ground_truth_artifact_id": "doc03:gt:raw_text",
|
| 506 |
+
"ignored_dimensions": [
|
| 507 |
+
"block_structure",
|
| 508 |
+
"confidence",
|
| 509 |
+
"formatting",
|
| 510 |
+
"geometry",
|
| 511 |
+
"ids",
|
| 512 |
+
"reading_order"
|
| 513 |
+
],
|
| 514 |
+
"metric_values": {
|
| 515 |
+
"cer": 0.05,
|
| 516 |
+
"mer": 0.25,
|
| 517 |
+
"wer": 0.25,
|
| 518 |
+
"wil": 0.4375
|
| 519 |
+
},
|
| 520 |
+
"pipeline_name": "ocr_then_correct",
|
| 521 |
+
"projection_report": null,
|
| 522 |
+
"view_name": "text_final",
|
| 523 |
+
"warnings": [
|
| 524 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 525 |
+
]
|
| 526 |
+
}
|
| 527 |
+
]
|
| 528 |
+
}
|
|
@@ -0,0 +1,592 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"artifacts_index": [
|
| 3 |
+
{
|
| 4 |
+
"content_hash": null,
|
| 5 |
+
"document_id": "doc01",
|
| 6 |
+
"id": "doc01:image",
|
| 7 |
+
"pipeline_name": "pero_only",
|
| 8 |
+
"produced_by_step": null,
|
| 9 |
+
"provenance": null,
|
| 10 |
+
"type": "image",
|
| 11 |
+
"uri": "<PATH>"
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"content_hash": null,
|
| 15 |
+
"document_id": "doc01",
|
| 16 |
+
"id": "doc01:precomputed_pero:raw_text",
|
| 17 |
+
"pipeline_name": "pero_only",
|
| 18 |
+
"produced_by_step": "ocr",
|
| 19 |
+
"provenance": null,
|
| 20 |
+
"type": "raw_text",
|
| 21 |
+
"uri": "<PATH>"
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"content_hash": null,
|
| 25 |
+
"document_id": "doc01",
|
| 26 |
+
"id": "doc01:image",
|
| 27 |
+
"pipeline_name": "tess_only",
|
| 28 |
+
"produced_by_step": null,
|
| 29 |
+
"provenance": null,
|
| 30 |
+
"type": "image",
|
| 31 |
+
"uri": "<PATH>"
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"content_hash": null,
|
| 35 |
+
"document_id": "doc01",
|
| 36 |
+
"id": "doc01:precomputed_tess:raw_text",
|
| 37 |
+
"pipeline_name": "tess_only",
|
| 38 |
+
"produced_by_step": "ocr",
|
| 39 |
+
"provenance": null,
|
| 40 |
+
"type": "raw_text",
|
| 41 |
+
"uri": "<PATH>"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"content_hash": null,
|
| 45 |
+
"document_id": "doc02",
|
| 46 |
+
"id": "doc02:image",
|
| 47 |
+
"pipeline_name": "pero_only",
|
| 48 |
+
"produced_by_step": null,
|
| 49 |
+
"provenance": null,
|
| 50 |
+
"type": "image",
|
| 51 |
+
"uri": "<PATH>"
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"content_hash": null,
|
| 55 |
+
"document_id": "doc02",
|
| 56 |
+
"id": "doc02:precomputed_pero:raw_text",
|
| 57 |
+
"pipeline_name": "pero_only",
|
| 58 |
+
"produced_by_step": "ocr",
|
| 59 |
+
"provenance": null,
|
| 60 |
+
"type": "raw_text",
|
| 61 |
+
"uri": "<PATH>"
|
| 62 |
+
},
|
| 63 |
+
{
|
| 64 |
+
"content_hash": null,
|
| 65 |
+
"document_id": "doc02",
|
| 66 |
+
"id": "doc02:image",
|
| 67 |
+
"pipeline_name": "tess_only",
|
| 68 |
+
"produced_by_step": null,
|
| 69 |
+
"provenance": null,
|
| 70 |
+
"type": "image",
|
| 71 |
+
"uri": "<PATH>"
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"content_hash": null,
|
| 75 |
+
"document_id": "doc02",
|
| 76 |
+
"id": "doc02:precomputed_tess:raw_text",
|
| 77 |
+
"pipeline_name": "tess_only",
|
| 78 |
+
"produced_by_step": "ocr",
|
| 79 |
+
"provenance": null,
|
| 80 |
+
"type": "raw_text",
|
| 81 |
+
"uri": "<PATH>"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"content_hash": null,
|
| 85 |
+
"document_id": "doc03",
|
| 86 |
+
"id": "doc03:image",
|
| 87 |
+
"pipeline_name": "pero_only",
|
| 88 |
+
"produced_by_step": null,
|
| 89 |
+
"provenance": null,
|
| 90 |
+
"type": "image",
|
| 91 |
+
"uri": "<PATH>"
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"content_hash": null,
|
| 95 |
+
"document_id": "doc03",
|
| 96 |
+
"id": "doc03:precomputed_pero:raw_text",
|
| 97 |
+
"pipeline_name": "pero_only",
|
| 98 |
+
"produced_by_step": "ocr",
|
| 99 |
+
"provenance": null,
|
| 100 |
+
"type": "raw_text",
|
| 101 |
+
"uri": "<PATH>"
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"content_hash": null,
|
| 105 |
+
"document_id": "doc03",
|
| 106 |
+
"id": "doc03:image",
|
| 107 |
+
"pipeline_name": "tess_only",
|
| 108 |
+
"produced_by_step": null,
|
| 109 |
+
"provenance": null,
|
| 110 |
+
"type": "image",
|
| 111 |
+
"uri": "<PATH>"
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"content_hash": null,
|
| 115 |
+
"document_id": "doc03",
|
| 116 |
+
"id": "doc03:precomputed_tess:raw_text",
|
| 117 |
+
"pipeline_name": "tess_only",
|
| 118 |
+
"produced_by_step": "ocr",
|
| 119 |
+
"provenance": null,
|
| 120 |
+
"type": "raw_text",
|
| 121 |
+
"uri": "<PATH>"
|
| 122 |
+
}
|
| 123 |
+
],
|
| 124 |
+
"manifest": {
|
| 125 |
+
"adapter_kwargs": {
|
| 126 |
+
"ocr": {
|
| 127 |
+
"source_label": "tess"
|
| 128 |
+
},
|
| 129 |
+
"pero_only__ocr": {
|
| 130 |
+
"source_label": "pero"
|
| 131 |
+
}
|
| 132 |
+
},
|
| 133 |
+
"code_version": "charac-1.0",
|
| 134 |
+
"corpus_name": "charac",
|
| 135 |
+
"dependencies_lock": {
|
| 136 |
+
"CacheControl": "0.14.4",
|
| 137 |
+
"Jinja2": "3.1.6",
|
| 138 |
+
"MarkupSafe": "3.0.3",
|
| 139 |
+
"PyGObject": "3.48.2",
|
| 140 |
+
"PyJWT": "2.7.0",
|
| 141 |
+
"PyYAML": "6.0.1",
|
| 142 |
+
"Pygments": "2.20.0",
|
| 143 |
+
"RapidFuzz": "3.14.5",
|
| 144 |
+
"annotated-doc": "0.0.4",
|
| 145 |
+
"annotated-types": "0.7.0",
|
| 146 |
+
"anyio": "4.13.0",
|
| 147 |
+
"argcomplete": "3.1.4",
|
| 148 |
+
"ast_serialize": "0.5.0",
|
| 149 |
+
"bandit": "1.9.4",
|
| 150 |
+
"blinker": "1.7.0",
|
| 151 |
+
"boolean.py": "5.0",
|
| 152 |
+
"certifi": "2026.2.25",
|
| 153 |
+
"charset-normalizer": "3.4.6",
|
| 154 |
+
"click": "8.4.0",
|
| 155 |
+
"colorama": "0.4.6",
|
| 156 |
+
"conan": "2.27.0",
|
| 157 |
+
"coverage": "7.14.0",
|
| 158 |
+
"cryptography": "41.0.7",
|
| 159 |
+
"cyclonedx-python-lib": "11.7.0",
|
| 160 |
+
"dbus-python": "1.3.2",
|
| 161 |
+
"defusedxml": "0.7.1",
|
| 162 |
+
"distro": "1.9.0",
|
| 163 |
+
"fastapi": "0.136.1",
|
| 164 |
+
"fasteners": "0.20",
|
| 165 |
+
"filelock": "3.29.0",
|
| 166 |
+
"h11": "0.16.0",
|
| 167 |
+
"httpcore": "1.0.9",
|
| 168 |
+
"httplib2": "0.20.4",
|
| 169 |
+
"httptools": "0.7.1",
|
| 170 |
+
"httpx": "0.28.1",
|
| 171 |
+
"idna": "3.11",
|
| 172 |
+
"iniconfig": "2.3.0",
|
| 173 |
+
"jiwer": "4.0.0",
|
| 174 |
+
"launchpadlib": "1.11.0",
|
| 175 |
+
"lazr.restfulclient": "0.14.6",
|
| 176 |
+
"lazr.uri": "1.0.6",
|
| 177 |
+
"librt": "0.11.0",
|
| 178 |
+
"license-expression": "30.4.4",
|
| 179 |
+
"markdown-it-py": "4.2.0",
|
| 180 |
+
"mdurl": "0.1.2",
|
| 181 |
+
"msgpack": "1.1.2",
|
| 182 |
+
"mypy": "2.1.0",
|
| 183 |
+
"mypy_extensions": "1.1.0",
|
| 184 |
+
"numpy": "2.4.6",
|
| 185 |
+
"oauthlib": "3.2.2",
|
| 186 |
+
"packageurl-python": "0.17.6",
|
| 187 |
+
"packaging": "24.0",
|
| 188 |
+
"patch-ng": "1.18.1",
|
| 189 |
+
"pathspec": "1.1.1",
|
| 190 |
+
"picarones": "1.1.0.dev311",
|
| 191 |
+
"pillow": "12.2.0",
|
| 192 |
+
"pip": "24.0",
|
| 193 |
+
"pip-api": "0.0.34",
|
| 194 |
+
"pip-requirements-parser": "32.0.1",
|
| 195 |
+
"pip_audit": "2.10.0",
|
| 196 |
+
"platformdirs": "4.9.6",
|
| 197 |
+
"pluggy": "1.6.0",
|
| 198 |
+
"py-serializable": "2.1.0",
|
| 199 |
+
"pydantic": "2.13.4",
|
| 200 |
+
"pydantic_core": "2.46.4",
|
| 201 |
+
"pyparsing": "3.1.1",
|
| 202 |
+
"pytesseract": "0.3.13",
|
| 203 |
+
"pytest": "9.0.3",
|
| 204 |
+
"pytest-cov": "7.1.0",
|
| 205 |
+
"pytest-timeout": "2.4.0",
|
| 206 |
+
"python-apt": "2.7.7+ubuntu5.2",
|
| 207 |
+
"python-dateutil": "2.9.0.post0",
|
| 208 |
+
"python-dotenv": "1.2.2",
|
| 209 |
+
"python-multipart": "0.0.29",
|
| 210 |
+
"requests": "2.33.1",
|
| 211 |
+
"rich": "15.0.0",
|
| 212 |
+
"setuptools": "68.1.2",
|
| 213 |
+
"six": "1.16.0",
|
| 214 |
+
"sortedcontainers": "2.4.0",
|
| 215 |
+
"starlette": "1.0.0",
|
| 216 |
+
"stevedore": "5.8.0",
|
| 217 |
+
"toml": "0.10.2",
|
| 218 |
+
"tomli": "2.4.1",
|
| 219 |
+
"tomli_w": "1.2.0",
|
| 220 |
+
"tqdm": "4.67.3",
|
| 221 |
+
"typing-inspection": "0.4.2",
|
| 222 |
+
"typing_extensions": "4.15.0",
|
| 223 |
+
"urllib3": "2.6.3",
|
| 224 |
+
"uvicorn": "0.47.0",
|
| 225 |
+
"uvloop": "0.22.1",
|
| 226 |
+
"wadllib": "1.3.6",
|
| 227 |
+
"watchfiles": "1.2.0",
|
| 228 |
+
"websockets": "16.0",
|
| 229 |
+
"wheel": "0.42.0",
|
| 230 |
+
"xmltodict": "0.13.0",
|
| 231 |
+
"yq": "3.1.0"
|
| 232 |
+
},
|
| 233 |
+
"metadata": {
|
| 234 |
+
"orchestrator": "picarones.app.services.run_orchestrator"
|
| 235 |
+
},
|
| 236 |
+
"n_documents": 3,
|
| 237 |
+
"pipeline_names": [
|
| 238 |
+
"pero_only",
|
| 239 |
+
"tess_only"
|
| 240 |
+
],
|
| 241 |
+
"pipeline_specs": [
|
| 242 |
+
{
|
| 243 |
+
"description": "",
|
| 244 |
+
"initial_inputs": [
|
| 245 |
+
"image"
|
| 246 |
+
],
|
| 247 |
+
"name": "tess_only",
|
| 248 |
+
"steps": [
|
| 249 |
+
{
|
| 250 |
+
"adapter_name": "ocr",
|
| 251 |
+
"id": "ocr",
|
| 252 |
+
"input_types": [
|
| 253 |
+
"image"
|
| 254 |
+
],
|
| 255 |
+
"inputs_from": {},
|
| 256 |
+
"kind": "step",
|
| 257 |
+
"output_types": [
|
| 258 |
+
"raw_text"
|
| 259 |
+
],
|
| 260 |
+
"params": {}
|
| 261 |
+
}
|
| 262 |
+
]
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"description": "",
|
| 266 |
+
"initial_inputs": [
|
| 267 |
+
"image"
|
| 268 |
+
],
|
| 269 |
+
"name": "pero_only",
|
| 270 |
+
"steps": [
|
| 271 |
+
{
|
| 272 |
+
"adapter_name": "pero_only__ocr",
|
| 273 |
+
"id": "ocr",
|
| 274 |
+
"input_types": [
|
| 275 |
+
"image"
|
| 276 |
+
],
|
| 277 |
+
"inputs_from": {},
|
| 278 |
+
"kind": "step",
|
| 279 |
+
"output_types": [
|
| 280 |
+
"raw_text"
|
| 281 |
+
],
|
| 282 |
+
"params": {}
|
| 283 |
+
}
|
| 284 |
+
]
|
| 285 |
+
}
|
| 286 |
+
],
|
| 287 |
+
"system_binaries_lock": {},
|
| 288 |
+
"view_specs": [
|
| 289 |
+
{
|
| 290 |
+
"candidate_types": [
|
| 291 |
+
"alto_xml",
|
| 292 |
+
"canonical_document",
|
| 293 |
+
"corrected_text",
|
| 294 |
+
"page_xml",
|
| 295 |
+
"raw_text"
|
| 296 |
+
],
|
| 297 |
+
"char_exclude": null,
|
| 298 |
+
"description": "Compare les sorties textuelles finales après projection éventuelle (ALTO/PAGE/markdown → texte plat).",
|
| 299 |
+
"ignored_dimensions": [
|
| 300 |
+
"block_structure",
|
| 301 |
+
"confidence",
|
| 302 |
+
"formatting",
|
| 303 |
+
"geometry",
|
| 304 |
+
"ids",
|
| 305 |
+
"reading_order"
|
| 306 |
+
],
|
| 307 |
+
"metric_names": [
|
| 308 |
+
"cer",
|
| 309 |
+
"mer",
|
| 310 |
+
"wer",
|
| 311 |
+
"wil"
|
| 312 |
+
],
|
| 313 |
+
"name": "text_final",
|
| 314 |
+
"normalization_profile": null,
|
| 315 |
+
"projection": null,
|
| 316 |
+
"projections_by_source_type": {
|
| 317 |
+
"alto_xml": {
|
| 318 |
+
"params": {},
|
| 319 |
+
"projector_name": "alto_to_text",
|
| 320 |
+
"source_type": "alto_xml",
|
| 321 |
+
"target_type": "raw_text"
|
| 322 |
+
},
|
| 323 |
+
"canonical_document": {
|
| 324 |
+
"params": {},
|
| 325 |
+
"projector_name": "canonical_to_text",
|
| 326 |
+
"source_type": "canonical_document",
|
| 327 |
+
"target_type": "raw_text"
|
| 328 |
+
},
|
| 329 |
+
"page_xml": {
|
| 330 |
+
"params": {},
|
| 331 |
+
"projector_name": "page_to_text",
|
| 332 |
+
"source_type": "page_xml",
|
| 333 |
+
"target_type": "raw_text"
|
| 334 |
+
}
|
| 335 |
+
},
|
| 336 |
+
"warnings": [
|
| 337 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 338 |
+
]
|
| 339 |
+
}
|
| 340 |
+
]
|
| 341 |
+
},
|
| 342 |
+
"pipeline_results": [
|
| 343 |
+
{
|
| 344 |
+
"document_id": "doc01",
|
| 345 |
+
"pipeline_name": "pero_only",
|
| 346 |
+
"step_results": [
|
| 347 |
+
{
|
| 348 |
+
"error": null,
|
| 349 |
+
"produced_artifacts": {
|
| 350 |
+
"raw_text": "doc01:precomputed_pero:raw_text"
|
| 351 |
+
},
|
| 352 |
+
"step_id": "ocr",
|
| 353 |
+
"succeeded": true
|
| 354 |
+
}
|
| 355 |
+
],
|
| 356 |
+
"succeeded": true
|
| 357 |
+
},
|
| 358 |
+
{
|
| 359 |
+
"document_id": "doc01",
|
| 360 |
+
"pipeline_name": "tess_only",
|
| 361 |
+
"step_results": [
|
| 362 |
+
{
|
| 363 |
+
"error": null,
|
| 364 |
+
"produced_artifacts": {
|
| 365 |
+
"raw_text": "doc01:precomputed_tess:raw_text"
|
| 366 |
+
},
|
| 367 |
+
"step_id": "ocr",
|
| 368 |
+
"succeeded": true
|
| 369 |
+
}
|
| 370 |
+
],
|
| 371 |
+
"succeeded": true
|
| 372 |
+
},
|
| 373 |
+
{
|
| 374 |
+
"document_id": "doc02",
|
| 375 |
+
"pipeline_name": "pero_only",
|
| 376 |
+
"step_results": [
|
| 377 |
+
{
|
| 378 |
+
"error": null,
|
| 379 |
+
"produced_artifacts": {
|
| 380 |
+
"raw_text": "doc02:precomputed_pero:raw_text"
|
| 381 |
+
},
|
| 382 |
+
"step_id": "ocr",
|
| 383 |
+
"succeeded": true
|
| 384 |
+
}
|
| 385 |
+
],
|
| 386 |
+
"succeeded": true
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"document_id": "doc02",
|
| 390 |
+
"pipeline_name": "tess_only",
|
| 391 |
+
"step_results": [
|
| 392 |
+
{
|
| 393 |
+
"error": null,
|
| 394 |
+
"produced_artifacts": {
|
| 395 |
+
"raw_text": "doc02:precomputed_tess:raw_text"
|
| 396 |
+
},
|
| 397 |
+
"step_id": "ocr",
|
| 398 |
+
"succeeded": true
|
| 399 |
+
}
|
| 400 |
+
],
|
| 401 |
+
"succeeded": true
|
| 402 |
+
},
|
| 403 |
+
{
|
| 404 |
+
"document_id": "doc03",
|
| 405 |
+
"pipeline_name": "pero_only",
|
| 406 |
+
"step_results": [
|
| 407 |
+
{
|
| 408 |
+
"error": null,
|
| 409 |
+
"produced_artifacts": {
|
| 410 |
+
"raw_text": "doc03:precomputed_pero:raw_text"
|
| 411 |
+
},
|
| 412 |
+
"step_id": "ocr",
|
| 413 |
+
"succeeded": true
|
| 414 |
+
}
|
| 415 |
+
],
|
| 416 |
+
"succeeded": true
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"document_id": "doc03",
|
| 420 |
+
"pipeline_name": "tess_only",
|
| 421 |
+
"step_results": [
|
| 422 |
+
{
|
| 423 |
+
"error": null,
|
| 424 |
+
"produced_artifacts": {
|
| 425 |
+
"raw_text": "doc03:precomputed_tess:raw_text"
|
| 426 |
+
},
|
| 427 |
+
"step_id": "ocr",
|
| 428 |
+
"succeeded": true
|
| 429 |
+
}
|
| 430 |
+
],
|
| 431 |
+
"succeeded": true
|
| 432 |
+
}
|
| 433 |
+
],
|
| 434 |
+
"view_results": [
|
| 435 |
+
{
|
| 436 |
+
"candidate_artifact_id": "doc01:precomputed_tess:raw_text",
|
| 437 |
+
"document_id": "doc01",
|
| 438 |
+
"failed_metrics": {},
|
| 439 |
+
"ground_truth_artifact_id": "doc01:gt:raw_text",
|
| 440 |
+
"ignored_dimensions": [
|
| 441 |
+
"block_structure",
|
| 442 |
+
"confidence",
|
| 443 |
+
"formatting",
|
| 444 |
+
"geometry",
|
| 445 |
+
"ids",
|
| 446 |
+
"reading_order"
|
| 447 |
+
],
|
| 448 |
+
"metric_values": {
|
| 449 |
+
"cer": 0.05,
|
| 450 |
+
"mer": 0.25,
|
| 451 |
+
"wer": 0.25,
|
| 452 |
+
"wil": 0.4375
|
| 453 |
+
},
|
| 454 |
+
"pipeline_name": "tess_only",
|
| 455 |
+
"projection_report": null,
|
| 456 |
+
"view_name": "text_final",
|
| 457 |
+
"warnings": [
|
| 458 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 459 |
+
]
|
| 460 |
+
},
|
| 461 |
+
{
|
| 462 |
+
"candidate_artifact_id": "doc01:precomputed_pero:raw_text",
|
| 463 |
+
"document_id": "doc01",
|
| 464 |
+
"failed_metrics": {},
|
| 465 |
+
"ground_truth_artifact_id": "doc01:gt:raw_text",
|
| 466 |
+
"ignored_dimensions": [
|
| 467 |
+
"block_structure",
|
| 468 |
+
"confidence",
|
| 469 |
+
"formatting",
|
| 470 |
+
"geometry",
|
| 471 |
+
"ids",
|
| 472 |
+
"reading_order"
|
| 473 |
+
],
|
| 474 |
+
"metric_values": {
|
| 475 |
+
"cer": 0.05,
|
| 476 |
+
"mer": 0.25,
|
| 477 |
+
"wer": 0.25,
|
| 478 |
+
"wil": 0.4375
|
| 479 |
+
},
|
| 480 |
+
"pipeline_name": "pero_only",
|
| 481 |
+
"projection_report": null,
|
| 482 |
+
"view_name": "text_final",
|
| 483 |
+
"warnings": [
|
| 484 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 485 |
+
]
|
| 486 |
+
},
|
| 487 |
+
{
|
| 488 |
+
"candidate_artifact_id": "doc02:precomputed_tess:raw_text",
|
| 489 |
+
"document_id": "doc02",
|
| 490 |
+
"failed_metrics": {},
|
| 491 |
+
"ground_truth_artifact_id": "doc02:gt:raw_text",
|
| 492 |
+
"ignored_dimensions": [
|
| 493 |
+
"block_structure",
|
| 494 |
+
"confidence",
|
| 495 |
+
"formatting",
|
| 496 |
+
"geometry",
|
| 497 |
+
"ids",
|
| 498 |
+
"reading_order"
|
| 499 |
+
],
|
| 500 |
+
"metric_values": {
|
| 501 |
+
"cer": 0.05,
|
| 502 |
+
"mer": 0.25,
|
| 503 |
+
"wer": 0.25,
|
| 504 |
+
"wil": 0.4375
|
| 505 |
+
},
|
| 506 |
+
"pipeline_name": "tess_only",
|
| 507 |
+
"projection_report": null,
|
| 508 |
+
"view_name": "text_final",
|
| 509 |
+
"warnings": [
|
| 510 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 511 |
+
]
|
| 512 |
+
},
|
| 513 |
+
{
|
| 514 |
+
"candidate_artifact_id": "doc02:precomputed_pero:raw_text",
|
| 515 |
+
"document_id": "doc02",
|
| 516 |
+
"failed_metrics": {},
|
| 517 |
+
"ground_truth_artifact_id": "doc02:gt:raw_text",
|
| 518 |
+
"ignored_dimensions": [
|
| 519 |
+
"block_structure",
|
| 520 |
+
"confidence",
|
| 521 |
+
"formatting",
|
| 522 |
+
"geometry",
|
| 523 |
+
"ids",
|
| 524 |
+
"reading_order"
|
| 525 |
+
],
|
| 526 |
+
"metric_values": {
|
| 527 |
+
"cer": 0.05,
|
| 528 |
+
"mer": 0.25,
|
| 529 |
+
"wer": 0.25,
|
| 530 |
+
"wil": 0.4375
|
| 531 |
+
},
|
| 532 |
+
"pipeline_name": "pero_only",
|
| 533 |
+
"projection_report": null,
|
| 534 |
+
"view_name": "text_final",
|
| 535 |
+
"warnings": [
|
| 536 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 537 |
+
]
|
| 538 |
+
},
|
| 539 |
+
{
|
| 540 |
+
"candidate_artifact_id": "doc03:precomputed_tess:raw_text",
|
| 541 |
+
"document_id": "doc03",
|
| 542 |
+
"failed_metrics": {},
|
| 543 |
+
"ground_truth_artifact_id": "doc03:gt:raw_text",
|
| 544 |
+
"ignored_dimensions": [
|
| 545 |
+
"block_structure",
|
| 546 |
+
"confidence",
|
| 547 |
+
"formatting",
|
| 548 |
+
"geometry",
|
| 549 |
+
"ids",
|
| 550 |
+
"reading_order"
|
| 551 |
+
],
|
| 552 |
+
"metric_values": {
|
| 553 |
+
"cer": 0.05,
|
| 554 |
+
"mer": 0.25,
|
| 555 |
+
"wer": 0.25,
|
| 556 |
+
"wil": 0.4375
|
| 557 |
+
},
|
| 558 |
+
"pipeline_name": "tess_only",
|
| 559 |
+
"projection_report": null,
|
| 560 |
+
"view_name": "text_final",
|
| 561 |
+
"warnings": [
|
| 562 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 563 |
+
]
|
| 564 |
+
},
|
| 565 |
+
{
|
| 566 |
+
"candidate_artifact_id": "doc03:precomputed_pero:raw_text",
|
| 567 |
+
"document_id": "doc03",
|
| 568 |
+
"failed_metrics": {},
|
| 569 |
+
"ground_truth_artifact_id": "doc03:gt:raw_text",
|
| 570 |
+
"ignored_dimensions": [
|
| 571 |
+
"block_structure",
|
| 572 |
+
"confidence",
|
| 573 |
+
"formatting",
|
| 574 |
+
"geometry",
|
| 575 |
+
"ids",
|
| 576 |
+
"reading_order"
|
| 577 |
+
],
|
| 578 |
+
"metric_values": {
|
| 579 |
+
"cer": 0.05,
|
| 580 |
+
"mer": 0.25,
|
| 581 |
+
"wer": 0.25,
|
| 582 |
+
"wil": 0.4375
|
| 583 |
+
},
|
| 584 |
+
"pipeline_name": "pero_only",
|
| 585 |
+
"projection_report": null,
|
| 586 |
+
"view_name": "text_final",
|
| 587 |
+
"warnings": [
|
| 588 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 589 |
+
]
|
| 590 |
+
}
|
| 591 |
+
]
|
| 592 |
+
}
|
|
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"artifacts_index": [
|
| 3 |
+
{
|
| 4 |
+
"content_hash": null,
|
| 5 |
+
"document_id": "doc01",
|
| 6 |
+
"id": "doc01:image",
|
| 7 |
+
"pipeline_name": "tess_only",
|
| 8 |
+
"produced_by_step": null,
|
| 9 |
+
"provenance": null,
|
| 10 |
+
"type": "image",
|
| 11 |
+
"uri": "<PATH>"
|
| 12 |
+
},
|
| 13 |
+
{
|
| 14 |
+
"content_hash": null,
|
| 15 |
+
"document_id": "doc01",
|
| 16 |
+
"id": "doc01:precomputed_tess:raw_text",
|
| 17 |
+
"pipeline_name": "tess_only",
|
| 18 |
+
"produced_by_step": "ocr",
|
| 19 |
+
"provenance": null,
|
| 20 |
+
"type": "raw_text",
|
| 21 |
+
"uri": "<PATH>"
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"content_hash": null,
|
| 25 |
+
"document_id": "doc02",
|
| 26 |
+
"id": "doc02:image",
|
| 27 |
+
"pipeline_name": "tess_only",
|
| 28 |
+
"produced_by_step": null,
|
| 29 |
+
"provenance": null,
|
| 30 |
+
"type": "image",
|
| 31 |
+
"uri": "<PATH>"
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"content_hash": null,
|
| 35 |
+
"document_id": "doc02",
|
| 36 |
+
"id": "doc02:precomputed_tess:raw_text",
|
| 37 |
+
"pipeline_name": "tess_only",
|
| 38 |
+
"produced_by_step": "ocr",
|
| 39 |
+
"provenance": null,
|
| 40 |
+
"type": "raw_text",
|
| 41 |
+
"uri": "<PATH>"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"content_hash": null,
|
| 45 |
+
"document_id": "doc03",
|
| 46 |
+
"id": "doc03:image",
|
| 47 |
+
"pipeline_name": "tess_only",
|
| 48 |
+
"produced_by_step": null,
|
| 49 |
+
"provenance": null,
|
| 50 |
+
"type": "image",
|
| 51 |
+
"uri": "<PATH>"
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"content_hash": null,
|
| 55 |
+
"document_id": "doc03",
|
| 56 |
+
"id": "doc03:precomputed_tess:raw_text",
|
| 57 |
+
"pipeline_name": "tess_only",
|
| 58 |
+
"produced_by_step": "ocr",
|
| 59 |
+
"provenance": null,
|
| 60 |
+
"type": "raw_text",
|
| 61 |
+
"uri": "<PATH>"
|
| 62 |
+
}
|
| 63 |
+
],
|
| 64 |
+
"manifest": {
|
| 65 |
+
"adapter_kwargs": {
|
| 66 |
+
"ocr": {
|
| 67 |
+
"source_label": "tess"
|
| 68 |
+
}
|
| 69 |
+
},
|
| 70 |
+
"code_version": "charac-1.0",
|
| 71 |
+
"corpus_name": "charac",
|
| 72 |
+
"dependencies_lock": {
|
| 73 |
+
"CacheControl": "0.14.4",
|
| 74 |
+
"Jinja2": "3.1.6",
|
| 75 |
+
"MarkupSafe": "3.0.3",
|
| 76 |
+
"PyGObject": "3.48.2",
|
| 77 |
+
"PyJWT": "2.7.0",
|
| 78 |
+
"PyYAML": "6.0.1",
|
| 79 |
+
"Pygments": "2.20.0",
|
| 80 |
+
"RapidFuzz": "3.14.5",
|
| 81 |
+
"annotated-doc": "0.0.4",
|
| 82 |
+
"annotated-types": "0.7.0",
|
| 83 |
+
"anyio": "4.13.0",
|
| 84 |
+
"argcomplete": "3.1.4",
|
| 85 |
+
"ast_serialize": "0.5.0",
|
| 86 |
+
"bandit": "1.9.4",
|
| 87 |
+
"blinker": "1.7.0",
|
| 88 |
+
"boolean.py": "5.0",
|
| 89 |
+
"certifi": "2026.2.25",
|
| 90 |
+
"charset-normalizer": "3.4.6",
|
| 91 |
+
"click": "8.4.0",
|
| 92 |
+
"colorama": "0.4.6",
|
| 93 |
+
"conan": "2.27.0",
|
| 94 |
+
"coverage": "7.14.0",
|
| 95 |
+
"cryptography": "41.0.7",
|
| 96 |
+
"cyclonedx-python-lib": "11.7.0",
|
| 97 |
+
"dbus-python": "1.3.2",
|
| 98 |
+
"defusedxml": "0.7.1",
|
| 99 |
+
"distro": "1.9.0",
|
| 100 |
+
"fastapi": "0.136.1",
|
| 101 |
+
"fasteners": "0.20",
|
| 102 |
+
"filelock": "3.29.0",
|
| 103 |
+
"h11": "0.16.0",
|
| 104 |
+
"httpcore": "1.0.9",
|
| 105 |
+
"httplib2": "0.20.4",
|
| 106 |
+
"httptools": "0.7.1",
|
| 107 |
+
"httpx": "0.28.1",
|
| 108 |
+
"idna": "3.11",
|
| 109 |
+
"iniconfig": "2.3.0",
|
| 110 |
+
"jiwer": "4.0.0",
|
| 111 |
+
"launchpadlib": "1.11.0",
|
| 112 |
+
"lazr.restfulclient": "0.14.6",
|
| 113 |
+
"lazr.uri": "1.0.6",
|
| 114 |
+
"librt": "0.11.0",
|
| 115 |
+
"license-expression": "30.4.4",
|
| 116 |
+
"markdown-it-py": "4.2.0",
|
| 117 |
+
"mdurl": "0.1.2",
|
| 118 |
+
"msgpack": "1.1.2",
|
| 119 |
+
"mypy": "2.1.0",
|
| 120 |
+
"mypy_extensions": "1.1.0",
|
| 121 |
+
"numpy": "2.4.6",
|
| 122 |
+
"oauthlib": "3.2.2",
|
| 123 |
+
"packageurl-python": "0.17.6",
|
| 124 |
+
"packaging": "24.0",
|
| 125 |
+
"patch-ng": "1.18.1",
|
| 126 |
+
"pathspec": "1.1.1",
|
| 127 |
+
"picarones": "1.1.0.dev311",
|
| 128 |
+
"pillow": "12.2.0",
|
| 129 |
+
"pip": "24.0",
|
| 130 |
+
"pip-api": "0.0.34",
|
| 131 |
+
"pip-requirements-parser": "32.0.1",
|
| 132 |
+
"pip_audit": "2.10.0",
|
| 133 |
+
"platformdirs": "4.9.6",
|
| 134 |
+
"pluggy": "1.6.0",
|
| 135 |
+
"py-serializable": "2.1.0",
|
| 136 |
+
"pydantic": "2.13.4",
|
| 137 |
+
"pydantic_core": "2.46.4",
|
| 138 |
+
"pyparsing": "3.1.1",
|
| 139 |
+
"pytesseract": "0.3.13",
|
| 140 |
+
"pytest": "9.0.3",
|
| 141 |
+
"pytest-cov": "7.1.0",
|
| 142 |
+
"pytest-timeout": "2.4.0",
|
| 143 |
+
"python-apt": "2.7.7+ubuntu5.2",
|
| 144 |
+
"python-dateutil": "2.9.0.post0",
|
| 145 |
+
"python-dotenv": "1.2.2",
|
| 146 |
+
"python-multipart": "0.0.29",
|
| 147 |
+
"requests": "2.33.1",
|
| 148 |
+
"rich": "15.0.0",
|
| 149 |
+
"setuptools": "68.1.2",
|
| 150 |
+
"six": "1.16.0",
|
| 151 |
+
"sortedcontainers": "2.4.0",
|
| 152 |
+
"starlette": "1.0.0",
|
| 153 |
+
"stevedore": "5.8.0",
|
| 154 |
+
"toml": "0.10.2",
|
| 155 |
+
"tomli": "2.4.1",
|
| 156 |
+
"tomli_w": "1.2.0",
|
| 157 |
+
"tqdm": "4.67.3",
|
| 158 |
+
"typing-inspection": "0.4.2",
|
| 159 |
+
"typing_extensions": "4.15.0",
|
| 160 |
+
"urllib3": "2.6.3",
|
| 161 |
+
"uvicorn": "0.47.0",
|
| 162 |
+
"uvloop": "0.22.1",
|
| 163 |
+
"wadllib": "1.3.6",
|
| 164 |
+
"watchfiles": "1.2.0",
|
| 165 |
+
"websockets": "16.0",
|
| 166 |
+
"wheel": "0.42.0",
|
| 167 |
+
"xmltodict": "0.13.0",
|
| 168 |
+
"yq": "3.1.0"
|
| 169 |
+
},
|
| 170 |
+
"metadata": {
|
| 171 |
+
"orchestrator": "picarones.app.services.run_orchestrator"
|
| 172 |
+
},
|
| 173 |
+
"n_documents": 3,
|
| 174 |
+
"pipeline_names": [
|
| 175 |
+
"tess_only"
|
| 176 |
+
],
|
| 177 |
+
"pipeline_specs": [
|
| 178 |
+
{
|
| 179 |
+
"description": "",
|
| 180 |
+
"initial_inputs": [
|
| 181 |
+
"image"
|
| 182 |
+
],
|
| 183 |
+
"name": "tess_only",
|
| 184 |
+
"steps": [
|
| 185 |
+
{
|
| 186 |
+
"adapter_name": "ocr",
|
| 187 |
+
"id": "ocr",
|
| 188 |
+
"input_types": [
|
| 189 |
+
"image"
|
| 190 |
+
],
|
| 191 |
+
"inputs_from": {},
|
| 192 |
+
"kind": "step",
|
| 193 |
+
"output_types": [
|
| 194 |
+
"raw_text"
|
| 195 |
+
],
|
| 196 |
+
"params": {}
|
| 197 |
+
}
|
| 198 |
+
]
|
| 199 |
+
}
|
| 200 |
+
],
|
| 201 |
+
"system_binaries_lock": {},
|
| 202 |
+
"view_specs": [
|
| 203 |
+
{
|
| 204 |
+
"candidate_types": [
|
| 205 |
+
"alto_xml",
|
| 206 |
+
"canonical_document",
|
| 207 |
+
"corrected_text",
|
| 208 |
+
"page_xml",
|
| 209 |
+
"raw_text"
|
| 210 |
+
],
|
| 211 |
+
"char_exclude": null,
|
| 212 |
+
"description": "Compare les sorties textuelles finales après projection éventuelle (ALTO/PAGE/markdown → texte plat).",
|
| 213 |
+
"ignored_dimensions": [
|
| 214 |
+
"block_structure",
|
| 215 |
+
"confidence",
|
| 216 |
+
"formatting",
|
| 217 |
+
"geometry",
|
| 218 |
+
"ids",
|
| 219 |
+
"reading_order"
|
| 220 |
+
],
|
| 221 |
+
"metric_names": [
|
| 222 |
+
"cer",
|
| 223 |
+
"mer",
|
| 224 |
+
"wer",
|
| 225 |
+
"wil"
|
| 226 |
+
],
|
| 227 |
+
"name": "text_final",
|
| 228 |
+
"normalization_profile": null,
|
| 229 |
+
"projection": null,
|
| 230 |
+
"projections_by_source_type": {
|
| 231 |
+
"alto_xml": {
|
| 232 |
+
"params": {},
|
| 233 |
+
"projector_name": "alto_to_text",
|
| 234 |
+
"source_type": "alto_xml",
|
| 235 |
+
"target_type": "raw_text"
|
| 236 |
+
},
|
| 237 |
+
"canonical_document": {
|
| 238 |
+
"params": {},
|
| 239 |
+
"projector_name": "canonical_to_text",
|
| 240 |
+
"source_type": "canonical_document",
|
| 241 |
+
"target_type": "raw_text"
|
| 242 |
+
},
|
| 243 |
+
"page_xml": {
|
| 244 |
+
"params": {},
|
| 245 |
+
"projector_name": "page_to_text",
|
| 246 |
+
"source_type": "page_xml",
|
| 247 |
+
"target_type": "raw_text"
|
| 248 |
+
}
|
| 249 |
+
},
|
| 250 |
+
"warnings": [
|
| 251 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 252 |
+
]
|
| 253 |
+
}
|
| 254 |
+
]
|
| 255 |
+
},
|
| 256 |
+
"pipeline_results": [
|
| 257 |
+
{
|
| 258 |
+
"document_id": "doc01",
|
| 259 |
+
"pipeline_name": "tess_only",
|
| 260 |
+
"step_results": [
|
| 261 |
+
{
|
| 262 |
+
"error": null,
|
| 263 |
+
"produced_artifacts": {
|
| 264 |
+
"raw_text": "doc01:precomputed_tess:raw_text"
|
| 265 |
+
},
|
| 266 |
+
"step_id": "ocr",
|
| 267 |
+
"succeeded": true
|
| 268 |
+
}
|
| 269 |
+
],
|
| 270 |
+
"succeeded": true
|
| 271 |
+
},
|
| 272 |
+
{
|
| 273 |
+
"document_id": "doc02",
|
| 274 |
+
"pipeline_name": "tess_only",
|
| 275 |
+
"step_results": [
|
| 276 |
+
{
|
| 277 |
+
"error": null,
|
| 278 |
+
"produced_artifacts": {
|
| 279 |
+
"raw_text": "doc02:precomputed_tess:raw_text"
|
| 280 |
+
},
|
| 281 |
+
"step_id": "ocr",
|
| 282 |
+
"succeeded": true
|
| 283 |
+
}
|
| 284 |
+
],
|
| 285 |
+
"succeeded": true
|
| 286 |
+
},
|
| 287 |
+
{
|
| 288 |
+
"document_id": "doc03",
|
| 289 |
+
"pipeline_name": "tess_only",
|
| 290 |
+
"step_results": [
|
| 291 |
+
{
|
| 292 |
+
"error": null,
|
| 293 |
+
"produced_artifacts": {
|
| 294 |
+
"raw_text": "doc03:precomputed_tess:raw_text"
|
| 295 |
+
},
|
| 296 |
+
"step_id": "ocr",
|
| 297 |
+
"succeeded": true
|
| 298 |
+
}
|
| 299 |
+
],
|
| 300 |
+
"succeeded": true
|
| 301 |
+
}
|
| 302 |
+
],
|
| 303 |
+
"view_results": [
|
| 304 |
+
{
|
| 305 |
+
"candidate_artifact_id": "doc01:precomputed_tess:raw_text",
|
| 306 |
+
"document_id": "doc01",
|
| 307 |
+
"failed_metrics": {},
|
| 308 |
+
"ground_truth_artifact_id": "doc01:gt:raw_text",
|
| 309 |
+
"ignored_dimensions": [
|
| 310 |
+
"block_structure",
|
| 311 |
+
"confidence",
|
| 312 |
+
"formatting",
|
| 313 |
+
"geometry",
|
| 314 |
+
"ids",
|
| 315 |
+
"reading_order"
|
| 316 |
+
],
|
| 317 |
+
"metric_values": {
|
| 318 |
+
"cer": 0.05,
|
| 319 |
+
"mer": 0.25,
|
| 320 |
+
"wer": 0.25,
|
| 321 |
+
"wil": 0.4375
|
| 322 |
+
},
|
| 323 |
+
"pipeline_name": "tess_only",
|
| 324 |
+
"projection_report": null,
|
| 325 |
+
"view_name": "text_final",
|
| 326 |
+
"warnings": [
|
| 327 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 328 |
+
]
|
| 329 |
+
},
|
| 330 |
+
{
|
| 331 |
+
"candidate_artifact_id": "doc02:precomputed_tess:raw_text",
|
| 332 |
+
"document_id": "doc02",
|
| 333 |
+
"failed_metrics": {},
|
| 334 |
+
"ground_truth_artifact_id": "doc02:gt:raw_text",
|
| 335 |
+
"ignored_dimensions": [
|
| 336 |
+
"block_structure",
|
| 337 |
+
"confidence",
|
| 338 |
+
"formatting",
|
| 339 |
+
"geometry",
|
| 340 |
+
"ids",
|
| 341 |
+
"reading_order"
|
| 342 |
+
],
|
| 343 |
+
"metric_values": {
|
| 344 |
+
"cer": 0.05,
|
| 345 |
+
"mer": 0.25,
|
| 346 |
+
"wer": 0.25,
|
| 347 |
+
"wil": 0.4375
|
| 348 |
+
},
|
| 349 |
+
"pipeline_name": "tess_only",
|
| 350 |
+
"projection_report": null,
|
| 351 |
+
"view_name": "text_final",
|
| 352 |
+
"warnings": [
|
| 353 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 354 |
+
]
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"candidate_artifact_id": "doc03:precomputed_tess:raw_text",
|
| 358 |
+
"document_id": "doc03",
|
| 359 |
+
"failed_metrics": {},
|
| 360 |
+
"ground_truth_artifact_id": "doc03:gt:raw_text",
|
| 361 |
+
"ignored_dimensions": [
|
| 362 |
+
"block_structure",
|
| 363 |
+
"confidence",
|
| 364 |
+
"formatting",
|
| 365 |
+
"geometry",
|
| 366 |
+
"ids",
|
| 367 |
+
"reading_order"
|
| 368 |
+
],
|
| 369 |
+
"metric_values": {
|
| 370 |
+
"cer": 0.05,
|
| 371 |
+
"mer": 0.25,
|
| 372 |
+
"wer": 0.25,
|
| 373 |
+
"wil": 0.4375
|
| 374 |
+
},
|
| 375 |
+
"pipeline_name": "tess_only",
|
| 376 |
+
"projection_report": null,
|
| 377 |
+
"view_name": "text_final",
|
| 378 |
+
"warnings": [
|
| 379 |
+
"Cette vue compare les sorties textuelles finales après projection éventuelle. Les pipelines qui produisent ALTO/PAGE/markdown sont projetés vers du texte plat — leurs structures spatiale et documentaire ne sont PAS évaluées ici. Pour évaluer la qualité ALTO, voir AltoView (S15)."
|
| 380 |
+
]
|
| 381 |
+
}
|
| 382 |
+
]
|
| 383 |
+
}
|
|
@@ -0,0 +1,607 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Harnais de caractérisation du cœur stateful de ``RunOrchestrator``.
|
| 2 |
+
|
| 3 |
+
Audit prod — PRÉCONDITION de la Phase B (décomposition stateful).
|
| 4 |
+
Verrouille le comportement EXACT actuel là où Phase B va toucher,
|
| 5 |
+
pour transformer « la CI pourrait ne pas attraper » en « la CI
|
| 6 |
+
attrape ». Cinq groupes, un par cas de risque identifié :
|
| 7 |
+
|
| 8 |
+
1. :class:`TestGlobalDocIdxContract` — compteur ``doc_idx`` global
|
| 9 |
+
au run (pas par pipeline) sur **multi-pipeline**.
|
| 10 |
+
2. :class:`TestCancelPropagation` — annulation pré-set ET
|
| 11 |
+
mi-run (identité d'objet event à travers les couches).
|
| 12 |
+
3. :class:`TestCrashResumeConsistency` — interruption réelle puis
|
| 13 |
+
resume : sortie finale identique à un run complet propre.
|
| 14 |
+
4. :class:`TestGoldenMultiTopology` — snapshot normalisé
|
| 15 |
+
déterministe des artefacts (manifest/pipeline/view) sur 4
|
| 16 |
+
topologies (linéaire, multi-pipeline, DAG branchant, OCR+corr).
|
| 17 |
+
5. :class:`TestConcurrencyIsolation` — deux ``execute()`` en
|
| 18 |
+
threads parallèles : isolation cancel/progress, pas de fuite
|
| 19 |
+
d'état entre instances.
|
| 20 |
+
|
| 21 |
+
Déterminisme : ``PrecomputedTextAdapter`` (lit ``<stem>.<label>.txt``,
|
| 22 |
+
aucun OCR/réseau). Un garde explicite
|
| 23 |
+
(:meth:`TestGoldenMultiTopology.test_snapshot_is_deterministic`)
|
| 24 |
+
échoue si le snapshot n'est pas reproductible — un golden flaky
|
| 25 |
+
serait pire que pas de golden.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
from __future__ import annotations
|
| 29 |
+
|
| 30 |
+
import io
|
| 31 |
+
import json
|
| 32 |
+
import textwrap
|
| 33 |
+
import threading
|
| 34 |
+
import time
|
| 35 |
+
import zipfile
|
| 36 |
+
from pathlib import Path
|
| 37 |
+
from typing import Any
|
| 38 |
+
|
| 39 |
+
import pytest
|
| 40 |
+
|
| 41 |
+
from picarones.app.schemas.run_spec import load_run_spec_from_yaml
|
| 42 |
+
from picarones.app.services import RunOrchestrator
|
| 43 |
+
|
| 44 |
+
_FIX_DIR = Path(__file__).parent / "fixtures" / "run_orchestrator"
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
# Briques déterministes
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
|
| 51 |
+
def _png_bytes() -> bytes:
|
| 52 |
+
return (
|
| 53 |
+
b"\x89PNG\r\n\x1a\n"
|
| 54 |
+
b"\x00\x00\x00\rIHDR"
|
| 55 |
+
b"\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00\x00\x00"
|
| 56 |
+
b"\x1f\x15\xc4\x89"
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def _make_corpus_zip(
|
| 61 |
+
tmp: Path, n_docs: int, *, sources: tuple[str, ...] = ("tess",),
|
| 62 |
+
) -> Path:
|
| 63 |
+
"""Corpus ZIP déterministe.
|
| 64 |
+
|
| 65 |
+
Pour chaque ``source_label`` de ``sources`` on écrit
|
| 66 |
+
``<doc>.<label>.txt`` (lu par ``PrecomputedTextAdapter``). La GT
|
| 67 |
+
diffère légèrement du texte pour produire un CER non trivial mais
|
| 68 |
+
fixe.
|
| 69 |
+
"""
|
| 70 |
+
buf = io.BytesIO()
|
| 71 |
+
with zipfile.ZipFile(buf, mode="w") as zf:
|
| 72 |
+
for i in range(1, n_docs + 1):
|
| 73 |
+
doc = f"doc{i:02d}"
|
| 74 |
+
zf.writestr(f"{doc}.png", _png_bytes())
|
| 75 |
+
zf.writestr(f"{doc}.gt.txt", f"Texte de reference {i}")
|
| 76 |
+
for label in sources:
|
| 77 |
+
# 1 substitution stable → CER déterministe non nul.
|
| 78 |
+
zf.writestr(f"{doc}.{label}.txt", f"Texte de reference {i}!")
|
| 79 |
+
tmp.mkdir(parents=True, exist_ok=True)
|
| 80 |
+
p = tmp / "corpus.zip"
|
| 81 |
+
p.write_bytes(buf.getvalue())
|
| 82 |
+
return p
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _spec(corpus_zip: Path, out: Path, body: str) -> Any:
|
| 86 |
+
yaml = textwrap.dedent(f"""
|
| 87 |
+
corpus_zip: {corpus_zip}
|
| 88 |
+
corpus_name: charac
|
| 89 |
+
output_dir: {out}
|
| 90 |
+
code_version: "charac-1.0"
|
| 91 |
+
views: [text_final]
|
| 92 |
+
{body}
|
| 93 |
+
""")
|
| 94 |
+
return load_run_spec_from_yaml(yaml)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
_TOPOLOGIES: dict[str, tuple[tuple[str, ...], str]] = {
|
| 98 |
+
"single_linear": (("tess",), """
|
| 99 |
+
pipelines:
|
| 100 |
+
- name: tess_only
|
| 101 |
+
initial_inputs: [image]
|
| 102 |
+
steps:
|
| 103 |
+
- id: ocr
|
| 104 |
+
adapter_class: picarones.adapters.ocr.precomputed.PrecomputedTextAdapter
|
| 105 |
+
adapter_kwargs: {source_label: tess}
|
| 106 |
+
input_types: [image]
|
| 107 |
+
output_types: [raw_text]
|
| 108 |
+
"""),
|
| 109 |
+
"multi_pipeline": (("tess", "pero"), """
|
| 110 |
+
pipelines:
|
| 111 |
+
- name: tess_only
|
| 112 |
+
initial_inputs: [image]
|
| 113 |
+
steps:
|
| 114 |
+
- id: ocr
|
| 115 |
+
adapter_class: picarones.adapters.ocr.precomputed.PrecomputedTextAdapter
|
| 116 |
+
adapter_kwargs: {source_label: tess}
|
| 117 |
+
input_types: [image]
|
| 118 |
+
output_types: [raw_text]
|
| 119 |
+
- name: pero_only
|
| 120 |
+
initial_inputs: [image]
|
| 121 |
+
steps:
|
| 122 |
+
- id: ocr
|
| 123 |
+
adapter_class: picarones.adapters.ocr.precomputed.PrecomputedTextAdapter
|
| 124 |
+
adapter_kwargs: {source_label: pero}
|
| 125 |
+
input_types: [image]
|
| 126 |
+
output_types: [raw_text]
|
| 127 |
+
"""),
|
| 128 |
+
"branching_dag": (("tess", "corr"), """
|
| 129 |
+
pipelines:
|
| 130 |
+
- name: ocr_then_correct
|
| 131 |
+
initial_inputs: [image]
|
| 132 |
+
preferred_text_output: corrector.corrected_text
|
| 133 |
+
steps:
|
| 134 |
+
- id: ocr
|
| 135 |
+
adapter_class: picarones.adapters.ocr.precomputed.PrecomputedTextAdapter
|
| 136 |
+
adapter_kwargs: {source_label: tess}
|
| 137 |
+
input_types: [image]
|
| 138 |
+
output_types: [raw_text]
|
| 139 |
+
- id: corrector
|
| 140 |
+
adapter_class: picarones.adapters.ocr.precomputed.PrecomputedTextAdapter
|
| 141 |
+
adapter_kwargs: {source_label: corr}
|
| 142 |
+
input_types: [image, raw_text]
|
| 143 |
+
output_types: [corrected_text]
|
| 144 |
+
inputs_from:
|
| 145 |
+
raw_text: ocr
|
| 146 |
+
"""),
|
| 147 |
+
}
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
# ---------------------------------------------------------------------------
|
| 151 |
+
# Normalisation snapshot (déterministe : retire timestamps/durées/paths)
|
| 152 |
+
# ---------------------------------------------------------------------------
|
| 153 |
+
|
| 154 |
+
_VOLATILE_KEYS = {
|
| 155 |
+
"started_at", "completed_at", "duration_seconds", "run_date",
|
| 156 |
+
"created_at", "elapsed_seconds", "wall_clock_seconds",
|
| 157 |
+
# ``run_id`` est horodaté (``charac_YYYYMMDDThhmmssZ``) — volatil,
|
| 158 |
+
# pas une caractéristique de comportement.
|
| 159 |
+
"run_id",
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def _scrub(obj: Any) -> Any:
|
| 164 |
+
"""Snapshot canonique : retire les clés volatiles, neutralise les
|
| 165 |
+
chemins absolus, et **trie les listes de scalaires**.
|
| 166 |
+
|
| 167 |
+
Le tri des listes de scalaires est essentiel : certaines sont
|
| 168 |
+
dérivées de ``set`` (ex. ``ignored_dimensions``, types projetés)
|
| 169 |
+
dont l'ordre de sérialisation varie d'un *process* à l'autre
|
| 170 |
+
(randomisation du hash de chaînes Python). Le garde déterminisme
|
| 171 |
+
intra-process ne l'attrape pas — seul un golden inter-process le
|
| 172 |
+
révèle. On NE trie PAS les listes de dicts : ce sont des
|
| 173 |
+
enregistrements dont l'ordre peut porter du sens (et qui sont
|
| 174 |
+
déjà triés au niveau record par :func:`_normalized_snapshot`)."""
|
| 175 |
+
if isinstance(obj, dict):
|
| 176 |
+
return {k: _scrub(v) for k, v in obj.items() if k not in _VOLATILE_KEYS}
|
| 177 |
+
if isinstance(obj, list):
|
| 178 |
+
scrubbed = [_scrub(x) for x in obj]
|
| 179 |
+
if all(isinstance(x, (str, int, float, bool, type(None))) for x in scrubbed):
|
| 180 |
+
return sorted(scrubbed, key=lambda x: (x is None, str(x)))
|
| 181 |
+
return scrubbed
|
| 182 |
+
if isinstance(obj, str):
|
| 183 |
+
# Chemins absolus → placeholder (tmp_path varie par run/CI).
|
| 184 |
+
if "/" in obj and ("/tmp" in obj or "pytest" in obj or obj.startswith("/")):
|
| 185 |
+
return "<PATH>"
|
| 186 |
+
return obj
|
| 187 |
+
return obj
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def _jsonl(path: Path) -> list[dict]:
|
| 191 |
+
if not path.exists():
|
| 192 |
+
return []
|
| 193 |
+
return [json.loads(ln) for ln in path.read_text(encoding="utf-8").splitlines() if ln.strip()]
|
| 194 |
+
|
| 195 |
+
|
| 196 |
+
def _normalized_snapshot(results_dir: Path) -> str:
|
| 197 |
+
"""Snapshot canonique, ordre-stable, sans champs volatils."""
|
| 198 |
+
manifest = json.loads((results_dir / "run_manifest.json").read_text(encoding="utf-8"))
|
| 199 |
+
pipe = sorted(
|
| 200 |
+
_jsonl(results_dir / "pipeline_results.jsonl"),
|
| 201 |
+
key=lambda r: (r.get("document_id", ""), r.get("pipeline_name", "")),
|
| 202 |
+
)
|
| 203 |
+
views = sorted(
|
| 204 |
+
_jsonl(results_dir / "view_results.jsonl"),
|
| 205 |
+
key=lambda r: (r.get("document_id", ""), r.get("view_name", "")),
|
| 206 |
+
)
|
| 207 |
+
# 4e artefact : index des artefacts (découplé de pipeline_results
|
| 208 |
+
# pour le streaming). Phase B réordonne l'extraction d'artefacts
|
| 209 |
+
# → à verrouiller aussi.
|
| 210 |
+
arts = sorted(
|
| 211 |
+
_jsonl(results_dir / "artifacts_index.jsonl"),
|
| 212 |
+
key=lambda r: (
|
| 213 |
+
r.get("document_id", ""), r.get("pipeline_name", ""),
|
| 214 |
+
r.get("id", ""),
|
| 215 |
+
),
|
| 216 |
+
)
|
| 217 |
+
snap = {
|
| 218 |
+
"manifest": _scrub(manifest),
|
| 219 |
+
"pipeline_results": _scrub(pipe),
|
| 220 |
+
"view_results": _scrub(views),
|
| 221 |
+
"artifacts_index": _scrub(arts),
|
| 222 |
+
}
|
| 223 |
+
return json.dumps(snap, ensure_ascii=False, sort_keys=True, indent=2)
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def _results_snapshot(results_dir: Path) -> str:
|
| 227 |
+
"""Snapshot des RÉSULTATS calculés seuls (pipeline + views),
|
| 228 |
+
indépendant de l'écho de config du manifest (``partial_dir``,
|
| 229 |
+
``output_dir``…). C'est l'invariant que « resume == run propre »
|
| 230 |
+
doit préserver : mêmes sorties par document, peu importe que le
|
| 231 |
+
run ait été configuré avec un ``partial_dir`` ou non."""
|
| 232 |
+
pipe = sorted(
|
| 233 |
+
_jsonl(results_dir / "pipeline_results.jsonl"),
|
| 234 |
+
key=lambda r: (r.get("document_id", ""), r.get("pipeline_name", "")),
|
| 235 |
+
)
|
| 236 |
+
views = sorted(
|
| 237 |
+
_jsonl(results_dir / "view_results.jsonl"),
|
| 238 |
+
key=lambda r: (r.get("document_id", ""), r.get("view_name", "")),
|
| 239 |
+
)
|
| 240 |
+
return json.dumps(
|
| 241 |
+
{"pipeline_results": _scrub(pipe), "view_results": _scrub(views)},
|
| 242 |
+
ensure_ascii=False, sort_keys=True, indent=2,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
|
| 246 |
+
def _golden(name: str, actual: str) -> None:
|
| 247 |
+
"""Pattern golden : 1er run crée le fixture et échoue (force le
|
| 248 |
+
commit) ; ensuite compare strictement."""
|
| 249 |
+
gp = _FIX_DIR / f"{name}.json"
|
| 250 |
+
if not gp.exists():
|
| 251 |
+
gp.parent.mkdir(parents=True, exist_ok=True)
|
| 252 |
+
gp.write_text(actual + "\n", encoding="utf-8")
|
| 253 |
+
pytest.fail(
|
| 254 |
+
f"Golden créé : {gp} — vérifier puis committer le fixture.",
|
| 255 |
+
)
|
| 256 |
+
expected = gp.read_text(encoding="utf-8").rstrip("\n")
|
| 257 |
+
assert actual == expected, (
|
| 258 |
+
f"Snapshot divergent vs {gp}.\nUne dérive ici = régression "
|
| 259 |
+
"comportementale du cœur stateful (Phase B). Si intentionnel : "
|
| 260 |
+
"supprimer le golden et relancer pour régénérer."
|
| 261 |
+
)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
def _run(tmp: Path, topo: str, **kw: Any) -> RunOrchestrator:
|
| 265 |
+
sources, body = _TOPOLOGIES[topo]
|
| 266 |
+
cz = _make_corpus_zip(tmp / "in", kw.pop("n_docs", 3), sources=sources)
|
| 267 |
+
out = tmp / "out"
|
| 268 |
+
spec = _spec(cz, out, body)
|
| 269 |
+
orch = RunOrchestrator(out)
|
| 270 |
+
res = orch.execute(spec, **kw)
|
| 271 |
+
return res
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
# ---------------------------------------------------------------------------
|
| 275 |
+
# 1. Contrat compteur doc_idx GLOBAL (Risque 1)
|
| 276 |
+
# ---------------------------------------------------------------------------
|
| 277 |
+
|
| 278 |
+
class TestGlobalDocIdxContract:
|
| 279 |
+
"""Le compteur ``doc_idx`` est global au run, pas par pipeline.
|
| 280 |
+
2 pipelines × N docs ⇒ séquence contiguë ``0..2N-1`` (PAS
|
| 281 |
+
``0..N-1`` deux fois). Phase B casse ça en premier si elle crée
|
| 282 |
+
le contexte par-collaborateur."""
|
| 283 |
+
|
| 284 |
+
def test_multi_pipeline_counter_is_global_and_contiguous(
|
| 285 |
+
self, tmp_path: Path,
|
| 286 |
+
) -> None:
|
| 287 |
+
calls: list[tuple[str, int, str]] = []
|
| 288 |
+
self_dir = tmp_path
|
| 289 |
+
sources, body = _TOPOLOGIES["multi_pipeline"]
|
| 290 |
+
cz = _make_corpus_zip(self_dir / "in", 3, sources=sources)
|
| 291 |
+
spec = _spec(cz, self_dir / "out", body)
|
| 292 |
+
RunOrchestrator(self_dir / "out").execute(
|
| 293 |
+
spec, progress_callback=lambda e, i, d: calls.append((e, i, d)),
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
# 2 pipelines × 3 docs = 6 notifications.
|
| 297 |
+
assert len(calls) == 6, calls
|
| 298 |
+
indices = sorted(c[1] for c in calls)
|
| 299 |
+
assert indices == [0, 1, 2, 3, 4, 5], (
|
| 300 |
+
f"compteur NON global/contigu : {indices} — régression "
|
| 301 |
+
"Phase B (contexte par-pipeline au lieu de global au run)"
|
| 302 |
+
)
|
| 303 |
+
engines = {c[0] for c in calls}
|
| 304 |
+
assert engines == {"tess_only", "pero_only"}, engines
|
| 305 |
+
|
| 306 |
+
def test_single_pipeline_counter_zero_based(
|
| 307 |
+
self, tmp_path: Path,
|
| 308 |
+
) -> None:
|
| 309 |
+
calls: list[int] = []
|
| 310 |
+
_run(
|
| 311 |
+
tmp_path, "single_linear", n_docs=4,
|
| 312 |
+
progress_callback=lambda e, i, d: calls.append(i),
|
| 313 |
+
)
|
| 314 |
+
assert sorted(calls) == [0, 1, 2, 3]
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
# ---------------------------------------------------------------------------
|
| 318 |
+
# 2. Propagation cancel_event (Risque 2)
|
| 319 |
+
# ---------------------------------------------------------------------------
|
| 320 |
+
|
| 321 |
+
class TestCancelPropagation:
|
| 322 |
+
def test_preset_cancel_multipipeline_stops_fast_and_partial(
|
| 323 |
+
self, tmp_path: Path,
|
| 324 |
+
) -> None:
|
| 325 |
+
ev = threading.Event()
|
| 326 |
+
ev.set()
|
| 327 |
+
t0 = time.monotonic()
|
| 328 |
+
res = _run(tmp_path, "multi_pipeline", n_docs=5, cancel_event=ev)
|
| 329 |
+
elapsed = time.monotonic() - t0
|
| 330 |
+
assert elapsed < 10.0, f"pas de short-circuit (elapsed={elapsed})"
|
| 331 |
+
# 2 pipelines × 5 = 10 docs possibles ; annulé ⇒ strictement
|
| 332 |
+
# moins traités (souvent 0).
|
| 333 |
+
assert res.run_result.manifest is not None
|
| 334 |
+
|
| 335 |
+
def test_cancel_mid_run_via_callback_propagates(
|
| 336 |
+
self, tmp_path: Path,
|
| 337 |
+
) -> None:
|
| 338 |
+
"""Identité d'objet event à travers les couches : l'event est
|
| 339 |
+
``set()`` APRÈS le 1er doc, depuis le callback ; les docs
|
| 340 |
+
suivants doivent être sautés."""
|
| 341 |
+
ev = threading.Event()
|
| 342 |
+
seen: list[str] = []
|
| 343 |
+
|
| 344 |
+
def cb(engine: str, idx: int, doc_id: str) -> None:
|
| 345 |
+
seen.append(doc_id)
|
| 346 |
+
if len(seen) == 1:
|
| 347 |
+
ev.set() # annule après le tout premier doc
|
| 348 |
+
|
| 349 |
+
_run(
|
| 350 |
+
tmp_path, "single_linear", n_docs=8,
|
| 351 |
+
progress_callback=cb, cancel_event=ev,
|
| 352 |
+
)
|
| 353 |
+
# Sans propagation correcte, les 8 docs passent. Avec : on
|
| 354 |
+
# s'arrête bien avant la fin (tolérance large pour le
|
| 355 |
+
# parallélisme max_in_flight=2 du runner).
|
| 356 |
+
assert len(seen) < 8, (
|
| 357 |
+
f"cancel mi-run NON propagé : {len(seen)}/8 docs traités "
|
| 358 |
+
"— régression identité d'objet event (Phase B)"
|
| 359 |
+
)
|
| 360 |
+
|
| 361 |
+
|
| 362 |
+
# ---------------------------------------------------------------------------
|
| 363 |
+
# 3. Interruption réelle → resume cohérent (Risque 4)
|
| 364 |
+
# ---------------------------------------------------------------------------
|
| 365 |
+
|
| 366 |
+
class TestCrashResumeConsistency:
|
| 367 |
+
"""Interruption RÉELLE (pas un partial pré-amorcé complet) :
|
| 368 |
+
on stoppe mi-run via cancel, puis on relance SANS cancel avec le
|
| 369 |
+
même ``partial_dir``.
|
| 370 |
+
|
| 371 |
+
⚠️ DÉFAUT PRÉ-EXISTANT DÉCOUVERT PAR CE HARNAIS ⚠️
|
| 372 |
+
Le partial store persiste ``PipelineResult`` mais PAS
|
| 373 |
+
``ViewResult``. Au resume, les documents rechargés du partial
|
| 374 |
+
récupèrent leurs ``pipeline_results`` mais **pas** leurs
|
| 375 |
+
``view_results`` (jamais recalculés). Conséquence : après une
|
| 376 |
+
reprise, ``view_results.jsonl`` est incomplet → toute métrique
|
| 377 |
+
agrégée (CER…) dérivée des vues est silencieusement faussée pour
|
| 378 |
+
les documents repris.
|
| 379 |
+
|
| 380 |
+
Ces tests CARACTÉRISENT le comportement ACTUEL (warts inclus) —
|
| 381 |
+
rôle d'un harnais de caractérisation — pour que Phase B ne
|
| 382 |
+
l'aggrave pas ET qu'une correction future du défaut soit
|
| 383 |
+
consciente (le test échouera, forçant la revue). Le défaut
|
| 384 |
+
lui-même est remonté à l'opérateur, pas corrigé furtivement ici
|
| 385 |
+
(resume/views = changement stateful risqué, hors périmètre
|
| 386 |
+
« construire le harnais »)."""
|
| 387 |
+
|
| 388 |
+
def _persisted_doc_ids(self, results_dir: Path) -> tuple[list[str], list[str]]:
|
| 389 |
+
pr = sorted({r["document_id"] for r in _jsonl(results_dir / "pipeline_results.jsonl")})
|
| 390 |
+
vr = sorted({r["document_id"] for r in _jsonl(results_dir / "view_results.jsonl")})
|
| 391 |
+
return pr, vr
|
| 392 |
+
|
| 393 |
+
def _interrupt_then_resume(
|
| 394 |
+
self, tmp_path: Path, n_docs: int, stop_after: int,
|
| 395 |
+
topo: str = "single_linear",
|
| 396 |
+
) -> tuple[Path, Path]:
|
| 397 |
+
sources, body = _TOPOLOGIES[topo]
|
| 398 |
+
cz = _make_corpus_zip(tmp_path / "in", n_docs, sources=sources)
|
| 399 |
+
partial = tmp_path / "partial"
|
| 400 |
+
partial.mkdir()
|
| 401 |
+
body_pd = body + f"\n partial_dir: {partial}"
|
| 402 |
+
|
| 403 |
+
ev = threading.Event()
|
| 404 |
+
n = {"c": 0}
|
| 405 |
+
|
| 406 |
+
def cb(e: str, i: int, d: str) -> None:
|
| 407 |
+
n["c"] += 1
|
| 408 |
+
if n["c"] == stop_after:
|
| 409 |
+
ev.set()
|
| 410 |
+
|
| 411 |
+
out1 = tmp_path / "out1"
|
| 412 |
+
RunOrchestrator(out1).execute(
|
| 413 |
+
_spec(cz, out1, body_pd), progress_callback=cb, cancel_event=ev,
|
| 414 |
+
)
|
| 415 |
+
out2 = tmp_path / "out2"
|
| 416 |
+
RunOrchestrator(out2).execute(_spec(cz, out2, body_pd))
|
| 417 |
+
return out1 / "results", out2 / "results"
|
| 418 |
+
|
| 419 |
+
def test_clean_run_pipeline_and_views_are_consistent(
|
| 420 |
+
self, tmp_path: Path,
|
| 421 |
+
) -> None:
|
| 422 |
+
"""Référence : un run PROPRE a pipeline_results == view_results
|
| 423 |
+
(tous les docs des deux côtés)."""
|
| 424 |
+
sources, body = _TOPOLOGIES["single_linear"]
|
| 425 |
+
cz = _make_corpus_zip(tmp_path / "in", 5, sources=sources)
|
| 426 |
+
out = tmp_path / "out"
|
| 427 |
+
RunOrchestrator(out).execute(_spec(cz, out, body))
|
| 428 |
+
pr, vr = self._persisted_doc_ids(out / "results")
|
| 429 |
+
assert pr == vr == ["doc01", "doc02", "doc03", "doc04", "doc05"]
|
| 430 |
+
|
| 431 |
+
def test_resume_pipeline_results_complete(self, tmp_path: Path) -> None:
|
| 432 |
+
"""Le resume RECONSTRUIT bien tous les ``pipeline_results``
|
| 433 |
+
(partiel chargé + reste rejoué) — cette partie est correcte."""
|
| 434 |
+
_, resumed = self._interrupt_then_resume(tmp_path, 5, stop_after=2)
|
| 435 |
+
pr, _ = self._persisted_doc_ids(resumed)
|
| 436 |
+
assert pr == ["doc01", "doc02", "doc03", "doc04", "doc05"]
|
| 437 |
+
|
| 438 |
+
#: Relation ``view_results`` vs ``pipeline_results`` APRÈS resume,
|
| 439 |
+
#: par topologie — comportement RÉEL observé (le défaut est
|
| 440 |
+
#: topologie-dépendant : présent en linéaire/DAG, absent en
|
| 441 |
+
#: multi-pipeline avec cette synchro d'interruption).
|
| 442 |
+
_RESUME_VIEW_RELATION = {
|
| 443 |
+
"single_linear": "strict_subset", # défaut : vues ⊊ pipeline
|
| 444 |
+
"branching_dag": "strict_subset", # défaut idem
|
| 445 |
+
"multi_pipeline": "equal", # pas de défaut ici
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
@pytest.mark.parametrize(
|
| 449 |
+
"topo", ["single_linear", "multi_pipeline", "branching_dag"],
|
| 450 |
+
)
|
| 451 |
+
def test_resume_view_vs_pipeline_relation_DEFECT_characterized(
|
| 452 |
+
self, tmp_path: Path, topo: str,
|
| 453 |
+
) -> None:
|
| 454 |
+
"""⚠️ CARACTÉRISE LE DÉFAUT (topologie-dépendant) ⚠️ : au
|
| 455 |
+
resume, ``pipeline_results`` couvre tout le corpus, mais la
|
| 456 |
+
relation ``view_results`` vs ``pipeline_results`` dépend de la
|
| 457 |
+
topologie (cf. :data:`_RESUME_VIEW_RELATION`) :
|
| 458 |
+
|
| 459 |
+
- ``single_linear`` / ``branching_dag`` : vues ⊊ pipeline —
|
| 460 |
+
les vues des docs repris du partial ne sont jamais
|
| 461 |
+
recalculées (métriques agrégées faussées après reprise).
|
| 462 |
+
- ``multi_pipeline`` : vues == pipeline (le défaut ne se
|
| 463 |
+
manifeste pas avec cette synchro d'interruption).
|
| 464 |
+
|
| 465 |
+
Toute évolution de l'une de ces relations (Phase B, ou
|
| 466 |
+
correction du défaut) fait échouer ce test et force une revue
|
| 467 |
+
consciente."""
|
| 468 |
+
_, resumed = self._interrupt_then_resume(
|
| 469 |
+
tmp_path, 5, stop_after=2, topo=topo,
|
| 470 |
+
)
|
| 471 |
+
pr, vr = self._persisted_doc_ids(resumed)
|
| 472 |
+
full = ["doc01", "doc02", "doc03", "doc04", "doc05"]
|
| 473 |
+
assert pr == full, f"pipeline incomplet au resume ({topo}): {pr}"
|
| 474 |
+
rel = self._RESUME_VIEW_RELATION[topo]
|
| 475 |
+
if rel == "strict_subset":
|
| 476 |
+
assert set(vr) < set(pr), (
|
| 477 |
+
f"[{topo}] défaut resume/vues changé : pipeline={pr} "
|
| 478 |
+
f"vues={vr}. Attendu : vues ⊊ pipeline."
|
| 479 |
+
)
|
| 480 |
+
else:
|
| 481 |
+
assert set(vr) == set(pr), (
|
| 482 |
+
f"[{topo}] relation resume/vues changée : pipeline={pr}"
|
| 483 |
+
f" vues={vr}. Attendu : vues == pipeline."
|
| 484 |
+
)
|
| 485 |
+
|
| 486 |
+
def test_resume_does_not_duplicate_documents(
|
| 487 |
+
self, tmp_path: Path,
|
| 488 |
+
) -> None:
|
| 489 |
+
sources, body = _TOPOLOGIES["single_linear"]
|
| 490 |
+
cz = _make_corpus_zip(tmp_path / "in", 4, sources=sources)
|
| 491 |
+
partial = tmp_path / "p"
|
| 492 |
+
partial.mkdir()
|
| 493 |
+
body_pd = body + f"\n partial_dir: {partial}"
|
| 494 |
+
out = tmp_path / "o"
|
| 495 |
+
|
| 496 |
+
ev = threading.Event()
|
| 497 |
+
c = {"n": 0}
|
| 498 |
+
|
| 499 |
+
def cb(e: str, i: int, d: str) -> None:
|
| 500 |
+
c["n"] += 1
|
| 501 |
+
if c["n"] == 1:
|
| 502 |
+
ev.set()
|
| 503 |
+
|
| 504 |
+
RunOrchestrator(out).execute(
|
| 505 |
+
_spec(cz, out, body_pd), progress_callback=cb, cancel_event=ev,
|
| 506 |
+
)
|
| 507 |
+
out2 = tmp_path / "o2"
|
| 508 |
+
res = RunOrchestrator(out2).execute(_spec(cz, out2, body_pd))
|
| 509 |
+
doc_ids = [d.document_id for d in res.run_result.document_results]
|
| 510 |
+
assert sorted(doc_ids) == ["doc01", "doc02", "doc03", "doc04"]
|
| 511 |
+
assert len(doc_ids) == len(set(doc_ids)), (
|
| 512 |
+
f"docs dupliqués au resume : {doc_ids}"
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
|
| 516 |
+
# ---------------------------------------------------------------------------
|
| 517 |
+
# 4. Golden multi-topologie + garde déterminisme (Risques 3 & 4)
|
| 518 |
+
# ---------------------------------------------------------------------------
|
| 519 |
+
|
| 520 |
+
class TestGoldenMultiTopology:
|
| 521 |
+
@pytest.mark.parametrize("topo", sorted(_TOPOLOGIES))
|
| 522 |
+
def test_snapshot_matches_golden(
|
| 523 |
+
self, tmp_path: Path, topo: str,
|
| 524 |
+
) -> None:
|
| 525 |
+
_run(tmp_path, topo, n_docs=3)
|
| 526 |
+
snap = _normalized_snapshot(tmp_path / "out" / "results")
|
| 527 |
+
_golden(topo, snap)
|
| 528 |
+
|
| 529 |
+
@pytest.mark.parametrize("topo", sorted(_TOPOLOGIES))
|
| 530 |
+
def test_snapshot_is_deterministic(
|
| 531 |
+
self, tmp_path: Path, topo: str,
|
| 532 |
+
) -> None:
|
| 533 |
+
"""Garde anti-golden-flaky : deux runs du MÊME spec doivent
|
| 534 |
+
produire un snapshot normalisé bit-identique. Si ça échoue,
|
| 535 |
+
le golden serait flaky → on refuse de le figer."""
|
| 536 |
+
_run(tmp_path / "a", topo, n_docs=3)
|
| 537 |
+
_run(tmp_path / "b", topo, n_docs=3)
|
| 538 |
+
s1 = _normalized_snapshot(tmp_path / "a" / "out" / "results")
|
| 539 |
+
s2 = _normalized_snapshot(tmp_path / "b" / "out" / "results")
|
| 540 |
+
assert s1 == s2, (
|
| 541 |
+
f"snapshot NON déterministe pour {topo} — un golden serait "
|
| 542 |
+
"flaky ; corriger la normalisation AVANT de figer"
|
| 543 |
+
)
|
| 544 |
+
|
| 545 |
+
|
| 546 |
+
# ---------------------------------------------------------------------------
|
| 547 |
+
# 5. Isolation concurrente (Risque 5)
|
| 548 |
+
# ---------------------------------------------------------------------------
|
| 549 |
+
|
| 550 |
+
class TestConcurrencyIsolation:
|
| 551 |
+
"""Deux ``execute()`` en threads parallèles, ``output_dir`` et
|
| 552 |
+
``cancel_event`` distincts. Annuler A ne doit PAS perturber B
|
| 553 |
+
(ni progression, ni complétude) — l'invariant « RunOrchestrator
|
| 554 |
+
sans état entre deux execute() » est non testé et Phase B
|
| 555 |
+
(collaborateur réutilisé) le casserait."""
|
| 556 |
+
|
| 557 |
+
def test_cancel_in_thread_a_does_not_leak_into_thread_b(
|
| 558 |
+
self, tmp_path: Path,
|
| 559 |
+
) -> None:
|
| 560 |
+
sources, body = _TOPOLOGIES["single_linear"]
|
| 561 |
+
cz = _make_corpus_zip(tmp_path / "in", 6, sources=sources)
|
| 562 |
+
|
| 563 |
+
ev_a = threading.Event()
|
| 564 |
+
ev_b = threading.Event()
|
| 565 |
+
prog_a: list[int] = []
|
| 566 |
+
prog_b: list[int] = []
|
| 567 |
+
err: dict[str, BaseException] = {}
|
| 568 |
+
|
| 569 |
+
def run_a() -> None:
|
| 570 |
+
try:
|
| 571 |
+
ev_a.set() # A annulé d'emblée
|
| 572 |
+
out = tmp_path / "a"
|
| 573 |
+
RunOrchestrator(out).execute(
|
| 574 |
+
_spec(cz, out, body),
|
| 575 |
+
progress_callback=lambda e, i, d: prog_a.append(i),
|
| 576 |
+
cancel_event=ev_a,
|
| 577 |
+
)
|
| 578 |
+
except BaseException as e: # noqa: BLE001
|
| 579 |
+
err["a"] = e
|
| 580 |
+
|
| 581 |
+
def run_b() -> None:
|
| 582 |
+
try:
|
| 583 |
+
out = tmp_path / "b"
|
| 584 |
+
RunOrchestrator(out).execute(
|
| 585 |
+
_spec(cz, out, body),
|
| 586 |
+
progress_callback=lambda e, i, d: prog_b.append(i),
|
| 587 |
+
cancel_event=ev_b, # B jamais annulé
|
| 588 |
+
)
|
| 589 |
+
except BaseException as e: # noqa: BLE001
|
| 590 |
+
err["b"] = e
|
| 591 |
+
|
| 592 |
+
ta = threading.Thread(target=run_a)
|
| 593 |
+
tb = threading.Thread(target=run_b)
|
| 594 |
+
ta.start()
|
| 595 |
+
tb.start()
|
| 596 |
+
ta.join(timeout=30)
|
| 597 |
+
tb.join(timeout=30)
|
| 598 |
+
|
| 599 |
+
assert not err, f"exception thread : {err}"
|
| 600 |
+
# B (non annulé) doit avoir traité ses 6 docs avec une
|
| 601 |
+
# séquence propre 0..5 — aucune fuite du compteur de A.
|
| 602 |
+
assert sorted(prog_b) == [0, 1, 2, 3, 4, 5], (
|
| 603 |
+
f"B perturbé par l'annulation de A : prog_b={sorted(prog_b)} "
|
| 604 |
+
"— fuite d'état entre instances (Phase B)"
|
| 605 |
+
)
|
| 606 |
+
# A annulé : strictement moins que 6.
|
| 607 |
+
assert len(prog_a) < 6, f"A non annulé : {prog_a}"
|