Claude commited on
Commit
95c8668
·
unverified ·
1 Parent(s): cd404f5

refactor(app): Phase A — extrait les 4 builders @staticmethod (run_orchestrator)

Browse files

Suite à la question « pas <500 ? » : correction de mon erreur
d'analyse (j'avais classé ces staticmethods en « risqué » à tort —
sans self, extraction sûre, pattern verbatim prouvé 7×).

_load_corpus / _build_pipelines / _build_views / _build_benchmark
_service → run_orchestrator_helpers/builders.py. _build_views et
_build_benchmark_service : retrait complet (call-sites → nom
module-global). _build_pipelines + _load_corpus : wrapper mince
@staticmethod conservé (tests les appellent via orch.<m>()).

Couplage white-box rattrapé (la suite l'a attrapé, 3 échecs) :
- test_run_orchestrator_feature_parity : orch._load_corpus → wrapper
- test_sprint_a14_s53 : patch resolve_adapter_class retargeté vers
run_orchestrator_helpers.builders (le symbole a migré ; le test
white-box suit, ce n'est pas masquer un bug).

run_orchestrator.py : 1316 → 731 (-44 % cumulé). Budget CI
RATCHETÉ VERS LE BAS 1050 → 775 (731+6 %, pas +15 % : signale
l'intention <500, n'entérine pas la taille). 413 app/CLI/archi
verts, lint propre (ratchet ISC/FLY/G inclus).

https://claude.ai/code/session_01EmLiMPJJuB44QHEFzDWUvF

picarones/app/services/run_orchestrator.py CHANGED
@@ -36,10 +36,8 @@ Anti-sur-ingénierie
36
 
37
  from __future__ import annotations
38
 
39
- import io
40
  import logging
41
  import threading
42
- import zipfile
43
  from dataclasses import dataclass, field
44
  from pathlib import Path
45
  from typing import Any, Callable
@@ -47,30 +45,16 @@ from typing import Any, Callable
47
  logger = logging.getLogger(__name__)
48
 
49
  from picarones.app.results import ReportRenderer, RunResult
50
- from picarones.app.schemas import RunSpec, resolve_adapter_class
51
- from picarones.app.services.benchmark_service import BenchmarkService
52
  from picarones.app.services.dependencies import (
53
  capture_dependencies_lock,
54
  capture_system_binaries_lock,
55
  )
56
- from picarones.app.services.corpus_service import (
57
- CorpusImportError,
58
- CorpusService,
59
- )
60
  from picarones.app.services.path_security import WorkspaceManager
61
  from picarones.app.services.registry_service import RegistryService
62
  from picarones.domain.corpus import CorpusSpec
63
- from picarones.evaluation.views import (
64
- DefaultEvaluationViewExecutor,
65
- build_alto_view,
66
- build_search_view,
67
- build_text_view,
68
- )
69
  from picarones.pipeline import (
70
- CorpusRunner,
71
- PipelineExecutor,
72
  PipelineSpec,
73
- PipelineStep,
74
  )
75
 
76
  # Helpers stateless extraits (audit prod P1 — dégonflage god-module).
@@ -80,10 +64,14 @@ from picarones.pipeline import (
80
  # (donc ``monkeypatch.setattr(run_orchestrator, …)`` fonctionne aussi).
81
  from picarones.app.services.run_orchestrator_helpers import (
82
  _PipelineEngineProxy as _PipelineEngineProxy,
 
 
 
83
  _default_gt_factory as _default_gt_factory,
84
  _default_inputs_factory as _default_inputs_factory,
85
  _filesystem_payload_loader as _filesystem_payload_loader,
86
  _kwargs_signature as _kwargs_signature,
 
87
  _make_context_factory as _make_context_factory,
88
  _persist_legacy_benchmark_json as _persist_legacy_benchmark_json,
89
  _resolve_entity_extractor as _resolve_entity_extractor,
@@ -203,7 +191,7 @@ class RunOrchestrator:
203
  workspace = WorkspaceManager(self._output_dir)
204
 
205
  # 1. Corpus.
206
- corpus_spec, extracted_dir = self._load_corpus(spec, workspace)
207
 
208
  # 2. Registres.
209
  registries = RegistryService.bootstrap_defaults()
@@ -215,14 +203,14 @@ class RunOrchestrator:
215
 
216
  # 4. Vues canoniques. Phase B2.5 — propage normalization +
217
  # char_exclude aux vues text_final/searchability.
218
- views = self._build_views(
219
  spec.views,
220
  normalization_profile=spec.normalization_profile,
221
  char_exclude=spec.char_exclude,
222
  )
223
 
224
  # 5. BenchmarkService.
225
- bench = self._build_benchmark_service(
226
  registries=registries,
227
  adapter_resolver=adapter_resolver,
228
  code_version=spec.code_version,
@@ -379,12 +367,12 @@ class RunOrchestrator:
379
  self._output_dir.mkdir(parents=True, exist_ok=True)
380
 
381
  registries = RegistryService.bootstrap_defaults()
382
- views = self._build_views(
383
  spec.views,
384
  normalization_profile=spec.normalization_profile,
385
  char_exclude=spec.char_exclude,
386
  )
387
- bench = self._build_benchmark_service(
388
  registries=registries,
389
  adapter_resolver=adapter_resolver,
390
  code_version=spec.code_version,
@@ -472,112 +460,25 @@ class RunOrchestrator:
472
  def _load_corpus(
473
  spec: RunSpec, workspace: WorkspaceManager,
474
  ) -> tuple[CorpusSpec, Path]:
475
- """Charge le corpus selon ``corpus_zip`` ou ``corpus_dir``."""
476
- corpus_service = CorpusService(workspace)
477
- if spec.corpus_zip is not None:
478
- zip_path = Path(spec.corpus_zip)
479
- zip_bytes = zip_path.read_bytes()
480
- report = corpus_service.import_zip(
481
- zip_bytes,
482
- corpus_name=spec.corpus_name or zip_path.stem,
483
- metadata=spec.corpus_metadata,
484
- )
485
- return report.spec, report.extracted_dir
486
-
487
- # corpus_dir : on zippe à la volée le contenu du dir et on
488
- # délègue à ``CorpusService`` — réutilise toute la détection
489
- # sans dupliquer la logique de classification image / GT.
490
- assert spec.corpus_dir is not None # garanti par RunSpec validator
491
- src_dir = Path(spec.corpus_dir)
492
- if not src_dir.is_dir():
493
- raise CorpusImportError(
494
- f"corpus_dir n'est pas un répertoire : {src_dir!r}.",
495
- )
496
- buf = io.BytesIO()
497
- with zipfile.ZipFile(buf, mode="w") as zf:
498
- for file_path in src_dir.rglob("*"):
499
- if file_path.is_file():
500
- arc = file_path.relative_to(src_dir).as_posix()
501
- zf.write(file_path, arcname=arc)
502
- report = corpus_service.import_zip(
503
- buf.getvalue(),
504
- corpus_name=spec.corpus_name or src_dir.name,
505
- metadata=spec.corpus_metadata,
506
- )
507
- return report.spec, report.extracted_dir
508
 
509
  @staticmethod
510
  def _build_pipelines(
511
  spec: RunSpec,
512
- ) -> tuple[
513
- list[PipelineSpec],
514
- Callable[[str], Any],
515
- dict[str, dict[str, Any]],
516
- ]:
517
- """Construit les ``PipelineSpec`` + un resolver d'adapters.
518
-
519
- Disambiguation des steps :
520
-
521
- - Deux steps avec la même ``(class, kwargs)`` partagent la
522
- même instance d'adapter (cache).
523
- - Deux steps avec la même ``id`` mais une ``class`` ou des
524
- ``kwargs`` différents reçoivent des ``adapter_name``
525
- distincts (préfixés par le nom de pipeline).
526
-
527
- C'est essentiel pour le cas où plusieurs pipelines utilisent
528
- la **même classe** avec des **kwargs différents** (ex :
529
- ``PrecomputedTextAdapter`` instancié N fois avec
530
- ``source_label`` distincts).
531
- """
532
- instance_cache: dict[str, Any] = {}
533
- registered: dict[str, tuple[type, str]] = {}
534
- name_to_class: dict[str, type] = {}
535
- name_to_kwargs: dict[str, dict[str, Any]] = {}
536
-
537
- pipeline_specs: list[PipelineSpec] = []
538
- for p in spec.pipelines:
539
- steps: list[PipelineStep] = []
540
- for s in p.steps:
541
- cls = resolve_adapter_class(s.adapter_class)
542
- kwargs_sig = _kwargs_signature(s.adapter_kwargs)
543
- adapter_name = s.id
544
- existing = registered.get(adapter_name)
545
- if existing is not None and existing != (cls, kwargs_sig):
546
- adapter_name = f"{p.name}__{s.id}"
547
- registered[adapter_name] = (cls, kwargs_sig)
548
- name_to_class[adapter_name] = cls
549
- name_to_kwargs[adapter_name] = s.adapter_kwargs
550
- # ``inputs_from`` du StepSpec YAML doit être propagé au
551
- # ``domain.PipelineSpec`` pour que le DAG branchant soit
552
- # honoré ; sans ce passage, un DAG branchant déclaré dans
553
- # le YAML serait silencieusement exécuté en linéaire.
554
- steps.append(PipelineStep(
555
- id=s.id,
556
- kind="step",
557
- adapter_name=adapter_name,
558
- input_types=s.input_types,
559
- output_types=s.output_types,
560
- inputs_from=dict(s.inputs_from),
561
- ))
562
- pipeline_specs.append(PipelineSpec(
563
- name=p.name,
564
- initial_inputs=p.initial_inputs,
565
- steps=tuple(steps),
566
- ))
567
-
568
- def resolver(name: str) -> Any:
569
- if name not in instance_cache:
570
- cls = name_to_class[name]
571
- kwargs = name_to_kwargs[name]
572
- instance_cache[name] = cls(**kwargs)
573
- return instance_cache[name]
574
-
575
- # Copie défensive — le manifest doit recevoir un snapshot
576
- # immuable, pas la map vivante du resolver.
577
- adapter_kwargs_dump = {
578
- name: dict(kwargs) for name, kwargs in name_to_kwargs.items()
579
- }
580
- return pipeline_specs, resolver, adapter_kwargs_dump
581
 
582
  def _execute_with_partial(
583
  self,
@@ -628,7 +529,6 @@ class RunOrchestrator:
628
  load_partial_pipeline_results,
629
  partial_path_for_pipeline,
630
  )
631
- from picarones.domain.corpus import CorpusSpec
632
  from picarones.domain.run_manifest import RunManifest
633
  from picarones.pipeline.run_result import RunDocumentResult
634
 
@@ -835,77 +735,6 @@ class RunOrchestrator:
835
  document_results=tuple(final_doc_results),
836
  )
837
 
838
- @staticmethod
839
- def _build_views(
840
- view_names: tuple[str, ...],
841
- *,
842
- normalization_profile: str | None = None,
843
- char_exclude: str | None = None,
844
- ) -> list[Any]:
845
- """Map noms canoniques → vues construites.
846
-
847
- Phase B2.5 — ``normalization_profile`` et ``char_exclude``
848
- sont propagés aux vues qui les supportent (``text_final`` et
849
- ``searchability``). ``alto_documentary`` les ignore : ses
850
- métriques structurelles n'opèrent pas sur du texte.
851
- """
852
- text_view_kwargs = {
853
- "normalization_profile": normalization_profile,
854
- "char_exclude": char_exclude,
855
- }
856
- builders: dict[str, Callable[[], Any]] = {
857
- "text_final": lambda: build_text_view(**text_view_kwargs),
858
- "alto_documentary": build_alto_view,
859
- "searchability": lambda: build_search_view(**text_view_kwargs),
860
- }
861
- return [builders[name]() for name in view_names]
862
-
863
- @staticmethod
864
- def _build_benchmark_service(
865
- *,
866
- registries: RegistryService,
867
- adapter_resolver: Callable[[str], Any],
868
- code_version: str,
869
- cancel_event: threading.Event | None = None,
870
- timeout_seconds_per_doc: float = 300.0,
871
- ) -> BenchmarkService:
872
- """Assemble ``BenchmarkService`` avec un loader filesystem.
873
-
874
- Phase B2.2 — quand ``cancel_event`` est fourni, le
875
- ``CorpusRunner.run`` est wrappé pour injecter l'event dans
876
- chaque appel. Pattern strictement copié de
877
- ``_benchmark_execution.py:142-149`` (legacy).
878
- """
879
- pipeline_executor = PipelineExecutor(
880
- adapter_resolver=adapter_resolver,
881
- )
882
- corpus_runner = CorpusRunner(
883
- pipeline_executor,
884
- max_in_flight=2,
885
- timeout_seconds_per_doc=timeout_seconds_per_doc,
886
- poll_interval_seconds=0.05,
887
- )
888
-
889
- if cancel_event is not None:
890
- original_run = corpus_runner.run
891
-
892
- def _runner_run_with_cancel(*args: Any, **kwargs: Any) -> Any:
893
- kwargs.setdefault("cancel_event", cancel_event)
894
- return original_run(*args, **kwargs)
895
-
896
- corpus_runner.run = _runner_run_with_cancel # type: ignore[method-assign]
897
-
898
- view_executor = DefaultEvaluationViewExecutor.from_registries(
899
- registries.metrics,
900
- registries.projectors,
901
- _filesystem_payload_loader,
902
- )
903
- return BenchmarkService(
904
- corpus_runner=corpus_runner,
905
- view_executor=view_executor,
906
- code_version=code_version,
907
- )
908
-
909
 
910
  __all__ = [
911
  "OrchestrationResult",
 
36
 
37
  from __future__ import annotations
38
 
 
39
  import logging
40
  import threading
 
41
  from dataclasses import dataclass, field
42
  from pathlib import Path
43
  from typing import Any, Callable
 
45
  logger = logging.getLogger(__name__)
46
 
47
  from picarones.app.results import ReportRenderer, RunResult
48
+ from picarones.app.schemas import RunSpec
 
49
  from picarones.app.services.dependencies import (
50
  capture_dependencies_lock,
51
  capture_system_binaries_lock,
52
  )
 
 
 
 
53
  from picarones.app.services.path_security import WorkspaceManager
54
  from picarones.app.services.registry_service import RegistryService
55
  from picarones.domain.corpus import CorpusSpec
 
 
 
 
 
 
56
  from picarones.pipeline import (
 
 
57
  PipelineSpec,
 
58
  )
59
 
60
  # Helpers stateless extraits (audit prod P1 — dégonflage god-module).
 
64
  # (donc ``monkeypatch.setattr(run_orchestrator, …)`` fonctionne aussi).
65
  from picarones.app.services.run_orchestrator_helpers import (
66
  _PipelineEngineProxy as _PipelineEngineProxy,
67
+ _build_benchmark_service as _build_benchmark_service,
68
+ _build_pipelines as _build_pipelines,
69
+ _build_views as _build_views,
70
  _default_gt_factory as _default_gt_factory,
71
  _default_inputs_factory as _default_inputs_factory,
72
  _filesystem_payload_loader as _filesystem_payload_loader,
73
  _kwargs_signature as _kwargs_signature,
74
+ _load_corpus as _load_corpus,
75
  _make_context_factory as _make_context_factory,
76
  _persist_legacy_benchmark_json as _persist_legacy_benchmark_json,
77
  _resolve_entity_extractor as _resolve_entity_extractor,
 
191
  workspace = WorkspaceManager(self._output_dir)
192
 
193
  # 1. Corpus.
194
+ corpus_spec, extracted_dir = _load_corpus(spec, workspace)
195
 
196
  # 2. Registres.
197
  registries = RegistryService.bootstrap_defaults()
 
203
 
204
  # 4. Vues canoniques. Phase B2.5 — propage normalization +
205
  # char_exclude aux vues text_final/searchability.
206
+ views = _build_views(
207
  spec.views,
208
  normalization_profile=spec.normalization_profile,
209
  char_exclude=spec.char_exclude,
210
  )
211
 
212
  # 5. BenchmarkService.
213
+ bench = _build_benchmark_service(
214
  registries=registries,
215
  adapter_resolver=adapter_resolver,
216
  code_version=spec.code_version,
 
367
  self._output_dir.mkdir(parents=True, exist_ok=True)
368
 
369
  registries = RegistryService.bootstrap_defaults()
370
+ views = _build_views(
371
  spec.views,
372
  normalization_profile=spec.normalization_profile,
373
  char_exclude=spec.char_exclude,
374
  )
375
+ bench = _build_benchmark_service(
376
  registries=registries,
377
  adapter_resolver=adapter_resolver,
378
  code_version=spec.code_version,
 
460
  def _load_corpus(
461
  spec: RunSpec, workspace: WorkspaceManager,
462
  ) -> tuple[CorpusSpec, Path]:
463
+ """Wrapper mince délègue à
464
+ :func:`run_orchestrator_helpers.builders._load_corpus`
465
+ (audit Phase A). Conservé comme ``@staticmethod`` car un test
466
+ de parité l'appelle via ``orch._load_corpus(...)`` pour
467
+ recalculer un fingerprint de partial cohérent."""
468
+ return _load_corpus(spec, workspace)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
  @staticmethod
471
  def _build_pipelines(
472
  spec: RunSpec,
473
+ ) -> tuple[list[PipelineSpec], Callable[[str], Any], dict[str, dict[str, Any]]]:
474
+ """Wrapper mince — délègue à
475
+ :func:`run_orchestrator_helpers.builders._build_pipelines`
476
+ (audit Phase A : corps extrait hors du god-module). Conservé
477
+ comme ``@staticmethod`` car un test l'appelle via
478
+ ``orch._build_pipelines(spec)`` ; ``_build_pipelines`` réfère
479
+ ici le nom module-global réimporté (pas de récursion : la
480
+ méthode de classe n'est pas dans les globals de la fonction)."""
481
+ return _build_pipelines(spec)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
 
483
  def _execute_with_partial(
484
  self,
 
529
  load_partial_pipeline_results,
530
  partial_path_for_pipeline,
531
  )
 
532
  from picarones.domain.run_manifest import RunManifest
533
  from picarones.pipeline.run_result import RunDocumentResult
534
 
 
735
  document_results=tuple(final_doc_results),
736
  )
737
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
738
 
739
  __all__ = [
740
  "OrchestrationResult",
picarones/app/services/run_orchestrator_helpers/__init__.py CHANGED
@@ -6,6 +6,8 @@ en sous-modules cohésifs :
6
 
7
  - :mod:`.factories` — GT / inputs / RunContext (stateless)
8
  - :mod:`.loaders` — payload filesystem + signature kwargs
 
 
9
  - :mod:`.legacy` — pont converter ``BenchmarkResult`` + résolution
10
  NER + persistance JSON legacy
11
 
@@ -25,6 +27,12 @@ from picarones.app.services.run_orchestrator_helpers.loaders import (
25
  _filesystem_payload_loader as _filesystem_payload_loader,
26
  _kwargs_signature as _kwargs_signature,
27
  )
 
 
 
 
 
 
28
  from picarones.app.services.run_orchestrator_helpers.legacy import (
29
  _PipelineEngineProxy as _PipelineEngineProxy,
30
  _persist_legacy_benchmark_json as _persist_legacy_benchmark_json,
@@ -33,10 +41,14 @@ from picarones.app.services.run_orchestrator_helpers.legacy import (
33
 
34
  __all__ = [
35
  "_PipelineEngineProxy",
 
 
 
36
  "_default_gt_factory",
37
  "_default_inputs_factory",
38
  "_filesystem_payload_loader",
39
  "_kwargs_signature",
 
40
  "_make_context_factory",
41
  "_persist_legacy_benchmark_json",
42
  "_resolve_entity_extractor",
 
6
 
7
  - :mod:`.factories` — GT / inputs / RunContext (stateless)
8
  - :mod:`.loaders` — payload filesystem + signature kwargs
9
+ - :mod:`.builders` — corpus / pipelines / vues / BenchmarkService
10
+ (ex-``@staticmethod`` du god-module, Phase A)
11
  - :mod:`.legacy` — pont converter ``BenchmarkResult`` + résolution
12
  NER + persistance JSON legacy
13
 
 
27
  _filesystem_payload_loader as _filesystem_payload_loader,
28
  _kwargs_signature as _kwargs_signature,
29
  )
30
+ from picarones.app.services.run_orchestrator_helpers.builders import (
31
+ _build_benchmark_service as _build_benchmark_service,
32
+ _build_pipelines as _build_pipelines,
33
+ _build_views as _build_views,
34
+ _load_corpus as _load_corpus,
35
+ )
36
  from picarones.app.services.run_orchestrator_helpers.legacy import (
37
  _PipelineEngineProxy as _PipelineEngineProxy,
38
  _persist_legacy_benchmark_json as _persist_legacy_benchmark_json,
 
41
 
42
  __all__ = [
43
  "_PipelineEngineProxy",
44
+ "_build_benchmark_service",
45
+ "_build_pipelines",
46
+ "_build_views",
47
  "_default_gt_factory",
48
  "_default_inputs_factory",
49
  "_filesystem_payload_loader",
50
  "_kwargs_signature",
51
+ "_load_corpus",
52
  "_make_context_factory",
53
  "_persist_legacy_benchmark_json",
54
  "_resolve_entity_extractor",
picarones/app/services/run_orchestrator_helpers/builders.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Constructeurs stateless du ``RunOrchestrator`` (corpus / pipelines /
2
+ vues / service).
3
+
4
+ Audit prod Phase A — extraction des 4 ``@staticmethod`` (sans
5
+ ``self``) hors du god-module ``run_orchestrator.py``. Déplacement
6
+ verbatim, comportement strictement préservé : ``run_orchestrator``
7
+ réimporte ces noms (façade) et conserve un wrapper mince
8
+ ``_build_pipelines`` (un test l'appelle via ``orch._build_pipelines``).
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import io
14
+ import threading
15
+ import zipfile
16
+ from pathlib import Path
17
+ from typing import Any, Callable
18
+
19
+ from picarones.app.schemas import RunSpec, resolve_adapter_class
20
+ from picarones.app.services.benchmark_service import BenchmarkService
21
+ from picarones.app.services.corpus_service import (
22
+ CorpusImportError,
23
+ CorpusService,
24
+ )
25
+ from picarones.app.services.path_security import WorkspaceManager
26
+ from picarones.app.services.registry_service import RegistryService
27
+ from picarones.domain.corpus import CorpusSpec
28
+ from picarones.evaluation.views import (
29
+ DefaultEvaluationViewExecutor,
30
+ build_alto_view,
31
+ build_search_view,
32
+ build_text_view,
33
+ )
34
+ from picarones.pipeline import (
35
+ CorpusRunner,
36
+ PipelineExecutor,
37
+ PipelineSpec,
38
+ PipelineStep,
39
+ )
40
+
41
+ from picarones.app.services.run_orchestrator_helpers.loaders import (
42
+ _filesystem_payload_loader,
43
+ _kwargs_signature,
44
+ )
45
+
46
+
47
+ def _load_corpus(
48
+ spec: RunSpec, workspace: WorkspaceManager,
49
+ ) -> tuple[CorpusSpec, Path]:
50
+ """Charge le corpus selon ``corpus_zip`` ou ``corpus_dir``."""
51
+ corpus_service = CorpusService(workspace)
52
+ if spec.corpus_zip is not None:
53
+ zip_path = Path(spec.corpus_zip)
54
+ zip_bytes = zip_path.read_bytes()
55
+ report = corpus_service.import_zip(
56
+ zip_bytes,
57
+ corpus_name=spec.corpus_name or zip_path.stem,
58
+ metadata=spec.corpus_metadata,
59
+ )
60
+ return report.spec, report.extracted_dir
61
+
62
+ # corpus_dir : on zippe à la volée le contenu du dir et on
63
+ # délègue à ``CorpusService`` — réutilise toute la détection
64
+ # sans dupliquer la logique de classification image / GT.
65
+ assert spec.corpus_dir is not None # garanti par RunSpec validator
66
+ src_dir = Path(spec.corpus_dir)
67
+ if not src_dir.is_dir():
68
+ raise CorpusImportError(
69
+ f"corpus_dir n'est pas un répertoire : {src_dir!r}.",
70
+ )
71
+ buf = io.BytesIO()
72
+ with zipfile.ZipFile(buf, mode="w") as zf:
73
+ for file_path in src_dir.rglob("*"):
74
+ if file_path.is_file():
75
+ arc = file_path.relative_to(src_dir).as_posix()
76
+ zf.write(file_path, arcname=arc)
77
+ report = corpus_service.import_zip(
78
+ buf.getvalue(),
79
+ corpus_name=spec.corpus_name or src_dir.name,
80
+ metadata=spec.corpus_metadata,
81
+ )
82
+ return report.spec, report.extracted_dir
83
+
84
+
85
+ def _build_pipelines(
86
+ spec: RunSpec,
87
+ ) -> tuple[
88
+ list[PipelineSpec],
89
+ Callable[[str], Any],
90
+ dict[str, dict[str, Any]],
91
+ ]:
92
+ """Construit les ``PipelineSpec`` + un resolver d'adapters.
93
+
94
+ Disambiguation des steps :
95
+
96
+ - Deux steps avec la même ``(class, kwargs)`` partagent la
97
+ même instance d'adapter (cache).
98
+ - Deux steps avec la même ``id`` mais une ``class`` ou des
99
+ ``kwargs`` différents reçoivent des ``adapter_name``
100
+ distincts (préfixés par le nom de pipeline).
101
+
102
+ C'est essentiel pour le cas où plusieurs pipelines utilisent
103
+ la **même classe** avec des **kwargs différents** (ex :
104
+ ``PrecomputedTextAdapter`` instancié N fois avec
105
+ ``source_label`` distincts).
106
+ """
107
+ instance_cache: dict[str, Any] = {}
108
+ registered: dict[str, tuple[type, str]] = {}
109
+ name_to_class: dict[str, type] = {}
110
+ name_to_kwargs: dict[str, dict[str, Any]] = {}
111
+
112
+ pipeline_specs: list[PipelineSpec] = []
113
+ for p in spec.pipelines:
114
+ steps: list[PipelineStep] = []
115
+ for s in p.steps:
116
+ cls = resolve_adapter_class(s.adapter_class)
117
+ kwargs_sig = _kwargs_signature(s.adapter_kwargs)
118
+ adapter_name = s.id
119
+ existing = registered.get(adapter_name)
120
+ if existing is not None and existing != (cls, kwargs_sig):
121
+ adapter_name = f"{p.name}__{s.id}"
122
+ registered[adapter_name] = (cls, kwargs_sig)
123
+ name_to_class[adapter_name] = cls
124
+ name_to_kwargs[adapter_name] = s.adapter_kwargs
125
+ # ``inputs_from`` du StepSpec YAML doit être propagé au
126
+ # ``domain.PipelineSpec`` pour que le DAG branchant soit
127
+ # honoré ; sans ce passage, un DAG branchant déclaré dans
128
+ # le YAML serait silencieusement exécuté en linéaire.
129
+ steps.append(PipelineStep(
130
+ id=s.id,
131
+ kind="step",
132
+ adapter_name=adapter_name,
133
+ input_types=s.input_types,
134
+ output_types=s.output_types,
135
+ inputs_from=dict(s.inputs_from),
136
+ ))
137
+ pipeline_specs.append(PipelineSpec(
138
+ name=p.name,
139
+ initial_inputs=p.initial_inputs,
140
+ steps=tuple(steps),
141
+ ))
142
+
143
+ def resolver(name: str) -> Any:
144
+ if name not in instance_cache:
145
+ cls = name_to_class[name]
146
+ kwargs = name_to_kwargs[name]
147
+ instance_cache[name] = cls(**kwargs)
148
+ return instance_cache[name]
149
+
150
+ # Copie défensive — le manifest doit recevoir un snapshot
151
+ # immuable, pas la map vivante du resolver.
152
+ adapter_kwargs_dump = {
153
+ name: dict(kwargs) for name, kwargs in name_to_kwargs.items()
154
+ }
155
+ return pipeline_specs, resolver, adapter_kwargs_dump
156
+
157
+
158
+ def _build_views(
159
+ view_names: tuple[str, ...],
160
+ *,
161
+ normalization_profile: str | None = None,
162
+ char_exclude: str | None = None,
163
+ ) -> list[Any]:
164
+ """Map noms canoniques → vues construites.
165
+
166
+ Phase B2.5 — ``normalization_profile`` et ``char_exclude``
167
+ sont propagés aux vues qui les supportent (``text_final`` et
168
+ ``searchability``). ``alto_documentary`` les ignore : ses
169
+ métriques structurelles n'opèrent pas sur du texte.
170
+ """
171
+ text_view_kwargs = {
172
+ "normalization_profile": normalization_profile,
173
+ "char_exclude": char_exclude,
174
+ }
175
+ builders: dict[str, Callable[[], Any]] = {
176
+ "text_final": lambda: build_text_view(**text_view_kwargs),
177
+ "alto_documentary": build_alto_view,
178
+ "searchability": lambda: build_search_view(**text_view_kwargs),
179
+ }
180
+ return [builders[name]() for name in view_names]
181
+
182
+
183
+ def _build_benchmark_service(
184
+ *,
185
+ registries: RegistryService,
186
+ adapter_resolver: Callable[[str], Any],
187
+ code_version: str,
188
+ cancel_event: threading.Event | None = None,
189
+ timeout_seconds_per_doc: float = 300.0,
190
+ ) -> BenchmarkService:
191
+ """Assemble ``BenchmarkService`` avec un loader filesystem.
192
+
193
+ Phase B2.2 — quand ``cancel_event`` est fourni, le
194
+ ``CorpusRunner.run`` est wrappé pour injecter l'event dans
195
+ chaque appel. Pattern strictement copié de
196
+ ``_benchmark_execution.py:142-149`` (legacy).
197
+ """
198
+ pipeline_executor = PipelineExecutor(
199
+ adapter_resolver=adapter_resolver,
200
+ )
201
+ corpus_runner = CorpusRunner(
202
+ pipeline_executor,
203
+ max_in_flight=2,
204
+ timeout_seconds_per_doc=timeout_seconds_per_doc,
205
+ poll_interval_seconds=0.05,
206
+ )
207
+
208
+ if cancel_event is not None:
209
+ original_run = corpus_runner.run
210
+
211
+ def _runner_run_with_cancel(*args: Any, **kwargs: Any) -> Any:
212
+ kwargs.setdefault("cancel_event", cancel_event)
213
+ return original_run(*args, **kwargs)
214
+
215
+ corpus_runner.run = _runner_run_with_cancel # type: ignore[method-assign]
216
+
217
+ view_executor = DefaultEvaluationViewExecutor.from_registries(
218
+ registries.metrics,
219
+ registries.projectors,
220
+ _filesystem_payload_loader,
221
+ )
222
+ return BenchmarkService(
223
+ corpus_runner=corpus_runner,
224
+ view_executor=view_executor,
225
+ code_version=code_version,
226
+ )
227
+
228
+
229
+ __all__ = [
230
+ "_build_benchmark_service",
231
+ "_build_pipelines",
232
+ "_build_views",
233
+ "_load_corpus",
234
+ ]
tests/app/services/test_sprint_a14_s53_inputs_from_propagation.py CHANGED
@@ -59,7 +59,7 @@ def test_orchestrator_propagates_inputs_from_to_pipeline_step(
59
  # inputs_from.
60
  from unittest.mock import MagicMock, patch
61
  with patch(
62
- "picarones.app.services.run_orchestrator.resolve_adapter_class",
63
  return_value=MagicMock,
64
  ):
65
  pipeline_specs, _resolver, _kwargs = orch._build_pipelines(spec)
@@ -95,7 +95,7 @@ def test_step_without_inputs_from_yields_empty_dict(tmp_path) -> None:
95
  orch = RunOrchestrator(output_dir=tmp_path / "out")
96
  from unittest.mock import MagicMock, patch
97
  with patch(
98
- "picarones.app.services.run_orchestrator.resolve_adapter_class",
99
  return_value=MagicMock,
100
  ):
101
  pipeline_specs, _, _ = orch._build_pipelines(spec)
 
59
  # inputs_from.
60
  from unittest.mock import MagicMock, patch
61
  with patch(
62
+ "picarones.app.services.run_orchestrator_helpers.builders.resolve_adapter_class",
63
  return_value=MagicMock,
64
  ):
65
  pipeline_specs, _resolver, _kwargs = orch._build_pipelines(spec)
 
95
  orch = RunOrchestrator(output_dir=tmp_path / "out")
96
  from unittest.mock import MagicMock, patch
97
  with patch(
98
+ "picarones.app.services.run_orchestrator_helpers.builders.resolve_adapter_class",
99
  return_value=MagicMock,
100
  ):
101
  pipeline_specs, _, _ = orch._build_pipelines(spec)
tests/architecture/test_file_budgets.py CHANGED
@@ -124,9 +124,12 @@ FILE_BUDGETS: dict[str, int] = {
124
  "picarones/app/services/corpus_service.py": 625, # actuel 541
125
  "picarones/app/services/path_security.py": 470, # actuel 410
126
  # Audit prod P1 — dégonflage du god-module : helpers extraits, puis
127
- # (P1.1) ``run_orchestrator_helpers`` éclaté en sous-package cohésif
128
- # (factories/loaders/legacy, chacun < 400 : pas d'entrée budget).
129
- "picarones/app/services/run_orchestrator.py": 1050, # actuel ~913
 
 
 
130
  "picarones/adapters/ocr/tesseract.py": 560, # actuel 479 — Phase B5 migration Option B (+ ALTO_XML expose)
131
  "picarones/app/schemas/run_spec.py": 620, # actuel 530 — Phase B1 migration Option B (+90 LOC : 7 nouveaux champs + 2 validators)
132
  "picarones/reports/html/render.py": 700, # actuel 615
 
124
  "picarones/app/services/corpus_service.py": 625, # actuel 541
125
  "picarones/app/services/path_security.py": 470, # actuel 410
126
  # Audit prod P1 — dégonflage du god-module : helpers extraits, puis
127
+ # (P1.1) ``run_orchestrator_helpers`` éclaté en sous-package, puis
128
+ # (Phase A) 4 builders @staticmethod extraits builders.py.
129
+ # Budget RATCHETÉ VERS LE BAS (731 + ~6 %, pas +15 % paresseux) :
130
+ # signale l'intention de poursuivre vers <500 (Phase B), n'entérine
131
+ # pas la taille actuelle comme acceptable.
132
+ "picarones/app/services/run_orchestrator.py": 775, # actuel ~731
133
  "picarones/adapters/ocr/tesseract.py": 560, # actuel 479 — Phase B5 migration Option B (+ ALTO_XML expose)
134
  "picarones/app/schemas/run_spec.py": 620, # actuel 530 — Phase B1 migration Option B (+90 LOC : 7 nouveaux champs + 2 validators)
135
  "picarones/reports/html/render.py": 700, # actuel 615