# stage-a-grid-v3-gpu Summary - generated_at: 2026-04-24T11:28:02.141941+00:00 - artifact_root: `Sentinel_models/artifacts/stage-a-grid-v3-gpu` - model_count: 48 - ranking: test.stage_a_selection_score desc, then violation_f1 desc, then severity_accuracy desc ## Candidate Ranking | Rank | Model | Encoder | Head | Variant | StageA | vF1 | Sev | Dom | Sub | Jur | Why | Prin | Rem | Diff | Agg | | ---: | --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | | 1 | `sentinel-rb-c-d11` | FacebookAI/roberta-large | columnar | d11 | 0.799 | 1.000 | 0.733 | 0.910 | 0.830 | 0.753 | 0.686 | 0.775 | 0.707 | 0.527 | 0.710 | | 2 | `sentinel-rb-c-d30` | FacebookAI/roberta-large | columnar | d30 | 0.798 | 0.996 | 0.700 | 0.885 | 0.846 | 0.793 | 0.707 | 0.770 | 0.687 | 0.493 | 0.715 | | 3 | `sentinel-rb-r-d31` | FacebookAI/roberta-large | recombine | d31 | 0.798 | 1.000 | 0.713 | 0.882 | 0.853 | 0.787 | 0.707 | 0.769 | 0.674 | 0.473 | 0.711 | | 4 | `sentinel-rb-c-d10` | FacebookAI/roberta-large | columnar | d10 | 0.798 | 0.996 | 0.747 | 0.854 | 0.829 | 0.773 | 0.711 | 0.778 | 0.694 | 0.493 | 0.691 | | 5 | `sentinel-rb-r-d30` | FacebookAI/roberta-large | recombine | d30 | 0.794 | 1.000 | 0.747 | 0.861 | 0.824 | 0.787 | 0.679 | 0.785 | 0.673 | 0.567 | 0.690 | | 6 | `sentinel-gte-r-d31` | Alibaba-NLP/gte-modernbert-base | recombine | d31 | 0.793 | 0.996 | 0.720 | 0.933 | 0.830 | 0.747 | 0.695 | 0.753 | 0.673 | 0.487 | 0.640 | | 7 | `sentinel-rb-d` | FacebookAI/roberta-large | direct | base | 0.791 | 0.996 | 0.727 | 0.883 | 0.836 | 0.760 | 0.702 | 0.763 | 0.665 | 0.500 | 0.708 | | 8 | `sentinel-rb-r-d11` | FacebookAI/roberta-large | recombine | d11 | 0.791 | 0.996 | 0.740 | 0.843 | 0.845 | 0.747 | 0.696 | 0.776 | 0.687 | 0.560 | 0.733 | | 9 | `sentinel-gte-c-d10` | Alibaba-NLP/gte-modernbert-base | columnar | d10 | 0.788 | 0.989 | 0.693 | 0.898 | 0.857 | 0.727 | 0.686 | 0.779 | 0.676 | 0.460 | 0.719 | | 10 | `sentinel-rb-c-d31` | FacebookAI/roberta-large | columnar | d31 | 0.788 | 0.996 | 0.727 | 0.864 | 0.833 | 0.720 | 0.695 | 0.779 | 0.689 | 0.480 | 0.713 | | 11 | `sentinel-rb-r-d51` | FacebookAI/roberta-large | recombine | d51 | 0.785 | 1.000 | 0.740 | 0.850 | 0.826 | 0.747 | 0.684 | 0.764 | 0.670 | 0.540 | 0.719 | | 12 | `sentinel-mb-r-d31` | answerdotai/ModernBERT-base | recombine | d31 | 0.783 | 0.989 | 0.687 | 0.849 | 0.826 | 0.767 | 0.707 | 0.774 | 0.662 | 0.493 | 0.705 | | 13 | `sentinel-mb-r-d10` | answerdotai/ModernBERT-base | recombine | d10 | 0.782 | 0.996 | 0.713 | 0.861 | 0.760 | 0.773 | 0.700 | 0.774 | 0.674 | 0.507 | 0.638 | | 14 | `sentinel-gte-r-d10` | Alibaba-NLP/gte-modernbert-base | recombine | d10 | 0.781 | 0.993 | 0.693 | 0.876 | 0.859 | 0.733 | 0.683 | 0.740 | 0.670 | 0.513 | 0.715 | | 15 | `sentinel-db-r-d11` | microsoft/deberta-v3-base | recombine | d11 | 0.780 | 1.000 | 0.740 | 0.829 | 0.819 | 0.733 | 0.673 | 0.787 | 0.662 | 0.473 | 0.684 | | 16 | `sentinel-gte-r-d11` | Alibaba-NLP/gte-modernbert-base | recombine | d11 | 0.779 | 0.993 | 0.700 | 0.894 | 0.853 | 0.720 | 0.671 | 0.751 | 0.646 | 0.473 | 0.670 | | 17 | `sentinel-gte-d` | Alibaba-NLP/gte-modernbert-base | direct | base | 0.778 | 0.978 | 0.713 | 0.871 | 0.837 | 0.733 | 0.684 | 0.744 | 0.665 | 0.467 | 0.600 | | 18 | `sentinel-rb-c-d51` | FacebookAI/roberta-large | columnar | d51 | 0.777 | 0.996 | 0.747 | 0.832 | 0.836 | 0.727 | 0.661 | 0.766 | 0.655 | 0.533 | 0.669 | | 19 | `sentinel-gte-c-d11` | Alibaba-NLP/gte-modernbert-base | columnar | d11 | 0.777 | 0.986 | 0.687 | 0.848 | 0.831 | 0.733 | 0.704 | 0.753 | 0.675 | 0.453 | 0.659 | | 20 | `sentinel-gte-c-d30` | Alibaba-NLP/gte-modernbert-base | columnar | d30 | 0.777 | 0.989 | 0.693 | 0.896 | 0.821 | 0.707 | 0.689 | 0.754 | 0.667 | 0.487 | 0.653 | | 21 | `sentinel-gte-c-d31` | Alibaba-NLP/gte-modernbert-base | columnar | d31 | 0.777 | 0.986 | 0.727 | 0.896 | 0.823 | 0.707 | 0.681 | 0.743 | 0.651 | 0.453 | 0.642 | | 22 | `sentinel-bge-c-d31` | BAAI/bge-large-en-v1.5 | columnar | d31 | 0.776 | 0.982 | 0.733 | 0.885 | 0.822 | 0.733 | 0.685 | 0.712 | 0.658 | 0.453 | 0.657 | | 23 | `sentinel-rb-r-d10` | FacebookAI/roberta-large | recombine | d10 | 0.775 | 1.000 | 0.713 | 0.812 | 0.806 | 0.760 | 0.671 | 0.770 | 0.667 | 0.513 | 0.701 | | 24 | `sentinel-mb-r-d11` | answerdotai/ModernBERT-base | recombine | d11 | 0.773 | 1.000 | 0.667 | 0.823 | 0.780 | 0.753 | 0.687 | 0.778 | 0.693 | 0.487 | 0.673 | | 25 | `sentinel-bge-r-d31` | BAAI/bge-large-en-v1.5 | recombine | d31 | 0.772 | 0.982 | 0.720 | 0.896 | 0.798 | 0.693 | 0.717 | 0.716 | 0.652 | 0.507 | 0.644 | | 26 | `sentinel-e5-r-d31` | intfloat/e5-large-v2 | recombine | d31 | 0.769 | 0.996 | 0.680 | 0.865 | 0.781 | 0.767 | 0.677 | 0.730 | 0.659 | 0.600 | 0.636 | | 27 | `sentinel-gte-c-d51` | Alibaba-NLP/gte-modernbert-base | columnar | d51 | 0.769 | 0.989 | 0.753 | 0.896 | 0.803 | 0.687 | 0.675 | 0.718 | 0.633 | 0.493 | 0.606 | | 28 | `sentinel-db-c-d51` | microsoft/deberta-v3-base | columnar | d51 | 0.768 | 1.000 | 0.713 | 0.835 | 0.848 | 0.720 | 0.663 | 0.733 | 0.637 | 0.480 | 0.629 | | 29 | `sentinel-db-c-d10` | microsoft/deberta-v3-base | columnar | d10 | 0.767 | 1.000 | 0.673 | 0.819 | 0.818 | 0.733 | 0.653 | 0.775 | 0.665 | 0.493 | 0.694 | | 30 | `sentinel-mb-r-d30` | answerdotai/ModernBERT-base | recombine | d30 | 0.766 | 1.000 | 0.687 | 0.821 | 0.829 | 0.747 | 0.669 | 0.720 | 0.658 | 0.493 | 0.655 | | 31 | `sentinel-db-r-d31` | microsoft/deberta-v3-base | recombine | d31 | 0.765 | 1.000 | 0.713 | 0.825 | 0.807 | 0.700 | 0.672 | 0.753 | 0.654 | 0.520 | 0.690 | | 32 | `sentinel-gte-r-d30` | Alibaba-NLP/gte-modernbert-base | recombine | d30 | 0.765 | 0.993 | 0.687 | 0.863 | 0.837 | 0.693 | 0.662 | 0.751 | 0.637 | 0.533 | 0.660 | | 33 | `sentinel-db-c-d31` | microsoft/deberta-v3-base | columnar | d31 | 0.765 | 1.000 | 0.713 | 0.849 | 0.840 | 0.673 | 0.661 | 0.733 | 0.646 | 0.493 | 0.620 | | 34 | `sentinel-db-r-d30` | microsoft/deberta-v3-base | recombine | d30 | 0.764 | 1.000 | 0.720 | 0.818 | 0.802 | 0.700 | 0.671 | 0.788 | 0.615 | 0.487 | 0.687 | | 35 | `sentinel-mb-r-d51` | answerdotai/ModernBERT-base | recombine | d51 | 0.763 | 0.996 | 0.673 | 0.840 | 0.794 | 0.727 | 0.684 | 0.735 | 0.656 | 0.453 | 0.639 | | 36 | `sentinel-db-r-d51` | microsoft/deberta-v3-base | recombine | d51 | 0.763 | 1.000 | 0.727 | 0.834 | 0.835 | 0.687 | 0.646 | 0.742 | 0.632 | 0.500 | 0.666 | | 37 | `sentinel-mb-c-d30` | answerdotai/ModernBERT-base | columnar | d30 | 0.761 | 1.000 | 0.687 | 0.857 | 0.791 | 0.720 | 0.654 | 0.732 | 0.646 | 0.467 | 0.626 | | 38 | `sentinel-e5-c-d31` | intfloat/e5-large-v2 | columnar | d31 | 0.760 | 0.982 | 0.673 | 0.845 | 0.841 | 0.733 | 0.674 | 0.686 | 0.645 | 0.447 | 0.607 | | 39 | `sentinel-db-c-d11` | microsoft/deberta-v3-base | columnar | d11 | 0.757 | 1.000 | 0.673 | 0.818 | 0.795 | 0.727 | 0.643 | 0.740 | 0.662 | 0.513 | 0.626 | | 40 | `sentinel-gte-r-d51` | Alibaba-NLP/gte-modernbert-base | recombine | d51 | 0.757 | 0.975 | 0.707 | 0.851 | 0.806 | 0.727 | 0.655 | 0.698 | 0.641 | 0.467 | 0.631 | | 41 | `sentinel-mb-c-d51` | answerdotai/ModernBERT-base | columnar | d51 | 0.756 | 1.000 | 0.667 | 0.826 | 0.826 | 0.707 | 0.675 | 0.712 | 0.637 | 0.480 | 0.563 | | 42 | `sentinel-db-d` | microsoft/deberta-v3-base | direct | base | 0.754 | 1.000 | 0.740 | 0.801 | 0.839 | 0.667 | 0.630 | 0.730 | 0.621 | 0.500 | 0.619 | | 43 | `sentinel-db-r-d10` | microsoft/deberta-v3-base | recombine | d10 | 0.752 | 1.000 | 0.680 | 0.809 | 0.770 | 0.700 | 0.644 | 0.767 | 0.643 | 0.493 | 0.700 | | 44 | `sentinel-db-c-d30` | microsoft/deberta-v3-base | columnar | d30 | 0.751 | 1.000 | 0.720 | 0.810 | 0.803 | 0.693 | 0.636 | 0.724 | 0.620 | 0.560 | 0.658 | | 45 | `sentinel-mb-c-d11` | answerdotai/ModernBERT-base | columnar | d11 | 0.751 | 0.993 | 0.727 | 0.803 | 0.738 | 0.740 | 0.684 | 0.703 | 0.618 | 0.473 | 0.655 | | 46 | `sentinel-mb-c-d10` | answerdotai/ModernBERT-base | columnar | d10 | 0.747 | 1.000 | 0.627 | 0.838 | 0.779 | 0.727 | 0.657 | 0.702 | 0.646 | 0.420 | 0.628 | | 47 | `sentinel-mb-d` | answerdotai/ModernBERT-base | direct | base | 0.746 | 0.996 | 0.667 | 0.836 | 0.791 | 0.713 | 0.646 | 0.687 | 0.634 | 0.433 | 0.574 | | 48 | `sentinel-mb-c-d31` | answerdotai/ModernBERT-base | columnar | d31 | 0.743 | 0.996 | 0.640 | 0.801 | 0.803 | 0.700 | 0.678 | 0.678 | 0.645 | 0.393 | 0.630 | ## Candidate Details ### `sentinel-rb-c-d11` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-rb-c-d11/260424_131334_sentinel-rb-c-d11` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_131147_sentinel-rb-c-d11.log` - test_stage_a: 0.799 - test_violation_f1: 1.000 - test_domain_f1: 0.910 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-rb-c-d30` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-rb-c-d30/260424_131513_sentinel-rb-c-d30` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_131339_sentinel-rb-c-d30.log` - test_stage_a: 0.798 - test_violation_f1: 0.996 - test_domain_f1: 0.885 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-rb-r-d31` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-rb-r-d31/260424_130815_sentinel-rb-r-d31` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_130613_sentinel-rb-r-d31.log` - test_stage_a: 0.798 - test_violation_f1: 1.000 - test_domain_f1: 0.882 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-rb-c-d10` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-rb-c-d10/260424_131142_sentinel-rb-c-d10` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_131007_sentinel-rb-c-d10.log` - test_stage_a: 0.798 - test_violation_f1: 0.996 - test_domain_f1: 0.854 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-rb-r-d30` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-rb-r-d30/260424_130608_sentinel-rb-r-d30` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_130405_sentinel-rb-r-d30.log` - test_stage_a: 0.794 - test_violation_f1: 1.000 - test_domain_f1: 0.861 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-gte-r-d31` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-gte-r-d31/260424_141249_sentinel-gte-r-d31` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_141137_sentinel-gte-r-d31.log` - test_stage_a: 0.793 - test_violation_f1: 0.996 - test_domain_f1: 0.933 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-rb-d` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-rb-d/260424_124258_sentinel-rb-d` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_124125_sentinel-rb-d.log` - test_stage_a: 0.791 - test_violation_f1: 0.996 - test_domain_f1: 0.883 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-rb-r-d11` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-rb-r-d11/260424_130400_sentinel-rb-r-d11` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_130226_sentinel-rb-r-d11.log` - test_stage_a: 0.791 - test_violation_f1: 0.996 - test_domain_f1: 0.843 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-gte-c-d10` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-gte-c-d10/260424_141540_sentinel-gte-c-d10` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_141421_sentinel-gte-c-d10.log` - test_stage_a: 0.788 - test_violation_f1: 0.989 - test_domain_f1: 0.898 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-rb-c-d31` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-rb-c-d31/260424_131658_sentinel-rb-c-d31` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_131518_sentinel-rb-c-d31.log` - test_stage_a: 0.788 - test_violation_f1: 0.996 - test_domain_f1: 0.864 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-rb-r-d51` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-rb-r-d51/260424_131003_sentinel-rb-r-d51` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_130820_sentinel-rb-r-d51.log` - test_stage_a: 0.785 - test_violation_f1: 1.000 - test_domain_f1: 0.850 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-mb-r-d31` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-mb-r-d31/260424_135349_sentinel-mb-r-d31` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_135149_sentinel-mb-r-d31.log` - test_stage_a: 0.783 - test_violation_f1: 0.989 - test_domain_f1: 0.849 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-mb-r-d10` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-mb-r-d10/260424_134805_sentinel-mb-r-d10` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_134614_sentinel-mb-r-d10.log` - test_stage_a: 0.782 - test_violation_f1: 0.996 - test_domain_f1: 0.861 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-gte-r-d10` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-gte-r-d10/260424_140759_sentinel-gte-r-d10` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_140549_sentinel-gte-r-d10.log` - test_stage_a: 0.781 - test_violation_f1: 0.993 - test_domain_f1: 0.876 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-db-r-d11` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-db-r-d11/260424_132315_sentinel-db-r-d11` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_132146_sentinel-db-r-d11.log` - test_stage_a: 0.780 - test_violation_f1: 1.000 - test_domain_f1: 0.829 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-gte-r-d11` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-gte-r-d11/260424_140906_sentinel-gte-r-d11` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_140804_sentinel-gte-r-d11.log` - test_stage_a: 0.779 - test_violation_f1: 0.993 - test_domain_f1: 0.894 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-gte-d` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-gte-d/260424_140545_sentinel-gte-d` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_140424_sentinel-gte-d.log` - test_stage_a: 0.778 - test_violation_f1: 0.978 - test_domain_f1: 0.871 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-rb-c-d51` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-rb-c-d51/260424_131821_sentinel-rb-c-d51` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_131702_sentinel-rb-c-d51.log` - test_stage_a: 0.777 - test_violation_f1: 0.996 - test_domain_f1: 0.832 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-gte-c-d11` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-gte-c-d11/260424_141656_sentinel-gte-c-d11` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_141545_sentinel-gte-c-d11.log` - test_stage_a: 0.777 - test_violation_f1: 0.986 - test_domain_f1: 0.848 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-gte-c-d30` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-gte-c-d30/260424_141807_sentinel-gte-c-d30` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_141701_sentinel-gte-c-d30.log` - test_stage_a: 0.777 - test_violation_f1: 0.989 - test_domain_f1: 0.896 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-gte-c-d31` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-gte-c-d31/260424_141915_sentinel-gte-c-d31` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_141812_sentinel-gte-c-d31.log` - test_stage_a: 0.777 - test_violation_f1: 0.986 - test_domain_f1: 0.896 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-bge-c-d31` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-bge-c-d31/260424_142358_sentinel-bge-c-d31` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_142209_sentinel-bge-c-d31.log` - test_stage_a: 0.776 - test_violation_f1: 0.982 - test_domain_f1: 0.885 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-rb-r-d10` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-rb-r-d10/260424_130221_sentinel-rb-r-d10` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_130036_sentinel-rb-r-d10.log` - test_stage_a: 0.775 - test_violation_f1: 1.000 - test_domain_f1: 0.812 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-mb-r-d11` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-mb-r-d11/260424_134934_sentinel-mb-r-d11` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_134810_sentinel-mb-r-d11.log` - test_stage_a: 0.773 - test_violation_f1: 1.000 - test_domain_f1: 0.823 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-bge-r-d31` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-bge-r-d31/260424_142204_sentinel-bge-r-d31` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_142029_sentinel-bge-r-d31.log` - test_stage_a: 0.772 - test_violation_f1: 0.982 - test_domain_f1: 0.896 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-e5-r-d31` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-e5-r-d31/260424_142549_sentinel-e5-r-d31` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_142403_sentinel-e5-r-d31.log` - test_stage_a: 0.769 - test_violation_f1: 0.996 - test_domain_f1: 0.865 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-gte-c-d51` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-gte-c-d51/260424_142025_sentinel-gte-c-d51` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_141920_sentinel-gte-c-d51.log` - test_stage_a: 0.769 - test_violation_f1: 0.989 - test_domain_f1: 0.896 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-db-c-d51` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-db-c-d51/260424_134449_sentinel-db-c-d51` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_134251_sentinel-db-c-d51.log` - test_stage_a: 0.768 - test_violation_f1: 1.000 - test_domain_f1: 0.835 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-db-c-d10` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-db-c-d10/260424_133302_sentinel-db-c-d10` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_133035_sentinel-db-c-d10.log` - test_stage_a: 0.767 - test_violation_f1: 1.000 - test_domain_f1: 0.819 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-mb-r-d30` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-mb-r-d30/260424_135144_sentinel-mb-r-d30` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_134939_sentinel-mb-r-d30.log` - test_stage_a: 0.766 - test_violation_f1: 1.000 - test_domain_f1: 0.821 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-db-r-d31` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-db-r-d31/260424_132827_sentinel-db-r-d31` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_132556_sentinel-db-r-d31.log` - test_stage_a: 0.765 - test_violation_f1: 1.000 - test_domain_f1: 0.825 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-gte-r-d30` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-gte-r-d30/260424_141132_sentinel-gte-r-d30` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_140911_sentinel-gte-r-d30.log` - test_stage_a: 0.765 - test_violation_f1: 0.993 - test_domain_f1: 0.863 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-db-c-d31` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-db-c-d31/260424_134247_sentinel-db-c-d31` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_133732_sentinel-db-c-d31.log` - test_stage_a: 0.765 - test_violation_f1: 1.000 - test_domain_f1: 0.849 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-db-r-d30` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-db-r-d30/260424_132552_sentinel-db-r-d30` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_132319_sentinel-db-r-d30.log` - test_stage_a: 0.764 - test_violation_f1: 1.000 - test_domain_f1: 0.818 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-mb-r-d51` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-mb-r-d51/260424_135601_sentinel-mb-r-d51` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_135353_sentinel-mb-r-d51.log` - test_stage_a: 0.763 - test_violation_f1: 0.996 - test_domain_f1: 0.840 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-db-r-d51` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-db-r-d51/260424_133030_sentinel-db-r-d51` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_132831_sentinel-db-r-d51.log` - test_stage_a: 0.763 - test_violation_f1: 1.000 - test_domain_f1: 0.834 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-mb-c-d30` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-mb-c-d30/260424_140046_sentinel-mb-c-d30` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_135917_sentinel-mb-c-d30.log` - test_stage_a: 0.761 - test_violation_f1: 1.000 - test_domain_f1: 0.857 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-e5-c-d31` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-e5-c-d31/260424_142708_sentinel-e5-c-d31` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_142553_sentinel-e5-c-d31.log` - test_stage_a: 0.760 - test_violation_f1: 0.982 - test_domain_f1: 0.845 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-db-c-d11` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-db-c-d11/260424_133520_sentinel-db-c-d11` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_133306_sentinel-db-c-d11.log` - test_stage_a: 0.757 - test_violation_f1: 1.000 - test_domain_f1: 0.818 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-gte-r-d51` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-gte-r-d51/260424_141417_sentinel-gte-r-d51` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_141253_sentinel-gte-r-d51.log` - test_stage_a: 0.757 - test_violation_f1: 0.975 - test_domain_f1: 0.851 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-mb-c-d51` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-mb-c-d51/260424_140419_sentinel-mb-c-d51` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_140244_sentinel-mb-c-d51.log` - test_stage_a: 0.756 - test_violation_f1: 1.000 - test_domain_f1: 0.826 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-db-d` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-db-d/260424_131933_sentinel-db-d` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_131826_sentinel-db-d.log` - test_stage_a: 0.754 - test_violation_f1: 1.000 - test_domain_f1: 0.801 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-db-r-d10` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-db-r-d10/260424_132141_sentinel-db-r-d10` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_131938_sentinel-db-r-d10.log` - test_stage_a: 0.752 - test_violation_f1: 1.000 - test_domain_f1: 0.809 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-db-c-d30` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-db-c-d30/260424_133727_sentinel-db-c-d30` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_133525_sentinel-db-c-d30.log` - test_stage_a: 0.751 - test_violation_f1: 1.000 - test_domain_f1: 0.810 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-mb-c-d11` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-mb-c-d11/260424_135913_sentinel-mb-c-d11` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_135746_sentinel-mb-c-d11.log` - test_stage_a: 0.751 - test_violation_f1: 0.993 - test_domain_f1: 0.803 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-mb-c-d10` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-mb-c-d10/260424_135742_sentinel-mb-c-d10` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_135606_sentinel-mb-c-d10.log` - test_stage_a: 0.747 - test_violation_f1: 1.000 - test_domain_f1: 0.838 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-mb-d` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-mb-d/260424_134610_sentinel-mb-d` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_134454_sentinel-mb-d.log` - test_stage_a: 0.746 - test_violation_f1: 0.996 - test_domain_f1: 0.836 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ### `sentinel-mb-c-d31` - checkpoint: `_models/stage-a-grid-v3-gpu/sentinel-mb-c-d31/260424_140239_sentinel-mb-c-d31` - log: `_cache/logs/legacy/stage-a-grid-v3-gpu/raw/260424_140050_sentinel-mb-c-d31.log` - test_stage_a: 0.743 - test_violation_f1: 0.996 - test_domain_f1: 0.801 #### Leakage Review # Prior PoC Leakage Review - The previous PoC reused the same 17 synthetic families across train, dev, and test, so the model mostly learned family signatures rather than broad compliance reasoning. - Every prior observation carried extra structural cues such as source metadata, evidence snippets, and explicit jurisdiction sentences appended to the text. - A later dataset refactor silently dropped jurisdiction, impacted-principle, and remediation heads, which made the reported Stage A contract narrower than the product actually promises. - Reported micro metrics on dense negative label maps made performance look cleaner than a realistic class-by-class review would suggest. ## Mitigations Applied - The data pipeline now uses a 150-row agent-authored pilot plus a hard human-review gate before any 1000/100/100 release split is allowed to exist on disk. - The generation workflow now keeps Python limited to validation, formatting, duplicate review, and statistics while the agent authors and labels each observation directly. - The encoder default still uses a 512-token window, which comfortably covers the current 1000-character manual-authoring ceiling. - The full Stage A diagnose/prescribe contract is restored in both dataset and model outputs: jurisdiction, why, impacted principles, remediation actions, detection difficulty, and aggravating factors are all explicit. - Dataset generation now validates the mock contract keys directly and requires a human-reviewed approval hash before contract changes can pass validation. - The model factory now constructs full model bundles, while checkpoints store the trained projection and heads plus the frozen encoder reference instead of duplicating immutable backbone weights. - Evaluation artifacts now report scenario-family macro violation metrics and worst-family binary performance so repeated rows inside a narrow split cannot hide behind a flattering row-average alone. - Cross-checkpoint comparison artifacts are only kept when they are refreshed against the current dataset, preventing stale benchmark reports from masquerading as current evidence. ## Included Root Reports ### `manifest.json` { "created_at": "2026-04-24T12:41:03", "run_id": "stage-a-grid-v3-gpu", "artifact_root": "Sentinel_models/artifacts/stage-a-grid-v3-gpu", "models_dir": "_models/stage-a-grid-v3-gpu", "logs_dir": "_logs/stage-a-grid-v3-gpu", "comparison_artifact_dir": "Sentinel_models/artifacts/model-comparison-v3-gpu", "encoded_cache_dir": "_cache/stage-a-grid-v3-gpu-encoded", "cycles": 3, "epochs": 50, "patience": 6, "batch_size": 32, "candidates": [ { "model_key": "sentinel-rb-d", "encoder_model_name": "FacebookAI/roberta-large", "head_type": "direct", "head_code": "d", "variant": null, "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": false }, { "model_key": "sentinel-rb-r-d10", "encoder_model_name": "FacebookAI/roberta-large", "head_type": "recombine", "head_code": "r", "variant": "d10", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": false }, { "model_key": "sentinel-rb-r-d11", "encoder_model_name": "FacebookAI/roberta-large", "head_type": "recombine", "head_code": "r", "variant": "d11", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": true }, { "model_key": "sentinel-rb-r-d30", "encoder_model_name": "FacebookAI/roberta-large", "head_type": "recombine", "head_code": "r", "variant": "d30", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": false }, { "model_key": "sentinel-rb-r-d31", "encoder_model_name": "FacebookAI/roberta-large", "head_type": "recombine", "head_code": "r", "variant": "d31", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-rb-r-d51", "encoder_model_name": "FacebookAI/roberta-large", "head_type": "recombine", "head_code": "r", "variant": "d51", "head_dropout": 0.5, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-rb-c-d10", "encoder_model_name": "FacebookAI/roberta-large", "head_type": "columnar", "head_code": "c", "variant": "d10", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": false }, { "model_key": "sentinel-rb-c-d11", "encoder_model_name": "FacebookAI/roberta-large", "head_type": "columnar", "head_code": "c", "variant": "d11", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": true }, { "model_key": "sentinel-rb-c-d30", "encoder_model_name": "FacebookAI/roberta-large", "head_type": "columnar", "head_code": "c", "variant": "d30", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": false }, { "model_key": "sentinel-rb-c-d31", "encoder_model_name": "FacebookAI/roberta-large", "head_type": "columnar", "head_code": "c", "variant": "d31", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-rb-c-d51", "encoder_model_name": "FacebookAI/roberta-large", "head_type": "columnar", "head_code": "c", "variant": "d51", "head_dropout": 0.5, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-db-d", "encoder_model_name": "microsoft/deberta-v3-base", "head_type": "direct", "head_code": "d", "variant": null, "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": false }, { "model_key": "sentinel-db-r-d10", "encoder_model_name": "microsoft/deberta-v3-base", "head_type": "recombine", "head_code": "r", "variant": "d10", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": false }, { "model_key": "sentinel-db-r-d11", "encoder_model_name": "microsoft/deberta-v3-base", "head_type": "recombine", "head_code": "r", "variant": "d11", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": true }, { "model_key": "sentinel-db-r-d30", "encoder_model_name": "microsoft/deberta-v3-base", "head_type": "recombine", "head_code": "r", "variant": "d30", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": false }, { "model_key": "sentinel-db-r-d31", "encoder_model_name": "microsoft/deberta-v3-base", "head_type": "recombine", "head_code": "r", "variant": "d31", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-db-r-d51", "encoder_model_name": "microsoft/deberta-v3-base", "head_type": "recombine", "head_code": "r", "variant": "d51", "head_dropout": 0.5, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-db-c-d10", "encoder_model_name": "microsoft/deberta-v3-base", "head_type": "columnar", "head_code": "c", "variant": "d10", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": false }, { "model_key": "sentinel-db-c-d11", "encoder_model_name": "microsoft/deberta-v3-base", "head_type": "columnar", "head_code": "c", "variant": "d11", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": true }, { "model_key": "sentinel-db-c-d30", "encoder_model_name": "microsoft/deberta-v3-base", "head_type": "columnar", "head_code": "c", "variant": "d30", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": false }, { "model_key": "sentinel-db-c-d31", "encoder_model_name": "microsoft/deberta-v3-base", "head_type": "columnar", "head_code": "c", "variant": "d31", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-db-c-d51", "encoder_model_name": "microsoft/deberta-v3-base", "head_type": "columnar", "head_code": "c", "variant": "d51", "head_dropout": 0.5, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-mb-d", "encoder_model_name": "answerdotai/ModernBERT-base", "head_type": "direct", "head_code": "d", "variant": null, "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": false }, { "model_key": "sentinel-mb-r-d10", "encoder_model_name": "answerdotai/ModernBERT-base", "head_type": "recombine", "head_code": "r", "variant": "d10", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": false }, { "model_key": "sentinel-mb-r-d11", "encoder_model_name": "answerdotai/ModernBERT-base", "head_type": "recombine", "head_code": "r", "variant": "d11", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": true }, { "model_key": "sentinel-mb-r-d30", "encoder_model_name": "answerdotai/ModernBERT-base", "head_type": "recombine", "head_code": "r", "variant": "d30", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": false }, { "model_key": "sentinel-mb-r-d31", "encoder_model_name": "answerdotai/ModernBERT-base", "head_type": "recombine", "head_code": "r", "variant": "d31", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-mb-r-d51", "encoder_model_name": "answerdotai/ModernBERT-base", "head_type": "recombine", "head_code": "r", "variant": "d51", "head_dropout": 0.5, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-mb-c-d10", "encoder_model_name": "answerdotai/ModernBERT-base", "head_type": "columnar", "head_code": "c", "variant": "d10", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": false }, { "model_key": "sentinel-mb-c-d11", "encoder_model_name": "answerdotai/ModernBERT-base", "head_type": "columnar", "head_code": "c", "variant": "d11", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": true }, { "model_key": "sentinel-mb-c-d30", "encoder_model_name": "answerdotai/ModernBERT-base", "head_type": "columnar", "head_code": "c", "variant": "d30", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": false }, { "model_key": "sentinel-mb-c-d31", "encoder_model_name": "answerdotai/ModernBERT-base", "head_type": "columnar", "head_code": "c", "variant": "d31", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-mb-c-d51", "encoder_model_name": "answerdotai/ModernBERT-base", "head_type": "columnar", "head_code": "c", "variant": "d51", "head_dropout": 0.5, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-gte-d", "encoder_model_name": "Alibaba-NLP/gte-modernbert-base", "head_type": "direct", "head_code": "d", "variant": null, "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": false }, { "model_key": "sentinel-gte-r-d10", "encoder_model_name": "Alibaba-NLP/gte-modernbert-base", "head_type": "recombine", "head_code": "r", "variant": "d10", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": false }, { "model_key": "sentinel-gte-r-d11", "encoder_model_name": "Alibaba-NLP/gte-modernbert-base", "head_type": "recombine", "head_code": "r", "variant": "d11", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": true }, { "model_key": "sentinel-gte-r-d30", "encoder_model_name": "Alibaba-NLP/gte-modernbert-base", "head_type": "recombine", "head_code": "r", "variant": "d30", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": false }, { "model_key": "sentinel-gte-r-d31", "encoder_model_name": "Alibaba-NLP/gte-modernbert-base", "head_type": "recombine", "head_code": "r", "variant": "d31", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-gte-r-d51", "encoder_model_name": "Alibaba-NLP/gte-modernbert-base", "head_type": "recombine", "head_code": "r", "variant": "d51", "head_dropout": 0.5, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-gte-c-d10", "encoder_model_name": "Alibaba-NLP/gte-modernbert-base", "head_type": "columnar", "head_code": "c", "variant": "d10", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": false }, { "model_key": "sentinel-gte-c-d11", "encoder_model_name": "Alibaba-NLP/gte-modernbert-base", "head_type": "columnar", "head_code": "c", "variant": "d11", "head_dropout": 0.1, "head_div": 1, "head_mul": 1, "head_skip": true }, { "model_key": "sentinel-gte-c-d30", "encoder_model_name": "Alibaba-NLP/gte-modernbert-base", "head_type": "columnar", "head_code": "c", "variant": "d30", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": false }, { "model_key": "sentinel-gte-c-d31", "encoder_model_name": "Alibaba-NLP/gte-modernbert-base", "head_type": "columnar", "head_code": "c", "variant": "d31", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-gte-c-d51", "encoder_model_name": "Alibaba-NLP/gte-modernbert-base", "head_type": "columnar", "head_code": "c", "variant": "d51", "head_dropout": 0.5, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-bge-r-d31", "encoder_model_name": "BAAI/bge-large-en-v1.5", "head_type": "recombine", "head_code": "r", "variant": "d31", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-bge-c-d31", "encoder_model_name": "BAAI/bge-large-en-v1.5", "head_type": "columnar", "head_code": "c", "variant": "d31", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-e5-r-d31", "encoder_model_name": "intfloat/e5-large-v2", "head_type": "recombine", "head_code": "r", "variant": "d31", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": true }, { "model_key": "sentinel-e5-c-d31", "encoder_model_name": "intfloat/e5-large-v2", "head_type": "columnar", "head_code": "c", "variant": "d31", "head_dropout": 0.3, "head_div": 2, "head_mul": 2, "head_skip": true } ], "command_template": "TRANSFORMERS_OFFLINE=1 HF_HUB_OFFLINE=1 python3 Sentinel_models/scripts/train_stage_a.py --model-key --artifact-dir Sentinel_models/artifacts/stage-a-grid-v3-gpu/ --comparison-artifact-dir Sentinel_models/artifacts/model-comparison-v3-gpu --logs-dir _logs/stage-a-grid-v3-gpu --models-dir _models/stage-a-grid-v3-gpu --encoded-cache-dir _cache/stage-a-grid-v3-gpu-encoded --cycles 3 --epochs 50 --patience 6 --batch-size 32 --skip-transformers-export", "skip_transformers_export_for_grid_metrics": true }