File size: 11,468 Bytes
f45f1a0
 
fcaf77a
86e7cb2
f45f1a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86e7cb2
f45f1a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86e7cb2
 
f45f1a0
 
 
 
 
 
 
86e7cb2
f45f1a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86e7cb2
 
f45f1a0
 
 
 
 
 
 
86e7cb2
f45f1a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86e7cb2
 
f45f1a0
 
 
 
 
 
 
86e7cb2
f45f1a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86e7cb2
 
f45f1a0
 
 
 
 
 
 
86e7cb2
f45f1a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86e7cb2
 
f45f1a0
 
 
 
 
 
 
86e7cb2
f45f1a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86e7cb2
 
f45f1a0
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
{
  "current_public_matrix_row": "qwen3_omni_v6_lora",
  "generated_at_utc": "2026-06-21T11:47:45+00:00",
  "interpretation_rule": "Do not confuse the Qwen run versions with the project evidence lines. The project evidence lines are one public sample episode and selected 128-episode artifacts. Qwen v1-v6 are only the Qwen3-Omni run lineage inside the selected-128 line. The 20-task matrix uses Qwen3-Omni v6 LoRA; v5 remains the pinned prior release; v1-v4 are lineage and ablation evidence.",
  "pinned_prior_release": "v5",
  "related_engineering_artifacts": [
    {
      "name": "Full-parameter gates",
      "path": "results/omni_finetune/QWEN3_FULL_PARAMETER_GATES_20260609.md",
      "role": "Feasibility and short-train gates; not a public 20-task matrix method row."
    },
    {
      "name": "Alternate fullsplit v6 package",
      "path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_fullsplit_fast8gpu_lora_fsdp_full_train_noval_tail_logits_fullstatesave_v6_eval_test_full",
      "role": "Verified alternate no-validation/fullsplit artifact retained for audit, not the current matrix row."
    }
  ],
  "runs": [
    {
      "change_from_previous": "First verified Qwen3-Omni selected-128 LoRA run.",
      "dataset_contract": "xperience10m_episode_json_qa_v1",
      "dataset_run_id": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605",
      "eval_run_id": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval",
      "eval_samples": 448,
      "metrics": {
        "action_macro_f1": 0.0026621494447581404,
        "contact_accuracy": 0.6450892857142857,
        "json_validity_rate": 0.875,
        "next_action_accuracy": 0.024553571428571428,
        "object_micro_f1": 0.22299431459254582,
        "subtask_accuracy": 0.006696428571428571,
        "transition_accuracy": 0.8504464285714286
      },
      "package": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval",
      "package_path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval",
      "public_matrix_role": "superseded lineage evidence, not the current 20-task Qwen row",
      "purpose": "Prove that the selected-128 split, LoRA training, held-out eval, validation, and public packaging loop works end to end.",
      "reader_use": "Use only as lineage evidence for the first working pipeline.",
      "role": "First verified 96/16/16 selected-episode Qwen3-Omni LoRA package; establishes dataset, training, eval, and packaging plumbing.",
      "status": "verified",
      "title": "Selected-128 validation-aware LoRA baseline",
      "train_run_id": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_lora",
      "version": "v1"
    },
    {
      "change_from_previous": "Reused the selected-128 split with a stricter structured-JSON answer contract and full 8-GPU LoRA training.",
      "dataset_contract": "xperience10m_episode_json_qa_v1",
      "dataset_run_id": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605",
      "eval_run_id": "xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora_eval_test_full",
      "eval_samples": 448,
      "metrics": {
        "action_macro_f1": 0.0024331644885523347,
        "contact_accuracy": 0.71875,
        "json_validity_rate": 0.9977678571428571,
        "next_action_accuracy": 0.029017857142857144,
        "object_micro_f1": 0.30160427807486634,
        "subtask_accuracy": 0.002232142857142857,
        "transition_accuracy": 0.9709821428571429
      },
      "package": "xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora_eval_test_full",
      "package_path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora_eval_test_full",
      "public_matrix_role": "superseded lineage evidence, not the current 20-task Qwen row",
      "purpose": "Make the answer format schema-checked and reduce invalid JSON before expanding scale.",
      "reader_use": "Use as evidence that schema-constrained evaluation improved validity and contact accuracy over v1.",
      "role": "Reuses the selected-128 split with a stricter structured JSON answer contract and full 8-GPU LoRA training.",
      "status": "verified",
      "title": "Structured-JSON reuse full-8-GPU LoRA",
      "train_run_id": "xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora",
      "version": "v2"
    },
    {
      "change_from_previous": "Evaluated the v2 adapter with stricter labels and prompts; no new adapter training.",
      "dataset_contract": "xperience10m_episode_json_qa_v1",
      "dataset_run_id": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605",
      "eval_run_id": "xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full",
      "eval_samples": 448,
      "metrics": {
        "action_macro_f1": 0.0021983997167007384,
        "contact_accuracy": 0.7209821428571429,
        "json_validity_rate": 1.0,
        "next_action_accuracy": 0.03125,
        "object_micro_f1": 0.30688228657389993,
        "subtask_accuracy": 0.002232142857142857,
        "transition_accuracy": 0.9732142857142857
      },
      "package": "xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full",
      "package_path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full",
      "public_matrix_role": "superseded prompt/eval lineage evidence",
      "purpose": "Separate prompt/eval formatting effects from adapter-training effects.",
      "reader_use": "Use as prompt/eval ablation evidence, not as a separate trained model.",
      "role": "Strict-label prompt/eval pass over the v2 adapter; improves JSON validity without introducing a new adapter training run.",
      "status": "verified",
      "title": "Strict-label prompt evaluation",
      "train_run_id": "xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora",
      "version": "v3"
    },
    {
      "change_from_previous": "Trained a new four-epoch full-8-GPU LoRA adapter on the structured-JSON setup.",
      "dataset_contract": "xperience10m_episode_json_qa_v1",
      "dataset_run_id": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605",
      "eval_run_id": "xperience10m_qwen3_omni_128ep_structured_json_v4_4epoch_full8gpu_lora_eval_test_full",
      "eval_samples": 448,
      "metrics": {
        "action_macro_f1": 0.0018678269676001454,
        "contact_accuracy": 0.7299107142857143,
        "json_validity_rate": 1.0,
        "next_action_accuracy": 0.033482142857142856,
        "object_micro_f1": 0.31099781500364165,
        "subtask_accuracy": 0.0,
        "transition_accuracy": 0.9732142857142857
      },
      "package": "xperience10m_qwen3_omni_128ep_structured_json_v4_4epoch_full8gpu_lora_eval_test_full",
      "package_path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v4_4epoch_full8gpu_lora_eval_test_full",
      "public_matrix_role": "superseded lineage evidence, not the current 20-task Qwen row",
      "purpose": "Test whether longer structured-JSON LoRA training improves the same selected split.",
      "reader_use": "Use as overfit and metric-tradeoff evidence before the multiscale export.",
      "role": "Four-epoch full-8-GPU LoRA run on the same selected split; useful for overfit/metric tradeoff analysis.",
      "status": "verified",
      "title": "Four-epoch structured-JSON LoRA",
      "train_run_id": "xperience10m_qwen3_omni_128ep_structured_json_v4_4epoch_full8gpu_lora",
      "version": "v4"
    },
    {
      "change_from_previous": "Introduced the multiscale cap96 export and larger held-out evaluation surface.",
      "dataset_contract": "xperience10m_episode_json_qa_v1",
      "dataset_run_id": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora",
      "eval_run_id": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_eval_test_full",
      "eval_samples": 4032,
      "metrics": {
        "action_macro_f1": 0.002289711036077459,
        "contact_accuracy": 0.7864583333333334,
        "json_validity_rate": 1.0,
        "next_action_accuracy": 0.053618594823032224,
        "object_micro_f1": 0.31614599936244814,
        "subtask_accuracy": 0.011194029850746268,
        "transition_accuracy": 0.9908234126984127
      },
      "package": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_eval_test_full",
      "package_path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_eval_test_full",
      "public_matrix_role": "pinned prior release row and comparison baseline",
      "purpose": "Move from the 448-sample compact eval to a denser multiscale 4,032-sample held-out eval.",
      "reader_use": "Use as the pinned prior release; it remains stronger on JSON validity, subtask, next-action, object, and transition metrics.",
      "role": "Dense/multiscale selected-128 run with 4,032 held-out predictions; kept as the pinned prior release because several metrics remain stronger than v6.",
      "status": "verified",
      "title": "Multiscale cap96 LoRA",
      "train_run_id": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora",
      "version": "v5"
    },
    {
      "change_from_previous": "Kept the multiscale setup, changed LoRA rank/lr to rank64/lr5e-5, and added verified task-specific probes for full 20-task coverage.",
      "dataset_contract": "xperience10m_episode_json_qa_v1",
      "dataset_run_id": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora",
      "eval_run_id": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full",
      "eval_samples": 4032,
      "metrics": {
        "action_macro_f1": 0.0028830723979596335,
        "contact_accuracy": 0.8177083333333334,
        "json_validity_rate": 0.9990079365079365,
        "next_action_accuracy": 0.04305335446381405,
        "object_micro_f1": 0.3064982378331287,
        "subtask_accuracy": 0.0037313432835820895,
        "transition_accuracy": 0.9898313492063492
      },
      "package": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full",
      "package_path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full",
      "public_matrix_role": "current public 20-task Qwen3-Omni v6 LoRA row",
      "purpose": "Promote the current public Qwen3-Omni 20-task row with multiscale LoRA plus task-specific probes.",
      "reader_use": "Use as the current public 20-task Qwen row; it improves action macro-F1 and contact accuracy while v5 remains the prior comparator.",
      "role": "Current verified Qwen3-Omni row: rank64/lr5e-5 multiscale LoRA plus task-specific probe artifacts used for the 20/20 Qwen matrix coverage.",
      "status": "verified",
      "title": "Rank64 lr5e-5 multiscale LoRA",
      "train_run_id": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora",
      "version": "v6"
    }
  ],
  "scope": "Verified public-safe Qwen3-Omni LoRA/eval packages over the selected Xperience-10M 128-episode surface.",
  "status": "pass",
  "title": "Qwen3-Omni v1-v6 Run Lineage"
}