File size: 10,519 Bytes
f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 05689bb f45f1a0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 | #!/usr/bin/env python3
"""Build the public Qwen3-Omni v1-v6 run-lineage summary."""
from __future__ import annotations
import json
from datetime import datetime, timezone
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
VERIFIED = ROOT / "results/omni_finetune/verified_public"
OUTPUT_JSON = ROOT / "docs/data/qwen3_omni_run_lineage.json"
OUTPUT_MD = ROOT / "QWEN3_OMNI_RUN_LINEAGE.md"
RUNS = [
{
"version": "v1",
"title": "Selected-128 validation-aware LoRA baseline",
"package": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval",
"purpose": "Prove that the selected-128 split, LoRA training, held-out eval, validation, and public packaging loop works end to end.",
"change_from_previous": "First verified Qwen3-Omni selected-128 LoRA run.",
"role": "First verified 96/16/16 selected-episode Qwen3-Omni LoRA package; establishes dataset, training, eval, and packaging plumbing.",
"reader_use": "Use only as lineage evidence for the first working pipeline.",
"public_matrix_role": "superseded lineage evidence, not the current 20-task Qwen row",
},
{
"version": "v2",
"title": "Structured-JSON reuse full-8-GPU LoRA",
"package": "xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora_eval_test_full",
"purpose": "Make the answer format schema-checked and reduce invalid JSON before expanding scale.",
"change_from_previous": "Reused the selected-128 split with a stricter structured-JSON answer contract and full 8-GPU LoRA training.",
"role": "Reuses the selected-128 split with a stricter structured JSON answer contract and full 8-GPU LoRA training.",
"reader_use": "Use as evidence that schema-constrained evaluation improved validity and contact accuracy over v1.",
"public_matrix_role": "superseded lineage evidence, not the current 20-task Qwen row",
},
{
"version": "v3",
"title": "Strict-label prompt evaluation",
"package": "xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full",
"purpose": "Separate prompt/eval formatting effects from adapter-training effects.",
"change_from_previous": "Evaluated the v2 adapter with stricter labels and prompts; no new adapter training.",
"role": "Strict-label prompt/eval pass over the v2 adapter; improves JSON validity without introducing a new adapter training run.",
"reader_use": "Use as prompt/eval ablation evidence, not as a separate trained model.",
"public_matrix_role": "superseded prompt/eval lineage evidence",
},
{
"version": "v4",
"title": "Four-epoch structured-JSON LoRA",
"package": "xperience10m_qwen3_omni_128ep_structured_json_v4_4epoch_full8gpu_lora_eval_test_full",
"purpose": "Test whether longer structured-JSON LoRA training improves the same selected split.",
"change_from_previous": "Trained a new four-epoch full-8-GPU LoRA adapter on the structured-JSON setup.",
"role": "Four-epoch full-8-GPU LoRA run on the same selected split; useful for overfit/metric tradeoff analysis.",
"reader_use": "Use as overfit and metric-tradeoff evidence before the multiscale export.",
"public_matrix_role": "superseded lineage evidence, not the current 20-task Qwen row",
},
{
"version": "v5",
"title": "Multiscale cap96 LoRA",
"package": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_eval_test_full",
"purpose": "Move from the 448-sample compact eval to a denser multiscale 4,032-sample held-out eval.",
"change_from_previous": "Introduced the multiscale cap96 export and larger held-out evaluation surface.",
"role": "Dense/multiscale selected-128 run with 4,032 held-out predictions; kept as the pinned prior release because several metrics remain stronger than v6.",
"reader_use": "Use as the pinned prior release; it remains stronger on JSON validity, subtask, next-action, object, and transition metrics.",
"public_matrix_role": "pinned prior release row and comparison baseline",
},
{
"version": "v6",
"title": "Rank64 lr5e-5 multiscale LoRA",
"package": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full",
"purpose": "Promote the current public Qwen3-Omni 20-task row with multiscale LoRA plus task-specific probes.",
"change_from_previous": "Kept the multiscale setup, changed LoRA rank/lr to rank64/lr5e-5, and added verified task-specific probes for full 20-task coverage.",
"role": "Current verified Qwen3-Omni row: rank64/lr5e-5 multiscale LoRA plus task-specific probe artifacts used for the 20/20 Qwen matrix coverage.",
"reader_use": "Use as the current public 20-task Qwen row; it improves action macro-F1 and contact accuracy while v5 remains the prior comparator.",
"public_matrix_role": "current public 20-task Qwen3-Omni v6 LoRA row",
},
]
METRIC_KEYS = [
"json_validity_rate",
"action_macro_f1",
"subtask_accuracy",
"transition_accuracy",
"next_action_accuracy",
"contact_accuracy",
"object_micro_f1",
]
def read_json(path: Path) -> dict:
return json.loads(path.read_text(encoding="utf-8"))
def metric_value(metrics: dict, key: str):
return metrics.get(key)
def fmt(value) -> str:
if value is None:
return ""
if isinstance(value, float):
return f"{value:.4f}"
return str(value)
def markdown_table(headers: list[str], rows: list[list[str]]) -> str:
out = [
"| " + " | ".join(headers) + " |",
"| " + " | ".join("---" for _ in headers) + " |",
]
for row in rows:
out.append("| " + " | ".join(str(cell).replace("|", "\\|").replace("\n", " ") for cell in row) + " |")
return "\n".join(out)
def build_payload() -> dict:
rows = []
for spec in RUNS:
package_dir = VERIFIED / spec["package"]
summary = read_json(package_dir / "verified_result_summary.json")
metrics = read_json(package_dir / "eval/metrics.json")
row = {
**spec,
"status": summary.get("status", "verified"),
"package_path": str(package_dir.relative_to(ROOT)),
"dataset_run_id": summary.get("dataset_run_id"),
"train_run_id": summary.get("train_run_id"),
"eval_run_id": summary.get("eval_run_id"),
"dataset_contract": summary.get("dataset_contract"),
"eval_samples": metrics.get("num_samples") or metrics.get("eval_samples"),
"metrics": {key: metric_value(metrics, key) for key in METRIC_KEYS},
}
rows.append(row)
return {
"title": "Qwen3-Omni v1-v6 Run Lineage",
"status": "pass",
"generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
"scope": "Verified public-safe Qwen3-Omni LoRA/eval packages over the selected Xperience-10M 128-episode surface.",
"interpretation_rule": (
"Do not confuse the Qwen run versions with the project evidence lines. "
"The project evidence lines are one public sample episode and selected 128-episode "
"artifacts. Qwen v1-v6 are only the Qwen3-Omni run lineage inside the selected-128 line. "
"The 20-task matrix uses Qwen3-Omni v6 LoRA; v5 remains the pinned prior release; "
"v1-v4 are lineage and ablation evidence."
),
"current_public_matrix_row": "qwen3_omni_v6_lora",
"pinned_prior_release": "v5",
"runs": rows,
"related_engineering_artifacts": [
{
"name": "Full-parameter gates",
"path": "results/omni_finetune/QWEN3_FULL_PARAMETER_GATES_20260609.md",
"role": "Feasibility and short-train gates; not a public 20-task matrix method row.",
},
{
"name": "Alternate fullsplit v6 package",
"path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_fullsplit_fast8gpu_lora_fsdp_full_train_noval_tail_logits_fullstatesave_v6_eval_test_full",
"role": "Verified alternate no-validation/fullsplit artifact retained for audit, not the current matrix row.",
},
],
}
def write_outputs(payload: dict) -> None:
OUTPUT_JSON.parent.mkdir(parents=True, exist_ok=True)
OUTPUT_JSON.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
rows = []
for run in payload["runs"]:
m = run["metrics"]
rows.append(
[
run["version"],
run["title"],
run["purpose"],
run["change_from_previous"],
run["eval_samples"],
fmt(m["json_validity_rate"]),
fmt(m["action_macro_f1"]),
fmt(m["contact_accuracy"]),
run["reader_use"],
]
)
detail_rows = [
[
run["version"],
run["train_run_id"],
run["eval_run_id"],
run["role"],
run["package_path"],
]
for run in payload["runs"]
]
text = f"""# Qwen3-Omni v1-v6 Run Lineage
Generated: `{payload['generated_at_utc']}`.
Scope: {payload['scope']}
Interpretation rule: {payload['interpretation_rule']}
Read the versions as an engineering audit trail, not as six separate benchmark
rows. v1-v4 explain how the Qwen3-Omni pipeline was hardened, v5 is the pinned
prior multiscale release, and v6 is the current 20-task Qwen3-Omni row.
## Compact Lineage
{markdown_table(['Version', 'Run', 'Purpose', 'Change from previous', 'Eval samples', 'JSON validity', 'Action macro-F1', 'Contact acc.', 'Use now'], rows)}
## Run IDs And Packages
{markdown_table(['Version', 'Train run', 'Eval run', 'Role', 'Package'], detail_rows)}
## Related Engineering Artifacts
{markdown_table(['Artifact', 'Path', 'Role'], [[row['name'], row['path'], row['role']] for row in payload['related_engineering_artifacts']])}
"""
OUTPUT_MD.write_text(text, encoding="utf-8")
def main() -> int:
write_outputs(build_payload())
print(f"Wrote {OUTPUT_JSON.relative_to(ROOT)}")
print(f"Wrote {OUTPUT_MD.relative_to(ROOT)}")
return 0
if __name__ == "__main__":
raise SystemExit(main())
|