ropedia-xperience-10m-task-baselines / data /qwen3_omni_run_lineage.json

Add files using upload-large-folder tool

fcaf77a verified 8 days ago

11.5 kB

	{
	"current_public_matrix_row": "qwen3_omni_v6_lora",
	"generated_at_utc": "2026-06-21T11:47:45+00:00",
	"interpretation_rule": "Do not confuse the Qwen run versions with the project evidence lines. The project evidence lines are one public sample episode and selected 128-episode artifacts. Qwen v1-v6 are only the Qwen3-Omni run lineage inside the selected-128 line. The 20-task matrix uses Qwen3-Omni v6 LoRA; v5 remains the pinned prior release; v1-v4 are lineage and ablation evidence.",
	"pinned_prior_release": "v5",
	"related_engineering_artifacts": [
	{
	"name": "Full-parameter gates",
	"path": "results/omni_finetune/QWEN3_FULL_PARAMETER_GATES_20260609.md",
	"role": "Feasibility and short-train gates; not a public 20-task matrix method row."
	},
	{
	"name": "Alternate fullsplit v6 package",
	"path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_fullsplit_fast8gpu_lora_fsdp_full_train_noval_tail_logits_fullstatesave_v6_eval_test_full",
	"role": "Verified alternate no-validation/fullsplit artifact retained for audit, not the current matrix row."
	}
	],
	"runs": [
	{
	"change_from_previous": "First verified Qwen3-Omni selected-128 LoRA run.",
	"dataset_contract": "xperience10m_episode_json_qa_v1",
	"dataset_run_id": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605",
	"eval_run_id": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval",
	"eval_samples": 448,
	"metrics": {
	"action_macro_f1": 0.0026621494447581404,
	"contact_accuracy": 0.6450892857142857,
	"json_validity_rate": 0.875,
	"next_action_accuracy": 0.024553571428571428,
	"object_micro_f1": 0.22299431459254582,
	"subtask_accuracy": 0.006696428571428571,
	"transition_accuracy": 0.8504464285714286
	},
	"package": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval",
	"package_path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval",
	"public_matrix_role": "superseded lineage evidence, not the current 20-task Qwen row",
	"purpose": "Prove that the selected-128 split, LoRA training, held-out eval, validation, and public packaging loop works end to end.",
	"reader_use": "Use only as lineage evidence for the first working pipeline.",
	"role": "First verified 96/16/16 selected-episode Qwen3-Omni LoRA package; establishes dataset, training, eval, and packaging plumbing.",
	"status": "verified",
	"title": "Selected-128 validation-aware LoRA baseline",
	"train_run_id": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_lora",
	"version": "v1"
	},
	{
	"change_from_previous": "Reused the selected-128 split with a stricter structured-JSON answer contract and full 8-GPU LoRA training.",
	"dataset_contract": "xperience10m_episode_json_qa_v1",
	"dataset_run_id": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605",
	"eval_run_id": "xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora_eval_test_full",
	"eval_samples": 448,
	"metrics": {
	"action_macro_f1": 0.0024331644885523347,
	"contact_accuracy": 0.71875,
	"json_validity_rate": 0.9977678571428571,
	"next_action_accuracy": 0.029017857142857144,
	"object_micro_f1": 0.30160427807486634,
	"subtask_accuracy": 0.002232142857142857,
	"transition_accuracy": 0.9709821428571429
	},
	"package": "xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora_eval_test_full",
	"package_path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora_eval_test_full",
	"public_matrix_role": "superseded lineage evidence, not the current 20-task Qwen row",
	"purpose": "Make the answer format schema-checked and reduce invalid JSON before expanding scale.",
	"reader_use": "Use as evidence that schema-constrained evaluation improved validity and contact accuracy over v1.",
	"role": "Reuses the selected-128 split with a stricter structured JSON answer contract and full 8-GPU LoRA training.",
	"status": "verified",
	"title": "Structured-JSON reuse full-8-GPU LoRA",
	"train_run_id": "xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora",
	"version": "v2"
	},
	{
	"change_from_previous": "Evaluated the v2 adapter with stricter labels and prompts; no new adapter training.",
	"dataset_contract": "xperience10m_episode_json_qa_v1",
	"dataset_run_id": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605",
	"eval_run_id": "xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full",
	"eval_samples": 448,
	"metrics": {
	"action_macro_f1": 0.0021983997167007384,
	"contact_accuracy": 0.7209821428571429,
	"json_validity_rate": 1.0,
	"next_action_accuracy": 0.03125,
	"object_micro_f1": 0.30688228657389993,
	"subtask_accuracy": 0.002232142857142857,
	"transition_accuracy": 0.9732142857142857
	},
	"package": "xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full",
	"package_path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full",
	"public_matrix_role": "superseded prompt/eval lineage evidence",
	"purpose": "Separate prompt/eval formatting effects from adapter-training effects.",
	"reader_use": "Use as prompt/eval ablation evidence, not as a separate trained model.",
	"role": "Strict-label prompt/eval pass over the v2 adapter; improves JSON validity without introducing a new adapter training run.",
	"status": "verified",
	"title": "Strict-label prompt evaluation",
	"train_run_id": "xperience10m_qwen3_omni_128ep_structured_json_v2_reuse_full8gpu_lora",
	"version": "v3"
	},
	{
	"change_from_previous": "Trained a new four-epoch full-8-GPU LoRA adapter on the structured-JSON setup.",
	"dataset_contract": "xperience10m_episode_json_qa_v1",
	"dataset_run_id": "xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605",
	"eval_run_id": "xperience10m_qwen3_omni_128ep_structured_json_v4_4epoch_full8gpu_lora_eval_test_full",
	"eval_samples": 448,
	"metrics": {
	"action_macro_f1": 0.0018678269676001454,
	"contact_accuracy": 0.7299107142857143,
	"json_validity_rate": 1.0,
	"next_action_accuracy": 0.033482142857142856,
	"object_micro_f1": 0.31099781500364165,
	"subtask_accuracy": 0.0,
	"transition_accuracy": 0.9732142857142857
	},
	"package": "xperience10m_qwen3_omni_128ep_structured_json_v4_4epoch_full8gpu_lora_eval_test_full",
	"package_path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v4_4epoch_full8gpu_lora_eval_test_full",
	"public_matrix_role": "superseded lineage evidence, not the current 20-task Qwen row",
	"purpose": "Test whether longer structured-JSON LoRA training improves the same selected split.",
	"reader_use": "Use as overfit and metric-tradeoff evidence before the multiscale export.",
	"role": "Four-epoch full-8-GPU LoRA run on the same selected split; useful for overfit/metric tradeoff analysis.",
	"status": "verified",
	"title": "Four-epoch structured-JSON LoRA",
	"train_run_id": "xperience10m_qwen3_omni_128ep_structured_json_v4_4epoch_full8gpu_lora",
	"version": "v4"
	},
	{
	"change_from_previous": "Introduced the multiscale cap96 export and larger held-out evaluation surface.",
	"dataset_contract": "xperience10m_episode_json_qa_v1",
	"dataset_run_id": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora",
	"eval_run_id": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_eval_test_full",
	"eval_samples": 4032,
	"metrics": {
	"action_macro_f1": 0.002289711036077459,
	"contact_accuracy": 0.7864583333333334,
	"json_validity_rate": 1.0,
	"next_action_accuracy": 0.053618594823032224,
	"object_micro_f1": 0.31614599936244814,
	"subtask_accuracy": 0.011194029850746268,
	"transition_accuracy": 0.9908234126984127
	},
	"package": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_eval_test_full",
	"package_path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora_eval_test_full",
	"public_matrix_role": "pinned prior release row and comparison baseline",
	"purpose": "Move from the 448-sample compact eval to a denser multiscale 4,032-sample held-out eval.",
	"reader_use": "Use as the pinned prior release; it remains stronger on JSON validity, subtask, next-action, object, and transition metrics.",
	"role": "Dense/multiscale selected-128 run with 4,032 held-out predictions; kept as the pinned prior release because several metrics remain stronger than v6.",
	"status": "verified",
	"title": "Multiscale cap96 LoRA",
	"train_run_id": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora",
	"version": "v5"
	},
	{
	"change_from_previous": "Kept the multiscale setup, changed LoRA rank/lr to rank64/lr5e-5, and added verified task-specific probes for full 20-task coverage.",
	"dataset_contract": "xperience10m_episode_json_qa_v1",
	"dataset_run_id": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v5_full8gpu_lora",
	"eval_run_id": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full",
	"eval_samples": 4032,
	"metrics": {
	"action_macro_f1": 0.0028830723979596335,
	"contact_accuracy": 0.8177083333333334,
	"json_validity_rate": 0.9990079365079365,
	"next_action_accuracy": 0.04305335446381405,
	"object_micro_f1": 0.3064982378331287,
	"subtask_accuracy": 0.0037313432835820895,
	"transition_accuracy": 0.9898313492063492
	},
	"package": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full",
	"package_path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full",
	"public_matrix_role": "current public 20-task Qwen3-Omni v6 LoRA row",
	"purpose": "Promote the current public Qwen3-Omni 20-task row with multiscale LoRA plus task-specific probes.",
	"reader_use": "Use as the current public 20-task Qwen row; it improves action macro-F1 and contact accuracy while v5 remains the prior comparator.",
	"role": "Current verified Qwen3-Omni row: rank64/lr5e-5 multiscale LoRA plus task-specific probe artifacts used for the 20/20 Qwen matrix coverage.",
	"status": "verified",
	"title": "Rank64 lr5e-5 multiscale LoRA",
	"train_run_id": "xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora",
	"version": "v6"
	}
	],
	"scope": "Verified public-safe Qwen3-Omni LoRA/eval packages over the selected Xperience-10M 128-episode surface.",
	"status": "pass",
	"title": "Qwen3-Omni v1-v6 Run Lineage"
	}