File size: 3,768 Bytes
2bd8497 eeac43c 2bd8497 eeac43c 2bd8497 eeac43c 2bd8497 eeac43c 2bd8497 eeac43c 2bd8497 eeac43c 2bd8497 eeac43c 2bd8497 eeac43c 2bd8497 eeac43c 2bd8497 eeac43c 2bd8497 91b502e eeac43c 2bd8497 eeac43c 3a10443 eeac43c 2bd8497 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | {
"title": "Verified Qwen3-Omni LoRA 128-Episode Held-Out Result",
"status": "verified_full_128_episode_diagnostic_result",
"status_date": "2026-06-07",
"backbone": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
"adapter": "Qwen3-Omni LoRA",
"dataset": "Ropedia Xperience-10M selected 128-episode pilot",
"split_policy": {
"unit": "episode",
"selected_episode_counts": {
"train": 96,
"val": 16,
"test": 16
},
"exported_window_counts": {
"train": 2848,
"val": 512,
"test": 448
},
"exported_episode_counts": {
"train": 89,
"val": 16,
"test": 14
},
"skipped_selected_episodes": 9,
"leakage_policy": "Train, validation, and test are separated by episode/session; test windows are used only for held-out evaluation."
},
"training": {
"num_processes": 8,
"epochs": 2,
"lora_rank": 16,
"lora_alpha": 32,
"lora_dropout": 0.05,
"num_train_samples": 2848,
"num_val_samples": 512,
"history": [
{
"epoch": 1,
"train_loss": 0.41282760031950355,
"val_loss": 0.03288277983665466,
"global_step": 356
},
{
"epoch": 2,
"train_loss": 0.027745448225544075,
"val_loss": 0.027823254466056824,
"global_step": 712
}
],
"loss": "answer-token cross entropy over supervised JSON tokens",
"note": "This current Qwen3-Omni LoRA result reuses the selected 96/16/16 episode setup and the v2 trained adapter, then applies the stricter label-contract prompt for held-out evaluation."
},
"evaluation": {
"split": "test",
"num_samples": 448,
"held_out_episode_count": 14,
"json_validity_rate": 1.0,
"action_macro_f1": 0.0021983997167007384,
"subtask_accuracy": 0.002232142857142857,
"transition_accuracy": 0.9732142857142857,
"next_action_accuracy": 0.03125,
"contact_accuracy": 0.7209821428571429,
"object_micro_f1": 0.30688228657389993,
"quality_target": {
"json_validity_rate": 0.98,
"status": "met"
},
"previous_validation_aware_json_validity_rate": 0.875,
"previous_structured_json_v2_json_validity_rate": 0.9977678571428571
},
"interpretation": "This is the current verified Qwen3-Omni LoRA diagnostic result for the selected 128-episode setup. It reuses the same trained LoRA adapter as v2 but tightens the prompt-side label contract at evaluation time, reaching 100% JSON validity and small gains in transition, contact, next-action exact accuracy, and object micro-F1. Action and subtask classification remain weak on held-out episodes, so this is still a baseline-quality diagnostic model rather than a strong Xperience-10M action recognizer.",
"public_package": {
"path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full",
"audit_status": "pass",
"contains_raw_xperience10m_data": false,
"contains_qwen_base_weights": false,
"contains_lora_weights": false,
"adapter_weights_repo": "cy0307/ropedia-qwen3-omni-lora-128ep"
},
"required_next_steps": [
"Use the v3 strict-label predictions for action/subtask error analysis and unseen-label debugging.",
"Keep the existing Qwen LoRA adapter repository as the weight-bearing artifact; v3 is an evaluation/package refresh over the same adapter, not new weights.",
"Implement the Cosmos3-Super pipeline-loaded batch packer and one-sample forward-dynamics overfit before claiming Cosmos3 fine-tuning; camera-pose proxy targets are now exported, contract-audited, and schema-packed, but no Cosmos weights have been updated.",
"Use sharded Qwen eval for future long held-out passes to improve GPU utilization."
]
}
|