{ "title": "Verified Qwen3-Omni LoRA 128-Episode Held-Out Result", "status": "verified_full_128_episode_diagnostic_result", "status_date": "2026-06-07", "backbone": "Qwen/Qwen3-Omni-30B-A3B-Instruct", "adapter": "Qwen3-Omni LoRA", "dataset": "Ropedia Xperience-10M selected 128-episode pilot", "split_policy": { "unit": "episode", "selected_episode_counts": { "train": 96, "val": 16, "test": 16 }, "exported_window_counts": { "train": 2848, "val": 512, "test": 448 }, "exported_episode_counts": { "train": 89, "val": 16, "test": 14 }, "skipped_selected_episodes": 9, "leakage_policy": "Train, validation, and test are separated by episode/session; test windows are used only for held-out evaluation." }, "training": { "num_processes": 8, "epochs": 2, "lora_rank": 16, "lora_alpha": 32, "lora_dropout": 0.05, "num_train_samples": 2848, "num_val_samples": 512, "history": [ { "epoch": 1, "train_loss": 0.41282760031950355, "val_loss": 0.03288277983665466, "global_step": 356 }, { "epoch": 2, "train_loss": 0.027745448225544075, "val_loss": 0.027823254466056824, "global_step": 712 } ], "loss": "answer-token cross entropy over supervised JSON tokens", "note": "This current Qwen3-Omni LoRA result reuses the selected 96/16/16 episode setup and the v2 trained adapter, then applies the stricter label-contract prompt for held-out evaluation." }, "evaluation": { "split": "test", "num_samples": 448, "held_out_episode_count": 14, "json_validity_rate": 1.0, "action_macro_f1": 0.0021983997167007384, "subtask_accuracy": 0.002232142857142857, "transition_accuracy": 0.9732142857142857, "next_action_accuracy": 0.03125, "contact_accuracy": 0.7209821428571429, "object_micro_f1": 0.30688228657389993, "quality_target": { "json_validity_rate": 0.98, "status": "met" }, "previous_validation_aware_json_validity_rate": 0.875, "previous_structured_json_v2_json_validity_rate": 0.9977678571428571 }, "interpretation": "This is the current verified Qwen3-Omni LoRA diagnostic result for the selected 128-episode setup. It reuses the same trained LoRA adapter as v2 but tightens the prompt-side label contract at evaluation time, reaching 100% JSON validity and small gains in transition, contact, next-action exact accuracy, and object micro-F1. Action and subtask classification remain weak on held-out episodes, so this is still a baseline-quality diagnostic model rather than a strong Xperience-10M action recognizer.", "public_package": { "path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full", "audit_status": "pass", "contains_raw_xperience10m_data": false, "contains_qwen_base_weights": false, "contains_lora_weights": false, "adapter_weights_repo": "cy0307/ropedia-qwen3-omni-lora-128ep" }, "required_next_steps": [ "Use the v3 strict-label predictions for action/subtask error analysis and unseen-label debugging.", "Keep the existing Qwen LoRA adapter repository as the weight-bearing artifact; v3 is an evaluation/package refresh over the same adapter, not new weights.", "Implement the Cosmos3-Super pipeline-loaded batch packer and one-sample forward-dynamics overfit before claiming Cosmos3 fine-tuning; camera-pose proxy targets are now exported, contract-audited, and schema-packed, but no Cosmos weights have been updated.", "Use sharded Qwen eval for future long held-out passes to improve GPU utilization." ] }