| { |
| "title": "Verified Qwen3-Omni LoRA 128-Episode Held-Out Result", |
| "status": "verified_full_128_episode_diagnostic_result", |
| "status_date": "2026-06-07", |
| "backbone": "Qwen/Qwen3-Omni-30B-A3B-Instruct", |
| "adapter": "Qwen3-Omni LoRA", |
| "dataset": "Ropedia Xperience-10M selected 128-episode pilot", |
| "split_policy": { |
| "unit": "episode", |
| "selected_episode_counts": { |
| "train": 96, |
| "val": 16, |
| "test": 16 |
| }, |
| "exported_window_counts": { |
| "train": 2848, |
| "val": 512, |
| "test": 448 |
| }, |
| "exported_episode_counts": { |
| "train": 89, |
| "val": 16, |
| "test": 14 |
| }, |
| "skipped_selected_episodes": 9, |
| "leakage_policy": "Train, validation, and test are separated by episode/session; test windows are used only for held-out evaluation." |
| }, |
| "training": { |
| "num_processes": 8, |
| "epochs": 2, |
| "lora_rank": 16, |
| "lora_alpha": 32, |
| "lora_dropout": 0.05, |
| "num_train_samples": 2848, |
| "num_val_samples": 512, |
| "history": [ |
| { |
| "epoch": 1, |
| "train_loss": 0.41282760031950355, |
| "val_loss": 0.03288277983665466, |
| "global_step": 356 |
| }, |
| { |
| "epoch": 2, |
| "train_loss": 0.027745448225544075, |
| "val_loss": 0.027823254466056824, |
| "global_step": 712 |
| } |
| ], |
| "loss": "answer-token cross entropy over supervised JSON tokens", |
| "note": "This current Qwen3-Omni LoRA result reuses the selected 96/16/16 episode setup and the v2 trained adapter, then applies the stricter label-contract prompt for held-out evaluation." |
| }, |
| "evaluation": { |
| "split": "test", |
| "num_samples": 448, |
| "held_out_episode_count": 14, |
| "json_validity_rate": 1.0, |
| "action_macro_f1": 0.0021983997167007384, |
| "subtask_accuracy": 0.002232142857142857, |
| "transition_accuracy": 0.9732142857142857, |
| "next_action_accuracy": 0.03125, |
| "contact_accuracy": 0.7209821428571429, |
| "object_micro_f1": 0.30688228657389993, |
| "quality_target": { |
| "json_validity_rate": 0.98, |
| "status": "met" |
| }, |
| "previous_validation_aware_json_validity_rate": 0.875, |
| "previous_structured_json_v2_json_validity_rate": 0.9977678571428571 |
| }, |
| "interpretation": "This is the current verified Qwen3-Omni LoRA diagnostic result for the selected 128-episode setup. It reuses the same trained LoRA adapter as v2 but tightens the prompt-side label contract at evaluation time, reaching 100% JSON validity and small gains in transition, contact, next-action exact accuracy, and object micro-F1. Action and subtask classification remain weak on held-out episodes, so this is still a baseline-quality diagnostic model rather than a strong Xperience-10M action recognizer.", |
| "public_package": { |
| "path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full", |
| "audit_status": "pass", |
| "contains_raw_xperience10m_data": false, |
| "contains_qwen_base_weights": false, |
| "contains_lora_weights": false, |
| "adapter_weights_repo": "cy0307/ropedia-qwen3-omni-lora-128ep" |
| }, |
| "required_next_steps": [ |
| "Use the v3 strict-label predictions for action/subtask error analysis and unseen-label debugging.", |
| "Keep the existing Qwen LoRA adapter repository as the weight-bearing artifact; v3 is an evaluation/package refresh over the same adapter, not new weights.", |
| "Implement the Cosmos3-Super pipeline-loaded batch packer and one-sample forward-dynamics overfit before claiming Cosmos3 fine-tuning; camera-pose proxy targets are now exported, contract-audited, and schema-packed, but no Cosmos weights have been updated.", |
| "Use sharded Qwen eval for future long held-out passes to improve GPU utilization." |
| ] |
| } |
|
|