ropedia-xperience-10m-task-baselines / data /omni_finetune_verified_result.json

Publish Ropedia Xperience-10M task baseline cards

3a10443 verified 19 days ago

3.77 kB

	{
	"title": "Verified Qwen3-Omni LoRA 128-Episode Held-Out Result",
	"status": "verified_full_128_episode_diagnostic_result",
	"status_date": "2026-06-07",
	"backbone": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
	"adapter": "Qwen3-Omni LoRA",
	"dataset": "Ropedia Xperience-10M selected 128-episode pilot",
	"split_policy": {
	"unit": "episode",
	"selected_episode_counts": {
	"train": 96,
	"val": 16,
	"test": 16
	},
	"exported_window_counts": {
	"train": 2848,
	"val": 512,
	"test": 448
	},
	"exported_episode_counts": {
	"train": 89,
	"val": 16,
	"test": 14
	},
	"skipped_selected_episodes": 9,
	"leakage_policy": "Train, validation, and test are separated by episode/session; test windows are used only for held-out evaluation."
	},
	"training": {
	"num_processes": 8,
	"epochs": 2,
	"lora_rank": 16,
	"lora_alpha": 32,
	"lora_dropout": 0.05,
	"num_train_samples": 2848,
	"num_val_samples": 512,
	"history": [
	{
	"epoch": 1,
	"train_loss": 0.41282760031950355,
	"val_loss": 0.03288277983665466,
	"global_step": 356
	},
	{
	"epoch": 2,
	"train_loss": 0.027745448225544075,
	"val_loss": 0.027823254466056824,
	"global_step": 712
	}
	],
	"loss": "answer-token cross entropy over supervised JSON tokens",
	"note": "This current Qwen3-Omni LoRA result reuses the selected 96/16/16 episode setup and the v2 trained adapter, then applies the stricter label-contract prompt for held-out evaluation."
	},
	"evaluation": {
	"split": "test",
	"num_samples": 448,
	"held_out_episode_count": 14,
	"json_validity_rate": 1.0,
	"action_macro_f1": 0.0021983997167007384,
	"subtask_accuracy": 0.002232142857142857,
	"transition_accuracy": 0.9732142857142857,
	"next_action_accuracy": 0.03125,
	"contact_accuracy": 0.7209821428571429,
	"object_micro_f1": 0.30688228657389993,
	"quality_target": {
	"json_validity_rate": 0.98,
	"status": "met"
	},
	"previous_validation_aware_json_validity_rate": 0.875,
	"previous_structured_json_v2_json_validity_rate": 0.9977678571428571
	},
	"interpretation": "This is the current verified Qwen3-Omni LoRA diagnostic result for the selected 128-episode setup. It reuses the same trained LoRA adapter as v2 but tightens the prompt-side label contract at evaluation time, reaching 100% JSON validity and small gains in transition, contact, next-action exact accuracy, and object micro-F1. Action and subtask classification remain weak on held-out episodes, so this is still a baseline-quality diagnostic model rather than a strong Xperience-10M action recognizer.",
	"public_package": {
	"path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full",
	"audit_status": "pass",
	"contains_raw_xperience10m_data": false,
	"contains_qwen_base_weights": false,
	"contains_lora_weights": false,
	"adapter_weights_repo": "cy0307/ropedia-qwen3-omni-lora-128ep"
	},
	"required_next_steps": [
	"Use the v3 strict-label predictions for action/subtask error analysis and unseen-label debugging.",
	"Keep the existing Qwen LoRA adapter repository as the weight-bearing artifact; v3 is an evaluation/package refresh over the same adapter, not new weights.",
	"Implement the Cosmos3-Super pipeline-loaded batch packer and one-sample forward-dynamics overfit before claiming Cosmos3 fine-tuning; camera-pose proxy targets are now exported, contract-audited, and schema-packed, but no Cosmos weights have been updated.",
	"Use sharded Qwen eval for future long held-out passes to improve GPU utilization."
	]
	}