File size: 3,768 Bytes
2bd8497
eeac43c
 
 
2bd8497
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eeac43c
2bd8497
 
 
 
 
 
 
 
eeac43c
 
2bd8497
eeac43c
 
 
 
 
 
2bd8497
 
 
eeac43c
2bd8497
 
 
 
 
eeac43c
 
 
 
 
 
 
2bd8497
 
eeac43c
2bd8497
eeac43c
 
2bd8497
eeac43c
2bd8497
eeac43c
2bd8497
 
 
91b502e
eeac43c
2bd8497
 
eeac43c
 
3a10443
eeac43c
2bd8497
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
{
  "title": "Verified Qwen3-Omni LoRA 128-Episode Held-Out Result",
  "status": "verified_full_128_episode_diagnostic_result",
  "status_date": "2026-06-07",
  "backbone": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
  "adapter": "Qwen3-Omni LoRA",
  "dataset": "Ropedia Xperience-10M selected 128-episode pilot",
  "split_policy": {
    "unit": "episode",
    "selected_episode_counts": {
      "train": 96,
      "val": 16,
      "test": 16
    },
    "exported_window_counts": {
      "train": 2848,
      "val": 512,
      "test": 448
    },
    "exported_episode_counts": {
      "train": 89,
      "val": 16,
      "test": 14
    },
    "skipped_selected_episodes": 9,
    "leakage_policy": "Train, validation, and test are separated by episode/session; test windows are used only for held-out evaluation."
  },
  "training": {
    "num_processes": 8,
    "epochs": 2,
    "lora_rank": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "num_train_samples": 2848,
    "num_val_samples": 512,
    "history": [
      {
        "epoch": 1,
        "train_loss": 0.41282760031950355,
        "val_loss": 0.03288277983665466,
        "global_step": 356
      },
      {
        "epoch": 2,
        "train_loss": 0.027745448225544075,
        "val_loss": 0.027823254466056824,
        "global_step": 712
      }
    ],
    "loss": "answer-token cross entropy over supervised JSON tokens",
    "note": "This current Qwen3-Omni LoRA result reuses the selected 96/16/16 episode setup and the v2 trained adapter, then applies the stricter label-contract prompt for held-out evaluation."
  },
  "evaluation": {
    "split": "test",
    "num_samples": 448,
    "held_out_episode_count": 14,
    "json_validity_rate": 1.0,
    "action_macro_f1": 0.0021983997167007384,
    "subtask_accuracy": 0.002232142857142857,
    "transition_accuracy": 0.9732142857142857,
    "next_action_accuracy": 0.03125,
    "contact_accuracy": 0.7209821428571429,
    "object_micro_f1": 0.30688228657389993,
    "quality_target": {
      "json_validity_rate": 0.98,
      "status": "met"
    },
    "previous_validation_aware_json_validity_rate": 0.875,
    "previous_structured_json_v2_json_validity_rate": 0.9977678571428571
  },
  "interpretation": "This is the current verified Qwen3-Omni LoRA diagnostic result for the selected 128-episode setup. It reuses the same trained LoRA adapter as v2 but tightens the prompt-side label contract at evaluation time, reaching 100% JSON validity and small gains in transition, contact, next-action exact accuracy, and object micro-F1. Action and subtask classification remain weak on held-out episodes, so this is still a baseline-quality diagnostic model rather than a strong Xperience-10M action recognizer.",
  "public_package": {
    "path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_structured_json_v3_strict_label_prompt_reuse_lora_eval_test_full",
    "audit_status": "pass",
    "contains_raw_xperience10m_data": false,
    "contains_qwen_base_weights": false,
    "contains_lora_weights": false,
    "adapter_weights_repo": "cy0307/ropedia-qwen3-omni-lora-128ep"
  },
  "required_next_steps": [
    "Use the v3 strict-label predictions for action/subtask error analysis and unseen-label debugging.",
    "Keep the existing Qwen LoRA adapter repository as the weight-bearing artifact; v3 is an evaluation/package refresh over the same adapter, not new weights.",
    "Implement the Cosmos3-Super pipeline-loaded batch packer and one-sample forward-dynamics overfit before claiming Cosmos3 fine-tuning; camera-pose proxy targets are now exported, contract-audited, and schema-packed, but no Cosmos weights have been updated.",
    "Use sharded Qwen eval for future long held-out passes to improve GPU utilization."
  ]
}