ropedia-xperience-10m-task-baselines / data /omni_finetune_verified_result.json
cy0307's picture
Update Qwen3-Omni error-analysis next steps
768fd2e verified
Raw
History Blame
4.21 kB
{
"title": "Verified Qwen3-Omni LoRA Validation-Aware Held-Out Pilot",
"status": "verified_validation_aware_diagnostic_pilot",
"status_date": "2026-06-06",
"backbone": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
"adapter": "Qwen3-Omni LoRA",
"dataset": "Ropedia Xperience-10M selected 128-episode pilot",
"split_policy": {
"unit": "episode",
"selected_episode_counts": {
"train": 96,
"val": 16,
"test": 16
},
"exported_window_counts": {
"train": 2848,
"val": 512,
"test": 448
},
"exported_episode_counts": {
"train": 89,
"val": 16,
"test": 14
},
"skipped_selected_episodes": 9,
"leakage_policy": "Train, validation, and test are separated by episode/session; test windows are used only for held-out evaluation."
},
"training": {
"num_processes": 8,
"epochs": 1,
"lora_rank": 16,
"lora_alpha": 32,
"lora_dropout": 0.05,
"num_train_samples": 2848,
"num_val_samples": 512,
"history": [
{
"epoch": 1,
"train_loss": 0.41304643672440994,
"val_loss": 0.0330660454928875,
"global_step": 356
}
],
"loss": "answer-token cross entropy over supervised JSON tokens",
"note": "This validation-aware run uses the selected validation split during training and preserves the held-out test split for final evaluation."
},
"evaluation": {
"split": "test",
"num_samples": 448,
"held_out_episode_count": 14,
"json_validity_rate": 0.875,
"action_macro_f1": 0.0026621494447581404,
"subtask_accuracy": 0.006696428571428571,
"transition_accuracy": 0.8504464285714286,
"next_action_accuracy": 0.024553571428571428,
"contact_accuracy": 0.6450892857142857,
"object_micro_f1": 0.22299431459254582,
"quality_target": {
"json_validity_rate": 0.98,
"status": "not_met"
},
"previous_diagnostic_json_validity_rate": 0.8526785714285714
},
"interpretation": "This is a real held-out multi-episode validation-aware diagnostic pilot proving the export, LoRA training with validation monitoring, evaluation, validation, and public-safe packaging loop. JSON validity improved over the earlier no-validation diagnostic run, but task-quality metrics remain weak, so it should be used as a baseline and error-analysis starting point rather than a strong Xperience-10M model.",
"public_package": {
"path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval",
"audit_status": "pass",
"contains_raw_xperience10m_data": false,
"contains_qwen_base_weights": false,
"contains_lora_weights": false,
"error_analysis": {
"status": "pass",
"path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval/analysis/error_analysis_summary.json",
"markdown_report": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval/analysis/ERROR_ANALYSIS.md",
"groupings": [
"episode",
"action_family",
"train_seen_status",
"required_modality_state",
"object_category"
],
"key_readouts": {
"parsed_prediction_rate": 0.8772321428571429,
"weakest_action_family": "locomotion",
"weakest_action_family_samples": 23,
"weakest_action_family_parsed_prediction_rate": 0.2608695652173913,
"seen_action_exact_rate": 0.04580152671755725,
"unseen_action_exact_rate": 0.015772870662460567,
"required_modality_state": "rrd_missing_only_required_modalities_present"
}
}
},
"required_next_steps": [
"Improve JSON-format reliability through prompt, decoding, constrained parsing, or target formatting changes.",
"Use the published held-out error analysis to prioritize JSON constraints, action/subtask formatting, object vocabulary handling, and missing-modality robustness.",
"Run a second validation-aware Qwen3-Omni pass only after the JSON/output contract is tightened.",
"Keep the same verified package contract for Cosmos-style world-model and VLA/policy branches."
]
}