Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "title": "Verified Qwen3-Omni LoRA Validation-Aware Held-Out Pilot", | |
| "status": "verified_validation_aware_diagnostic_pilot", | |
| "status_date": "2026-06-06", | |
| "backbone": "Qwen/Qwen3-Omni-30B-A3B-Instruct", | |
| "adapter": "Qwen3-Omni LoRA", | |
| "dataset": "Ropedia Xperience-10M selected 128-episode pilot", | |
| "split_policy": { | |
| "unit": "episode", | |
| "selected_episode_counts": { | |
| "train": 96, | |
| "val": 16, | |
| "test": 16 | |
| }, | |
| "exported_window_counts": { | |
| "train": 2848, | |
| "val": 512, | |
| "test": 448 | |
| }, | |
| "exported_episode_counts": { | |
| "train": 89, | |
| "val": 16, | |
| "test": 14 | |
| }, | |
| "skipped_selected_episodes": 9, | |
| "leakage_policy": "Train, validation, and test are separated by episode/session; test windows are used only for held-out evaluation." | |
| }, | |
| "training": { | |
| "num_processes": 8, | |
| "epochs": 1, | |
| "lora_rank": 16, | |
| "lora_alpha": 32, | |
| "lora_dropout": 0.05, | |
| "num_train_samples": 2848, | |
| "num_val_samples": 512, | |
| "history": [ | |
| { | |
| "epoch": 1, | |
| "train_loss": 0.41304643672440994, | |
| "val_loss": 0.0330660454928875, | |
| "global_step": 356 | |
| } | |
| ], | |
| "loss": "answer-token cross entropy over supervised JSON tokens", | |
| "note": "This validation-aware run uses the selected validation split during training and preserves the held-out test split for final evaluation." | |
| }, | |
| "evaluation": { | |
| "split": "test", | |
| "num_samples": 448, | |
| "held_out_episode_count": 14, | |
| "json_validity_rate": 0.875, | |
| "action_macro_f1": 0.0026621494447581404, | |
| "subtask_accuracy": 0.006696428571428571, | |
| "transition_accuracy": 0.8504464285714286, | |
| "next_action_accuracy": 0.024553571428571428, | |
| "contact_accuracy": 0.6450892857142857, | |
| "object_micro_f1": 0.22299431459254582, | |
| "quality_target": { | |
| "json_validity_rate": 0.98, | |
| "status": "not_met" | |
| }, | |
| "previous_diagnostic_json_validity_rate": 0.8526785714285714 | |
| }, | |
| "interpretation": "This is a real held-out multi-episode validation-aware diagnostic pilot proving the export, LoRA training with validation monitoring, evaluation, validation, and public-safe packaging loop. JSON validity improved over the earlier no-validation diagnostic run, but task-quality metrics remain weak, so it should be used as a baseline and error-analysis starting point rather than a strong Xperience-10M model.", | |
| "public_package": { | |
| "path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval", | |
| "audit_status": "pass", | |
| "contains_raw_xperience10m_data": false, | |
| "contains_qwen_base_weights": false, | |
| "contains_lora_weights": false, | |
| "error_analysis": { | |
| "status": "pass", | |
| "path": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval/analysis/error_analysis_summary.json", | |
| "markdown_report": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_96train_16val_16test_valmon_20260605_eval/analysis/ERROR_ANALYSIS.md", | |
| "groupings": [ | |
| "episode", | |
| "action_family", | |
| "train_seen_status", | |
| "required_modality_state", | |
| "object_category" | |
| ], | |
| "key_readouts": { | |
| "parsed_prediction_rate": 0.8772321428571429, | |
| "weakest_action_family": "locomotion", | |
| "weakest_action_family_samples": 23, | |
| "weakest_action_family_parsed_prediction_rate": 0.2608695652173913, | |
| "seen_action_exact_rate": 0.04580152671755725, | |
| "unseen_action_exact_rate": 0.015772870662460567, | |
| "required_modality_state": "rrd_missing_only_required_modalities_present" | |
| } | |
| } | |
| }, | |
| "required_next_steps": [ | |
| "Improve JSON-format reliability through prompt, decoding, constrained parsing, or target formatting changes.", | |
| "Use the published held-out error analysis to prioritize JSON constraints, action/subtask formatting, object vocabulary handling, and missing-modality robustness.", | |
| "Run a second validation-aware Qwen3-Omni pass only after the JSON/output contract is tightened.", | |
| "Keep the same verified package contract for Cosmos-style world-model and VLA/policy branches." | |
| ] | |
| } | |