Robotics
PyTorch
Cosmos
xperience10m_task_baseline_suite
embodied-ai
multimodal
xperience-10m
baseline
evaluation
qwen3-omni
Instructions to use cy0307/ropedia-xperience-10m-task-baselines with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Cosmos
How to use cy0307/ropedia-xperience-10m-task-baselines with Cosmos:
# No code snippets available yet for this library. # To use this model, check the repository files and the library's documentation. # Want to help? PRs adding snippets are welcome at: # https://github.com/huggingface/huggingface.js
- Notebooks
- Google Colab
- Kaggle
| { | |
| "title": "128-Episode 20-Task Radar", | |
| "status": "pass", | |
| "generated_at_utc": "2026-06-19T11:30:03+00:00", | |
| "description": "Selected 128-episode metadata/raw baselines plus verified Qwen3/Cosmos branches. Every method has 20 records; numeric scores appear only where the public artifact produced that task target.", | |
| "task_count": 20, | |
| "method_count": 7, | |
| "method_task_record_count": 140, | |
| "scored_method_task_count": 116, | |
| "normalization_policy": { | |
| "higher_is_better": "bounded metrics are plotted directly on 0-1 axes after clipping to [0, 1]", | |
| "lower_is_better": "lower-error metrics are converted to best_observed_value / raw_value within the same task", | |
| "raw_values": "raw metric values, metric keys, and sources are retained in this JSON; the SVG is an overview, not a replacement for the metric table", | |
| "result_record_policy": "every method has 20 task records; records without a numeric score carry explicit unsupported/not-evaluated status and reason fields", | |
| "foundation_model_overlay": "Qwen3/Cosmos points are plotted only on task-aligned axes. Scoreless records mean the public result does not evaluate that task contract.", | |
| "metadata_128_overlay": "128-episode aligned baselines have 20 records. Numeric scores come from JSONL metadata/text tasks plus staged sensor-block targets when the processed target exists; raw interaction text and paired camera-view embeddings remain explicit gaps.", | |
| "raw_128_overlay": "128-episode raw-feature baselines use staged sensor NPZ features. Eighteen axes use direct task targets; interaction text and camera-view sync are completed with documented compact proxies because raw interaction strings and paired video-view embeddings are absent from the 128 export." | |
| }, | |
| "source_unified_radar": "docs/data/unified_task_model_radar.json", | |
| "source_result_matrix": "docs/data/task_method_20_result_matrix.json", | |
| "series": [ | |
| { | |
| "id": "metadata128_simple", | |
| "label": "128ep Aligned Simple", | |
| "short_label": "128-S", | |
| "color": "#ffd166", | |
| "kind": "partial_128_episode_aligned_baseline", | |
| "scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available", | |
| "stroke_dasharray": "9 6", | |
| "method_detail": "128-episode aligned simple baselines: JSONL metadata/text tasks plus staged sensor-block tasks where the processed target exists.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 18, | |
| "covered_task_count": 18, | |
| "proxy_scored_task_count": 0, | |
| "scoreless_task_count": 2, | |
| "unsupported_task_count": 2, | |
| "not_evaluated_task_count": 0, | |
| "status_counts": { | |
| "scored": 18, | |
| "unsupported_without_required_target": 2 | |
| }, | |
| "coverage_fraction": 0.9, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "metadata128_neural_mlp", | |
| "label": "128ep Aligned NN", | |
| "short_label": "128-NN", | |
| "color": "#f472b6", | |
| "kind": "partial_128_episode_aligned_baseline", | |
| "scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available", | |
| "stroke_dasharray": "3 6", | |
| "method_detail": "128-episode aligned MLP baselines: JSONL metadata/text tasks plus staged sensor-block tasks where the processed target exists.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 18, | |
| "covered_task_count": 18, | |
| "proxy_scored_task_count": 0, | |
| "scoreless_task_count": 2, | |
| "unsupported_task_count": 2, | |
| "not_evaluated_task_count": 0, | |
| "status_counts": { | |
| "not_supported_by_metadata_only_package": 2, | |
| "scored": 18 | |
| }, | |
| "coverage_fraction": 0.9, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "raw128_simple", | |
| "label": "128ep Raw Simple", | |
| "short_label": "128-RS", | |
| "color": "#f59e0b", | |
| "kind": "complete_128_episode_raw_feature_baseline", | |
| "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", | |
| "stroke_dasharray": "8 4", | |
| "method_detail": "128-episode 4430-dim sensor NPZ simple heads; tasks 15/19 use compact proxies.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 20, | |
| "covered_task_count": 20, | |
| "proxy_scored_task_count": 2, | |
| "scoreless_task_count": 0, | |
| "unsupported_task_count": 0, | |
| "not_evaluated_task_count": 0, | |
| "status_counts": { | |
| "proxy_scored": 2, | |
| "scored": 18 | |
| }, | |
| "coverage_fraction": 1.0, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "raw128_neural_mlp", | |
| "label": "128ep Raw NN", | |
| "short_label": "128-RN", | |
| "color": "#22d3ee", | |
| "kind": "complete_128_episode_raw_feature_baseline", | |
| "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", | |
| "stroke_dasharray": "2 5", | |
| "method_detail": "128-episode 4430-dim sensor NPZ MLP heads; tasks 15/19 use compact proxies.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 20, | |
| "covered_task_count": 20, | |
| "proxy_scored_task_count": 2, | |
| "scoreless_task_count": 0, | |
| "unsupported_task_count": 0, | |
| "not_evaluated_task_count": 0, | |
| "status_counts": { | |
| "proxy_scored": 2, | |
| "scored": 18 | |
| }, | |
| "coverage_fraction": 1.0, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "qwen3_omni_v6_lora", | |
| "label": "Qwen3-Omni v6 LoRA", | |
| "short_label": "Qwen3", | |
| "color": "#9bb8ff", | |
| "kind": "partial_128_episode_foundation_model_overlay", | |
| "scope": "128 selected episodes, held-out test", | |
| "stroke_dasharray": "7 7", | |
| "method_detail": "Verified held-out Qwen3-Omni v6 LoRA metrics, plus task 16 and any completed private-GPU future/retrieval/sensor-target probes scored from task-specific JSON.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 19, | |
| "covered_task_count": 19, | |
| "proxy_scored_task_count": 0, | |
| "scoreless_task_count": 1, | |
| "unsupported_task_count": 0, | |
| "not_evaluated_task_count": 1, | |
| "status_counts": { | |
| "not_evaluated_in_verified_package": 1, | |
| "scored": 19 | |
| }, | |
| "coverage_fraction": 0.95, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "cosmos3_super_reasoner", | |
| "label": "Cosmos3-Super Reasoner", | |
| "short_label": "C3-S", | |
| "color": "#ff9c7a", | |
| "kind": "partial_128_episode_foundation_model_overlay", | |
| "scope": "128 selected episodes, held-out test", | |
| "stroke_dasharray": "4 7", | |
| "method_detail": "Verified Cosmos3-Super base-weight Reasoner JSON-task evaluation, plus task 8/16 and a derived task-20 action-boundary timing probe scored from existing verified JSON.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 10, | |
| "covered_task_count": 10, | |
| "proxy_scored_task_count": 0, | |
| "scoreless_task_count": 10, | |
| "unsupported_task_count": 0, | |
| "not_evaluated_task_count": 10, | |
| "status_counts": { | |
| "not_evaluated_in_verified_package": 10, | |
| "scored": 10 | |
| }, | |
| "coverage_fraction": 0.5, | |
| "result_record_fraction": 1.0 | |
| }, | |
| { | |
| "id": "cosmos3_nano_future_window", | |
| "label": "Cosmos3-Nano Future Window", | |
| "short_label": "C3-N", | |
| "color": "#d9c7ff", | |
| "kind": "partial_128_episode_world_model_overlay", | |
| "scope": "128 selected episodes, held-out test", | |
| "stroke_dasharray": "2 7", | |
| "method_detail": "Verified Cosmos3-Nano future-window compatibility metrics, plus tasks 10/13/14/16/17 and a derived task-20 boundary timing probe scored from existing held-out future-window artifacts.", | |
| "plotted_as": "colored point overlay", | |
| "result_record_count": 20, | |
| "scored_task_count": 11, | |
| "covered_task_count": 11, | |
| "proxy_scored_task_count": 0, | |
| "scoreless_task_count": 9, | |
| "unsupported_task_count": 0, | |
| "not_evaluated_task_count": 9, | |
| "status_counts": { | |
| "not_evaluated_in_verified_package": 9, | |
| "scored": 11 | |
| }, | |
| "coverage_fraction": 0.55, | |
| "result_record_fraction": 1.0 | |
| } | |
| ], | |
| "tasks": [ | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "label": "Action Recognition", | |
| "axis_label": "01 Action Recognition", | |
| "short_label": "Action", | |
| "origin": "original_public_sample_tasks", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.008252821966746326, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/timeline_action/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.008252821966746326, | |
| "raw_text": "0.0083", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.004175793689174209, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/timeline_action/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.004175793689174209, | |
| "raw_text": "0.0042", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.002915061325704321, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/timeline_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.002915061325704321, | |
| "raw_text": "0.0029", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.0014955083181204041, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/timeline_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0014955083181204041, | |
| "raw_text": "0.0015", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.0028830723979596335, | |
| "metric_key": "action_macro_f1", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0028830723979596335, | |
| "raw_text": "0.0029", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": 0.0008284021201089245, | |
| "metric_key": "action_macro_f1", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0008284021201089245, | |
| "raw_text": "0.0008", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": 0.007936507936507936, | |
| "metric_key": "action_accuracy_from_retrieved_future", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.007936507936507936, | |
| "raw_text": "0.0079", | |
| "status_label": "scored" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "label": "Procedure Step Recognition", | |
| "axis_label": "02 Procedure Step Recognition", | |
| "short_label": "Step", | |
| "origin": "original_public_sample_tasks", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.00019512195121951218, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/timeline_subtask/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.00019512195121951218, | |
| "raw_text": "0.0002", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 7.207207207207208e-05, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/timeline_subtask/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 7.207207207207208e-05, | |
| "raw_text": "0.0001", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/timeline_subtask/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0, | |
| "raw_text": "0.0000", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 7.35632183908046e-05, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/timeline_subtask/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 7.35632183908046e-05, | |
| "raw_text": "0.0001", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.0037313432835820895, | |
| "metric_key": "subtask_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0037313432835820895, | |
| "raw_text": "0.0037", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": 0.0, | |
| "metric_key": "subtask_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0, | |
| "raw_text": "0.0000", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "label": "Action Boundary Detection", | |
| "axis_label": "03 Action Boundary Detection", | |
| "short_label": "Boundary", | |
| "origin": "original_public_sample_tasks", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.29652162550029315, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/transition_detection/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.29652162550029315, | |
| "raw_text": "0.2965", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.4841733292368365, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/transition_detection/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.4841733292368365, | |
| "raw_text": "0.4842", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.4203613574238283, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/transition_detection/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.4203613574238283, | |
| "raw_text": "0.4204", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.4902206914147213, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/transition_detection/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.4902206914147213, | |
| "raw_text": "0.4902", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.9898313492063492, | |
| "metric_key": "transition_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.9898313492063492, | |
| "raw_text": "0.9898", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": 0.36830357142857145, | |
| "metric_key": "transition_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.36830357142857145, | |
| "raw_text": "0.3683", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": 0.9682539682539683, | |
| "metric_key": "transition_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.9682539682539683, | |
| "raw_text": "0.9683", | |
| "status_label": "scored" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "label": "Next-Action Prediction", | |
| "axis_label": "04 Next-Action Prediction", | |
| "short_label": "Next act", | |
| "origin": "original_public_sample_tasks", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.006514774539765508, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/next_action/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.006514774539765508, | |
| "raw_text": "0.0065", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.004910507980164745, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/next_action/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.004910507980164745, | |
| "raw_text": "0.0049", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.003285273363482094, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/next_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.003285273363482094, | |
| "raw_text": "0.0033", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.0018477984371755407, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/next_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0018477984371755407, | |
| "raw_text": "0.0018", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.04305335446381405, | |
| "metric_key": "next_action_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.04305335446381405, | |
| "raw_text": "0.0431", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": 0.013392857142857142, | |
| "metric_key": "next_action_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.013392857142857142, | |
| "raw_text": "0.0134", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": 0.007936507936507936, | |
| "metric_key": "action_accuracy_from_retrieved_future", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.007936507936507936, | |
| "raw_text": "0.0079", | |
| "status_label": "scored" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "label": "Hand Trajectory Forecasting", | |
| "axis_label": "05 Hand Trajectory Forecasting", | |
| "short_label": "Hand traj", | |
| "origin": "original_public_sample_tasks", | |
| "metric_key": "mpjpe", | |
| "metric_name": "MPJPE", | |
| "metric_direction": "lower", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 8.817333221435547, | |
| "metric_key": "mpjpe", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/hand_trajectory_forecast/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.012231610603598841, | |
| "raw_text": "8.817", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.429434210062027, | |
| "metric_key": "mpjpe", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/hand_trajectory_forecast/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.25114484128127007, | |
| "raw_text": "0.4294", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.2729249894618988, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/hand_trajectory_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.39516420515180267, | |
| "raw_text": "0.2729", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.18475216627120972, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/hand_trajectory_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.5837560051580399, | |
| "raw_text": "0.1848", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.7216105627267382, | |
| "metric_key": "hand_trajectory_forecast_mrr", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_sensor_target_probes_a100_20260619T000000Z/hand_trajectory_forecast/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.149457605109387, | |
| "raw_text": "0.7216", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": null, | |
| "metric_key": "mpjpe", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": null, | |
| "metric_key": "mpjpe", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "label": "Contact State Prediction", | |
| "axis_label": "06 Contact State Prediction", | |
| "short_label": "Contact", | |
| "origin": "original_public_sample_tasks", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.4381481308057444, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/contact_prediction/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.4381481308057444, | |
| "raw_text": "0.4381", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.5682695682695682, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/contact_prediction/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.5682695682695682, | |
| "raw_text": "0.5683", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.886990707397193, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/contact_prediction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.886990707397193, | |
| "raw_text": "0.8870", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 1.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/contact_prediction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 1.0, | |
| "raw_text": "1.000", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.8177083333333334, | |
| "metric_key": "contact_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.8177083333333334, | |
| "raw_text": "0.8177", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": 0.32142857142857145, | |
| "metric_key": "contact_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.32142857142857145, | |
| "raw_text": "0.3214", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": 0.7433862433862434, | |
| "metric_key": "contact_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.7433862433862434, | |
| "raw_text": "0.7434", | |
| "status_label": "scored" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "label": "Object Relevance Prediction", | |
| "axis_label": "07 Object Relevance Prediction", | |
| "short_label": "Objects", | |
| "origin": "original_public_sample_tasks", | |
| "metric_key": "micro_f1", | |
| "metric_name": "micro-F1", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.17764578833693304, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/object_relevance/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.17764578833693304, | |
| "raw_text": "0.1776", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.18662723837686876, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/object_relevance/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.18662723837686876, | |
| "raw_text": "0.1866", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.0655376369662084, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/object_relevance/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0655376369662084, | |
| "raw_text": "0.0655", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.1765890386972509, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/object_relevance/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.1765890386972509, | |
| "raw_text": "0.1766", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.3064982378331287, | |
| "metric_key": "object_micro_f1", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.3064982378331287, | |
| "raw_text": "0.3065", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": 0.13704276146316333, | |
| "metric_key": "object_micro_f1", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.13704276146316333, | |
| "raw_text": "0.1370", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": null, | |
| "metric_key": "micro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "label": "Language Grounding", | |
| "axis_label": "08 Language Grounding", | |
| "short_label": "Language", | |
| "origin": "original_public_sample_tasks", | |
| "metric_key": "mrr", | |
| "metric_name": "MRR", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.002332374220713973, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.002332374220713973, | |
| "raw_text": "0.0023", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.008236799389123917, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.008236799389123917, | |
| "raw_text": "0.0082", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.011138836853206158, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.011138836853206158, | |
| "raw_text": "0.0111", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.0063402121886610985, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0063402121886610985, | |
| "raw_text": "0.0063", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.8764467592592605, | |
| "metric_key": "caption_grounding_mrr", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_retrieval_task_probes_a100_20260617T175919Z/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.8764467592592605, | |
| "raw_text": "0.8764", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": 0.30639899644580487, | |
| "metric_key": "caption_grounding_iou", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/caption_grounding/cosmos3_super_reasoner/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.30639899644580487, | |
| "raw_text": "0.3064", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "label": "Cross-Modal Retrieval", | |
| "axis_label": "09 Cross-Modal Retrieval", | |
| "short_label": "X-modal", | |
| "origin": "original_public_sample_tasks", | |
| "metric_key": "mrr", | |
| "metric_name": "MRR", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.002587692579254508, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/cross_modal_retrieval/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.002587692579254508, | |
| "raw_text": "0.0026", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.0026067993603646755, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/cross_modal_retrieval/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0026067993603646755, | |
| "raw_text": "0.0026", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.003459817497059703, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/cross_modal_retrieval/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.003459817497059703, | |
| "raw_text": "0.0035", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.002535284962505102, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/cross_modal_retrieval/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.002535284962505102, | |
| "raw_text": "0.0025", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.5080191798941732, | |
| "metric_key": "cross_modal_retrieval_mrr", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_cross_modal_retrieval_probe_a100_20260618T000000Z/cross_modal_retrieval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.5080191798941732, | |
| "raw_text": "0.5080", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": 0.022138720585222767, | |
| "metric_key": "future_retrieval_mrr", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.022138720585222767, | |
| "raw_text": "0.0221", | |
| "status_label": "scored" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "label": "Cross-Modal Reconstruction", | |
| "axis_label": "10 Cross-Modal Reconstruction", | |
| "short_label": "Recon", | |
| "origin": "original_public_sample_tasks", | |
| "metric_key": "r2", | |
| "metric_name": "R2", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": -190.66106203944798, | |
| "metric_key": "r2", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/modality_reconstruction/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0, | |
| "raw_text": "-190.66", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": -0.43481132003942147, | |
| "metric_key": "r2", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/modality_reconstruction/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0, | |
| "raw_text": "-0.4348", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": -1.3450960391924882, | |
| "metric_key": "r2", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/modality_reconstruction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0, | |
| "raw_text": "-1.345", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": -1.3974418160502369, | |
| "metric_key": "r2", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/modality_reconstruction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0, | |
| "raw_text": "-1.397", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.9670547540707002, | |
| "metric_key": "modality_reconstruction_mrr", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_sensor_target_probes_a100_20260619T000000Z/modality_reconstruction/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.9670547540707002, | |
| "raw_text": "0.9671", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": null, | |
| "metric_key": "r2", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": 0.0002873382957286892, | |
| "metric_key": "feature_reconstruction_quality", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/modality_reconstruction/cosmos3_nano_future_window/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0002873382957286892, | |
| "raw_text": "0.0003", | |
| "status_label": "scored" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "label": "Temporal Order Verification", | |
| "axis_label": "11 Temporal Order Verification", | |
| "short_label": "Order", | |
| "origin": "original_public_sample_tasks", | |
| "metric_key": "f1", | |
| "metric_name": "F1", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.4198864140782312, | |
| "metric_key": "f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.4198864140782312, | |
| "raw_text": "0.4199", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.8252408266656923, | |
| "metric_key": "f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.8252408266656923, | |
| "raw_text": "0.8252", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.49824413370686593, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.49824413370686593, | |
| "raw_text": "0.4982", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.8030047098504103, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.8030047098504103, | |
| "raw_text": "0.8030", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.40984631701404173, | |
| "metric_key": "temporal_order_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.40984631701404173, | |
| "raw_text": "0.4098", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "label": "Multimodal Synchronization Detection", | |
| "axis_label": "12 Multimodal Synchronization Detection", | |
| "short_label": "Sync", | |
| "origin": "original_public_sample_tasks", | |
| "metric_key": "f1", | |
| "metric_name": "F1", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.49980060227663614, | |
| "metric_key": "f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.49980060227663614, | |
| "raw_text": "0.4998", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.7773773780941162, | |
| "metric_key": "f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.7773773780941162, | |
| "raw_text": "0.7774", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.4958867673901769, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.4958867673901769, | |
| "raw_text": "0.4959", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.8272709077974252, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.8272709077974252, | |
| "raw_text": "0.8273", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.3344936184319576, | |
| "metric_key": "misalignment_detection_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.3344936184319576, | |
| "raw_text": "0.3345", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "label": "Long-Horizon Next-Action Forecasting", | |
| "axis_label": "13 Long-Horizon Next-Action Forecasting", | |
| "short_label": "Long act", | |
| "origin": "additional_public_sample_tasks", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.004579592783699693, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/long_horizon_next_action/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.004579592783699693, | |
| "raw_text": "0.0046", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.0029821307969142615, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/long_horizon_next_action/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0029821307969142615, | |
| "raw_text": "0.0030", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.0024280172369056294, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/long_horizon_next_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0024280172369056294, | |
| "raw_text": "0.0024", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.001063859887389299, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/long_horizon_next_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.001063859887389299, | |
| "raw_text": "0.0011", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.0023356666867101906, | |
| "metric_key": "long_horizon_next_action_macro_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/long_horizon_next_action/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0023356666867101906, | |
| "raw_text": "0.0023", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": 0.008807588075880758, | |
| "metric_key": "long_horizon_next_action_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/long_horizon_next_action/cosmos3_super_reasoner/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.008807588075880758, | |
| "raw_text": "0.0088", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": 0.0024906600249066007, | |
| "metric_key": "long_horizon_next_action_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/long_horizon_next_action/cosmos3_nano_future_window/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0024906600249066007, | |
| "raw_text": "0.0025", | |
| "status_label": "scored" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "label": "Long-Horizon Next-Subtask Forecasting", | |
| "axis_label": "14 Long-Horizon Next-Subtask Forecasting", | |
| "short_label": "Long step", | |
| "origin": "additional_public_sample_tasks", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.0001206030150753769, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/next_subtask_forecast/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0001206030150753769, | |
| "raw_text": "0.0001", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 2.086049543676662e-05, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/next_subtask_forecast/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 2.086049543676662e-05, | |
| "raw_text": "0.0000", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/next_subtask_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0, | |
| "raw_text": "0.0000", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/next_subtask_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0, | |
| "raw_text": "0.0000", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.004206715978529301, | |
| "metric_key": "next_subtask_forecast_macro_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/next_subtask_forecast/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.004206715978529301, | |
| "raw_text": "0.0042", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": 0.006614876224708678, | |
| "metric_key": "next_subtask_forecast_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/next_subtask_forecast/cosmos3_nano_future_window/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.006614876224708678, | |
| "raw_text": "0.0066", | |
| "status_label": "scored" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "label": "Interaction Text Prediction", | |
| "axis_label": "15 Interaction Text Prediction", | |
| "short_label": "Interact txt", | |
| "origin": "additional_public_sample_tasks", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": true, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": null, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/interaction_text_prediction/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "unsupported_without_required_target", | |
| "reason": "requires raw annotation.hdf5 caption interaction text; the public 128 JSONL keeps only structured labels and derived metadata", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "unsupported" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "not_supported_by_metadata_only_package", | |
| "reason": "the 128-episode aligned rerun did not produce this task target; raw interaction text, paired camera-view embeddings, or a task-specific target builder is required", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not supported" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.012611998261547169, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "proxy_scored", | |
| "reason": "documented compact proxy completion for this raw128 task axis", | |
| "normalized_score": 0.012611998261547169, | |
| "raw_text": "0.0126", | |
| "status_label": "proxy scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.009791421280985521, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "proxy_scored", | |
| "reason": "documented compact proxy completion for this raw128 task axis", | |
| "normalized_score": 0.009791421280985521, | |
| "raw_text": "0.0098", | |
| "status_label": "proxy scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "label": "Action-Object Relation Prediction", | |
| "axis_label": "16 Action-Object Relation Prediction", | |
| "short_label": "Act+obj", | |
| "origin": "additional_public_sample_tasks", | |
| "metric_key": "macro_f1", | |
| "metric_name": "macro-F1", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/action_object_relation/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0, | |
| "raw_text": "0.0000", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/action_object_relation/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0, | |
| "raw_text": "0.0000", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/action_object_relation/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0, | |
| "raw_text": "0.0000", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/action_object_relation/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0, | |
| "raw_text": "0.0000", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.0002220083079671497, | |
| "metric_key": "action_object_relation_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/qwen3_omni_v6_lora/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0002220083079671497, | |
| "raw_text": "0.0002", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": 0.0, | |
| "metric_key": "action_object_relation_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_super_reasoner/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.0, | |
| "raw_text": "0.0000", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": 0.002794157670325683, | |
| "metric_key": "action_object_relation_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_nano_future_window/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.002794157670325683, | |
| "raw_text": "0.0028", | |
| "status_label": "scored" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "label": "Future Object-Set Forecasting", | |
| "axis_label": "17 Future Object-Set Forecasting", | |
| "short_label": "Future obj", | |
| "origin": "additional_public_sample_tasks", | |
| "metric_key": "micro_f1", | |
| "metric_name": "micro-F1", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.17656983343047333, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/object_set_forecast/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.17656983343047333, | |
| "raw_text": "0.1766", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.17418550827844048, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/object_set_forecast/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.17418550827844048, | |
| "raw_text": "0.1742", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.06469493412657774, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/object_set_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.06469493412657774, | |
| "raw_text": "0.0647", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.17523098630012288, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/object_set_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.17523098630012288, | |
| "raw_text": "0.1752", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.1659483964851402, | |
| "metric_key": "object_set_forecast_micro_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/object_set_forecast/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.1659483964851402, | |
| "raw_text": "0.1659", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": null, | |
| "metric_key": "micro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": 0.01781970649895178, | |
| "metric_key": "object_set_forecast_micro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/object_set_forecast/cosmos3_nano_future_window/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.01781970649895178, | |
| "raw_text": "0.0178", | |
| "status_label": "scored" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "label": "IMU-to-Hand Pose Reconstruction", | |
| "axis_label": "18 IMU-to-Hand Pose Reconstruction", | |
| "short_label": "IMU->hand", | |
| "origin": "additional_public_sample_tasks", | |
| "metric_key": "mae", | |
| "metric_name": "MAE", | |
| "metric_direction": "lower", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 0.2294670194387436, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/imu_to_hand_pose/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.18324815505876868, | |
| "raw_text": "0.2295", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 0.2555866539478302, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/imu_to_hand_pose/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.16452114110609004, | |
| "raw_text": "0.2556", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.22941437363624573, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/imu_to_hand_pose/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.1832902066792771, | |
| "raw_text": "0.2294", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.252998411655426, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/imu_to_hand_pose/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.1662042369509182, | |
| "raw_text": "0.2530", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.9641651902471952, | |
| "metric_key": "imu_to_hand_pose_mrr", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_sensor_target_probes_a100_20260619T000000Z/imu_to_hand_pose/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.043612244441436056, | |
| "raw_text": "0.9642", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": null, | |
| "metric_key": "mae", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": null, | |
| "metric_key": "mae", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "label": "Camera-View Synchronization Retrieval", | |
| "axis_label": "19 Camera-View Synchronization Retrieval", | |
| "short_label": "Cam sync", | |
| "origin": "additional_public_sample_tasks", | |
| "metric_key": "mrr", | |
| "metric_name": "MRR", | |
| "metric_direction": "higher", | |
| "raw128_proxy_axis": true, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": null, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/camera_view_sync_retrieval/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "unsupported_without_required_target", | |
| "reason": "requires paired camera-view feature blocks, which are not in the public 128 JSONL metadata package", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "unsupported" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "not_supported_by_metadata_only_package", | |
| "reason": "the 128-episode aligned rerun did not produce this task target; raw interaction text, paired camera-view embeddings, or a task-specific target builder is required", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not supported" | |
| }, | |
| "raw128_simple": { | |
| "raw": 0.0026625150348991156, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "proxy_scored", | |
| "reason": "documented compact proxy completion for this raw128 task axis", | |
| "normalized_score": 0.0026625150348991156, | |
| "raw_text": "0.0027", | |
| "status_label": "proxy scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 0.0025448438245803118, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "proxy_scored", | |
| "reason": "documented compact proxy completion for this raw128 task axis", | |
| "normalized_score": 0.0025448438245803118, | |
| "raw_text": "0.0025", | |
| "status_label": "proxy scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 0.6587714947089998, | |
| "metric_key": "camera_view_sync_retrieval_mrr", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_camera_view_sync_mosaic_tile_a100_20260619T0305Z/camera_view_sync_retrieval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.6587714947089998, | |
| "raw_text": "0.6588", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "not_evaluated_in_verified_package", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", | |
| "normalized_score": null, | |
| "raw_text": "n/a", | |
| "status_label": "not evaluated" | |
| } | |
| } | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "label": "Time-to-Next-Transition Regression", | |
| "axis_label": "20 Time-to-Next-Transition Regression", | |
| "short_label": "Time2bdry", | |
| "origin": "additional_public_sample_tasks", | |
| "metric_key": "mae", | |
| "metric_name": "MAE frames", | |
| "metric_direction": "lower", | |
| "raw128_proxy_axis": false, | |
| "values": { | |
| "metadata128_simple": { | |
| "raw": 624.8108520507812, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/time_to_transition/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.016864874132806403, | |
| "raw_text": "624.81", | |
| "status_label": "scored" | |
| }, | |
| "metadata128_neural_mlp": { | |
| "raw": 41.4664421081543, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/time_to_transition/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.25411768748242325, | |
| "raw_text": "41.47", | |
| "status_label": "scored" | |
| }, | |
| "raw128_simple": { | |
| "raw": 52.32759475708008, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/time_to_transition/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.20137284019197565, | |
| "raw_text": "52.33", | |
| "status_label": "scored" | |
| }, | |
| "raw128_neural_mlp": { | |
| "raw": 42.374061584472656, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/time_to_transition/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.24867468405504953, | |
| "raw_text": "42.37", | |
| "status_label": "scored" | |
| }, | |
| "qwen3_omni_v6_lora": { | |
| "raw": 134.0687422166874, | |
| "metric_key": "time_to_transition_mae", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/time_to_transition/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.07859666766782253, | |
| "raw_text": "134.07", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_super_reasoner": { | |
| "raw": 52.94642857142857, | |
| "metric_key": "time_to_transition_mae", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/time_to_transition/cosmos3_super_reasoner/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.19901920981190058, | |
| "raw_text": "52.95", | |
| "status_label": "scored" | |
| }, | |
| "cosmos3_nano_future_window": { | |
| "raw": 33.80952380952381, | |
| "metric_key": "time_to_transition_mae", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/time_to_transition/cosmos3_nano_future_window/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "status": "scored", | |
| "reason": null, | |
| "normalized_score": 0.3116682871966295, | |
| "raw_text": "33.81", | |
| "status_label": "scored" | |
| } | |
| } | |
| } | |
| ], | |
| "task_method_result_matrix": [ | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.008252821966746326, | |
| "raw_text": "0.0083", | |
| "normalized_score": 0.008252821966746326, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/timeline_action/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.004175793689174209, | |
| "raw_text": "0.0042", | |
| "normalized_score": 0.004175793689174209, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/timeline_action/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.002915061325704321, | |
| "raw_text": "0.0029", | |
| "normalized_score": 0.002915061325704321, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/timeline_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0014955083181204041, | |
| "raw_text": "0.0015", | |
| "normalized_score": 0.0014955083181204041, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/timeline_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0028830723979596335, | |
| "raw_text": "0.0029", | |
| "normalized_score": 0.0028830723979596335, | |
| "metric_key": "action_macro_f1", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0008284021201089245, | |
| "raw_text": "0.0008", | |
| "normalized_score": 0.0008284021201089245, | |
| "metric_key": "action_macro_f1", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 1, | |
| "task_id": "timeline_action", | |
| "task_label": "Action Recognition", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.007936507936507936, | |
| "raw_text": "0.0079", | |
| "normalized_score": 0.007936507936507936, | |
| "metric_key": "action_accuracy_from_retrieved_future", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.00019512195121951218, | |
| "raw_text": "0.0002", | |
| "normalized_score": 0.00019512195121951218, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/timeline_subtask/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 7.207207207207208e-05, | |
| "raw_text": "0.0001", | |
| "normalized_score": 7.207207207207208e-05, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/timeline_subtask/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/timeline_subtask/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 7.35632183908046e-05, | |
| "raw_text": "0.0001", | |
| "normalized_score": 7.35632183908046e-05, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/timeline_subtask/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0037313432835820895, | |
| "raw_text": "0.0037", | |
| "normalized_score": 0.0037313432835820895, | |
| "metric_key": "subtask_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "subtask_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 2, | |
| "task_id": "timeline_subtask", | |
| "task_label": "Procedure Step Recognition", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.29652162550029315, | |
| "raw_text": "0.2965", | |
| "normalized_score": 0.29652162550029315, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/transition_detection/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.4841733292368365, | |
| "raw_text": "0.4842", | |
| "normalized_score": 0.4841733292368365, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/transition_detection/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.4203613574238283, | |
| "raw_text": "0.4204", | |
| "normalized_score": 0.4203613574238283, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/transition_detection/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.4902206914147213, | |
| "raw_text": "0.4902", | |
| "normalized_score": 0.4902206914147213, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/transition_detection/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.9898313492063492, | |
| "raw_text": "0.9898", | |
| "normalized_score": 0.9898313492063492, | |
| "metric_key": "transition_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.36830357142857145, | |
| "raw_text": "0.3683", | |
| "normalized_score": 0.36830357142857145, | |
| "metric_key": "transition_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 3, | |
| "task_id": "transition_detection", | |
| "task_label": "Action Boundary Detection", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.9682539682539683, | |
| "raw_text": "0.9683", | |
| "normalized_score": 0.9682539682539683, | |
| "metric_key": "transition_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.006514774539765508, | |
| "raw_text": "0.0065", | |
| "normalized_score": 0.006514774539765508, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/next_action/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.004910507980164745, | |
| "raw_text": "0.0049", | |
| "normalized_score": 0.004910507980164745, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/next_action/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.003285273363482094, | |
| "raw_text": "0.0033", | |
| "normalized_score": 0.003285273363482094, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/next_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0018477984371755407, | |
| "raw_text": "0.0018", | |
| "normalized_score": 0.0018477984371755407, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/next_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.04305335446381405, | |
| "raw_text": "0.0431", | |
| "normalized_score": 0.04305335446381405, | |
| "metric_key": "next_action_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.013392857142857142, | |
| "raw_text": "0.0134", | |
| "normalized_score": 0.013392857142857142, | |
| "metric_key": "next_action_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 4, | |
| "task_id": "next_action", | |
| "task_label": "Next-Action Prediction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.007936507936507936, | |
| "raw_text": "0.0079", | |
| "normalized_score": 0.007936507936507936, | |
| "metric_key": "action_accuracy_from_retrieved_future", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 8.817333221435547, | |
| "raw_text": "8.817", | |
| "normalized_score": 0.012231610603598841, | |
| "metric_key": "mpjpe", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/hand_trajectory_forecast/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.429434210062027, | |
| "raw_text": "0.4294", | |
| "normalized_score": 0.25114484128127007, | |
| "metric_key": "mpjpe", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/hand_trajectory_forecast/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.2729249894618988, | |
| "raw_text": "0.2729", | |
| "normalized_score": 0.39516420515180267, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/hand_trajectory_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.18475216627120972, | |
| "raw_text": "0.1848", | |
| "normalized_score": 0.5837560051580399, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/hand_trajectory_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.7216105627267382, | |
| "raw_text": "0.7216", | |
| "normalized_score": 0.149457605109387, | |
| "metric_key": "hand_trajectory_forecast_mrr", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_sensor_target_probes_a100_20260619T000000Z/hand_trajectory_forecast/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mpjpe", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 5, | |
| "task_id": "hand_trajectory_forecast", | |
| "task_label": "Hand Trajectory Forecasting", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mpjpe", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.4381481308057444, | |
| "raw_text": "0.4381", | |
| "normalized_score": 0.4381481308057444, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/contact_prediction/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.5682695682695682, | |
| "raw_text": "0.5683", | |
| "normalized_score": 0.5682695682695682, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/contact_prediction/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.886990707397193, | |
| "raw_text": "0.8870", | |
| "normalized_score": 0.886990707397193, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/contact_prediction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 1.0, | |
| "raw_text": "1.000", | |
| "normalized_score": 1.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/contact_prediction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.8177083333333334, | |
| "raw_text": "0.8177", | |
| "normalized_score": 0.8177083333333334, | |
| "metric_key": "contact_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.32142857142857145, | |
| "raw_text": "0.3214", | |
| "normalized_score": 0.32142857142857145, | |
| "metric_key": "contact_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 6, | |
| "task_id": "contact_prediction", | |
| "task_label": "Contact State Prediction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.7433862433862434, | |
| "raw_text": "0.7434", | |
| "normalized_score": 0.7433862433862434, | |
| "metric_key": "contact_accuracy", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.17764578833693304, | |
| "raw_text": "0.1776", | |
| "normalized_score": 0.17764578833693304, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/object_relevance/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.18662723837686876, | |
| "raw_text": "0.1866", | |
| "normalized_score": 0.18662723837686876, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/object_relevance/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0655376369662084, | |
| "raw_text": "0.0655", | |
| "normalized_score": 0.0655376369662084, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/object_relevance/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.1765890386972509, | |
| "raw_text": "0.1766", | |
| "normalized_score": 0.1765890386972509, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/object_relevance/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.3064982378331287, | |
| "raw_text": "0.3065", | |
| "normalized_score": 0.3064982378331287, | |
| "metric_key": "object_micro_f1", | |
| "source": "results/omni_finetune/verified_public/xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.13704276146316333, | |
| "raw_text": "0.1370", | |
| "normalized_score": 0.13704276146316333, | |
| "metric_key": "object_micro_f1", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 7, | |
| "task_id": "object_relevance", | |
| "task_label": "Object Relevance Prediction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "micro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.002332374220713973, | |
| "raw_text": "0.0023", | |
| "normalized_score": 0.002332374220713973, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.008236799389123917, | |
| "raw_text": "0.0082", | |
| "normalized_score": 0.008236799389123917, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.011138836853206158, | |
| "raw_text": "0.0111", | |
| "normalized_score": 0.011138836853206158, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0063402121886610985, | |
| "raw_text": "0.0063", | |
| "normalized_score": 0.0063402121886610985, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.8764467592592605, | |
| "raw_text": "0.8764", | |
| "normalized_score": 0.8764467592592605, | |
| "metric_key": "caption_grounding_mrr", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_retrieval_task_probes_a100_20260617T175919Z/caption_grounding/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.30639899644580487, | |
| "raw_text": "0.3064", | |
| "normalized_score": 0.30639899644580487, | |
| "metric_key": "caption_grounding_iou", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/caption_grounding/cosmos3_super_reasoner/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 8, | |
| "task_id": "caption_grounding", | |
| "task_label": "Language Grounding", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.002587692579254508, | |
| "raw_text": "0.0026", | |
| "normalized_score": 0.002587692579254508, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/cross_modal_retrieval/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0026067993603646755, | |
| "raw_text": "0.0026", | |
| "normalized_score": 0.0026067993603646755, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/cross_modal_retrieval/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.003459817497059703, | |
| "raw_text": "0.0035", | |
| "normalized_score": 0.003459817497059703, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/cross_modal_retrieval/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.002535284962505102, | |
| "raw_text": "0.0025", | |
| "normalized_score": 0.002535284962505102, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/cross_modal_retrieval/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.5080191798941732, | |
| "raw_text": "0.5080", | |
| "normalized_score": 0.5080191798941732, | |
| "metric_key": "cross_modal_retrieval_mrr", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_cross_modal_retrieval_probe_a100_20260618T000000Z/cross_modal_retrieval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 9, | |
| "task_id": "cross_modal_retrieval", | |
| "task_label": "Cross-Modal Retrieval", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.022138720585222767, | |
| "raw_text": "0.0221", | |
| "normalized_score": 0.022138720585222767, | |
| "metric_key": "future_retrieval_mrr", | |
| "source": "results/omni_finetune/verified_public/xperience10m_cosmos3_nano_128ep_future_window_h5_compat_adapter_eval_test_full/eval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": -190.66106203944798, | |
| "raw_text": "-190.66", | |
| "normalized_score": 0.0, | |
| "metric_key": "r2", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/modality_reconstruction/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": -0.43481132003942147, | |
| "raw_text": "-0.4348", | |
| "normalized_score": 0.0, | |
| "metric_key": "r2", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/modality_reconstruction/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": -1.3450960391924882, | |
| "raw_text": "-1.345", | |
| "normalized_score": 0.0, | |
| "metric_key": "r2", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/modality_reconstruction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": -1.3974418160502369, | |
| "raw_text": "-1.397", | |
| "normalized_score": 0.0, | |
| "metric_key": "r2", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/modality_reconstruction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.9670547540707002, | |
| "raw_text": "0.9671", | |
| "normalized_score": 0.9670547540707002, | |
| "metric_key": "modality_reconstruction_mrr", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_sensor_target_probes_a100_20260619T000000Z/modality_reconstruction/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "r2", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 10, | |
| "task_id": "modality_reconstruction", | |
| "task_label": "Cross-Modal Reconstruction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0002873382957286892, | |
| "raw_text": "0.0003", | |
| "normalized_score": 0.0002873382957286892, | |
| "metric_key": "feature_reconstruction_quality", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/modality_reconstruction/cosmos3_nano_future_window/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.4198864140782312, | |
| "raw_text": "0.4199", | |
| "normalized_score": 0.4198864140782312, | |
| "metric_key": "f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.8252408266656923, | |
| "raw_text": "0.8252", | |
| "normalized_score": 0.8252408266656923, | |
| "metric_key": "f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.49824413370686593, | |
| "raw_text": "0.4982", | |
| "normalized_score": 0.49824413370686593, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.8030047098504103, | |
| "raw_text": "0.8030", | |
| "normalized_score": 0.8030047098504103, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.40984631701404173, | |
| "raw_text": "0.4098", | |
| "normalized_score": 0.40984631701404173, | |
| "metric_key": "temporal_order_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/temporal_order/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 11, | |
| "task_id": "temporal_order", | |
| "task_label": "Temporal Order Verification", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.49980060227663614, | |
| "raw_text": "0.4998", | |
| "normalized_score": 0.49980060227663614, | |
| "metric_key": "f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.7773773780941162, | |
| "raw_text": "0.7774", | |
| "normalized_score": 0.7773773780941162, | |
| "metric_key": "f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.4958867673901769, | |
| "raw_text": "0.4959", | |
| "normalized_score": 0.4958867673901769, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.8272709077974252, | |
| "raw_text": "0.8273", | |
| "normalized_score": 0.8272709077974252, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.3344936184319576, | |
| "raw_text": "0.3345", | |
| "normalized_score": 0.3344936184319576, | |
| "metric_key": "misalignment_detection_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/misalignment_detection/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 12, | |
| "task_id": "misalignment_detection", | |
| "task_label": "Multimodal Synchronization Detection", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.004579592783699693, | |
| "raw_text": "0.0046", | |
| "normalized_score": 0.004579592783699693, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/long_horizon_next_action/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0029821307969142615, | |
| "raw_text": "0.0030", | |
| "normalized_score": 0.0029821307969142615, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/long_horizon_next_action/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0024280172369056294, | |
| "raw_text": "0.0024", | |
| "normalized_score": 0.0024280172369056294, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/long_horizon_next_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.001063859887389299, | |
| "raw_text": "0.0011", | |
| "normalized_score": 0.001063859887389299, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/long_horizon_next_action/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0023356666867101906, | |
| "raw_text": "0.0023", | |
| "normalized_score": 0.0023356666867101906, | |
| "metric_key": "long_horizon_next_action_macro_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/long_horizon_next_action/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.008807588075880758, | |
| "raw_text": "0.0088", | |
| "normalized_score": 0.008807588075880758, | |
| "metric_key": "long_horizon_next_action_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/long_horizon_next_action/cosmos3_super_reasoner/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 13, | |
| "task_id": "long_horizon_next_action", | |
| "task_label": "Long-Horizon Next-Action Forecasting", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0024906600249066007, | |
| "raw_text": "0.0025", | |
| "normalized_score": 0.0024906600249066007, | |
| "metric_key": "long_horizon_next_action_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/long_horizon_next_action/cosmos3_nano_future_window/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0001206030150753769, | |
| "raw_text": "0.0001", | |
| "normalized_score": 0.0001206030150753769, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/next_subtask_forecast/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 2.086049543676662e-05, | |
| "raw_text": "0.0000", | |
| "normalized_score": 2.086049543676662e-05, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/next_subtask_forecast/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/next_subtask_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/next_subtask_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.004206715978529301, | |
| "raw_text": "0.0042", | |
| "normalized_score": 0.004206715978529301, | |
| "metric_key": "next_subtask_forecast_macro_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/next_subtask_forecast/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 14, | |
| "task_id": "next_subtask_forecast", | |
| "task_label": "Long-Horizon Next-Subtask Forecasting", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.006614876224708678, | |
| "raw_text": "0.0066", | |
| "normalized_score": 0.006614876224708678, | |
| "metric_key": "next_subtask_forecast_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/next_subtask_forecast/cosmos3_nano_future_window/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "unsupported_without_required_target", | |
| "status_label": "unsupported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/interaction_text_prediction/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": "requires raw annotation.hdf5 caption interaction text; the public 128 JSONL keeps only structured labels and derived metadata" | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": "the 128-episode aligned rerun did not produce this task target; raw interaction text, paired camera-view embeddings, or a task-specific target builder is required" | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "proxy_scored", | |
| "status_label": "proxy scored", | |
| "scored": true, | |
| "proxy_scored": true, | |
| "raw": 0.012611998261547169, | |
| "raw_text": "0.0126", | |
| "normalized_score": 0.012611998261547169, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": "documented compact proxy completion for this raw128 task axis" | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "proxy_scored", | |
| "status_label": "proxy scored", | |
| "scored": true, | |
| "proxy_scored": true, | |
| "raw": 0.009791421280985521, | |
| "raw_text": "0.0098", | |
| "normalized_score": 0.009791421280985521, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": "documented compact proxy completion for this raw128 task axis" | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 15, | |
| "task_id": "interaction_text_prediction", | |
| "task_label": "Interaction Text Prediction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "macro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/action_object_relation/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/action_object_relation/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/action_object_relation/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "macro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/action_object_relation/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0002220083079671497, | |
| "raw_text": "0.0002", | |
| "normalized_score": 0.0002220083079671497, | |
| "metric_key": "action_object_relation_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/qwen3_omni_v6_lora/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.0, | |
| "raw_text": "0.0000", | |
| "normalized_score": 0.0, | |
| "metric_key": "action_object_relation_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_super_reasoner/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 16, | |
| "task_id": "action_object_relation", | |
| "task_label": "Action-Object Relation Prediction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.002794157670325683, | |
| "raw_text": "0.0028", | |
| "normalized_score": 0.002794157670325683, | |
| "metric_key": "action_object_relation_macro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/action_object_relation/cosmos3_nano_future_window/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.17656983343047333, | |
| "raw_text": "0.1766", | |
| "normalized_score": 0.17656983343047333, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/object_set_forecast/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.17418550827844048, | |
| "raw_text": "0.1742", | |
| "normalized_score": 0.17418550827844048, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/object_set_forecast/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.06469493412657774, | |
| "raw_text": "0.0647", | |
| "normalized_score": 0.06469493412657774, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/object_set_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.17523098630012288, | |
| "raw_text": "0.1752", | |
| "normalized_score": 0.17523098630012288, | |
| "metric_key": "micro_f1", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/object_set_forecast/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.1659483964851402, | |
| "raw_text": "0.1659", | |
| "normalized_score": 0.1659483964851402, | |
| "metric_key": "object_set_forecast_micro_f1", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_future_task_probes_a100_20260616T143608Z/object_set_forecast/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "micro_f1", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 17, | |
| "task_id": "object_set_forecast", | |
| "task_label": "Future Object-Set Forecasting", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.01781970649895178, | |
| "raw_text": "0.0178", | |
| "normalized_score": 0.01781970649895178, | |
| "metric_key": "object_set_forecast_micro_f1", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/object_set_forecast/cosmos3_nano_future_window/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.2294670194387436, | |
| "raw_text": "0.2295", | |
| "normalized_score": 0.18324815505876868, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/imu_to_hand_pose/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.2555866539478302, | |
| "raw_text": "0.2556", | |
| "normalized_score": 0.16452114110609004, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/imu_to_hand_pose/metrics.json", | |
| "scope": "multi_episode_128_aligned_sensor_block_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.22941437363624573, | |
| "raw_text": "0.2294", | |
| "normalized_score": 0.1832902066792771, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/imu_to_hand_pose/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.252998411655426, | |
| "raw_text": "0.2530", | |
| "normalized_score": 0.1662042369509182, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/imu_to_hand_pose/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.9641651902471952, | |
| "raw_text": "0.9642", | |
| "normalized_score": 0.043612244441436056, | |
| "metric_key": "imu_to_hand_pose_mrr", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_sensor_target_probes_a100_20260619T000000Z/imu_to_hand_pose/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mae", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 18, | |
| "task_id": "imu_to_hand_pose", | |
| "task_label": "IMU-to-Hand Pose Reconstruction", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mae", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "unsupported_without_required_target", | |
| "status_label": "unsupported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/camera_view_sync_retrieval/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": "requires paired camera-view feature blocks, which are not in the public 128 JSONL metadata package" | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "not_supported_by_metadata_only_package", | |
| "status_label": "not supported", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": "the 128-episode aligned rerun did not produce this task target; raw interaction text, paired camera-view embeddings, or a task-specific target builder is required" | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "proxy_scored", | |
| "status_label": "proxy scored", | |
| "scored": true, | |
| "proxy_scored": true, | |
| "raw": 0.0026625150348991156, | |
| "raw_text": "0.0027", | |
| "normalized_score": 0.0026625150348991156, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": "documented compact proxy completion for this raw128 task axis" | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "proxy_scored", | |
| "status_label": "proxy scored", | |
| "scored": true, | |
| "proxy_scored": true, | |
| "raw": 0.0025448438245803118, | |
| "raw_text": "0.0025", | |
| "normalized_score": 0.0025448438245803118, | |
| "metric_key": "mrr", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": "documented compact proxy completion for this raw128 task axis" | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 0.6587714947089998, | |
| "raw_text": "0.6588", | |
| "normalized_score": 0.6587714947089998, | |
| "metric_key": "camera_view_sync_retrieval_mrr", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_camera_view_sync_mosaic_tile_a100_20260619T0305Z/camera_view_sync_retrieval/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 19, | |
| "task_id": "camera_view_sync_retrieval", | |
| "task_label": "Camera-View Synchronization Retrieval", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "not_evaluated_in_verified_package", | |
| "status_label": "not evaluated", | |
| "scored": false, | |
| "proxy_scored": false, | |
| "raw": null, | |
| "raw_text": "n/a", | |
| "normalized_score": null, | |
| "metric_key": "mrr", | |
| "source": null, | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score" | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "metadata128_simple", | |
| "method": "128ep Aligned Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 624.8108520507812, | |
| "raw_text": "624.81", | |
| "normalized_score": 0.016864874132806403, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/time_to_transition/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "metadata128_neural_mlp", | |
| "method": "128ep Aligned NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 41.4664421081543, | |
| "raw_text": "41.47", | |
| "normalized_score": 0.25411768748242325, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/time_to_transition/metrics.json", | |
| "scope": "multi_episode_128_aligned_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "raw128_simple", | |
| "method": "128ep Raw Simple", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 52.32759475708008, | |
| "raw_text": "52.33", | |
| "normalized_score": 0.20137284019197565, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/time_to_transition/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "raw128_neural_mlp", | |
| "method": "128ep Raw NN", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 42.374061584472656, | |
| "raw_text": "42.37", | |
| "normalized_score": 0.24867468405504953, | |
| "metric_key": "mae", | |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/time_to_transition/metrics.json", | |
| "scope": "multi_episode_128_raw_sensor_feature_baseline", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "qwen3_omni_v6_lora", | |
| "method": "Qwen3-Omni v6 LoRA", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 134.0687422166874, | |
| "raw_text": "134.07", | |
| "normalized_score": 0.07859666766782253, | |
| "metric_key": "time_to_transition_mae", | |
| "source": "results/omni_finetune/xperience10m_qwen3_omni_v6_order_sync_time_probes_a100_20260617T132500Z/time_to_transition/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "cosmos3_super_reasoner", | |
| "method": "Cosmos3-Super Reasoner", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 52.94642857142857, | |
| "raw_text": "52.95", | |
| "normalized_score": 0.19901920981190058, | |
| "metric_key": "time_to_transition_mae", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/time_to_transition/cosmos3_super_reasoner/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| }, | |
| { | |
| "task_number": 20, | |
| "task_id": "time_to_transition", | |
| "task_label": "Time-to-Next-Transition Regression", | |
| "series_id": "cosmos3_nano_future_window", | |
| "method": "Cosmos3-Nano Future Window", | |
| "status": "scored", | |
| "status_label": "scored", | |
| "scored": true, | |
| "proxy_scored": false, | |
| "raw": 33.80952380952381, | |
| "raw_text": "33.81", | |
| "normalized_score": 0.3116682871966295, | |
| "metric_key": "time_to_transition_mae", | |
| "source": "results/omni_finetune/model_output_task_probes_20260616/time_to_transition/cosmos3_nano_future_window/metrics.json", | |
| "scope": "multi_episode_128_partial_model_overlay", | |
| "reason": null | |
| } | |
| ] | |
| } | |