| { |
| "generated_at_utc": "2026-06-22T09:56:30+00:00", |
| "interpretation_rule": "Read the 1-episode line as the inspectable task lab. Read the 128-episode line as the selected comparison surface for metadata/raw baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super, and Cosmos3-Nano.", |
| "lines": [ |
| { |
| "artifact_entry_points": [ |
| "docs/data/single_episode_task_model_radar.json", |
| "docs/data/two_evidence_line_result_summary.json", |
| "results/episode_task_suite/summary_report.json", |
| "results/episode_task_suite/feature_manifest.json", |
| "docs/single_episode_explorer.html" |
| ], |
| "best_read_as": "Inspect the raw sample, understand file organization, reproduce the 20 task targets, and compare Minimal vs Neural MLP behavior inside one episode.", |
| "data_unit": "One public Xperience-10M sample episode", |
| "direct_scored_method_task_count": 40, |
| "id": "single_public_sample_episode", |
| "label": "1 sample episode", |
| "method_count": 2, |
| "method_task_record_count": 40, |
| "methods": [ |
| { |
| "direct_scored_task_count": 20, |
| "id": "minimal", |
| "label": "Minimal", |
| "method_detail": "Single-episode simple heads over the public sample split.", |
| "proxy_scored_task_count": 0, |
| "result_record_count": 20, |
| "scope": "1 public sample episode", |
| "scored_task_count": 20, |
| "status_counts": { |
| "scored": 20 |
| } |
| }, |
| { |
| "direct_scored_task_count": 20, |
| "id": "neural_mlp", |
| "label": "Neural MLP", |
| "method_detail": "Single-episode compact PyTorch MLP heads on the same 20 task contracts.", |
| "proxy_scored_task_count": 0, |
| "result_record_count": 20, |
| "scope": "1 public sample episode", |
| "scored_task_count": 20, |
| "status_counts": { |
| "scored": 20 |
| } |
| } |
| ], |
| "primary_use": "Inspect raw files, understand each task, rerun local baselines, and debug task quality.", |
| "primary_visuals": [ |
| "docs/assets/charts/two_evidence_line_map.svg", |
| "docs/assets/charts/single_episode_task_model_radar.svg" |
| ], |
| "proxy_scored_method_task_count": 0, |
| "read_separately_from": "The selected-128 comparison rows and broader held-out model behavior.", |
| "result_statement": "40/40 direct scores from Minimal and Neural MLP heads on the same 20 task contracts.", |
| "scored_method_task_count": 40, |
| "short_label": "Line 1", |
| "task_count": 20 |
| }, |
| { |
| "artifact_entry_points": [ |
| "docs/data/episode128_task_model_radar.json", |
| "docs/data/two_evidence_line_result_summary.json", |
| "docs/data/xperience10m_128_episode_feature_index.json", |
| "docs/data/omni_model_comparison.json", |
| "docs/data/qwen3_omni_run_lineage.json", |
| "docs/data/task_method_20_gap_audit.json" |
| ], |
| "best_read_as": "Compare same-split metadata/raw baselines, Qwen3-Omni v6, Cosmos3-Super, and Cosmos3-Nano while keeping the 6 compact-proxy cells visible.", |
| "data_unit": "Selected held-out 96/16/16 split with public-safe processed features linked to official gated episode paths", |
| "direct_scored_method_task_count": 134, |
| "id": "selected_128_episode_surface", |
| "label": "128 selected episodes", |
| "method_count": 7, |
| "method_task_record_count": 140, |
| "methods": [ |
| { |
| "direct_scored_task_count": 19, |
| "id": "metadata128_simple", |
| "label": "128ep Aligned Simple", |
| "method_detail": "128-episode aligned simple baselines: JSONL metadata/text tasks plus staged sensor-block tasks where the processed target exists.", |
| "proxy_scored_task_count": 1, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available", |
| "scored_task_count": 20, |
| "status_counts": { |
| "proxy_scored": 1, |
| "scored": 19 |
| } |
| }, |
| { |
| "direct_scored_task_count": 19, |
| "id": "metadata128_neural_mlp", |
| "label": "128ep Aligned NN", |
| "method_detail": "128-episode aligned MLP baselines: JSONL metadata/text tasks plus staged sensor-block tasks where the processed target exists.", |
| "proxy_scored_task_count": 1, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available", |
| "scored_task_count": 20, |
| "status_counts": { |
| "proxy_scored": 1, |
| "scored": 19 |
| } |
| }, |
| { |
| "direct_scored_task_count": 18, |
| "id": "raw128_simple", |
| "label": "128ep Raw Simple", |
| "method_detail": "128-episode 4430-dim sensor NPZ simple heads; tasks 15/19 use compact proxies.", |
| "proxy_scored_task_count": 2, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", |
| "scored_task_count": 20, |
| "status_counts": { |
| "proxy_scored": 2, |
| "scored": 18 |
| } |
| }, |
| { |
| "direct_scored_task_count": 18, |
| "id": "raw128_neural_mlp", |
| "label": "128ep Raw NN", |
| "method_detail": "128-episode 4430-dim sensor NPZ MLP heads; tasks 15/19 use compact proxies.", |
| "proxy_scored_task_count": 2, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", |
| "scored_task_count": 20, |
| "status_counts": { |
| "proxy_scored": 2, |
| "scored": 18 |
| } |
| }, |
| { |
| "direct_scored_task_count": 20, |
| "id": "qwen3_omni_v6_lora", |
| "label": "Qwen3-Omni v6 LoRA", |
| "method_detail": "Verified held-out Qwen3-Omni v6 LoRA metrics, plus task 16 and any completed private-GPU future/retrieval/sensor-target probes scored from task-specific JSON.", |
| "proxy_scored_task_count": 0, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, held-out test", |
| "scored_task_count": 20, |
| "status_counts": { |
| "scored": 20 |
| } |
| }, |
| { |
| "direct_scored_task_count": 20, |
| "id": "cosmos3_super_reasoner", |
| "label": "Cosmos3-Super Reasoner", |
| "method_detail": "Verified Cosmos3-Super base-weight Reasoner JSON-task evaluation, plus task 5/8/9/10/11/12/13/14/16/17/18/19/20 probes where public metrics exist.", |
| "proxy_scored_task_count": 0, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, held-out test", |
| "scored_task_count": 20, |
| "status_counts": { |
| "scored": 20 |
| } |
| }, |
| { |
| "direct_scored_task_count": 20, |
| "id": "cosmos3_nano_future_window", |
| "label": "Cosmos3-Nano Future Window", |
| "method_detail": "Verified Cosmos3-Nano future-window compatibility metrics, plus model-output probes for tasks 2/5/7/8/10/11/12/13/14/15/16/17/18/19 and a derived task-20 boundary timing probe scored from held-out future-window artifacts.", |
| "proxy_scored_task_count": 0, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, held-out test", |
| "scored_task_count": 20, |
| "status_counts": { |
| "scored": 20 |
| } |
| } |
| ], |
| "primary_use": "Compare same-split metadata/raw baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super Reasoner, and Cosmos3-Nano Future Window while keeping evidence type explicit.", |
| "primary_visuals": [ |
| "docs/assets/charts/two_evidence_line_map.svg", |
| "docs/assets/charts/episode128_task_model_radar.svg", |
| "docs/assets/charts/unified_task_model_radar.svg" |
| ], |
| "proxy_scored_method_task_count": 6, |
| "read_separately_from": "Direct raw-target interpretation for the proxy-marked cells.", |
| "result_statement": "140/140 selected-128 scores across seven methods: 134 direct scores plus 6 documented compact-proxy scores.", |
| "scored_method_task_count": 140, |
| "short_label": "Line 2", |
| "task_count": 20 |
| } |
| ], |
| "method_blocks": [ |
| { |
| "block": "Task-head baselines", |
| "direct_scored_method_task_count": 40, |
| "evidence_type": "Direct target metrics on the public sample windows.", |
| "line_id": "single_public_sample_episode", |
| "line_label": "1 sample episode", |
| "method_ids": [ |
| "minimal", |
| "neural_mlp" |
| ], |
| "method_task_record_count": 40, |
| "methods": [ |
| "Minimal", |
| "Neural MLP" |
| ], |
| "proxy_scored_method_task_count": 0, |
| "read_as": "Task construction, local reproducibility, and Minimal-vs-Neural behavior.", |
| "scored_method_task_count": 40 |
| }, |
| { |
| "block": "Aligned baseline heads", |
| "direct_scored_method_task_count": 74, |
| "evidence_type": "Direct processed-target metrics where available; compact proxies for documented raw-target gaps.", |
| "line_id": "selected_128_episode_surface", |
| "line_label": "128 selected episodes", |
| "method_ids": [ |
| "metadata128_simple", |
| "metadata128_neural_mlp", |
| "raw128_simple", |
| "raw128_neural_mlp" |
| ], |
| "method_task_record_count": 80, |
| "methods": [ |
| "128ep Aligned Simple", |
| "128ep Aligned NN", |
| "128ep Raw Simple", |
| "128ep Raw NN" |
| ], |
| "proxy_scored_method_task_count": 6, |
| "read_as": "Same-split metadata/raw-feature baseline comparison.", |
| "scored_method_task_count": 80 |
| }, |
| { |
| "block": "Qwen3-Omni series", |
| "direct_scored_method_task_count": 20, |
| "evidence_type": "Verified selected-128 Qwen3-Omni v6 LoRA plus source-linked task-specific probes.", |
| "line_id": "selected_128_episode_surface", |
| "line_label": "128 selected episodes", |
| "method_ids": [ |
| "qwen3_omni_v6_lora" |
| ], |
| "method_task_record_count": 20, |
| "methods": [ |
| "Qwen3-Omni v6 LoRA" |
| ], |
| "proxy_scored_method_task_count": 0, |
| "read_as": "Trainable Qwen3-Omni diagnostic baseline on the selected-128 surface.", |
| "scored_method_task_count": 20 |
| }, |
| { |
| "block": "Cosmos3 series", |
| "direct_scored_method_task_count": 40, |
| "evidence_type": "Verified Cosmos3-Super Reasoner and Cosmos3-Nano Future Window public-safe artifacts.", |
| "line_id": "selected_128_episode_surface", |
| "line_label": "128 selected episodes", |
| "method_ids": [ |
| "cosmos3_super_reasoner", |
| "cosmos3_nano_future_window" |
| ], |
| "method_task_record_count": 40, |
| "methods": [ |
| "Cosmos3-Super Reasoner", |
| "Cosmos3-Nano Future Window" |
| ], |
| "proxy_scored_method_task_count": 0, |
| "read_as": "Cosmos3 reasoner and future-window diagnostics on the selected-128 surface.", |
| "scored_method_task_count": 40 |
| } |
| ], |
| "proxy_records": [ |
| { |
| "line_id": "selected_128_episode_surface", |
| "method": "128ep Raw Simple", |
| "metric_key": "macro_f1", |
| "reason": "documented compact proxy completion for this raw128 task axis", |
| "series_id": "raw128_simple", |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json", |
| "task_id": "interaction_text_prediction", |
| "task_label": "Interaction Text Prediction", |
| "task_number": 15 |
| }, |
| { |
| "line_id": "selected_128_episode_surface", |
| "method": "128ep Raw NN", |
| "metric_key": "macro_f1", |
| "reason": "documented compact proxy completion for this raw128 task axis", |
| "series_id": "raw128_neural_mlp", |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json", |
| "task_id": "interaction_text_prediction", |
| "task_label": "Interaction Text Prediction", |
| "task_number": 15 |
| }, |
| { |
| "line_id": "selected_128_episode_surface", |
| "method": "128ep Aligned Simple", |
| "metric_key": "mrr", |
| "reason": "paired camera-view embeddings are absent from the 128 JSONL/feature export; metadata features retrieve the synchronized same-window depth/audio block as a documented compact synchronization proxy", |
| "series_id": "metadata128_simple", |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/camera_view_sync_retrieval/metrics.json", |
| "task_id": "camera_view_sync_retrieval", |
| "task_label": "Camera-View Synchronization Retrieval", |
| "task_number": 19 |
| }, |
| { |
| "line_id": "selected_128_episode_surface", |
| "method": "128ep Aligned NN", |
| "metric_key": "mrr", |
| "reason": "paired camera-view embeddings are absent from the 128 JSONL/feature export; metadata features retrieve the synchronized same-window depth/audio block as a documented compact synchronization proxy", |
| "series_id": "metadata128_neural_mlp", |
| "source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/camera_view_sync_retrieval/metrics.json", |
| "task_id": "camera_view_sync_retrieval", |
| "task_label": "Camera-View Synchronization Retrieval", |
| "task_number": 19 |
| }, |
| { |
| "line_id": "selected_128_episode_surface", |
| "method": "128ep Raw Simple", |
| "metric_key": "mrr", |
| "reason": "documented compact proxy completion for this raw128 task axis", |
| "series_id": "raw128_simple", |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json", |
| "task_id": "camera_view_sync_retrieval", |
| "task_label": "Camera-View Synchronization Retrieval", |
| "task_number": 19 |
| }, |
| { |
| "line_id": "selected_128_episode_surface", |
| "method": "128ep Raw NN", |
| "metric_key": "mrr", |
| "reason": "documented compact proxy completion for this raw128 task axis", |
| "series_id": "raw128_neural_mlp", |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json", |
| "task_id": "camera_view_sync_retrieval", |
| "task_label": "Camera-View Synchronization Retrieval", |
| "task_number": 19 |
| } |
| ], |
| "reader_policy": { |
| "proxy_policy": "Proxy-scored cells stay numeric only when the source artifact and reason are attached; they should not be read as direct raw-target measurements.", |
| "selected_128_episode_surface": "Use for held-out comparison, metadata/raw-feature baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super Reasoner, Cosmos3-Nano Future Window, and scale-up decisions.", |
| "single_public_sample_episode": "Use for task construction, raw-file inspection, local reproducibility, and controlled Minimal-vs-Neural baseline behavior." |
| }, |
| "reader_summary": "The suite has two public reading lanes. Line 1 is the fully inspectable one-episode task lab. Line 2 is the 128-episode comparison surface for aligned baselines, the Qwen3-Omni series, and the Cosmos3 series. Compare scores within the same lane first.", |
| "reading_order": [ |
| { |
| "reason": "Line 1 answers task-lab and reproducibility questions; line 2 answers selected-128 comparison questions.", |
| "step": "Choose the evidence line" |
| }, |
| { |
| "reason": "Use the 1-episode radar for Minimal-vs-Neural behavior and the 128-episode radar for metadata/raw baselines, Qwen3-Omni v6, Cosmos3-Super, and Cosmos3-Nano.", |
| "step": "Open the matching radar" |
| }, |
| { |
| "reason": "Every numeric score is tied to a method, task, metric key, source artifact, and proxy flag.", |
| "step": "Inspect the matrix row" |
| }, |
| { |
| "reason": "The six compact-proxy cells are numeric but are not direct raw-target measurements.", |
| "step": "Check proxy cells before interpreting totals" |
| } |
| ], |
| "related_model_artifacts": [ |
| { |
| "name": "Qwen3-Omni v1-v6 run lineage", |
| "repo": "docs/data/qwen3_omni_run_lineage.json", |
| "role": "Explains the LoRA/evaluation version ladder; v6 is the current 20-task matrix row, v5 remains the pinned prior release, and v1-v4 are lineage/ablation evidence." |
| }, |
| { |
| "name": "Cosmos3-Super Forward-Dynamics LoRA", |
| "repo": "https://huggingface.co/cy0307/ropedia-cosmos3-super-forward-dynamics-lora-128ep", |
| "role": "Separate fine-tuned adapter artifact for forward-dynamics loss metrics; published with weights/results but not counted as a 20-task matrix method row." |
| } |
| ], |
| "score_formula": "2 single-episode methods x 20 tasks = 40 records; 7 selected-128 methods x 20 tasks = 140 records; total public matrix = 180/180 scored records.", |
| "source_lines": "docs/data/two_evidence_lines.json", |
| "source_matrix": "docs/data/task_method_20_result_matrix.json", |
| "status": "pass", |
| "summary": { |
| "direct_scored_method_task_count": 174, |
| "line_count": 2, |
| "method_count": 9, |
| "method_task_record_count": 180, |
| "proxy_scored_method_task_count": 6, |
| "scored_method_task_count": 180, |
| "task_count": 20 |
| }, |
| "title": "Two Evidence-Line Result Summary" |
| } |
|
|