| { |
| "status": "current", |
| "updated_utc": "2026-06-21T00:00:00Z", |
| "interpretation_rule": "Read the 1-episode line as the inspectable task lab. Read the 128-episode line as the selected comparison surface for metadata/raw baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super, and Cosmos3-Nano.", |
| "reader_summary": "The suite has two public reading lanes. Line 1 is the fully inspectable one-episode task lab. Line 2 is the 128-episode comparison surface for aligned baselines, the Qwen3-Omni series, and the Cosmos3 series. Compare scores within the same lane first.", |
| "score_formula": "2 single-episode methods x 20 tasks = 40 records; 7 selected-128 methods x 20 tasks = 140 records; total public matrix = 180/180 scored records.", |
| "lines": [ |
| { |
| "id": "single_public_sample_episode", |
| "label": "1 sample episode", |
| "short_label": "Line 1", |
| "data_unit": "One public Xperience-10M sample episode", |
| "result_statement": "40/40 direct scores from Minimal and Neural MLP heads on the same 20 task contracts.", |
| "best_read_as": "Inspect the raw sample, understand file organization, reproduce the 20 task targets, and compare Minimal vs Neural MLP behavior inside one episode.", |
| "read_separately_from": "The selected-128 comparison rows and broader held-out model behavior.", |
| "frames": 5821, |
| "windows": 1161, |
| "window_definition": "20-frame aligned windows with 5-frame stride", |
| "feature_dimensions": 8546, |
| "methods": [ |
| "Minimal heads", |
| "Neural MLP heads" |
| ], |
| "task_axes": 20, |
| "method_task_records": 40, |
| "scored_records": 40, |
| "direct_scored_records": 40, |
| "proxy_scored_records": 0, |
| "best_use": "Inspect raw files, understand each task, rerun local baselines, and debug task quality.", |
| "primary_visuals": [ |
| "docs/assets/charts/two_evidence_line_map.svg", |
| "docs/assets/charts/single_episode_task_model_radar.svg" |
| ], |
| "primary_artifacts": [ |
| "docs/data/single_episode_task_model_radar.json", |
| "docs/data/two_evidence_line_result_summary.json", |
| "results/episode_task_suite/summary_report.json", |
| "results/episode_task_suite/feature_manifest.json", |
| "docs/single_episode_explorer.html" |
| ] |
| }, |
| { |
| "id": "selected_128_episode_surface", |
| "label": "128 selected episodes", |
| "short_label": "Line 2", |
| "data_unit": "Selected held-out 96/16/16 split with public-safe processed features linked to official gated episode paths", |
| "result_statement": "140/140 selected-128 scores across seven methods: 134 direct scores plus 6 documented compact-proxy scores.", |
| "best_read_as": "Compare same-split metadata/raw baselines, Qwen3-Omni v6, Cosmos3-Super, and Cosmos3-Nano while keeping the 6 compact-proxy cells visible.", |
| "read_separately_from": "Direct raw-target interpretation for the proxy-marked cells.", |
| "episodes": 128, |
| "split": { |
| "train": 96, |
| "validation": 16, |
| "test": 16 |
| }, |
| "exported_windows": 34269, |
| "methods": [ |
| "Metadata simple", |
| "Metadata NN", |
| "Raw-feature simple", |
| "Raw-feature NN", |
| "Qwen3-Omni", |
| "Cosmos3-Super", |
| "Cosmos3-Nano" |
| ], |
| "task_axes": 20, |
| "method_task_records": 140, |
| "scored_records": 140, |
| "direct_scored_records": 134, |
| "proxy_scored_records": 6, |
| "proxy_policy": "Proxy flags remain visible where the public export lacks a direct raw target.", |
| "best_use": "Compare same-split metadata/raw baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super Reasoner, and Cosmos3-Nano Future Window while keeping evidence type explicit.", |
| "primary_visuals": [ |
| "docs/assets/charts/two_evidence_line_map.svg", |
| "docs/assets/charts/episode128_task_model_radar.svg", |
| "docs/assets/charts/unified_task_model_radar.svg" |
| ], |
| "primary_artifacts": [ |
| "docs/data/episode128_task_model_radar.json", |
| "docs/data/two_evidence_line_result_summary.json", |
| "docs/data/xperience10m_128_episode_feature_index.json", |
| "docs/data/omni_model_comparison.json", |
| "docs/data/qwen3_omni_run_lineage.json", |
| "docs/data/task_method_20_gap_audit.json" |
| ] |
| } |
| ], |
| "method_blocks": [ |
| { |
| "line_id": "single_public_sample_episode", |
| "line_label": "1 sample episode", |
| "block": "Task-head baselines", |
| "methods": [ |
| "Minimal", |
| "Neural MLP" |
| ], |
| "scored_records": 40, |
| "direct_scored_records": 40, |
| "proxy_scored_records": 0, |
| "evidence_type": "Direct target metrics on the public sample windows.", |
| "read_as": "Task-lab reproducibility and simple-vs-neural behavior." |
| }, |
| { |
| "line_id": "selected_128_episode_surface", |
| "line_label": "128 selected episodes", |
| "block": "Aligned baseline heads", |
| "methods": [ |
| "Metadata simple", |
| "Metadata NN", |
| "Raw-feature simple", |
| "Raw-feature NN" |
| ], |
| "scored_records": 80, |
| "direct_scored_records": 74, |
| "proxy_scored_records": 6, |
| "evidence_type": "Direct processed-target metrics where available; compact proxies for documented raw-target gaps.", |
| "read_as": "Same-split metadata/raw-feature baseline comparison." |
| }, |
| { |
| "line_id": "selected_128_episode_surface", |
| "line_label": "128 selected episodes", |
| "block": "Qwen3-Omni series", |
| "methods": [ |
| "Qwen3-Omni v6 LoRA" |
| ], |
| "scored_records": 20, |
| "direct_scored_records": 20, |
| "proxy_scored_records": 0, |
| "evidence_type": "Verified selected-128 Qwen3-Omni v6 LoRA plus source-linked task-specific probes.", |
| "read_as": "Trainable Qwen3-Omni diagnostic baseline on the selected-128 surface." |
| }, |
| { |
| "line_id": "selected_128_episode_surface", |
| "line_label": "128 selected episodes", |
| "block": "Cosmos3 series", |
| "methods": [ |
| "Cosmos3-Super Reasoner", |
| "Cosmos3-Nano Future Window" |
| ], |
| "scored_records": 40, |
| "direct_scored_records": 40, |
| "proxy_scored_records": 0, |
| "evidence_type": "Verified Cosmos3-Super Reasoner and Cosmos3-Nano Future Window public-safe artifacts.", |
| "read_as": "Cosmos3 reasoner and future-window diagnostics on the selected-128 surface." |
| } |
| ], |
| "related_model_artifacts": [ |
| { |
| "name": "Qwen3-Omni v1-v6 run lineage", |
| "role": "Explains the LoRA/evaluation version ladder; v6 is the current 20-task matrix row, v5 remains the pinned prior release, and v1-v4 are lineage/ablation evidence.", |
| "repo": "docs/data/qwen3_omni_run_lineage.json" |
| }, |
| { |
| "name": "Cosmos3-Super Forward-Dynamics LoRA", |
| "role": "Separate fine-tuned adapter artifact for forward-dynamics loss metrics; published with weights/results but not counted as a 20-task matrix method row.", |
| "repo": "https://huggingface.co/cy0307/ropedia-cosmos3-super-forward-dynamics-lora-128ep" |
| } |
| ], |
| "combined_public_matrix": { |
| "task_axes": 20, |
| "methods": 9, |
| "method_task_records": 180, |
| "scored_records": 180, |
| "direct_scored_records": 174, |
| "proxy_scored_records": 6, |
| "summary_artifact": "docs/data/two_evidence_line_result_summary.json", |
| "artifact": "docs/data/task_method_20_result_matrix.json" |
| } |
| } |
|
|