ropedia-xperience-10m-task-baselines / docs /data /two_evidence_lines.json
cy0307's picture
Refine reader-facing scope wording (2/4)
2600a90 verified
Raw
History Blame Contribute Delete
7.39 kB
{
"status": "current",
"updated_utc": "2026-06-21T00:00:00Z",
"interpretation_rule": "Read the 1-episode line as the inspectable task lab. Read the 128-episode line as the selected comparison surface for metadata/raw baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super, and Cosmos3-Nano.",
"reader_summary": "The suite has two public reading lanes. Line 1 is the fully inspectable one-episode task lab. Line 2 is the 128-episode comparison surface for aligned baselines, the Qwen3-Omni series, and the Cosmos3 series. Compare scores within the same lane first.",
"score_formula": "2 single-episode methods x 20 tasks = 40 records; 7 selected-128 methods x 20 tasks = 140 records; total public matrix = 180/180 scored records.",
"lines": [
{
"id": "single_public_sample_episode",
"label": "1 sample episode",
"short_label": "Line 1",
"data_unit": "One public Xperience-10M sample episode",
"result_statement": "40/40 direct scores from Minimal and Neural MLP heads on the same 20 task contracts.",
"best_read_as": "Inspect the raw sample, understand file organization, reproduce the 20 task targets, and compare Minimal vs Neural MLP behavior inside one episode.",
"read_separately_from": "The selected-128 comparison rows and broader held-out model behavior.",
"frames": 5821,
"windows": 1161,
"window_definition": "20-frame aligned windows with 5-frame stride",
"feature_dimensions": 8546,
"methods": [
"Minimal heads",
"Neural MLP heads"
],
"task_axes": 20,
"method_task_records": 40,
"scored_records": 40,
"direct_scored_records": 40,
"proxy_scored_records": 0,
"best_use": "Inspect raw files, understand each task, rerun local baselines, and debug task quality.",
"primary_visuals": [
"docs/assets/charts/two_evidence_line_map.svg",
"docs/assets/charts/single_episode_task_model_radar.svg"
],
"primary_artifacts": [
"docs/data/single_episode_task_model_radar.json",
"docs/data/two_evidence_line_result_summary.json",
"results/episode_task_suite/summary_report.json",
"results/episode_task_suite/feature_manifest.json",
"docs/single_episode_explorer.html"
]
},
{
"id": "selected_128_episode_surface",
"label": "128 selected episodes",
"short_label": "Line 2",
"data_unit": "Selected held-out 96/16/16 split with public-safe processed features linked to official gated episode paths",
"result_statement": "140/140 selected-128 scores across seven methods: 134 direct scores plus 6 documented compact-proxy scores.",
"best_read_as": "Compare same-split metadata/raw baselines, Qwen3-Omni v6, Cosmos3-Super, and Cosmos3-Nano while keeping the 6 compact-proxy cells visible.",
"read_separately_from": "Direct raw-target interpretation for the proxy-marked cells.",
"episodes": 128,
"split": {
"train": 96,
"validation": 16,
"test": 16
},
"exported_windows": 34269,
"methods": [
"Metadata simple",
"Metadata NN",
"Raw-feature simple",
"Raw-feature NN",
"Qwen3-Omni",
"Cosmos3-Super",
"Cosmos3-Nano"
],
"task_axes": 20,
"method_task_records": 140,
"scored_records": 140,
"direct_scored_records": 134,
"proxy_scored_records": 6,
"proxy_policy": "Proxy flags remain visible where the public export lacks a direct raw target.",
"best_use": "Compare same-split metadata/raw baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super Reasoner, and Cosmos3-Nano Future Window while keeping evidence type explicit.",
"primary_visuals": [
"docs/assets/charts/two_evidence_line_map.svg",
"docs/assets/charts/episode128_task_model_radar.svg",
"docs/assets/charts/unified_task_model_radar.svg"
],
"primary_artifacts": [
"docs/data/episode128_task_model_radar.json",
"docs/data/two_evidence_line_result_summary.json",
"docs/data/xperience10m_128_episode_feature_index.json",
"docs/data/omni_model_comparison.json",
"docs/data/qwen3_omni_run_lineage.json",
"docs/data/task_method_20_gap_audit.json"
]
}
],
"method_blocks": [
{
"line_id": "single_public_sample_episode",
"line_label": "1 sample episode",
"block": "Task-head baselines",
"methods": [
"Minimal",
"Neural MLP"
],
"scored_records": 40,
"direct_scored_records": 40,
"proxy_scored_records": 0,
"evidence_type": "Direct target metrics on the public sample windows.",
"read_as": "Task-lab reproducibility and simple-vs-neural behavior."
},
{
"line_id": "selected_128_episode_surface",
"line_label": "128 selected episodes",
"block": "Aligned baseline heads",
"methods": [
"Metadata simple",
"Metadata NN",
"Raw-feature simple",
"Raw-feature NN"
],
"scored_records": 80,
"direct_scored_records": 74,
"proxy_scored_records": 6,
"evidence_type": "Direct processed-target metrics where available; compact proxies for documented raw-target gaps.",
"read_as": "Same-split metadata/raw-feature baseline comparison."
},
{
"line_id": "selected_128_episode_surface",
"line_label": "128 selected episodes",
"block": "Qwen3-Omni series",
"methods": [
"Qwen3-Omni v6 LoRA"
],
"scored_records": 20,
"direct_scored_records": 20,
"proxy_scored_records": 0,
"evidence_type": "Verified selected-128 Qwen3-Omni v6 LoRA plus source-linked task-specific probes.",
"read_as": "Trainable Qwen3-Omni diagnostic baseline on the selected-128 surface."
},
{
"line_id": "selected_128_episode_surface",
"line_label": "128 selected episodes",
"block": "Cosmos3 series",
"methods": [
"Cosmos3-Super Reasoner",
"Cosmos3-Nano Future Window"
],
"scored_records": 40,
"direct_scored_records": 40,
"proxy_scored_records": 0,
"evidence_type": "Verified Cosmos3-Super Reasoner and Cosmos3-Nano Future Window public-safe artifacts.",
"read_as": "Cosmos3 reasoner and future-window diagnostics on the selected-128 surface."
}
],
"related_model_artifacts": [
{
"name": "Qwen3-Omni v1-v6 run lineage",
"role": "Explains the LoRA/evaluation version ladder; v6 is the current 20-task matrix row, v5 remains the pinned prior release, and v1-v4 are lineage/ablation evidence.",
"repo": "docs/data/qwen3_omni_run_lineage.json"
},
{
"name": "Cosmos3-Super Forward-Dynamics LoRA",
"role": "Separate fine-tuned adapter artifact for forward-dynamics loss metrics; published with weights/results but not counted as a 20-task matrix method row.",
"repo": "https://huggingface.co/cy0307/ropedia-cosmos3-super-forward-dynamics-lora-128ep"
}
],
"combined_public_matrix": {
"task_axes": 20,
"methods": 9,
"method_task_records": 180,
"scored_records": 180,
"direct_scored_records": 174,
"proxy_scored_records": 6,
"summary_artifact": "docs/data/two_evidence_line_result_summary.json",
"artifact": "docs/data/task_method_20_result_matrix.json"
}
}