ropedia-xperience-10m-task-baselines / docs /data /two_evidence_line_result_summary.json

Add files using upload-large-folder tool

f590137 verified 8 days ago

17.4 kB

	{
	"generated_at_utc": "2026-06-21T11:49:06+00:00",
	"interpretation_rule": "Use the 1-episode line for task construction and reproducibility claims. Use the 128-episode line for same-split metadata/raw baselines, Qwen3-Omni v6 LoRA diagnostics, and Cosmos3 diagnostics.",
	"lines": [
	{
	"artifact_entry_points": [
	"docs/data/single_episode_task_model_radar.json",
	"docs/data/two_evidence_line_result_summary.json",
	"results/episode_task_suite/summary_report.json",
	"results/episode_task_suite/feature_manifest.json",
	"docs/single_episode_explorer.html"
	],
	"claim_boundary": "Supports task construction, file inspection, local reproducibility, and controlled single-episode baseline claims.",
	"data_unit": "One public Xperience-10M sample episode",
	"direct_scored_method_task_count": 40,
	"id": "single_public_sample_episode",
	"label": "1 sample episode",
	"method_count": 2,
	"method_task_record_count": 40,
	"methods": [
	{
	"direct_scored_task_count": 20,
	"id": "minimal",
	"label": "Minimal",
	"method_detail": "Single-episode simple heads over the public sample split.",
	"proxy_scored_task_count": 0,
	"result_record_count": 20,
	"scope": "1 public sample episode",
	"scored_task_count": 20,
	"status_counts": {
	"scored": 20
	}
	},
	{
	"direct_scored_task_count": 20,
	"id": "neural_mlp",
	"label": "Neural MLP",
	"method_detail": "Single-episode compact PyTorch MLP heads on the same 20 task contracts.",
	"proxy_scored_task_count": 0,
	"result_record_count": 20,
	"scope": "1 public sample episode",
	"scored_task_count": 20,
	"status_counts": {
	"scored": 20
	}
	}
	],
	"not_for": "Do not use this line as evidence of multi-episode generalization.",
	"primary_use": "Inspect raw files, understand each task, rerun local baselines, and debug task quality.",
	"primary_visuals": [
	"docs/assets/charts/two_evidence_line_map.svg",
	"docs/assets/charts/single_episode_task_model_radar.svg"
	],
	"proxy_scored_method_task_count": 0,
	"result_statement": "40/40 direct scores from Minimal and Neural MLP heads on the same 20 task contracts.",
	"scored_method_task_count": 40,
	"short_label": "Line 1",
	"task_count": 20
	},
	{
	"artifact_entry_points": [
	"docs/data/episode128_task_model_radar.json",
	"docs/data/two_evidence_line_result_summary.json",
	"docs/data/xperience10m_128_episode_feature_index.json",
	"docs/data/omni_model_comparison.json",
	"docs/data/qwen3_omni_run_lineage.json",
	"docs/data/task_method_20_gap_audit.json"
	],
	"claim_boundary": "Supports same-split metadata/raw baseline comparison, Qwen3-Omni v6 diagnostics, Cosmos3 diagnostics, and scale-up planning on public-safe processed artifacts.",
	"data_unit": "Selected held-out 96/16/16 split with public-safe processed features linked to official gated episode paths",
	"direct_scored_method_task_count": 134,
	"id": "selected_128_episode_surface",
	"label": "128 selected episodes",
	"method_count": 7,
	"method_task_record_count": 140,
	"methods": [
	{
	"direct_scored_task_count": 19,
	"id": "metadata128_simple",
	"label": "128ep Aligned Simple",
	"method_detail": "128-episode aligned simple baselines: JSONL metadata/text tasks plus staged sensor-block tasks where the processed target exists.",
	"proxy_scored_task_count": 1,
	"result_record_count": 20,
	"scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available",
	"scored_task_count": 20,
	"status_counts": {
	"proxy_scored": 1,
	"scored": 19
	}
	},
	{
	"direct_scored_task_count": 19,
	"id": "metadata128_neural_mlp",
	"label": "128ep Aligned NN",
	"method_detail": "128-episode aligned MLP baselines: JSONL metadata/text tasks plus staged sensor-block tasks where the processed target exists.",
	"proxy_scored_task_count": 1,
	"result_record_count": 20,
	"scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available",
	"scored_task_count": 20,
	"status_counts": {
	"proxy_scored": 1,
	"scored": 19
	}
	},
	{
	"direct_scored_task_count": 18,
	"id": "raw128_simple",
	"label": "128ep Raw Simple",
	"method_detail": "128-episode 4430-dim sensor NPZ simple heads; tasks 15/19 use compact proxies.",
	"proxy_scored_task_count": 2,
	"result_record_count": 20,
	"scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes",
	"scored_task_count": 20,
	"status_counts": {
	"proxy_scored": 2,
	"scored": 18
	}
	},
	{
	"direct_scored_task_count": 18,
	"id": "raw128_neural_mlp",
	"label": "128ep Raw NN",
	"method_detail": "128-episode 4430-dim sensor NPZ MLP heads; tasks 15/19 use compact proxies.",
	"proxy_scored_task_count": 2,
	"result_record_count": 20,
	"scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes",
	"scored_task_count": 20,
	"status_counts": {
	"proxy_scored": 2,
	"scored": 18
	}
	},
	{
	"direct_scored_task_count": 20,
	"id": "qwen3_omni_v6_lora",
	"label": "Qwen3-Omni v6 LoRA",
	"method_detail": "Verified held-out Qwen3-Omni v6 LoRA metrics, plus task 16 and any completed private-GPU future/retrieval/sensor-target probes scored from task-specific JSON.",
	"proxy_scored_task_count": 0,
	"result_record_count": 20,
	"scope": "128 selected episodes, held-out test",
	"scored_task_count": 20,
	"status_counts": {
	"scored": 20
	}
	},
	{
	"direct_scored_task_count": 20,
	"id": "cosmos3_super_reasoner",
	"label": "Cosmos3-Super Reasoner",
	"method_detail": "Verified Cosmos3-Super base-weight Reasoner JSON-task evaluation, plus task 5/8/9/10/11/12/13/14/16/17/18/19/20 probes where public metrics exist.",
	"proxy_scored_task_count": 0,
	"result_record_count": 20,
	"scope": "128 selected episodes, held-out test",
	"scored_task_count": 20,
	"status_counts": {
	"scored": 20
	}
	},
	{
	"direct_scored_task_count": 20,
	"id": "cosmos3_nano_future_window",
	"label": "Cosmos3-Nano Future Window",
	"method_detail": "Verified Cosmos3-Nano future-window compatibility metrics, plus model-output probes for tasks 2/5/7/8/10/11/12/13/14/15/16/17/18/19 and a derived task-20 boundary timing probe scored from held-out future-window artifacts.",
	"proxy_scored_task_count": 0,
	"result_record_count": 20,
	"scope": "128 selected episodes, held-out test",
	"scored_task_count": 20,
	"status_counts": {
	"scored": 20
	}
	}
	],
	"not_for": "Do not read compact-proxy cells as direct raw-target measurements.",
	"primary_use": "Compare same-split metadata/raw baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super Reasoner, and Cosmos3-Nano Future Window while keeping evidence type explicit.",
	"primary_visuals": [
	"docs/assets/charts/two_evidence_line_map.svg",
	"docs/assets/charts/episode128_task_model_radar.svg",
	"docs/assets/charts/unified_task_model_radar.svg"
	],
	"proxy_scored_method_task_count": 6,
	"result_statement": "140/140 selected-128 scores across seven methods: 134 direct scores plus 6 documented compact-proxy scores.",
	"scored_method_task_count": 140,
	"short_label": "Line 2",
	"task_count": 20
	}
	],
	"method_blocks": [
	{
	"block": "Task-head baselines",
	"direct_scored_method_task_count": 40,
	"evidence_type": "Direct target metrics on the public sample windows.",
	"line_id": "single_public_sample_episode",
	"line_label": "1 sample episode",
	"method_ids": [
	"minimal",
	"neural_mlp"
	],
	"method_task_record_count": 40,
	"methods": [
	"Minimal",
	"Neural MLP"
	],
	"proxy_scored_method_task_count": 0,
	"read_as": "Task construction, local reproducibility, and Minimal-vs-Neural behavior.",
	"scored_method_task_count": 40
	},
	{
	"block": "Aligned baseline heads",
	"direct_scored_method_task_count": 74,
	"evidence_type": "Direct processed-target metrics where available; compact proxies for documented raw-target gaps.",
	"line_id": "selected_128_episode_surface",
	"line_label": "128 selected episodes",
	"method_ids": [
	"metadata128_simple",
	"metadata128_neural_mlp",
	"raw128_simple",
	"raw128_neural_mlp"
	],
	"method_task_record_count": 80,
	"methods": [
	"128ep Aligned Simple",
	"128ep Aligned NN",
	"128ep Raw Simple",
	"128ep Raw NN"
	],
	"proxy_scored_method_task_count": 6,
	"read_as": "Same-split metadata/raw-feature baseline comparison.",
	"scored_method_task_count": 80
	},
	{
	"block": "Qwen3-Omni series",
	"direct_scored_method_task_count": 20,
	"evidence_type": "Verified selected-128 Qwen3-Omni v6 LoRA plus source-linked task-specific probes.",
	"line_id": "selected_128_episode_surface",
	"line_label": "128 selected episodes",
	"method_ids": [
	"qwen3_omni_v6_lora"
	],
	"method_task_record_count": 20,
	"methods": [
	"Qwen3-Omni v6 LoRA"
	],
	"proxy_scored_method_task_count": 0,
	"read_as": "Trainable Qwen3-Omni diagnostic baseline on the selected-128 surface.",
	"scored_method_task_count": 20
	},
	{
	"block": "Cosmos3 series",
	"direct_scored_method_task_count": 40,
	"evidence_type": "Verified Cosmos3-Super Reasoner and Cosmos3-Nano Future Window public-safe artifacts.",
	"line_id": "selected_128_episode_surface",
	"line_label": "128 selected episodes",
	"method_ids": [
	"cosmos3_super_reasoner",
	"cosmos3_nano_future_window"
	],
	"method_task_record_count": 40,
	"methods": [
	"Cosmos3-Super Reasoner",
	"Cosmos3-Nano Future Window"
	],
	"proxy_scored_method_task_count": 0,
	"read_as": "Cosmos3 reasoner and future-window diagnostics on the selected-128 surface.",
	"scored_method_task_count": 40
	}
	],
	"proxy_records": [
	{
	"line_id": "selected_128_episode_surface",
	"method": "128ep Raw Simple",
	"metric_key": "macro_f1",
	"reason": "documented compact proxy completion for this raw128 task axis",
	"series_id": "raw128_simple",
	"source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json",
	"task_id": "interaction_text_prediction",
	"task_label": "Interaction Text Prediction",
	"task_number": 15
	},
	{
	"line_id": "selected_128_episode_surface",
	"method": "128ep Raw NN",
	"metric_key": "macro_f1",
	"reason": "documented compact proxy completion for this raw128 task axis",
	"series_id": "raw128_neural_mlp",
	"source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json",
	"task_id": "interaction_text_prediction",
	"task_label": "Interaction Text Prediction",
	"task_number": 15
	},
	{
	"line_id": "selected_128_episode_surface",
	"method": "128ep Aligned Simple",
	"metric_key": "mrr",
	"reason": "paired camera-view embeddings are absent from the 128 JSONL/feature export; metadata features retrieve the synchronized same-window depth/audio block as a documented compact synchronization proxy",
	"series_id": "metadata128_simple",
	"source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/camera_view_sync_retrieval/metrics.json",
	"task_id": "camera_view_sync_retrieval",
	"task_label": "Camera-View Synchronization Retrieval",
	"task_number": 19
	},
	{
	"line_id": "selected_128_episode_surface",
	"method": "128ep Aligned NN",
	"metric_key": "mrr",
	"reason": "paired camera-view embeddings are absent from the 128 JSONL/feature export; metadata features retrieve the synchronized same-window depth/audio block as a documented compact synchronization proxy",
	"series_id": "metadata128_neural_mlp",
	"source": "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/neural_mlp/camera_view_sync_retrieval/metrics.json",
	"task_id": "camera_view_sync_retrieval",
	"task_label": "Camera-View Synchronization Retrieval",
	"task_number": 19
	},
	{
	"line_id": "selected_128_episode_surface",
	"method": "128ep Raw Simple",
	"metric_key": "mrr",
	"reason": "documented compact proxy completion for this raw128 task axis",
	"series_id": "raw128_simple",
	"source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json",
	"task_id": "camera_view_sync_retrieval",
	"task_label": "Camera-View Synchronization Retrieval",
	"task_number": 19
	},
	{
	"line_id": "selected_128_episode_surface",
	"method": "128ep Raw NN",
	"metric_key": "mrr",
	"reason": "documented compact proxy completion for this raw128 task axis",
	"series_id": "raw128_neural_mlp",
	"source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json",
	"task_id": "camera_view_sync_retrieval",
	"task_label": "Camera-View Synchronization Retrieval",
	"task_number": 19
	}
	],
	"reader_policy": {
	"proxy_policy": "Proxy-scored cells stay numeric only when the source artifact and reason are attached; they should not be read as direct raw-target measurements.",
	"selected_128_episode_surface": "Use for held-out comparison, metadata/raw-feature baselines, Qwen3-Omni v6 LoRA, Cosmos3-Super Reasoner, Cosmos3-Nano Future Window, and scale-up decisions.",
	"single_public_sample_episode": "Use for task construction, raw-file inspection, local reproducibility, and controlled Minimal-vs-Neural baseline behavior."
	},
	"reader_summary": "The suite has two public evidence lines. Line 1 is the fully inspectable one-episode task lab. Line 2 is the 128-episode comparison surface for aligned baselines, the Qwen3-Omni series, and the Cosmos3 series. Do not mix the two when reading scores.",
	"reading_order": [
	{
	"reason": "Line 1 answers task-lab and reproducibility questions; line 2 answers selected-128 comparison questions.",
	"step": "Choose the evidence line"
	},
	{
	"reason": "Use the 1-episode radar for Minimal-vs-Neural behavior and the 128-episode radar for metadata/raw baselines, Qwen3-Omni v6, Cosmos3-Super, and Cosmos3-Nano.",
	"step": "Open the matching radar"
	},
	{
	"reason": "Every numeric score is tied to a method, task, metric key, source artifact, and proxy flag.",
	"step": "Inspect the matrix row"
	},
	{
	"reason": "The six compact-proxy cells are numeric but are not direct raw-target measurements.",
	"step": "Check proxy cells before interpreting totals"
	}
	],
	"related_model_artifacts": [
	{
	"name": "Qwen3-Omni v1-v6 run lineage",
	"repo": "docs/data/qwen3_omni_run_lineage.json",
	"role": "Explains the LoRA/evaluation version ladder; v6 is the current 20-task matrix row, v5 remains the pinned prior release, and v1-v4 are lineage/ablation evidence."
	},
	{
	"name": "Cosmos3-Super Forward-Dynamics LoRA",
	"repo": "https://huggingface.co/cy0307/ropedia-cosmos3-super-forward-dynamics-lora-128ep",
	"role": "Separate fine-tuned adapter artifact for forward-dynamics loss metrics; published with weights/results but not counted as a 20-task matrix method row."
	}
	],
	"score_formula": "2 single-episode methods x 20 tasks = 40 records; 7 selected-128 methods x 20 tasks = 140 records; total public matrix = 180/180 scored records.",
	"source_lines": "docs/data/two_evidence_lines.json",
	"source_matrix": "docs/data/task_method_20_result_matrix.json",
	"status": "pass",
	"summary": {
	"direct_scored_method_task_count": 174,
	"line_count": 2,
	"method_count": 9,
	"method_task_record_count": 180,
	"proxy_scored_method_task_count": 6,
	"scored_method_task_count": 180,
	"task_count": 20
	},
	"title": "Two Evidence-Line Result Summary"
	}