File size: 20,037 Bytes

{
  "generated_at_utc": "2026-06-20T13:58:05+00:00",
  "immediate_actions": [
    {
      "artifact": "docs/data/task_method_20_gap_audit.json",
      "id": "gap_audit",
      "purpose": "Keep the 16 scoreless cells visible and reproducible."
    },
    {
      "artifact": "scripts/omni/score_model_output_probes.py",
      "id": "model_output_probe",
      "purpose": "Check whether train/validation/test model outputs exist before attempting all-task Qwen3/Cosmos scoring."
    },
    {
      "artifact": "scripts/omni/launch_all_task_model_scoring_when_free.sh",
      "id": "guarded_gpu_launcher",
      "purpose": "Start a user-provided all-task scoring command only after enough private GPU capacity is idle."
    }
  ],
  "methods": {
    "cosmos3_nano_future_window": {
      "kind": "partial_128_episode_world_model_overlay",
      "label": "Cosmos3-Nano Future Window",
      "proxy_scored_task_count": 0,
      "result_record_count": 20,
      "scope": "128 selected episodes, held-out test",
      "scored_task_count": 11,
      "scoreless_task_count": 9,
      "status_counts": {
        "not_evaluated_in_verified_package": 9,
        "scored": 11
      }
    },
    "cosmos3_super_reasoner": {
      "kind": "partial_128_episode_foundation_model_overlay",
      "label": "Cosmos3-Super Reasoner",
      "proxy_scored_task_count": 0,
      "result_record_count": 20,
      "scope": "128 selected episodes, held-out test",
      "scored_task_count": 15,
      "scoreless_task_count": 5,
      "status_counts": {
        "not_evaluated_in_verified_package": 5,
        "scored": 15
      }
    },
    "metadata128_neural_mlp": {
      "kind": "partial_128_episode_aligned_baseline",
      "label": "128ep Aligned NN",
      "proxy_scored_task_count": 0,
      "result_record_count": 20,
      "scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available",
      "scored_task_count": 19,
      "scoreless_task_count": 1,
      "status_counts": {
        "not_supported_by_metadata_only_package": 1,
        "scored": 19
      }
    },
    "metadata128_simple": {
      "kind": "partial_128_episode_aligned_baseline",
      "label": "128ep Aligned Simple",
      "proxy_scored_task_count": 0,
      "result_record_count": 20,
      "scope": "128 selected episodes, JSONL metadata/text plus staged sensor-block targets where available",
      "scored_task_count": 19,
      "scoreless_task_count": 1,
      "status_counts": {
        "scored": 19,
        "unsupported_without_required_target": 1
      }
    },
    "minimal": {
      "kind": "full_20_task_baseline",
      "label": "Minimal",
      "proxy_scored_task_count": 0,
      "result_record_count": 20,
      "scope": "1 public sample episode",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "scored": 20
      }
    },
    "neural_mlp": {
      "kind": "full_20_task_baseline",
      "label": "Neural MLP",
      "proxy_scored_task_count": 0,
      "result_record_count": 20,
      "scope": "1 public sample episode",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "scored": 20
      }
    },
    "qwen3_omni_v6_lora": {
      "kind": "partial_128_episode_foundation_model_overlay",
      "label": "Qwen3-Omni v6 LoRA",
      "proxy_scored_task_count": 0,
      "result_record_count": 20,
      "scope": "128 selected episodes, held-out test",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "scored": 20
      }
    },
    "raw128_neural_mlp": {
      "kind": "complete_128_episode_raw_feature_baseline",
      "label": "128ep Raw NN",
      "proxy_scored_task_count": 2,
      "result_record_count": 20,
      "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "proxy_scored": 2,
        "scored": 18
      }
    },
    "raw128_simple": {
      "kind": "complete_128_episode_raw_feature_baseline",
      "label": "128ep Raw Simple",
      "proxy_scored_task_count": 2,
      "result_record_count": 20,
      "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes",
      "scored_task_count": 20,
      "scoreless_task_count": 0,
      "status_counts": {
        "proxy_scored": 2,
        "scored": 18
      }
    }
  },
  "missing_by_method": {
    "cosmos3_nano_future_window": 9,
    "cosmos3_super_reasoner": 5,
    "metadata128_neural_mlp": 1,
    "metadata128_simple": 1
  },
  "missing_by_status": {
    "not_evaluated_in_verified_package": 14,
    "not_supported_by_metadata_only_package": 1,
    "unsupported_without_required_target": 1
  },
  "missing_by_task": {
    "02 Procedure Step Recognition": [
      "cosmos3_nano_future_window"
    ],
    "05 Hand Trajectory Forecasting": [
      "cosmos3_nano_future_window"
    ],
    "07 Object Relevance Prediction": [
      "cosmos3_nano_future_window"
    ],
    "08 Language Grounding": [
      "cosmos3_nano_future_window"
    ],
    "11 Temporal Order Verification": [
      "cosmos3_nano_future_window",
      "cosmos3_super_reasoner"
    ],
    "12 Multimodal Synchronization Detection": [
      "cosmos3_nano_future_window",
      "cosmos3_super_reasoner"
    ],
    "14 Long-Horizon Next-Subtask Forecasting": [
      "cosmos3_super_reasoner"
    ],
    "15 Interaction Text Prediction": [
      "cosmos3_nano_future_window",
      "cosmos3_super_reasoner"
    ],
    "17 Future Object-Set Forecasting": [
      "cosmos3_super_reasoner"
    ],
    "18 IMU-to-Hand Pose Reconstruction": [
      "cosmos3_nano_future_window"
    ],
    "19 Camera-View Synchronization Retrieval": [
      "cosmos3_nano_future_window",
      "metadata128_neural_mlp",
      "metadata128_simple"
    ]
  },
  "missing_records": [
    {
      "method": "Cosmos3-Nano Future Window",
      "metric_key": "macro_f1",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_nano_future_window",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "timeline_subtask",
      "task_label": "Procedure Step Recognition",
      "task_number": 2
    },
    {
      "method": "Cosmos3-Nano Future Window",
      "metric_key": "mpjpe",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_nano_future_window",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "hand_trajectory_forecast",
      "task_label": "Hand Trajectory Forecasting",
      "task_number": 5
    },
    {
      "method": "Cosmos3-Nano Future Window",
      "metric_key": "micro_f1",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_nano_future_window",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "object_relevance",
      "task_label": "Object Relevance Prediction",
      "task_number": 7
    },
    {
      "method": "Cosmos3-Nano Future Window",
      "metric_key": "mrr",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_nano_future_window",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "caption_grounding",
      "task_label": "Language Grounding",
      "task_number": 8
    },
    {
      "method": "Cosmos3-Super Reasoner",
      "metric_key": "f1",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_super_reasoner",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "temporal_order",
      "task_label": "Temporal Order Verification",
      "task_number": 11
    },
    {
      "method": "Cosmos3-Nano Future Window",
      "metric_key": "f1",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_nano_future_window",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "temporal_order",
      "task_label": "Temporal Order Verification",
      "task_number": 11
    },
    {
      "method": "Cosmos3-Super Reasoner",
      "metric_key": "f1",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_super_reasoner",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "misalignment_detection",
      "task_label": "Multimodal Synchronization Detection",
      "task_number": 12
    },
    {
      "method": "Cosmos3-Nano Future Window",
      "metric_key": "f1",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_nano_future_window",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "misalignment_detection",
      "task_label": "Multimodal Synchronization Detection",
      "task_number": 12
    },
    {
      "method": "Cosmos3-Super Reasoner",
      "metric_key": "macro_f1",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_super_reasoner",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "next_subtask_forecast",
      "task_label": "Long-Horizon Next-Subtask Forecasting",
      "task_number": 14
    },
    {
      "method": "Cosmos3-Super Reasoner",
      "metric_key": "macro_f1",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_super_reasoner",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "interaction_text_prediction",
      "task_label": "Interaction Text Prediction",
      "task_number": 15
    },
    {
      "method": "Cosmos3-Nano Future Window",
      "metric_key": "macro_f1",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_nano_future_window",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "interaction_text_prediction",
      "task_label": "Interaction Text Prediction",
      "task_number": 15
    },
    {
      "method": "Cosmos3-Super Reasoner",
      "metric_key": "micro_f1",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_super_reasoner",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "object_set_forecast",
      "task_label": "Future Object-Set Forecasting",
      "task_number": 17
    },
    {
      "method": "Cosmos3-Nano Future Window",
      "metric_key": "mae",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_nano_future_window",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "imu_to_hand_pose",
      "task_label": "IMU-to-Hand Pose Reconstruction",
      "task_number": 18
    },
    {
      "method": "128ep Aligned Simple",
      "metric_key": "mrr",
      "reason": "requires paired camera-view feature blocks, which are not in the public 128 JSONL metadata package",
      "recommended_next_step": "Export the missing target field for this 128-episode method, then rerun the same train/validation/test split.",
      "scope": "multi_episode_128_aligned_baseline",
      "series_id": "metadata128_simple",
      "status": "unsupported_without_required_target",
      "status_label": "unsupported",
      "task_id": "camera_view_sync_retrieval",
      "task_label": "Camera-View Synchronization Retrieval",
      "task_number": 19
    },
    {
      "method": "128ep Aligned NN",
      "metric_key": "mrr",
      "reason": "the 128-episode aligned rerun did not produce this task target; raw interaction text, paired camera-view embeddings, or a task-specific target builder is required",
      "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.",
      "scope": "multi_episode_128_aligned_baseline",
      "series_id": "metadata128_neural_mlp",
      "status": "not_supported_by_metadata_only_package",
      "status_label": "not supported",
      "task_id": "camera_view_sync_retrieval",
      "task_label": "Camera-View Synchronization Retrieval",
      "task_number": 19
    },
    {
      "method": "Cosmos3-Nano Future Window",
      "metric_key": "mrr",
      "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score",
      "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.",
      "scope": "multi_episode_128_partial_model_overlay",
      "series_id": "cosmos3_nano_future_window",
      "status": "not_evaluated_in_verified_package",
      "status_label": "not evaluated",
      "task_id": "camera_view_sync_retrieval",
      "task_label": "Camera-View Synchronization Retrieval",
      "task_number": 19
    }
  ],
  "proxy_records": [
    {
      "method": "128ep Raw Simple",
      "metric_key": "macro_f1",
      "reason": "documented compact proxy completion for this raw128 task axis",
      "series_id": "raw128_simple",
      "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json",
      "task_id": "interaction_text_prediction",
      "task_label": "Interaction Text Prediction",
      "task_number": 15
    },
    {
      "method": "128ep Raw NN",
      "metric_key": "macro_f1",
      "reason": "documented compact proxy completion for this raw128 task axis",
      "series_id": "raw128_neural_mlp",
      "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json",
      "task_id": "interaction_text_prediction",
      "task_label": "Interaction Text Prediction",
      "task_number": 15
    },
    {
      "method": "128ep Raw Simple",
      "metric_key": "mrr",
      "reason": "documented compact proxy completion for this raw128 task axis",
      "series_id": "raw128_simple",
      "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json",
      "task_id": "camera_view_sync_retrieval",
      "task_label": "Camera-View Synchronization Retrieval",
      "task_number": 19
    },
    {
      "method": "128ep Raw NN",
      "metric_key": "mrr",
      "reason": "documented compact proxy completion for this raw128 task axis",
      "series_id": "raw128_neural_mlp",
      "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json",
      "task_id": "camera_view_sync_retrieval",
      "task_label": "Camera-View Synchronization Retrieval",
      "task_number": 19
    }
  ],
  "score_summary": {
    "method_count": 9,
    "method_task_record_count": 180,
    "proxy_scored_method_task_count": 4,
    "scored_method_task_count": 164,
    "scoreless_method_task_count": 16,
    "task_count": 20
  },
  "source_matrix": "docs/data/task_method_20_result_matrix.json",
  "status": "pass",
  "target_policy": {
    "numeric_score_gate": "A method-task cell is numeric only when a runner or verified package emits that exact task target and metric.",
    "proxy_policy": "Proxy scores are allowed only when the matrix marks them as proxy_scored and keeps the reason/source attached.",
    "scoreless_cell_policy": "Unsupported and not-evaluated cells stay explicit in the public matrix instead of being hidden or backfilled with proxy model claims."
  },
  "title": "Task Method 20-Result Gap Audit"
}