| { |
| "generated_at_utc": "2026-06-17T21:17:51+00:00", |
| "immediate_actions": [ |
| { |
| "artifact": "docs/data/task_method_20_gap_audit.json", |
| "id": "gap_audit", |
| "purpose": "Keep the 61 scoreless cells visible and reproducible." |
| }, |
| { |
| "artifact": "scripts/omni/score_model_output_probes.py", |
| "id": "model_output_probe", |
| "purpose": "Check whether train/validation/test model outputs exist before attempting all-task Qwen3/Cosmos scoring." |
| }, |
| { |
| "artifact": "scripts/omni/launch_all_task_model_scoring_when_free.sh", |
| "id": "guarded_gpu_launcher", |
| "purpose": "Start a user-provided all-task scoring command only after enough private GPU capacity is idle." |
| } |
| ], |
| "methods": { |
| "cosmos3_nano_future_window": { |
| "kind": "partial_128_episode_world_model_overlay", |
| "label": "Cosmos3-Nano Future Window", |
| "proxy_scored_task_count": 0, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, held-out test", |
| "scored_task_count": 5, |
| "scoreless_task_count": 15, |
| "status_counts": { |
| "not_evaluated_in_verified_package": 15, |
| "scored": 5 |
| } |
| }, |
| "cosmos3_super_reasoner": { |
| "kind": "partial_128_episode_foundation_model_overlay", |
| "label": "Cosmos3-Super Reasoner", |
| "proxy_scored_task_count": 0, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, held-out test", |
| "scored_task_count": 7, |
| "scoreless_task_count": 13, |
| "status_counts": { |
| "not_evaluated_in_verified_package": 13, |
| "scored": 7 |
| } |
| }, |
| "metadata128_neural_mlp": { |
| "kind": "partial_128_episode_metadata_baseline", |
| "label": "128ep Metadata NN", |
| "proxy_scored_task_count": 0, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, JSONL metadata/text only", |
| "scored_task_count": 6, |
| "scoreless_task_count": 14, |
| "status_counts": { |
| "not_supported_by_metadata_only_package": 14, |
| "scored": 6 |
| } |
| }, |
| "metadata128_simple": { |
| "kind": "partial_128_episode_metadata_baseline", |
| "label": "128ep Metadata Simple", |
| "proxy_scored_task_count": 0, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, JSONL metadata/text only", |
| "scored_task_count": 8, |
| "scoreless_task_count": 12, |
| "status_counts": { |
| "not_supported_by_metadata_only_package": 8, |
| "scored": 8, |
| "unsupported_without_required_target": 4 |
| } |
| }, |
| "minimal": { |
| "kind": "full_20_task_baseline", |
| "label": "Minimal", |
| "proxy_scored_task_count": 0, |
| "result_record_count": 20, |
| "scope": "1 public sample episode", |
| "scored_task_count": 20, |
| "scoreless_task_count": 0, |
| "status_counts": { |
| "scored": 20 |
| } |
| }, |
| "neural_mlp": { |
| "kind": "full_20_task_baseline", |
| "label": "Neural MLP", |
| "proxy_scored_task_count": 0, |
| "result_record_count": 20, |
| "scope": "1 public sample episode", |
| "scored_task_count": 20, |
| "scoreless_task_count": 0, |
| "status_counts": { |
| "scored": 20 |
| } |
| }, |
| "qwen3_omni_v6_lora": { |
| "kind": "partial_128_episode_foundation_model_overlay", |
| "label": "Qwen3-Omni v6 LoRA", |
| "proxy_scored_task_count": 0, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, held-out test", |
| "scored_task_count": 13, |
| "scoreless_task_count": 7, |
| "status_counts": { |
| "not_evaluated_in_verified_package": 7, |
| "scored": 13 |
| } |
| }, |
| "raw128_neural_mlp": { |
| "kind": "complete_128_episode_raw_feature_baseline", |
| "label": "128ep Raw NN", |
| "proxy_scored_task_count": 2, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", |
| "scored_task_count": 20, |
| "scoreless_task_count": 0, |
| "status_counts": { |
| "proxy_scored": 2, |
| "scored": 18 |
| } |
| }, |
| "raw128_simple": { |
| "kind": "complete_128_episode_raw_feature_baseline", |
| "label": "128ep Raw Simple", |
| "proxy_scored_task_count": 2, |
| "result_record_count": 20, |
| "scope": "128 selected episodes, staged 4430-dim sensor NPZ features; 2 compact proxy axes", |
| "scored_task_count": 20, |
| "scoreless_task_count": 0, |
| "status_counts": { |
| "proxy_scored": 2, |
| "scored": 18 |
| } |
| } |
| }, |
| "missing_by_method": { |
| "cosmos3_nano_future_window": 15, |
| "cosmos3_super_reasoner": 13, |
| "metadata128_neural_mlp": 14, |
| "metadata128_simple": 12, |
| "qwen3_omni_v6_lora": 7 |
| }, |
| "missing_by_status": { |
| "not_evaluated_in_verified_package": 35, |
| "not_supported_by_metadata_only_package": 22, |
| "unsupported_without_required_target": 4 |
| }, |
| "missing_by_task": { |
| "02 Procedure Step Recognition": [ |
| "cosmos3_nano_future_window" |
| ], |
| "05 Hand Trajectory Forecasting": [ |
| "cosmos3_nano_future_window", |
| "cosmos3_super_reasoner", |
| "metadata128_neural_mlp", |
| "metadata128_simple", |
| "qwen3_omni_v6_lora" |
| ], |
| "07 Object Relevance Prediction": [ |
| "cosmos3_nano_future_window" |
| ], |
| "08 Language Grounding": [ |
| "cosmos3_nano_future_window", |
| "cosmos3_super_reasoner", |
| "metadata128_neural_mlp", |
| "qwen3_omni_v6_lora" |
| ], |
| "09 Cross-Modal Retrieval": [ |
| "cosmos3_super_reasoner", |
| "metadata128_neural_mlp", |
| "metadata128_simple", |
| "qwen3_omni_v6_lora" |
| ], |
| "10 Cross-Modal Reconstruction": [ |
| "cosmos3_nano_future_window", |
| "cosmos3_super_reasoner", |
| "metadata128_neural_mlp", |
| "metadata128_simple", |
| "qwen3_omni_v6_lora" |
| ], |
| "11 Temporal Order Verification": [ |
| "cosmos3_nano_future_window", |
| "cosmos3_super_reasoner", |
| "metadata128_neural_mlp" |
| ], |
| "12 Multimodal Synchronization Detection": [ |
| "cosmos3_nano_future_window", |
| "cosmos3_super_reasoner", |
| "metadata128_neural_mlp", |
| "metadata128_simple" |
| ], |
| "13 Long-Horizon Next-Action Forecasting": [ |
| "cosmos3_nano_future_window", |
| "cosmos3_super_reasoner", |
| "metadata128_neural_mlp", |
| "metadata128_simple" |
| ], |
| "14 Long-Horizon Next-Subtask Forecasting": [ |
| "cosmos3_nano_future_window", |
| "cosmos3_super_reasoner", |
| "metadata128_neural_mlp", |
| "metadata128_simple" |
| ], |
| "15 Interaction Text Prediction": [ |
| "cosmos3_nano_future_window", |
| "cosmos3_super_reasoner", |
| "metadata128_neural_mlp", |
| "metadata128_simple", |
| "qwen3_omni_v6_lora" |
| ], |
| "16 Action-Object Relation Prediction": [ |
| "cosmos3_nano_future_window", |
| "metadata128_neural_mlp", |
| "metadata128_simple" |
| ], |
| "17 Future Object-Set Forecasting": [ |
| "cosmos3_nano_future_window", |
| "cosmos3_super_reasoner", |
| "metadata128_neural_mlp", |
| "metadata128_simple" |
| ], |
| "18 IMU-to-Hand Pose Reconstruction": [ |
| "cosmos3_nano_future_window", |
| "cosmos3_super_reasoner", |
| "metadata128_neural_mlp", |
| "metadata128_simple", |
| "qwen3_omni_v6_lora" |
| ], |
| "19 Camera-View Synchronization Retrieval": [ |
| "cosmos3_nano_future_window", |
| "cosmos3_super_reasoner", |
| "metadata128_neural_mlp", |
| "metadata128_simple", |
| "qwen3_omni_v6_lora" |
| ], |
| "20 Time-to-Next-Transition Regression": [ |
| "cosmos3_nano_future_window", |
| "cosmos3_super_reasoner", |
| "metadata128_neural_mlp", |
| "metadata128_simple" |
| ] |
| }, |
| "missing_records": [ |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "macro_f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "timeline_subtask", |
| "task_label": "Procedure Step Recognition", |
| "task_number": 2 |
| }, |
| { |
| "method": "128ep Metadata Simple", |
| "metric_key": "mpjpe", |
| "reason": "requires future hand-joint trajectories from raw sensor feature NPZ blocks, which are not in the public 128 package", |
| "recommended_next_step": "Export the missing target field for this 128-episode method, then rerun the same train/validation/test split.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_simple", |
| "status": "unsupported_without_required_target", |
| "status_label": "unsupported", |
| "task_id": "hand_trajectory_forecast", |
| "task_label": "Hand Trajectory Forecasting", |
| "task_number": 5 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "mpjpe", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "hand_trajectory_forecast", |
| "task_label": "Hand Trajectory Forecasting", |
| "task_number": 5 |
| }, |
| { |
| "method": "Qwen3-Omni v6 LoRA", |
| "metric_key": "mpjpe", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "qwen3_omni_v6_lora", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "hand_trajectory_forecast", |
| "task_label": "Hand Trajectory Forecasting", |
| "task_number": 5 |
| }, |
| { |
| "method": "Cosmos3-Super Reasoner", |
| "metric_key": "mpjpe", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_super_reasoner", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "hand_trajectory_forecast", |
| "task_label": "Hand Trajectory Forecasting", |
| "task_number": 5 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "mpjpe", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "hand_trajectory_forecast", |
| "task_label": "Hand Trajectory Forecasting", |
| "task_number": 5 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "micro_f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "object_relevance", |
| "task_label": "Object Relevance Prediction", |
| "task_number": 7 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "mrr", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "caption_grounding", |
| "task_label": "Language Grounding", |
| "task_number": 8 |
| }, |
| { |
| "method": "Qwen3-Omni v6 LoRA", |
| "metric_key": "mrr", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "qwen3_omni_v6_lora", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "caption_grounding", |
| "task_label": "Language Grounding", |
| "task_number": 8 |
| }, |
| { |
| "method": "Cosmos3-Super Reasoner", |
| "metric_key": "mrr", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_super_reasoner", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "caption_grounding", |
| "task_label": "Language Grounding", |
| "task_number": 8 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "mrr", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "caption_grounding", |
| "task_label": "Language Grounding", |
| "task_number": 8 |
| }, |
| { |
| "method": "128ep Metadata Simple", |
| "metric_key": "mrr", |
| "reason": "requires paired motion/IMU/camera/audio/depth feature blocks, which are not in the public 128 package", |
| "recommended_next_step": "Export the missing target field for this 128-episode method, then rerun the same train/validation/test split.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_simple", |
| "status": "unsupported_without_required_target", |
| "status_label": "unsupported", |
| "task_id": "cross_modal_retrieval", |
| "task_label": "Cross-Modal Retrieval", |
| "task_number": 9 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "mrr", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "cross_modal_retrieval", |
| "task_label": "Cross-Modal Retrieval", |
| "task_number": 9 |
| }, |
| { |
| "method": "Qwen3-Omni v6 LoRA", |
| "metric_key": "mrr", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "qwen3_omni_v6_lora", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "cross_modal_retrieval", |
| "task_label": "Cross-Modal Retrieval", |
| "task_number": 9 |
| }, |
| { |
| "method": "Cosmos3-Super Reasoner", |
| "metric_key": "mrr", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_super_reasoner", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "cross_modal_retrieval", |
| "task_label": "Cross-Modal Retrieval", |
| "task_number": 9 |
| }, |
| { |
| "method": "128ep Metadata Simple", |
| "metric_key": "r2", |
| "reason": "requires source and target modality feature blocks such as depth/video vectors, which are not in the public 128 package", |
| "recommended_next_step": "Export the missing target field for this 128-episode method, then rerun the same train/validation/test split.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_simple", |
| "status": "unsupported_without_required_target", |
| "status_label": "unsupported", |
| "task_id": "modality_reconstruction", |
| "task_label": "Cross-Modal Reconstruction", |
| "task_number": 10 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "r2", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "modality_reconstruction", |
| "task_label": "Cross-Modal Reconstruction", |
| "task_number": 10 |
| }, |
| { |
| "method": "Qwen3-Omni v6 LoRA", |
| "metric_key": "r2", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "qwen3_omni_v6_lora", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "modality_reconstruction", |
| "task_label": "Cross-Modal Reconstruction", |
| "task_number": 10 |
| }, |
| { |
| "method": "Cosmos3-Super Reasoner", |
| "metric_key": "r2", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_super_reasoner", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "modality_reconstruction", |
| "task_label": "Cross-Modal Reconstruction", |
| "task_number": 10 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "r2", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "modality_reconstruction", |
| "task_label": "Cross-Modal Reconstruction", |
| "task_number": 10 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "f1", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "temporal_order", |
| "task_label": "Temporal Order Verification", |
| "task_number": 11 |
| }, |
| { |
| "method": "Cosmos3-Super Reasoner", |
| "metric_key": "f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_super_reasoner", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "temporal_order", |
| "task_label": "Temporal Order Verification", |
| "task_number": 11 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "temporal_order", |
| "task_label": "Temporal Order Verification", |
| "task_number": 11 |
| }, |
| { |
| "method": "128ep Metadata Simple", |
| "metric_key": "f1", |
| "reason": "requires deliberately shifted cross-modal feature pairs, which cannot be reconstructed from the public JSONL labels alone", |
| "recommended_next_step": "Export the missing target field for this 128-episode method, then rerun the same train/validation/test split.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_simple", |
| "status": "unsupported_without_required_target", |
| "status_label": "unsupported", |
| "task_id": "misalignment_detection", |
| "task_label": "Multimodal Synchronization Detection", |
| "task_number": 12 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "f1", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "misalignment_detection", |
| "task_label": "Multimodal Synchronization Detection", |
| "task_number": 12 |
| }, |
| { |
| "method": "Cosmos3-Super Reasoner", |
| "metric_key": "f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_super_reasoner", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "misalignment_detection", |
| "task_label": "Multimodal Synchronization Detection", |
| "task_number": 12 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "misalignment_detection", |
| "task_label": "Multimodal Synchronization Detection", |
| "task_number": 12 |
| }, |
| { |
| "method": "128ep Metadata Simple", |
| "metric_key": "macro_f1", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_simple", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "long_horizon_next_action", |
| "task_label": "Long-Horizon Next-Action Forecasting", |
| "task_number": 13 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "macro_f1", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "long_horizon_next_action", |
| "task_label": "Long-Horizon Next-Action Forecasting", |
| "task_number": 13 |
| }, |
| { |
| "method": "Cosmos3-Super Reasoner", |
| "metric_key": "macro_f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_super_reasoner", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "long_horizon_next_action", |
| "task_label": "Long-Horizon Next-Action Forecasting", |
| "task_number": 13 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "macro_f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "long_horizon_next_action", |
| "task_label": "Long-Horizon Next-Action Forecasting", |
| "task_number": 13 |
| }, |
| { |
| "method": "128ep Metadata Simple", |
| "metric_key": "macro_f1", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_simple", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "next_subtask_forecast", |
| "task_label": "Long-Horizon Next-Subtask Forecasting", |
| "task_number": 14 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "macro_f1", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "next_subtask_forecast", |
| "task_label": "Long-Horizon Next-Subtask Forecasting", |
| "task_number": 14 |
| }, |
| { |
| "method": "Cosmos3-Super Reasoner", |
| "metric_key": "macro_f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_super_reasoner", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "next_subtask_forecast", |
| "task_label": "Long-Horizon Next-Subtask Forecasting", |
| "task_number": 14 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "macro_f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "next_subtask_forecast", |
| "task_label": "Long-Horizon Next-Subtask Forecasting", |
| "task_number": 14 |
| }, |
| { |
| "method": "128ep Metadata Simple", |
| "metric_key": "macro_f1", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_simple", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "interaction_text_prediction", |
| "task_label": "Interaction Text Prediction", |
| "task_number": 15 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "macro_f1", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "interaction_text_prediction", |
| "task_label": "Interaction Text Prediction", |
| "task_number": 15 |
| }, |
| { |
| "method": "Qwen3-Omni v6 LoRA", |
| "metric_key": "macro_f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "qwen3_omni_v6_lora", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "interaction_text_prediction", |
| "task_label": "Interaction Text Prediction", |
| "task_number": 15 |
| }, |
| { |
| "method": "Cosmos3-Super Reasoner", |
| "metric_key": "macro_f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_super_reasoner", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "interaction_text_prediction", |
| "task_label": "Interaction Text Prediction", |
| "task_number": 15 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "macro_f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "interaction_text_prediction", |
| "task_label": "Interaction Text Prediction", |
| "task_number": 15 |
| }, |
| { |
| "method": "128ep Metadata Simple", |
| "metric_key": "macro_f1", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_simple", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "action_object_relation", |
| "task_label": "Action-Object Relation Prediction", |
| "task_number": 16 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "macro_f1", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "action_object_relation", |
| "task_label": "Action-Object Relation Prediction", |
| "task_number": 16 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "macro_f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "action_object_relation", |
| "task_label": "Action-Object Relation Prediction", |
| "task_number": 16 |
| }, |
| { |
| "method": "128ep Metadata Simple", |
| "metric_key": "micro_f1", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_simple", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "object_set_forecast", |
| "task_label": "Future Object-Set Forecasting", |
| "task_number": 17 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "micro_f1", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "object_set_forecast", |
| "task_label": "Future Object-Set Forecasting", |
| "task_number": 17 |
| }, |
| { |
| "method": "Cosmos3-Super Reasoner", |
| "metric_key": "micro_f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_super_reasoner", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "object_set_forecast", |
| "task_label": "Future Object-Set Forecasting", |
| "task_number": 17 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "micro_f1", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "object_set_forecast", |
| "task_label": "Future Object-Set Forecasting", |
| "task_number": 17 |
| }, |
| { |
| "method": "128ep Metadata Simple", |
| "metric_key": "mae", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_simple", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "imu_to_hand_pose", |
| "task_label": "IMU-to-Hand Pose Reconstruction", |
| "task_number": 18 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "mae", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "imu_to_hand_pose", |
| "task_label": "IMU-to-Hand Pose Reconstruction", |
| "task_number": 18 |
| }, |
| { |
| "method": "Qwen3-Omni v6 LoRA", |
| "metric_key": "mae", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "qwen3_omni_v6_lora", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "imu_to_hand_pose", |
| "task_label": "IMU-to-Hand Pose Reconstruction", |
| "task_number": 18 |
| }, |
| { |
| "method": "Cosmos3-Super Reasoner", |
| "metric_key": "mae", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_super_reasoner", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "imu_to_hand_pose", |
| "task_label": "IMU-to-Hand Pose Reconstruction", |
| "task_number": 18 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "mae", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "imu_to_hand_pose", |
| "task_label": "IMU-to-Hand Pose Reconstruction", |
| "task_number": 18 |
| }, |
| { |
| "method": "128ep Metadata Simple", |
| "metric_key": "mrr", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_simple", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "camera_view_sync_retrieval", |
| "task_label": "Camera-View Synchronization Retrieval", |
| "task_number": 19 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "mrr", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "camera_view_sync_retrieval", |
| "task_label": "Camera-View Synchronization Retrieval", |
| "task_number": 19 |
| }, |
| { |
| "method": "Qwen3-Omni v6 LoRA", |
| "metric_key": "mrr", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "qwen3_omni_v6_lora", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "camera_view_sync_retrieval", |
| "task_label": "Camera-View Synchronization Retrieval", |
| "task_number": 19 |
| }, |
| { |
| "method": "Cosmos3-Super Reasoner", |
| "metric_key": "mrr", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_super_reasoner", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "camera_view_sync_retrieval", |
| "task_label": "Camera-View Synchronization Retrieval", |
| "task_number": 19 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "mrr", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "camera_view_sync_retrieval", |
| "task_label": "Camera-View Synchronization Retrieval", |
| "task_number": 19 |
| }, |
| { |
| "method": "128ep Metadata Simple", |
| "metric_key": "mae", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_simple", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "time_to_transition", |
| "task_label": "Time-to-Next-Transition Regression", |
| "task_number": 20 |
| }, |
| { |
| "method": "128ep Metadata NN", |
| "metric_key": "mae", |
| "reason": "the 128-episode metadata/text rerun did not produce this task target; raw sensor blocks or a task-specific metadata target builder are required", |
| "recommended_next_step": "Run the task with raw sensor-feature blocks or add a task-specific metadata target builder before assigning a numeric score.", |
| "scope": "multi_episode_128_metadata_baseline", |
| "series_id": "metadata128_neural_mlp", |
| "status": "not_supported_by_metadata_only_package", |
| "status_label": "not supported", |
| "task_id": "time_to_transition", |
| "task_label": "Time-to-Next-Transition Regression", |
| "task_number": 20 |
| }, |
| { |
| "method": "Cosmos3-Super Reasoner", |
| "metric_key": "mae", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_super_reasoner", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "time_to_transition", |
| "task_label": "Time-to-Next-Transition Regression", |
| "task_number": 20 |
| }, |
| { |
| "method": "Cosmos3-Nano Future Window", |
| "metric_key": "mae", |
| "reason": "the verified public model package did not ask this branch to emit that task target; a new task-specific evaluation package is required for a numeric score", |
| "recommended_next_step": "Generate verified model outputs for this task contract and score them against the held-out labels.", |
| "scope": "multi_episode_128_partial_model_overlay", |
| "series_id": "cosmos3_nano_future_window", |
| "status": "not_evaluated_in_verified_package", |
| "status_label": "not evaluated", |
| "task_id": "time_to_transition", |
| "task_label": "Time-to-Next-Transition Regression", |
| "task_number": 20 |
| } |
| ], |
| "proxy_records": [ |
| { |
| "method": "128ep Raw Simple", |
| "metric_key": "macro_f1", |
| "reason": "documented compact proxy completion for this raw128 task axis", |
| "series_id": "raw128_simple", |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/interaction_text_prediction/metrics.json", |
| "task_id": "interaction_text_prediction", |
| "task_label": "Interaction Text Prediction", |
| "task_number": 15 |
| }, |
| { |
| "method": "128ep Raw NN", |
| "metric_key": "macro_f1", |
| "reason": "documented compact proxy completion for this raw128 task axis", |
| "series_id": "raw128_neural_mlp", |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/interaction_text_prediction/metrics.json", |
| "task_id": "interaction_text_prediction", |
| "task_label": "Interaction Text Prediction", |
| "task_number": 15 |
| }, |
| { |
| "method": "128ep Raw Simple", |
| "metric_key": "mrr", |
| "reason": "documented compact proxy completion for this raw128 task axis", |
| "series_id": "raw128_simple", |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/simple_raw128/camera_view_sync_retrieval/metrics.json", |
| "task_id": "camera_view_sync_retrieval", |
| "task_label": "Camera-View Synchronization Retrieval", |
| "task_number": 19 |
| }, |
| { |
| "method": "128ep Raw NN", |
| "metric_key": "mrr", |
| "reason": "documented compact proxy completion for this raw128 task axis", |
| "series_id": "raw128_neural_mlp", |
| "source": "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z/neural_mlp_raw128/camera_view_sync_retrieval/metrics.json", |
| "task_id": "camera_view_sync_retrieval", |
| "task_label": "Camera-View Synchronization Retrieval", |
| "task_number": 19 |
| } |
| ], |
| "score_summary": { |
| "method_count": 9, |
| "method_task_record_count": 180, |
| "proxy_scored_method_task_count": 4, |
| "scored_method_task_count": 119, |
| "scoreless_method_task_count": 61, |
| "task_count": 20 |
| }, |
| "source_matrix": "docs/data/task_method_20_result_matrix.json", |
| "status": "pass", |
| "target_policy": { |
| "numeric_score_gate": "A method-task cell is numeric only when a runner or verified package emits that exact task target and metric.", |
| "proxy_policy": "Proxy scores are allowed only when the matrix marks them as proxy_scored and keeps the reason/source attached.", |
| "scoreless_cell_policy": "Unsupported and not-evaluated cells stay explicit in the public matrix instead of being hidden or backfilled with proxy model claims." |
| }, |
| "title": "Task Method 20-Result Gap Audit" |
| } |
|
|