{ "id": "cosmos_world_model", "display_name": "Cosmos3-Nano Future-Window World Model", "status": "implemented", "model_family": "Cosmos / physical-world foundation models", "default_model_id": "nvidia/Cosmos3-Nano", "local_model_env": "COSMOS_MODEL_DIR", "dataset_contract": "xperience10m_future_window_world_model_v0", "training_objective": "future_window_and_action_conditioned_world_modeling", "split_policy": { "unit": "episode", "default_counts": { "train": 96, "val": 16, "test": 16 }, "leakage_guard": "future windows must remain inside the same episode and test episodes must never condition training" }, "modalities": { "direct_inputs": [ "camera video streams or rendered mosaics", "language task context" ], "conditioning_inputs": [ "pose and SLAM trajectory", "depth and confidence", "mocap or action labels", "IMU acceleration and gyro", "audio event cues" ], "targets": [ "future visual window", "future latent state", "future sensor-feature window", "transition or contact event" ], "excluded_inputs": [ "visualization.rrd" ] }, "entrypoints": { "selection_manifest": "scripts/omni/build_selection_episode_manifest.py", "neutral_index": "scripts/omni/export_model_neutral_window_index.py", "export": "scripts/omni/export_cosmos3_future_window_dataset.py", "train": "scripts/omni/eval_cosmos3_future_window_retrieval.py", "eval": "scripts/omni/eval_cosmos3_future_window_retrieval.py", "launcher": "scripts/omni/run_cosmos3_nano_future_window_compat.sh", "validate": "scripts/omni/validate_omni_finetune_run.py" }, "primary_metrics": [ "future_retrieval_mrr", "future_retrieval_recall_at_5", "temporal_consistency", "feature_reconstruction_error", "transition_accuracy", "contact_accuracy", "held_out_episode_count" ], "artifact_contract": { "checkpoint_gate": "world_model_checkpoint_and_generation_config", "required_eval_files": [ "metrics.json", "future_predictions.jsonl", "retrieval_rankings.csv", "temporal_consistency.csv", "qualitative_examples.json", "RUN_REPORT.md" ], "required_training_files": [ "training_metadata.json", "progress.jsonl", "model_config.json", "checkpoint_manifest.json" ], "public_package_allowed": [ "metrics", "future-window prediction summaries", "retrieval rankings", "temporal consistency tables", "qualitative example metadata", "episode and dataset manifests", "validation summaries" ], "public_package_forbidden": [ "raw MP4", "annotation HDF5", "Rerun RRD", "generated raw video unless explicitly licensed and size-bounded", "base-model weights", "full checkpoints", "large archives" ] }, "extension_requirements": [ "Current implementation starts with Cosmos3-Nano compatibility over same-split future sensor-feature retrieval; it does not fine-tune Cosmos diffusion weights yet.", "Install a Cosmos3 Diffusers training stack before replacing the compatibility adapter with LoRA or diffusion post-training.", "Keep target windows inside the same episode and never train on held-out test episodes.", "Record generated or retrieved qualitative examples separately from task-classification metrics." ] }