| <svg xmlns="http://www.w3.org/2000/svg" width="1180" height="700" viewBox="0 0 1180 700" role="img" aria-label="Xperience-10M task coverage across four research directions"> |
| <rect width="100%" height="100%" fill="#020502"/> |
| <rect x="24" y="24" width="1132" height="652" rx="20" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/> |
| <text x="58" y="64" font-size="30" font-weight="800" fill="#f4f8ef">Xperience-10M 20-Task Suite: Four Research Directions</text> |
| <text x="58" y="96" font-size="16" font-weight="500" fill="#a5afa2">One public sample episode, two baseline families, Qwen3-Omni/Cosmos3 diagnostics, and explicit direct/proxy coverage.</text> |
| <rect x="58" y="130" width="515" height="220" rx="8" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/> |
| <text x="82" y="172" font-size="21" font-weight="700" fill="#f4f8ef">A. Human Modeling & Motion Understanding</text> |
| <text x="82" y="205" font-size="15" font-weight="700" fill="#ccffa0">partially implemented</text> |
| <text x="82" y="238" font-size="14" font-weight="500" fill="#dce8d7">Tasks: Action Recognition, Hand Trajectory Forecasting, Contact State Prediction, Object Relevance Prediction, Interaction Text Prediction, +1</text> |
| <rect x="82" y="262" width="234" height="16" rx="8" fill="#ccffa0"/> |
| <rect x="316" y="262" width="234" height="16" rx="8" fill="#7ae5c3"/> |
| <text x="82" y="304" font-size="14" font-weight="700" fill="#ccffa0">Direct 3</text> |
| <text x="208" y="304" font-size="14" font-weight="700" fill="#7ae5c3">Proxy 3</text> |
| <text x="328" y="304" font-size="14" font-weight="700" fill="#d8f4a5">Diagnostic 0</text><rect x="607" y="130" width="515" height="220" rx="8" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/> |
| <text x="631" y="172" font-size="21" font-weight="700" fill="#f4f8ef">B. 3D/4D Reconstruction & Neural Rendering</text> |
| <text x="631" y="205" font-size="15" font-weight="700" fill="#ccffa0">proxy tasks only</text> |
| <text x="631" y="238" font-size="14" font-weight="500" fill="#dce8d7">Tasks: Cross-Modal Retrieval, Cross-Modal Reconstruction, Multimodal Synchronization Detection, IMU-to-Hand Pose Reconstruction, Camera-View Synchronization Retrieval</text> |
| <rect x="631" y="262" width="93" height="16" rx="8" fill="#ccffa0"/> |
| <rect x="724" y="262" width="280" height="16" rx="8" fill="#7ae5c3"/> |
| <rect x="1004" y="262" width="93" height="16" rx="8" fill="#d8f4a5"/> |
| <text x="631" y="304" font-size="14" font-weight="700" fill="#ccffa0">Direct 1</text> |
| <text x="757" y="304" font-size="14" font-weight="700" fill="#7ae5c3">Proxy 3</text> |
| <text x="877" y="304" font-size="14" font-weight="700" fill="#d8f4a5">Diagnostic 1</text><rect x="58" y="384" width="515" height="220" rx="8" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/> |
| <text x="82" y="426" font-size="21" font-weight="700" fill="#f4f8ef">C. Egocentric Vision & Interaction</text> |
| <text x="82" y="459" font-size="15" font-weight="700" fill="#ccffa0">strongest implemented track</text> |
| <text x="82" y="492" font-size="14" font-weight="500" fill="#dce8d7">Tasks: Action Recognition, Procedure Step Recognition, Action Boundary Detection, Next-Action Prediction, Hand Trajectory Forecasting, +12</text> |
| <rect x="82" y="516" width="275" height="16" rx="8" fill="#ccffa0"/> |
| <rect x="357" y="516" width="82" height="16" rx="8" fill="#7ae5c3"/> |
| <rect x="439" y="516" width="110" height="16" rx="8" fill="#d8f4a5"/> |
| <text x="82" y="558" font-size="14" font-weight="700" fill="#ccffa0">Direct 10</text> |
| <text x="208" y="558" font-size="14" font-weight="700" fill="#7ae5c3">Proxy 3</text> |
| <text x="328" y="558" font-size="14" font-weight="700" fill="#d8f4a5">Diagnostic 4</text><rect x="607" y="384" width="515" height="220" rx="8" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/> |
| <text x="631" y="426" font-size="21" font-weight="700" fill="#f4f8ef">D. Scene Reconstruction & World Modeling</text> |
| <text x="631" y="459" font-size="15" font-weight="700" fill="#ccffa0">early proxy tasks</text> |
| <text x="631" y="492" font-size="14" font-weight="500" fill="#dce8d7">Tasks: Procedure Step Recognition, Action Boundary Detection, Next-Action Prediction, Object Relevance Prediction, Language Grounding, +10</text> |
| <rect x="631" y="516" width="31" height="16" rx="8" fill="#ccffa0"/> |
| <rect x="662" y="516" width="311" height="16" rx="8" fill="#7ae5c3"/> |
| <rect x="973" y="516" width="125" height="16" rx="8" fill="#d8f4a5"/> |
| <text x="631" y="558" font-size="14" font-weight="700" fill="#ccffa0">Direct 1</text> |
| <text x="757" y="558" font-size="14" font-weight="700" fill="#7ae5c3">Proxy 10</text> |
| <text x="877" y="558" font-size="14" font-weight="700" fill="#d8f4a5">Diagnostic 4</text> |
| <rect x="58" y="622" width="16" height="16" rx="4" fill="#ccffa0"/><text x="82" y="636" font-size="14" font-weight="600" fill="#dce8d7">Direct task</text><rect x="258" y="622" width="16" height="16" rx="4" fill="#7ae5c3"/><text x="282" y="636" font-size="14" font-weight="600" fill="#dce8d7">Proxy / prerequisite</text><rect x="458" y="622" width="16" height="16" rx="4" fill="#d8f4a5"/><text x="482" y="636" font-size="14" font-weight="600" fill="#dce8d7">Diagnostic probe</text> |
| <text x="58" y="670" font-size="13" font-weight="500" fill="#a5afa2">Generated from docs/data/task_suite_20.json, committed metrics, and scripts/research_direction_taxonomy.py</text> |
| </svg> |
|
|