File size: 5,078 Bytes
477807f
b7334ff
367c357
b7334ff
 
367c357
b7334ff
367c357
1e05f01
367c357
b7334ff
367c357
b7334ff
367c357
b7334ff
367c357
1e05f01
b7334ff
 
367c357
b7334ff
367c357
b7334ff
367c357
1e05f01
367c357
b7334ff
 
367c357
b7334ff
367c357
b7334ff
367c357
1e05f01
b7334ff
 
367c357
b7334ff
 
367c357
b7334ff
477807f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
<svg xmlns="http://www.w3.org/2000/svg" width="1180" height="700" viewBox="0 0 1180 700" role="img" aria-label="Xperience-10M task coverage across four research directions">
  <rect width="100%" height="100%" fill="#020502"/>
  <rect x="24" y="24" width="1132" height="652" rx="20" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/>
  <text x="58" y="64" font-size="30" font-weight="800" fill="#f4f8ef">Xperience-10M 12-Task Suite: Four Research Directions</text>
  <text x="58" y="96" font-size="16" font-weight="500" fill="#a5afa2">One public sample episode, two baseline families, explicit direct/proxy/diagnostic coverage.</text>
  <rect x="58" y="130" width="515" height="220" rx="8" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/>
<text x="82" y="172" font-size="21" font-weight="700" fill="#f4f8ef">A. Human Modeling &amp; Motion Understanding</text>
<text x="82" y="205" font-size="15" font-weight="700" fill="#ccffa0">partially implemented</text>
<text x="82" y="238" font-size="14" font-weight="500" fill="#dce8d7">Tasks: Action Recognition, Hand Trajectory Forecasting, Contact State Prediction, Object Relevance Prediction</text>
<rect x="82" y="262" width="234" height="16" rx="8" fill="#ccffa0"/>
<rect x="316" y="262" width="234" height="16" rx="8" fill="#7ae5c3"/>
<text x="82" y="304" font-size="14" font-weight="700" fill="#ccffa0">Direct 2</text>
<text x="208" y="304" font-size="14" font-weight="700" fill="#7ae5c3">Proxy 2</text>
<text x="328" y="304" font-size="14" font-weight="700" fill="#d8f4a5">Diagnostic 0</text><rect x="607" y="130" width="515" height="220" rx="8" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/>
<text x="631" y="172" font-size="21" font-weight="700" fill="#f4f8ef">B. 3D/4D Reconstruction &amp; Neural Rendering</text>
<text x="631" y="205" font-size="15" font-weight="700" fill="#ccffa0">proxy tasks only</text>
<text x="631" y="238" font-size="14" font-weight="500" fill="#dce8d7">Tasks: Cross-Modal Retrieval, Cross-Modal Reconstruction, Multimodal Synchronization Detection</text>
<rect x="631" y="262" width="311" height="16" rx="8" fill="#7ae5c3"/>
<rect x="942" y="262" width="156" height="16" rx="8" fill="#d8f4a5"/>
<text x="631" y="304" font-size="14" font-weight="700" fill="#ccffa0">Direct 0</text>
<text x="757" y="304" font-size="14" font-weight="700" fill="#7ae5c3">Proxy 2</text>
<text x="877" y="304" font-size="14" font-weight="700" fill="#d8f4a5">Diagnostic 1</text><rect x="58" y="384" width="515" height="220" rx="8" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/>
<text x="82" y="426" font-size="21" font-weight="700" fill="#f4f8ef">C. Egocentric Vision &amp; Interaction</text>
<text x="82" y="459" font-size="15" font-weight="700" fill="#ccffa0">strongest implemented track</text>
<text x="82" y="492" font-size="14" font-weight="500" fill="#dce8d7">Tasks: Action Recognition, Procedure Step Recognition, Action Boundary Detection, Next-Action Prediction, Hand Trajectory Forecasting, +6</text>
<rect x="82" y="516" width="255" height="16" rx="8" fill="#ccffa0"/>
<rect x="337" y="516" width="85" height="16" rx="8" fill="#7ae5c3"/>
<rect x="422" y="516" width="127" height="16" rx="8" fill="#d8f4a5"/>
<text x="82" y="558" font-size="14" font-weight="700" fill="#ccffa0">Direct 6</text>
<text x="208" y="558" font-size="14" font-weight="700" fill="#7ae5c3">Proxy 2</text>
<text x="328" y="558" font-size="14" font-weight="700" fill="#d8f4a5">Diagnostic 3</text><rect x="607" y="384" width="515" height="220" rx="8" fill="#050905" stroke="#ccffa0" stroke-opacity="0.24"/>
<text x="631" y="426" font-size="21" font-weight="700" fill="#f4f8ef">D. Scene Reconstruction &amp; World Modeling</text>
<text x="631" y="459" font-size="15" font-weight="700" fill="#ccffa0">early proxy tasks</text>
<text x="631" y="492" font-size="14" font-weight="500" fill="#dce8d7">Tasks: Procedure Step Recognition, Action Boundary Detection, Next-Action Prediction, Object Relevance Prediction, Language Grounding, +4</text>
<rect x="631" y="516" width="311" height="16" rx="8" fill="#7ae5c3"/>
<rect x="942" y="516" width="156" height="16" rx="8" fill="#d8f4a5"/>
<text x="631" y="558" font-size="14" font-weight="700" fill="#ccffa0">Direct 0</text>
<text x="757" y="558" font-size="14" font-weight="700" fill="#7ae5c3">Proxy 6</text>
<text x="877" y="558" font-size="14" font-weight="700" fill="#d8f4a5">Diagnostic 3</text>
  <rect x="58" y="622" width="16" height="16" rx="4" fill="#ccffa0"/><text x="82" y="636" font-size="14" font-weight="600" fill="#dce8d7">Direct task</text><rect x="258" y="622" width="16" height="16" rx="4" fill="#7ae5c3"/><text x="282" y="636" font-size="14" font-weight="600" fill="#dce8d7">Proxy / prerequisite</text><rect x="458" y="622" width="16" height="16" rx="4" fill="#d8f4a5"/><text x="482" y="636" font-size="14" font-weight="600" fill="#dce8d7">Diagnostic probe</text>
  <text x="58" y="670" font-size="13" font-weight="500" fill="#a5afa2">Generated from results/episode_task_suite/summary_report.json and scripts/research_direction_taxonomy.py</text>
</svg>