File size: 5,294 Bytes
7faed79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a07660e
 
 
 
 
 
7faed79
 
 
 
 
 
 
 
 
 
 
 
45c1706
7faed79
a07660e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c5b88c
7faed79
2c5b88c
 
7faed79
 
 
 
 
4173e02
 
0f9a8e2
7faed79
 
45c1706
2d80be0
a07660e
2d80be0
eeac43c
7faed79
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
{
  "title": "Ropedia Xperience-10M Task Suite Reproducibility Matrix",
  "version": "2026-06-01",
  "scope": "one public Xperience-10M sample episode",
  "python_target": "3.12",
  "public_raw_data_redistributed": false,
  "last_exact_metric_audit": {
    "date": "2026-05-30",
    "timezone": "Asia/Singapore",
    "evidence": "notes/reproducibility_audit.md",
    "status": "pass",
    "matched_artifacts": [
      "results/min_action_model/metrics.json",
      "results/min_subtask_model/metrics.json",
      "results/min_all_modalities_action_model/metrics.json",
      "results/min_all_modalities_subtask_model/metrics.json",
      "results/episode_task_suite/summary_report.json",
      "results/episode_task_suite/feature_manifest.json",
      "results/episode_task_suite/available_modalities.json",
      "results/episode_task_suite/*/metrics.json"
    ]
  },
  "steps": [
    {
      "id": "download_sample",
      "status": "reproducible",
      "command": "hf download ropedia-ai/xperience-10m-sample --repo-type dataset --local-dir data/sample/xperience-10m-sample",
      "expected": "annotation.hdf5 plus public sample MP4 streams under data/sample/xperience-10m-sample; optional visualization.rrd can be inspected with Rerun 0.29.0",
      "boundary": "sample card lists cc-by-nc-4.0; raw sample data is downloaded from upstream, not redistributed here"
    },
    {
      "id": "minimal_baselines",
      "status": "reproducible",
      "command": "python scripts/train_min_action_model.py --workspace $WORKSPACE && python scripts/train_all_modalities_model.py --workspace $WORKSPACE",
      "expected": "minimal baseline metrics and model weights under results/min_*",
      "boundary": "single-episode chronological split"
    },
    {
      "id": "twelve_task_suite",
      "status": "reproducible",
      "command": "python scripts/episode_task_suite.py --workspace $WORKSPACE --include-neural",
      "expected": "12 task metrics, predictions, manifests, and neural_mlp task-head artifacts",
      "boundary": "8,546-dimensional multimodal window contract"
    },
    {
      "id": "research_direction_outputs",
      "status": "reproducible",
      "command": "python scripts/research_direction_taxonomy.py && python scripts/research_direction_extension_tasks.py && python scripts/task_walkthroughs.py",
      "expected": "research-direction taxonomy, extension probes, and task walkthrough artifacts",
      "boundary": "single-episode probes, not full research-direction solutions"
    },
    {
      "id": "source_alignment_audit",
      "status": "reproducible",
      "command": "python scripts/validate_source_alignment.py",
      "expected": "SOURCE_ALIGNMENT_AUDIT.md and docs/data/source_alignment_audit.json",
      "boundary": "offline committed-fact audit; does not fetch private gated data"
    },
    {
      "id": "evaluation_protocol",
      "status": "reproducible",
      "command": "python scripts/build_evaluation_protocol.py",
      "expected": "EVALUATION_PROTOCOL.md and docs/data/evaluation_protocol.json",
      "boundary": "defines single-episode task evaluation rules; does not add cross-episode model quality"
    },
    {
      "id": "figures_and_dashboard_data",
      "status": "reproducible",
      "command": "python scripts/generate_visualizations.py && python scripts/render_overview_figures.py && python scripts/render_task_suite_infographic.py && python scripts/export_modality_atlas_assets.py && python scripts/build_brand_assets.py && python scripts/build_figure_index.py",
      "expected": "website JSON bundles, charts, overview figures, task-suite infographic, responsive modality atlas assets, brand logo derivatives, FIGURE_INDEX.md, docs/data/brand_assets.json, and docs/data/figure_index.json",
      "boundary": "figures are generated presentation layers over committed metrics and sample thumbnails"
    },
    {
      "id": "publication_validation",
      "status": "reproducible",
      "command": "python scripts/validate_website_integrity.py && python scripts/validate_task_surface.py && python scripts/validate_scope_claims.py && python scripts/build_artifact_index.py && python scripts/validate_mirror_parity.py && python scripts/validate_publication_package.py",
      "expected": "docs/data/website_integrity.json, docs/data/task_surface_integrity.json, docs/data/scope_claims_audit.json, docs/data/artifact_index.json, docs/data/mirror_parity.json, and docs/data/publication_audit.json",
      "boundary": "checks local website integrity plus public repo, prepared HF bundles, and prepared mirror parity"
    },
    {
      "id": "qwen3_omni_multi_episode_pilot",
      "status": "verified_final_diagnostic_result_not_publicly_rerunnable_without_gated_data",
      "command": "scripts/omni/build_qwen3_omni_dataset.py and scripts/omni/train_qwen3_omni_lora.py on the selected gated episodes",
      "expected": "verified final diagnostic LoRA package with 3,808 exported windows, 2,848 train windows, and 448 held-out test predictions",
      "boundary": "the public package records metrics and manifests, but rerunning requires gated Xperience-10M episode access and base-model weights; current strict-label JSON validity is 100.00%, meeting the 98% target, while action/subtask metrics remain weak"
    }
  ]
}