File size: 6,836 Bytes

7faed79
 
0995310
 
7faed79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a07660e
 
 
 
 
 
7faed79
 
 
 
 
 
 
 
d9be7c0
7faed79
 
146ae33
45c1706
7faed79
a07660e
 
 
 
 
 
 
c614c4e
146ae33
c614c4e
2ebe45d
146ae33
 
c614c4e
a07660e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c5b88c
7faed79
2c5b88c
 
7faed79
 
 
 
 
4173e02
 
0f9a8e2
7faed79
 
45c1706
2d80be0
0995310
 
 
 
 
 
 
 
 
 
7faed79

{
  "title": "Ropedia Xperience-10M Task Suite Reproducibility Matrix",
  "version": "2026-06-14",
  "scope": "one public Xperience-10M sample episode plus owner-side private staged Qwen3-Omni v6 reproduction",
  "python_target": "3.12",
  "public_raw_data_redistributed": false,
  "last_exact_metric_audit": {
    "date": "2026-05-30",
    "timezone": "Asia/Singapore",
    "evidence": "notes/reproducibility_audit.md",
    "status": "pass",
    "matched_artifacts": [
      "results/min_action_model/metrics.json",
      "results/min_subtask_model/metrics.json",
      "results/min_all_modalities_action_model/metrics.json",
      "results/min_all_modalities_subtask_model/metrics.json",
      "results/episode_task_suite/summary_report.json",
      "results/episode_task_suite/feature_manifest.json",
      "results/episode_task_suite/available_modalities.json",
      "results/episode_task_suite/*/metrics.json"
    ]
  },
  "steps": [
    {
      "id": "download_sample",
      "status": "reproducible",
      "command": "hf download ropedia-ai/xperience-10m-sample --repo-type dataset --local-dir data/sample/xperience-10m-sample",
      "expected": "annotation.hdf5 plus public sample MP4 streams under data/sample/xperience-10m-sample; optional visualization.rrd can be inspected with Rerun 0.29.0",
      "boundary": "sample card lists cc-by-nc-4.0; raw sample data is downloaded from upstream, not redistributed here"
    },
    {
      "id": "minimal_baselines",
      "status": "reproducible",
      "command": "python scripts/train_min_action_model.py --workspace $WORKSPACE && python scripts/train_all_modalities_model.py --workspace $WORKSPACE",
      "expected": "minimal baseline metrics and model weights under results/min_*",
      "boundary": "single-episode chronological split"
    },
    {
      "id": "original_task_suite",
      "status": "reproducible",
      "command": "python scripts/episode_task_suite.py --workspace $WORKSPACE --include-neural",
      "expected": "walkthrough-backed task metrics, predictions, manifests, and neural_mlp task-head artifacts",
      "boundary": "8,546-dimensional multimodal window contract"
    },
    {
      "id": "research_direction_outputs",
      "status": "reproducible",
      "command": "python scripts/research_direction_taxonomy.py && python scripts/research_direction_extension_tasks.py && python scripts/task_walkthroughs.py",
      "expected": "research-direction taxonomy, extension probes, and task walkthrough artifacts",
      "boundary": "single-episode probes, not full research-direction solutions"
    },
    {
      "id": "unified_20_task_index",
      "status": "reproducible",
      "command": "python scripts/tier2_task_suite.py && python scripts/build_unified_task_suite.py && python scripts/build_unified_task_model_radar.py",
      "expected": "unified 20-task metrics, prediction/rank artifacts, TASK_SUITE_20.md, docs/data/task_suite_20.json, docs/data/tier2_task_suite.json, docs/assets/charts/tier2_task_suite.svg, docs/data/unified_task_model_radar.json, and docs/assets/charts/unified_task_model_radar.svg",
      "boundary": "requires local public-sample annotation.hdf5 plus HOMIE Toolkit or h5py for full public-task regeneration; raw HDF5 and MP4 files are not redistributed"
    },
    {
      "id": "source_alignment_audit",
      "status": "reproducible",
      "command": "python scripts/validate_source_alignment.py",
      "expected": "SOURCE_ALIGNMENT_AUDIT.md and docs/data/source_alignment_audit.json",
      "boundary": "offline committed-fact audit; does not fetch private gated data"
    },
    {
      "id": "evaluation_protocol",
      "status": "reproducible",
      "command": "python scripts/build_evaluation_protocol.py",
      "expected": "EVALUATION_PROTOCOL.md and docs/data/evaluation_protocol.json",
      "boundary": "defines single-episode task evaluation rules; does not add cross-episode model quality"
    },
    {
      "id": "figures_and_dashboard_data",
      "status": "reproducible",
      "command": "python scripts/generate_visualizations.py && python scripts/render_overview_figures.py && python scripts/render_task_suite_infographic.py && python scripts/export_modality_atlas_assets.py && python scripts/build_brand_assets.py && python scripts/build_figure_index.py",
      "expected": "website JSON bundles, charts, overview figures, task-suite infographic, responsive modality atlas assets, brand logo derivatives, FIGURE_INDEX.md, docs/data/brand_assets.json, and docs/data/figure_index.json",
      "boundary": "figures are generated presentation layers over committed metrics and sample thumbnails"
    },
    {
      "id": "publication_validation",
      "status": "reproducible",
      "command": "python scripts/validate_website_integrity.py && python scripts/validate_task_surface.py && python scripts/validate_scope_claims.py && python scripts/build_artifact_index.py && python scripts/validate_mirror_parity.py && python scripts/validate_publication_package.py",
      "expected": "docs/data/website_integrity.json, docs/data/task_surface_integrity.json, docs/data/scope_claims_audit.json, docs/data/artifact_index.json, docs/data/mirror_parity.json, and docs/data/publication_audit.json",
      "boundary": "checks local website integrity plus public repo, prepared HF bundles, and prepared mirror parity"
    },
    {
      "id": "qwen3_omni_multi_episode_pilot",
      "status": "verified_final_diagnostic_result_not_publicly_rerunnable_without_gated_data",
      "command": "scripts/omni/run_dense_multiscale_qwen_v5_h20.sh, scripts/omni/train_qwen3_omni_lora.py, and scripts/omni/run_qwen3_omni_lora_eval_sharded.sh on the selected gated episodes",
      "expected": "verified v5/v6 diagnostic LoRA packages; the latest v6 package records 34,269 exported multiscale windows and 4,032 held-out test predictions",
      "boundary": "the public package records metrics and manifests, but rerunning requires gated Xperience-10M episode access and base-model weights; v6 improves some structured metrics over v5 but remains a diagnostic baseline rather than a strong action/subtask model"
    },
    {
      "id": "owner_gpu_qwen3_v6_reproduction",
      "status": "reproducible_on_private_gpu_staging",
      "command": "cd <staged-repo-root> && CUDA_VISIBLE_DEVICES=0,1,2,3 RUN_ID=a100_repro_qwen_v6_eval_smoke1_manual SAMPLE_LIMIT=1 MAX_NEW_TOKENS=1 scripts/omni/run_private_gpu_qwen3_v6_repro_smoke.sh",
      "expected": "One-sample Qwen3-Omni v6 eval smoke writes progress.jsonl, predictions, metrics.json, and exit_code.txt with exit_code 0",
      "boundary": "owner-side private staging only; depends on exported media cache, path-rewritten dataset_a100_eval.jsonl, Qwen3-Omni base model cache, v6 LoRA adapter, and the compatible Transformers video-feature patch"
    }
  ]
}