| { |
| "title": "Ropedia Xperience-10M Task Suite Reproducibility Matrix", |
| "version": "2026-06-14", |
| "scope": "one public Xperience-10M sample episode plus owner-side private staged Qwen3-Omni v6 reproduction", |
| "python_target": "3.12", |
| "public_raw_data_redistributed": false, |
| "last_exact_metric_audit": { |
| "date": "2026-05-30", |
| "timezone": "Asia/Singapore", |
| "evidence": "notes/reproducibility_audit.md", |
| "status": "pass", |
| "matched_artifacts": [ |
| "results/min_action_model/metrics.json", |
| "results/min_subtask_model/metrics.json", |
| "results/min_all_modalities_action_model/metrics.json", |
| "results/min_all_modalities_subtask_model/metrics.json", |
| "results/episode_task_suite/summary_report.json", |
| "results/episode_task_suite/feature_manifest.json", |
| "results/episode_task_suite/available_modalities.json", |
| "results/episode_task_suite/*/metrics.json" |
| ] |
| }, |
| "steps": [ |
| { |
| "id": "download_sample", |
| "status": "reproducible", |
| "command": "hf download ropedia-ai/xperience-10m-sample --repo-type dataset --local-dir data/sample/xperience-10m-sample", |
| "expected": "annotation.hdf5 plus public sample MP4 streams under data/sample/xperience-10m-sample; optional visualization.rrd can be inspected with Rerun 0.29.0", |
| "boundary": "sample card lists cc-by-nc-4.0; raw sample data is downloaded from upstream, not redistributed here" |
| }, |
| { |
| "id": "minimal_baselines", |
| "status": "reproducible", |
| "command": "python scripts/train_min_action_model.py --workspace $WORKSPACE && python scripts/train_all_modalities_model.py --workspace $WORKSPACE", |
| "expected": "minimal baseline metrics and model weights under results/min_*", |
| "boundary": "single-episode chronological split" |
| }, |
| { |
| "id": "original_task_suite", |
| "status": "reproducible", |
| "command": "python scripts/episode_task_suite.py --workspace $WORKSPACE --include-neural", |
| "expected": "original task metrics, predictions, manifests, and neural_mlp task-head artifacts", |
| "boundary": "8,546-dimensional multimodal window contract" |
| }, |
| { |
| "id": "research_direction_outputs", |
| "status": "reproducible", |
| "command": "python scripts/research_direction_taxonomy.py && python scripts/research_direction_extension_tasks.py && python scripts/task_walkthroughs.py", |
| "expected": "research-direction taxonomy, extension probes, and task walkthrough artifacts", |
| "boundary": "single-episode probes, not full research-direction solutions" |
| }, |
| { |
| "id": "tasks_13_to_20_and_unified_index", |
| "status": "reproducible", |
| "command": "python scripts/tier2_task_suite.py && python scripts/build_unified_task_suite.py && python scripts/build_unified_task_model_radar.py", |
| "expected": "tasks 13-20 metrics, prediction/rank artifacts, TASK_SUITE_20.md, docs/data/task_suite_20.json, docs/data/tier2_task_suite.json, docs/assets/charts/tier2_task_suite.svg, docs/data/unified_task_model_radar.json, and docs/assets/charts/unified_task_model_radar.svg", |
| "boundary": "requires local public-sample annotation.hdf5 plus HOMIE Toolkit or h5py for tasks 13-20; raw HDF5 and MP4 files are not redistributed" |
| }, |
| { |
| "id": "source_alignment_audit", |
| "status": "reproducible", |
| "command": "python scripts/validate_source_alignment.py", |
| "expected": "SOURCE_ALIGNMENT_AUDIT.md and docs/data/source_alignment_audit.json", |
| "boundary": "offline committed-fact audit; does not fetch private gated data" |
| }, |
| { |
| "id": "evaluation_protocol", |
| "status": "reproducible", |
| "command": "python scripts/build_evaluation_protocol.py", |
| "expected": "EVALUATION_PROTOCOL.md and docs/data/evaluation_protocol.json", |
| "boundary": "defines single-episode task evaluation rules; does not add cross-episode model quality" |
| }, |
| { |
| "id": "figures_and_dashboard_data", |
| "status": "reproducible", |
| "command": "python scripts/generate_visualizations.py && python scripts/render_overview_figures.py && python scripts/render_task_suite_infographic.py && python scripts/export_modality_atlas_assets.py && python scripts/build_brand_assets.py && python scripts/build_figure_index.py", |
| "expected": "website JSON bundles, charts, overview figures, task-suite infographic, responsive modality atlas assets, brand logo derivatives, FIGURE_INDEX.md, docs/data/brand_assets.json, and docs/data/figure_index.json", |
| "boundary": "figures are generated presentation layers over committed metrics and sample thumbnails" |
| }, |
| { |
| "id": "publication_validation", |
| "status": "reproducible", |
| "command": "python scripts/validate_website_integrity.py && python scripts/validate_task_surface.py && python scripts/validate_scope_claims.py && python scripts/build_artifact_index.py && python scripts/validate_mirror_parity.py && python scripts/validate_publication_package.py", |
| "expected": "docs/data/website_integrity.json, docs/data/task_surface_integrity.json, docs/data/scope_claims_audit.json, docs/data/artifact_index.json, docs/data/mirror_parity.json, and docs/data/publication_audit.json", |
| "boundary": "checks local website integrity plus public repo, prepared HF bundles, and prepared mirror parity" |
| }, |
| { |
| "id": "qwen3_omni_multi_episode_pilot", |
| "status": "verified_final_diagnostic_result_not_publicly_rerunnable_without_gated_data", |
| "command": "scripts/omni/run_dense_multiscale_qwen_v5_h20.sh, scripts/omni/train_qwen3_omni_lora.py, and scripts/omni/run_qwen3_omni_lora_eval_sharded.sh on the selected gated episodes", |
| "expected": "verified v5/v6 diagnostic LoRA packages; the latest v6 package records 34,269 exported multiscale windows and 4,032 held-out test predictions", |
| "boundary": "the public package records metrics and manifests, but rerunning requires gated Xperience-10M episode access and base-model weights; v6 improves some structured metrics over v5 but remains a diagnostic baseline rather than a strong action/subtask model" |
| }, |
| { |
| "id": "owner_gpu_qwen3_v6_reproduction", |
| "status": "reproducible_on_private_gpu_staging", |
| "command": "cd <staged-repo-root> && CUDA_VISIBLE_DEVICES=0,1,2,3 RUN_ID=a100_repro_qwen_v6_eval_smoke1_manual SAMPLE_LIMIT=1 MAX_NEW_TOKENS=1 scripts/omni/run_private_gpu_qwen3_v6_repro_smoke.sh", |
| "expected": "One-sample Qwen3-Omni v6 eval smoke writes progress.jsonl, predictions, metrics.json, and exit_code.txt with exit_code 0", |
| "boundary": "owner-side private staging only; depends on exported media cache, path-rewritten dataset_a100_eval.jsonl, Qwen3-Omni base model cache, v6 LoRA adapter, and the compatible Transformers video-feature patch" |
| } |
| ] |
| } |
|
|