cy0307 commited on
Commit
6fdcae7
·
verified ·
1 Parent(s): 989ae80

Refresh model mirror project status

Browse files
Files changed (1) hide show
  1. metrics/project_status.json +19 -5
metrics/project_status.json CHANGED
@@ -31,6 +31,9 @@
31
  "qwen3_omni_lora_adapter_repo": "https://huggingface.co/cy0307/ropedia-qwen3-omni-lora-128ep",
32
  "cosmos3_nano_future_window_compatibility_verified": true,
33
  "cosmos3_nano_future_window_test_predictions": 378,
 
 
 
34
  "omni_model_comparison_available": true,
35
  "multi_episode_128_aligned_baselines": true,
36
  "multi_episode_128_baseline_window_counts": {
@@ -116,7 +119,7 @@
116
  "FOUNDATION_MODEL_PLAN.md",
117
  "docs/data/foundation_model_plan.json"
118
  ],
119
- "readout": "Qwen3-Omni remains the first trainable held-out LoRA baseline; Cosmos 3 is now represented by a verified Cosmos3-Nano future-window compatibility package and remains the first world-model/action-generation branch; OpenVLA/openpi/GR00T are policy candidates after action targets are explicit."
120
  },
121
  {
122
  "area": "Omni model extension contract",
@@ -204,7 +207,7 @@
204
  "results/omni_finetune/OMNI_MODEL_COMPARISON.md",
205
  "scripts/omni/build_omni_model_comparison.py"
206
  ],
207
- "readout": "The public comparison separates three layers: the single-episode raw-feature task suite, the selected 128-episode simple/NN metadata baselines, and verified foundation-model branch packages for Qwen3-Omni and Cosmos3-Nano future-window compatibility."
208
  },
209
  {
210
  "area": "Qwen3-Omni fine-tuning",
@@ -230,6 +233,17 @@
230
  ],
231
  "readout": "The Cosmos3-Nano branch now has a public-safe verified future-window compatibility package with 3,213 future-window samples, 378 held-out test predictions, future retrieval MRR 0.0221, temporal consistency 0.0952, transition accuracy 0.9683, and contact accuracy 0.7434. It is a compatibility adapter result, not a full Cosmos diffusion-weight fine-tune."
232
  },
 
 
 
 
 
 
 
 
 
 
 
233
  {
234
  "area": "Raw Xperience-10M redistribution",
235
  "status": "not_included",
@@ -254,15 +268,15 @@
254
  "Inspect SOURCE_ALIGNMENT_AUDIT.md before judging source-card consistency across public surfaces.",
255
  "Inspect XPERIENCE10M_DATASET_CARD_ALIGNMENT.md before judging dataset wording.",
256
  "Inspect results/omni_finetune/multi_episode_128_task_baselines/BASELINE_ALIGNMENT_REPORT.md before comparing simple/NN baselines to the selected 128-episode setup.",
257
- "Inspect docs/data/omni_model_comparison.json before comparing the current three result versions.",
258
  "Inspect docs/data/omni_finetune_verified_result.json before judging the Qwen3-Omni diagnostic pilot."
259
  ],
260
  "current_reading_notes": [
261
  "The final Qwen3-Omni diagnostic result is verified and meets the strict-JSON target, but action/subtask held-out quality is still weak.",
262
- "Use docs/data/omni_model_comparison.json to compare the single-episode task suite, 128-episode aligned baselines, and verified Qwen3/Cosmos branch packages without mixing incompatible metric targets.",
263
  "Use docs/data/omni_finetune_verified_result.json and the latest verified_public final Qwen package for current held-out results.",
264
  "The 128-episode aligned simple/NN baselines use metadata/text features from the derived Qwen JSONL export; they align the split and task ids but do not replace raw-modality baselines for trajectory, retrieval, reconstruction, or misalignment tasks.",
265
- "The Cosmos3-Nano future-window branch is verified as a compatibility adapter result; full Cosmos diffusion-weight fine-tuning remains pending.",
266
  "The current reconstruction task reconstructs feature vectors, not pixel-depth, mesh, NeRF, or Gaussian reconstruction.",
267
  "Audio is one of the synchronized source modalities in the current task representation.",
268
  "The audio ablation report compares audio/no-audio variants across all 12 task contracts in results/audio_ablation/.",
 
31
  "qwen3_omni_lora_adapter_repo": "https://huggingface.co/cy0307/ropedia-qwen3-omni-lora-128ep",
32
  "cosmos3_nano_future_window_compatibility_verified": true,
33
  "cosmos3_nano_future_window_test_predictions": 378,
34
+ "cosmos3_super_reasoner_verified": true,
35
+ "cosmos3_super_reasoner_test_predictions": 448,
36
+ "cosmos3_super_reasoner_json_validity_rate": 0.5111607142857143,
37
  "omni_model_comparison_available": true,
38
  "multi_episode_128_aligned_baselines": true,
39
  "multi_episode_128_baseline_window_counts": {
 
119
  "FOUNDATION_MODEL_PLAN.md",
120
  "docs/data/foundation_model_plan.json"
121
  ],
122
+ "readout": "Qwen3-Omni remains the first trainable held-out LoRA baseline; Cosmos 3 is now represented by a verified Cosmos3-Nano future-window compatibility package plus a verified Cosmos3-Super base-weight Reasoner evaluation; OpenVLA/openpi/GR00T are policy candidates after action targets are explicit."
123
  },
124
  {
125
  "area": "Omni model extension contract",
 
207
  "results/omni_finetune/OMNI_MODEL_COMPARISON.md",
208
  "scripts/omni/build_omni_model_comparison.py"
209
  ],
210
+ "readout": "The public comparison now has two views: the three result layers and a model-family grouping. The model grouping pairs 1-episode and 128-episode entries for task-head baselines, separates Qwen3-Omni sensor-adapter smoke from 128-episode LoRA diagnostics, and separates Cosmos3-Nano future-window compatibility from Cosmos3-Super base-weight Reasoner evaluation."
211
  },
212
  {
213
  "area": "Qwen3-Omni fine-tuning",
 
233
  ],
234
  "readout": "The Cosmos3-Nano branch now has a public-safe verified future-window compatibility package with 3,213 future-window samples, 378 held-out test predictions, future retrieval MRR 0.0221, temporal consistency 0.0952, transition accuracy 0.9683, and contact accuracy 0.7434. It is a compatibility adapter result, not a full Cosmos diffusion-weight fine-tune."
235
  },
236
+ {
237
+ "area": "Cosmos3-Super Reasoner branch",
238
+ "status": "verified_base_weight_result",
239
+ "evidence": [
240
+ "configs/omni_backbones/cosmos3_super_reasoner.json",
241
+ "scripts/omni/eval_cosmos3_super_reasoner.py",
242
+ "scripts/omni/run_cosmos3_super_reasoner_eval.sh",
243
+ "results/omni_finetune/verified_public/xperience10m_cosmos3_super_reasoner_128ep_test_full_20260607/verified_result_summary.json"
244
+ ],
245
+ "readout": "Cosmos3-Super Reasoner now has a public-safe verified 448-window held-out evaluation on the same structured JSON task as Qwen3. It uses staged nv-community/Cosmos3-Super base weights through an 8-GPU vLLM server, not fine-tuned weights: JSON validity 0.5112, action macro-F1 0.0008, transition accuracy 0.3683, contact accuracy 0.3214, and object micro-F1 0.1370."
246
+ },
247
  {
248
  "area": "Raw Xperience-10M redistribution",
249
  "status": "not_included",
 
268
  "Inspect SOURCE_ALIGNMENT_AUDIT.md before judging source-card consistency across public surfaces.",
269
  "Inspect XPERIENCE10M_DATASET_CARD_ALIGNMENT.md before judging dataset wording.",
270
  "Inspect results/omni_finetune/multi_episode_128_task_baselines/BASELINE_ALIGNMENT_REPORT.md before comparing simple/NN baselines to the selected 128-episode setup.",
271
+ "Inspect docs/data/omni_model_comparison.json before comparing the current three result versions or the model-family 1-episode versus 128-episode groupings.",
272
  "Inspect docs/data/omni_finetune_verified_result.json before judging the Qwen3-Omni diagnostic pilot."
273
  ],
274
  "current_reading_notes": [
275
  "The final Qwen3-Omni diagnostic result is verified and meets the strict-JSON target, but action/subtask held-out quality is still weak.",
276
+ "Use docs/data/omni_model_comparison.json to compare both views: the single-episode/128-baseline/model-branch result layers and the model-family grouping for task heads, Qwen3-Omni LoRA, Cosmos3-Nano, and Cosmos3-Super.",
277
  "Use docs/data/omni_finetune_verified_result.json and the latest verified_public final Qwen package for current held-out results.",
278
  "The 128-episode aligned simple/NN baselines use metadata/text features from the derived Qwen JSONL export; they align the split and task ids but do not replace raw-modality baselines for trajectory, retrieval, reconstruction, or misalignment tasks.",
279
+ "The Cosmos3-Nano future-window branch is verified as a compatibility adapter result, and Cosmos3-Super Reasoner is verified as a base-weight evaluation; one-episode Cosmos fine-tuning and full Cosmos adapter/diffusion-weight fine-tuning remain pending, so no Cosmos weight repo should be published yet.",
280
  "The current reconstruction task reconstructs feature vectors, not pixel-depth, mesh, NeRF, or Gaussian reconstruction.",
281
  "Audio is one of the synchronized source modalities in the current task representation.",
282
  "The audio ablation report compares audio/no-audio variants across all 12 task contracts in results/audio_ablation/.",