ropedia-xperience-10m-task-baselines / scripts /normalize_task_display_names.py
cy0307's picture
Update final Qwen model scripts
627e5d7 verified
Raw
History Blame
6.13 kB
#!/usr/bin/env python3
"""Add reader-facing task names to generated public artifacts."""
from __future__ import annotations
import csv
import json
from pathlib import Path
from typing import Any
from task_display import TASK_DISPLAY_NAMES, task_display_name
ROOT = Path(__file__).resolve().parents[1]
PARENT = ROOT.parent
JSON_GLOBS = [
"results/episode_task_suite/**/*.json",
"results/omni_finetune/multi_episode_128_task_baselines/**/*.json",
"docs/data/summary_metrics.json",
"docs/data/evaluation_protocol.json",
"docs/data/audio_ablation_summary.json",
"docs/data/single_episode_explorer.json",
]
HF_JSON_GLOBS = [
"hf_publish/space/data/summary_metrics.json",
"hf_publish/space/data/evaluation_protocol.json",
"hf_publish/space/data/audio_ablation_summary.json",
"hf_publish/space/data/single_episode_explorer.json",
"hf_publish/artifacts/data/summary_metrics.json",
"hf_publish/artifacts/data/evaluation_protocol.json",
"hf_publish/artifacts/data/audio_ablation_summary.json",
"hf_publish/artifacts/data/single_episode_explorer.json",
"hf_publish/model/results/omni_finetune/multi_episode_128_task_baselines/**/*.json",
]
TASK_METRICS_CSV = [
ROOT / "results/omni_finetune/multi_episode_128_task_baselines/task_metrics.csv",
PARENT / "hf_publish/model/results/omni_finetune/multi_episode_128_task_baselines/task_metrics.csv",
]
BASELINE_REPORTS = [
ROOT / "results/omni_finetune/multi_episode_128_task_baselines/BASELINE_ALIGNMENT_REPORT.md",
PARENT / "hf_publish/model/results/omni_finetune/multi_episode_128_task_baselines/BASELINE_ALIGNMENT_REPORT.md",
]
def update_json_obj(value: Any) -> bool:
changed = False
if isinstance(value, dict):
task = value.get("task")
if isinstance(task, str) and task in TASK_DISPLAY_NAMES:
display = task_display_name(task)
if value.get("task_display_name") != display:
value["task_display_name"] = display
changed = True
tasks = value.get("tasks")
if isinstance(tasks, (dict, list)):
display_map = {task_id: task_display_name(task_id) for task_id in TASK_DISPLAY_NAMES}
if value.get("task_display_names") != display_map:
value["task_display_names"] = display_map
changed = True
for item in value.values():
changed = update_json_obj(item) or changed
elif isinstance(value, list):
for item in value:
changed = update_json_obj(item) or changed
return changed
def update_json_file(path: Path) -> bool:
if not path.exists() or path.name in {"object_vocab.json", "history.json"}:
return False
try:
payload = json.loads(path.read_text(encoding="utf-8"))
except json.JSONDecodeError:
return False
if not update_json_obj(payload):
return False
path.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
return True
def update_task_metrics_csv(path: Path) -> bool:
if not path.exists():
return False
with path.open(newline="", encoding="utf-8") as handle:
reader = csv.DictReader(handle)
rows = list(reader)
fieldnames = list(reader.fieldnames or [])
if not rows or "task" not in fieldnames:
return False
if "task_display_name" not in fieldnames:
fieldnames.insert(fieldnames.index("task") + 1, "task_display_name")
changed = False
for row in rows:
task = row.get("task", "")
if task in TASK_DISPLAY_NAMES:
display = task_display_name(task)
if row.get("task_display_name") != display:
row["task_display_name"] = display
changed = True
if not changed and "task_display_name" in fieldnames:
return False
with path.open("w", newline="", encoding="utf-8") as handle:
writer = csv.DictWriter(handle, fieldnames=fieldnames, lineterminator="\n")
writer.writeheader()
writer.writerows(rows)
return True
def update_baseline_report(path: Path) -> bool:
if not path.exists():
return False
lines = path.read_text(encoding="utf-8").splitlines()
changed = False
out: list[str] = []
for line in lines:
if line == "| task | simple status | simple primary | neural status | neural primary |":
out.append("| task | artifact id | simple status | simple primary | neural status | neural primary |")
changed = True
continue
if line == "| --- | --- | ---: | --- | ---: |":
out.append("| --- | --- | --- | ---: | --- | ---: |")
changed = True
continue
parts = [part.strip() for part in line.split("|")]
if len(parts) >= 7 and parts[1] in TASK_DISPLAY_NAMES:
task_id = parts[1]
out.append(
f"| {task_display_name(task_id)} | `{task_id}` | {parts[2]} | {parts[3]} | {parts[4]} | {parts[5]} |"
)
changed = True
continue
out.append(line)
if changed:
path.write_text("\n".join(out) + "\n", encoding="utf-8")
return changed
def iter_paths(patterns: list[str], base: Path) -> list[Path]:
paths: list[Path] = []
for pattern in patterns:
paths.extend(base.glob(pattern))
return sorted(set(paths))
def main() -> int:
changed: list[str] = []
for path in iter_paths(JSON_GLOBS, ROOT):
if update_json_file(path):
changed.append(str(path.relative_to(ROOT)))
for path in iter_paths(HF_JSON_GLOBS, PARENT):
if update_json_file(path):
changed.append(str(path.relative_to(PARENT)))
for path in TASK_METRICS_CSV:
if update_task_metrics_csv(path):
changed.append(str(path))
for path in BASELINE_REPORTS:
if update_baseline_report(path):
changed.append(str(path))
print(json.dumps({"status": "pass", "changed_files": changed}, indent=2))
return 0
if __name__ == "__main__":
raise SystemExit(main())