#!/usr/bin/env python3
"""Build the public 128-episode source and processed-feature index.

The index links every selected Xperience-10M episode back to the official
gated dataset path, then lists the public-safe derived feature artifacts that
are mirrored in this repository and the Hugging Face bundles.
"""

from __future__ import annotations

import csv
import hashlib
import json
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

try:
    import numpy as np
except Exception:  # pragma: no cover - the index can still be built without np
    np = None


ROOT = Path(__file__).resolve().parents[2]
OUTPUT_JSON = ROOT / "docs/data/xperience10m_128_episode_feature_index.json"
OUTPUT_MD = ROOT / "XPERIENCE10M_128_EPISODE_FEATURE_INDEX.md"

OFFICIAL_REPO_ID = "ropedia-ai/xperience-10m"
OFFICIAL_TREE_BASE = f"https://huggingface.co/datasets/{OFFICIAL_REPO_ID}/tree/main"
OFFICIAL_RESOLVE_BASE = f"https://huggingface.co/datasets/{OFFICIAL_REPO_ID}/resolve/main"
PROJECT_ARTIFACT_REPO = "cy0307/ropedia-xperience-10m-task-suite-artifacts"
PROJECT_MODEL_REPO = "cy0307/ropedia-xperience-10m-task-baselines"

SELECTION_JSON = ROOT / "results/omni_finetune/xperience10m_128_episode_selection.json"
SELECTION_CSV = ROOT / "results/omni_finetune/xperience10m_128_episode_selection.csv"
DOWNLOAD_LIST = ROOT / "results/omni_finetune/xperience10m_128_episode_download_files.txt"
EPISODE_MANIFEST = ROOT / "results/omni_finetune/episode_manifest.json"
SPARSE_DATASET_MANIFEST = ROOT / "results/omni_finetune/dataset_manifest.json"
QWEN_V6_DATASET_MANIFEST = (
    ROOT
    / "results/omni_finetune/verified_public/"
    / "xperience10m_qwen3_omni_128ep_multiscale_cap96_v6_rank64_lr5e5_full8gpu_lora_eval_test_full/"
    / "dataset/dataset_manifest.json"
)
DENSE_DIR = ROOT / "results/omni_finetune/xperience10m_128ep_dense_multiscale_hierarchical_v1_20260608"
DENSE_DATASET = DENSE_DIR / "dense_multiscale_windows.jsonl"
DENSE_MANIFEST = DENSE_DIR / "dataset_manifest.json"
DENSE_LABEL_STATS = DENSE_DIR / "hierarchical_label_stats.json"
DENSE_SPLIT_SCALE_COUNTS = DENSE_DIR / "split_scale_counts.csv"
METADATA_MATRIX_V2 = (
    ROOT / "results/omni_finetune/a100_128_metadata_task_baselines_20260616_v2/metadata_feature_matrix.npz"
)
METADATA_MATRIX_SPARSE = (
    ROOT / "results/omni_finetune/multi_episode_128_task_baselines/metadata_feature_matrix.npz"
)
RAW20_DIR = ROOT / "results/omni_finetune/a100_128_raw20_task_baselines_complete20_proxy_20260616T091500Z"
RAW20_SUMMARY = RAW20_DIR / "run_summary_all.json"


def load_json(path: Path) -> Any:
    return json.loads(path.read_text(encoding="utf-8"))


def rel(path: Path) -> str:
    return path.relative_to(ROOT).as_posix()


def sha256(path: Path) -> str:
    digest = hashlib.sha256()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            digest.update(chunk)
    return digest.hexdigest()


def line_count(path: Path) -> int | None:
    if not path.exists():
        return None
    count = 0
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            count += chunk.count(b"\n")
    return count


def npz_summary(path: Path) -> dict[str, Any]:
    summary: dict[str, Any] = {"available": path.exists()}
    if not path.exists():
        return summary
    summary.update({"bytes": path.stat().st_size, "sha256": sha256(path)})
    if np is None:
        return summary
    with np.load(path, allow_pickle=True) as data:
        arrays = {}
        for key in data.files:
            arr = data[key]
            arrays[key] = {"shape": list(arr.shape), "dtype": str(arr.dtype)}
        summary["arrays"] = arrays
        if "X" in data:
            summary["row_count"] = int(data["X"].shape[0])
            summary["feature_dim"] = int(data["X"].shape[1]) if data["X"].ndim > 1 else None
        if "split" in data:
            summary["split_counts"] = dict(Counter(str(x) for x in data["split"].tolist()))
    return summary


def artifact_record(path: Path, title: str, description: str, *, kind: str) -> dict[str, Any]:
    record: dict[str, Any] = {
        "title": title,
        "kind": kind,
        "description": description,
        "repo_path": rel(path),
        "github_url": f"https://github.com/ChaoYue0307/ropedia-xperience-10m-task-suite/blob/main/{rel(path)}",
        "hf_artifact_url": f"https://huggingface.co/datasets/{PROJECT_ARTIFACT_REPO}/resolve/main/{rel(path)}",
        "hf_model_url": f"https://huggingface.co/{PROJECT_MODEL_REPO}/resolve/main/{rel(path)}",
        "available": path.exists(),
    }
    if path.exists() and path.is_file():
        record["bytes"] = path.stat().st_size
        record["sha256"] = sha256(path)
        if path.suffix == ".jsonl":
            record["line_count"] = line_count(path)
        if path.suffix == ".npz":
            record["npz"] = npz_summary(path)
    return record


def split_counts(rows: list[dict[str, Any]]) -> dict[str, int]:
    return dict(Counter(str(row.get("split", "unknown")) for row in rows))


def selected_episode_records(selection: dict[str, Any], episode_manifest: dict[str, Any]) -> list[dict[str, Any]]:
    manifest_by_key = {
        episode.get("episode_path"): episode for episode in episode_manifest.get("episodes", [])
    }
    rows = []
    for item in selection.get("selected_episodes", []):
        episode_path = item["episode_path"]
        manifest = manifest_by_key.get(episode_path, {})
        file_records = []
        for file_path in item.get("download_files", []):
            file_name = Path(file_path).name
            manifest_file = next(
                (entry for entry in manifest.get("files", []) if entry.get("name") == file_name),
                {},
            )
            file_records.append(
                {
                    "name": file_name,
                    "official_repo_path": file_path,
                    "gated_download_url": f"{OFFICIAL_RESOLVE_BASE}/{file_path}",
                    "bytes": manifest_file.get("bytes"),
                    "exists_in_selected_manifest": manifest_file.get("exists"),
                }
            )
        row_id_prefix = f"{item['top_level_session']}__{item['episode_id']}"
        rows.append(
            {
                "selection_rank": item.get("selection_rank"),
                "split": item.get("split"),
                "size_band": item.get("size_band"),
                "official_episode_path": episode_path,
                "top_level_session": item.get("top_level_session"),
                "source_episode_id": item.get("episode_id"),
                "canonical_episode_id": manifest.get("episode_id", row_id_prefix),
                "row_id_prefix": row_id_prefix,
                "official_tree_url": f"{OFFICIAL_TREE_BASE}/{episode_path}",
                "official_files": file_records,
                "main_task": manifest.get("main_task"),
                "frame_count": manifest.get("frame_count"),
                "window_counts": manifest.get("label_stats", {}).get("num_labeled_windows", {}),
                "hdf5_modalities": manifest.get("hdf5_modalities", {}),
                "annotation_bytes": item.get("annotation_bytes"),
                "training_bytes_excluding_visualization_rrd": item.get(
                    "training_bytes_excluding_visualization_rrd"
                ),
                "has_all_six_videos": item.get("has_all_six_videos"),
                "has_visualization_rrd": item.get("has_visualization_rrd"),
            }
        )
    return rows


def csv_split_scale_counts(path: Path) -> list[dict[str, str]]:
    if not path.exists():
        return []
    with path.open(newline="", encoding="utf-8") as handle:
        return list(csv.DictReader(handle))


def build_index() -> dict[str, Any]:
    selection = load_json(SELECTION_JSON)
    episode_manifest = load_json(EPISODE_MANIFEST)
    sparse_manifest = load_json(SPARSE_DATASET_MANIFEST)
    qwen_v6_manifest = load_json(QWEN_V6_DATASET_MANIFEST)
    dense_manifest = load_json(DENSE_MANIFEST)
    raw20_summary = load_json(RAW20_SUMMARY)

    episodes = selected_episode_records(selection, episode_manifest)
    artifacts = [
        artifact_record(
            SELECTION_JSON,
            "128-episode selected source manifest",
            "Public-safe selection record with official episode paths, split labels, size bands, and file lists.",
            kind="source_index",
        ),
        artifact_record(
            SELECTION_CSV,
            "128-episode selected source table",
            "CSV table for quick scanning of rank, split, official episode path, source session, and size band.",
            kind="source_index",
        ),
        artifact_record(
            DOWNLOAD_LIST,
            "Official raw-file download list",
            "Seven official gated raw files per selected episode: annotation HDF5 plus six synchronized MP4 streams.",
            kind="source_index",
        ),
        artifact_record(
            EPISODE_MANIFEST,
            "Inspected 128-episode manifest",
            "Per-episode file sizes, frame counts, task labels, HDF5 modality availability, and selected split metadata.",
            kind="episode_manifest",
        ),
        artifact_record(
            SPARSE_DATASET_MANIFEST,
            "Sparse 20-frame JSONL export manifest",
            "Manifest for the sparse Qwen-style 20-frame window export after export-time filtering.",
            kind="processed_manifest",
        ),
        artifact_record(
            QWEN_V6_DATASET_MANIFEST,
            "Qwen3-Omni v6 multiscale dataset manifest",
            "Verified package manifest for the current 34,269-window Qwen3-Omni v6 dataset branch.",
            kind="processed_manifest",
        ),
        artifact_record(
            DENSE_DATASET,
            "Dense multiscale public-safe windows JSONL",
            "Compact public-safe rows for dense/medium/long windows, labels, object sets, and sparse-window provenance.",
            kind="processed_feature_table",
        ),
        artifact_record(
            DENSE_MANIFEST,
            "Dense multiscale manifest",
            "Counts and provenance for the dense multiscale public-safe feature table.",
            kind="processed_manifest",
        ),
        artifact_record(
            DENSE_LABEL_STATS,
            "Dense hierarchical label stats",
            "Action/subtask family and object-label statistics for the dense multiscale feature table.",
            kind="processed_summary",
        ),
        artifact_record(
            DENSE_SPLIT_SCALE_COUNTS,
            "Dense split-by-scale counts",
            "CSV counts by split and scale id for the dense multiscale feature table.",
            kind="processed_summary",
        ),
        artifact_record(
            METADATA_MATRIX_V2,
            "128-episode metadata feature matrix v2",
            "Public-safe 34,269 x 394 metadata/text feature matrix used by the aligned metadata baselines.",
            kind="processed_feature_matrix",
        ),
        artifact_record(
            METADATA_MATRIX_SPARSE,
            "128-episode sparse metadata feature matrix",
            "Earlier 3,808 x 394 metadata/text feature matrix for the sparse 20-frame export.",
            kind="processed_feature_matrix",
        ),
        artifact_record(
            RAW20_SUMMARY,
            "128-episode raw20 baseline summary",
            "All 40 simple/raw and neural/raw result records for the 20-task raw-feature baseline run.",
            kind="result_summary",
        ),
    ]

    return {
        "status": "pass",
        "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
        "official_dataset": {
            "repo_id": selection.get("repo_id", OFFICIAL_REPO_ID),
            "repo_sha": selection.get("repo_sha"),
            "tree_base_url": OFFICIAL_TREE_BASE,
            "gated_resolve_base_url": OFFICIAL_RESOLVE_BASE,
            "access_note": (
                "Episode directory pages are browsable on Hugging Face. Raw annotation/video file "
                "downloads require access to the gated official dataset and are not redistributed here."
            ),
        },
        "selection_summary": {
            "selected_episode_count": len(selection.get("selected_episodes", [])),
            "selected_split_counts": split_counts(selection.get("selected_episodes", [])),
            "size_band_counts": dict(Counter(e.get("size_band") for e in selection.get("selected_episodes", []))),
            "one_episode_per_top_level_session": len(
                {e.get("top_level_session") for e in selection.get("selected_episodes", [])}
            )
            == len(selection.get("selected_episodes", [])),
            "selected_download_size_excluding_visualization_rrd_bytes": sum(
                int(e.get("training_bytes_excluding_visualization_rrd") or 0)
                for e in selection.get("selected_episodes", [])
            ),
        },
        "processed_summary": {
            "inspected_episode_manifest_count": len(episode_manifest.get("episodes", [])),
            "sparse_export": {
                "num_episodes": sparse_manifest.get("num_episodes"),
                "num_samples": sparse_manifest.get("num_samples"),
                "split_counts": sparse_manifest.get("split_counts"),
            },
            "qwen_v6_multiscale_export": {
                "num_episodes": qwen_v6_manifest.get("num_episodes"),
                "num_samples": qwen_v6_manifest.get("num_samples"),
                "split_counts": qwen_v6_manifest.get("split_counts"),
            },
            "dense_multiscale_compact_export": {
                "num_episodes": dense_manifest.get("num_episodes"),
                "num_samples": dense_manifest.get("num_samples"),
                "split_counts": dense_manifest.get("split_counts"),
                "scale_counts": dense_manifest.get("scale_counts"),
                "split_scale_counts": csv_split_scale_counts(DENSE_SPLIT_SCALE_COUNTS),
            },
            "metadata_matrix_v2": npz_summary(METADATA_MATRIX_V2),
            "metadata_matrix_sparse": npz_summary(METADATA_MATRIX_SPARSE),
            "raw20_result_records": raw20_summary.get("num_result_records"),
            "raw20_status_counts": raw20_summary.get("status_counts"),
            "raw20_proxy_tasks": raw20_summary.get("proxy_tasks"),
        },
        "processed_feature_artifacts": artifacts,
        "episodes": episodes,
        "non_redistribution_policy": {
            "raw_not_included": ["annotation.hdf5", "fisheye_cam*.mp4", "stereo_*.mp4", "visualization.rrd"],
            "reason": "Raw Xperience-10M files remain in the official gated dataset.",
            "included_public_safe": [
                "episode/source ids",
                "file sizes and modality availability",
                "compact dense window rows",
                "metadata feature matrices",
                "baseline metrics and predictions",
                "verified public model-package summaries",
            ],
        },
    }


def write_markdown(index: dict[str, Any]) -> None:
    lines = [
        "# Xperience-10M 128-Episode Feature Index",
        "",
        "This file links the selected 128-episode split to the official Xperience-10M episode paths and to the public-safe processed feature artifacts mirrored by this project.",
        "",
        "Raw Xperience-10M annotation/video files are not redistributed. Each episode row includes the official Hugging Face tree URL and gated raw-file URLs so a reader with access can resolve the source data.",
        "",
        "## Summary",
        "",
        f"- official_dataset: `{index['official_dataset']['repo_id']}`",
        f"- official_repo_sha: `{index['official_dataset'].get('repo_sha')}`",
        f"- selected_episode_count: `{index['selection_summary']['selected_episode_count']}`",
        f"- selected_split_counts: `{json.dumps(index['selection_summary']['selected_split_counts'], sort_keys=True)}`",
        f"- qwen_v6_multiscale_windows: `{index['processed_summary']['qwen_v6_multiscale_export']['num_samples']}`",
        f"- dense_multiscale_compact_windows: `{index['processed_summary']['dense_multiscale_compact_export']['num_samples']}`",
        f"- metadata_matrix_v2_shape: `{index['processed_summary']['metadata_matrix_v2'].get('npz', index['processed_summary']['metadata_matrix_v2']).get('arrays', {}).get('X', {}).get('shape')}`",
        "",
        "## Processed Feature Artifacts",
        "",
        "| Artifact | What it represents | Repo path |",
        "| --- | --- | --- |",
    ]
    for artifact in index["processed_feature_artifacts"]:
        lines.append(
            f"| {artifact['title']} | {artifact['description']} | `{artifact['repo_path']}` |"
        )

    lines += [
        "",
        "## Selected Episodes",
        "",
        "| Rank | Split | Official episode path | Canonical id | Frames | Action windows | Official tree |",
        "| ---: | --- | --- | --- | ---: | ---: | --- |",
    ]
    for episode in index["episodes"]:
        action_windows = episode.get("window_counts", {}).get("action", "")
        lines.append(
            "| {rank} | {split} | `{path}` | `{canonical}` | {frames} | {windows} | [HF tree]({url}) |".format(
                rank=episode.get("selection_rank"),
                split=episode.get("split"),
                path=episode.get("official_episode_path"),
                canonical=episode.get("canonical_episode_id"),
                frames=episode.get("frame_count") or "",
                windows=action_windows,
                url=episode.get("official_tree_url"),
            )
        )
    lines.append("")
    OUTPUT_MD.write_text("\n".join(lines), encoding="utf-8")


def main() -> int:
    index = build_index()
    OUTPUT_JSON.parent.mkdir(parents=True, exist_ok=True)
    OUTPUT_JSON.write_text(json.dumps(index, indent=2) + "\n", encoding="utf-8")
    write_markdown(index)
    print(f"WROTE {OUTPUT_JSON}")
    print(f"WROTE {OUTPUT_MD}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())