#!/usr/bin/env python3
"""Build a compact source-of-truth artifact index for reviewers.

The index is intentionally selective. It lists the files that prove the public
claims, not every prediction array or checkpoint in the repository.
"""

from __future__ import annotations

import hashlib
import json
from datetime import datetime, timezone
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
OUTPUT = ROOT / "docs/data/artifact_index.json"

ARTIFACTS = [
    {
        "id": "evidence_contract",
        "title": "Evidence contract",
        "path": "EVIDENCE_CONTRACT.md",
        "kind": "claim_boundary",
        "surface": "repo",
        "proves": "Defines what is verified, what is smoke-only, and what must not be inferred.",
    },
    {
        "id": "reviewer_packet",
        "title": "Reviewer packet",
        "path": "docs/data/reviewer_packet.json",
        "kind": "review_path",
        "surface": "website_hf",
        "proves": "Gives a short audit path with scope status and public surfaces.",
    },
    {
        "id": "artifact_guide",
        "title": "Artifact guide",
        "path": "ARTIFACT_GUIDE.md",
        "kind": "review_path",
        "surface": "repo_hf",
        "proves": "Gives the human-readable map from proof boundary to data, tasks, platform mirrors, and scale-up status.",
    },
    {
        "id": "reproducibility_contract",
        "title": "Reproducibility contract",
        "path": "REPRODUCIBILITY.md",
        "kind": "reproducibility",
        "surface": "repo_hf",
        "proves": "Defines public reproduction commands, expected outputs, and non-reproducible scale-up boundaries.",
    },
    {
        "id": "reproducibility_matrix",
        "title": "Reproducibility matrix",
        "path": "docs/data/reproducibility_matrix.json",
        "kind": "reproducibility",
        "surface": "website_hf",
        "proves": "Machine-readable reproduction steps with expected artifacts and public boundaries.",
    },
    {
        "id": "artifact_index_builder",
        "title": "Artifact index builder",
        "path": "scripts/build_artifact_index.py",
        "kind": "review_path",
        "surface": "repo_hf",
        "proves": "Generates the selective proof-artifact catalog from local files.",
    },
    {
        "id": "publication_audit",
        "title": "Publication audit",
        "path": "docs/data/publication_audit.json",
        "kind": "hygiene_report",
        "surface": "website_hf",
        "volatile": True,
        "proves": "Confirms public bundles pass raw-data, cache, archive, and token-string checks.",
    },
    {
        "id": "scope_claims_audit",
        "title": "Scope claims audit",
        "path": "docs/data/scope_claims_audit.json",
        "kind": "scope_guard",
        "surface": "website_hf",
        "volatile": True,
        "proves": "Confirms historical 32ep path strings are not presented as real 32-episode results.",
    },
    {
        "id": "mirror_parity",
        "title": "Prepared mirror parity report",
        "path": "docs/data/mirror_parity.json",
        "kind": "mirror_parity",
        "surface": "website_hf",
        "volatile": True,
        "proves": "Confirms prepared GitHub/HF Space/artifact/model mirrors share the same critical data, figure, website HTML, and validator files.",
    },
    {
        "id": "website_integrity",
        "title": "Website integrity report",
        "path": "docs/data/website_integrity.json",
        "kind": "integrity_report",
        "surface": "website_hf",
        "volatile": True,
        "proves": "Confirms local website links, anchors, JSON data files, and referenced images resolve.",
    },
    {
        "id": "project_manifest",
        "title": "Project manifest",
        "path": "docs/data/project_manifest.json",
        "kind": "metadata",
        "surface": "website_hf",
        "proves": "Lists public URLs, upstream sources, and machine-readable project metadata.",
    },
    {
        "id": "task_summary",
        "title": "12-task summary report",
        "path": "results/episode_task_suite/summary_report.json",
        "kind": "metrics_source",
        "surface": "repo_hf",
        "proves": "Stores the task definitions, splits, feature dimension, and minimal/neural metrics.",
    },
    {
        "id": "website_metrics_bundle",
        "title": "Website metrics bundle",
        "path": "docs/data/summary_metrics.json",
        "kind": "website_data",
        "surface": "website_hf",
        "proves": "Mirrors task metrics for the static dashboard.",
    },
    {
        "id": "feature_manifest",
        "title": "Feature manifest",
        "path": "results/episode_task_suite/feature_manifest.json",
        "kind": "data_contract",
        "surface": "repo_hf",
        "proves": "Maps the 8,378-dimensional window vector back to source feature blocks.",
    },
    {
        "id": "available_modalities",
        "title": "Available modalities",
        "path": "results/episode_task_suite/available_modalities.json",
        "kind": "data_contract",
        "surface": "repo_hf",
        "proves": "Documents which sample modalities entered the current extracted feature contract.",
    },
    {
        "id": "windows_table",
        "title": "Aligned windows table",
        "path": "results/episode_task_suite/windows.csv",
        "kind": "data_contract",
        "surface": "repo_hf",
        "proves": "Lists the 1,161 aligned windows and their frame/action/subtask labels.",
    },
    {
        "id": "neural_mlp_directory",
        "title": "Neural MLP task-head results",
        "path": "results/episode_task_suite/neural_mlp",
        "kind": "result_directory",
        "surface": "repo_hf_model",
        "proves": "Stores matching PyTorch MLP results for the 12 task contracts.",
    },
    {
        "id": "research_direction_taxonomy",
        "title": "Research direction taxonomy",
        "path": "results/episode_task_suite/research_directions/research_direction_taxonomy.json",
        "kind": "taxonomy",
        "surface": "repo_hf",
        "proves": "Maps the 12 tasks to the four Ropedia research directions as direct/proxy/diagnostic.",
    },
    {
        "id": "research_direction_extensions",
        "title": "Research direction extension probes",
        "path": "results/episode_task_suite/research_direction_extensions/research_direction_extension_results.json",
        "kind": "metrics_source",
        "surface": "repo_hf",
        "proves": "Stores one coded extension probe per research direction with minimal and neural metrics.",
    },
    {
        "id": "task_walkthroughs",
        "title": "Task walkthroughs",
        "path": "results/episode_task_suite/task_walkthroughs/TASK_WALKTHROUGHS.md",
        "kind": "onboarding_doc",
        "surface": "repo_hf",
        "proves": "Explains every task with case study, input, process modules, output, and limitation.",
    },
    {
        "id": "task_suite_infographic",
        "title": "12-task suite infographic",
        "path": "docs/assets/task_suite_infographic.png",
        "kind": "generated_figure",
        "surface": "website_hf",
        "proves": "Presents the task suite and sample modality thumbnails with metrics generated from committed files.",
    },
    {
        "id": "modality_atlas",
        "title": "Responsive modality atlas",
        "path": "docs/data/modality_atlas.json",
        "kind": "website_data",
        "surface": "website_hf",
        "proves": "Documents the seven public-sample modality cards and their derived thumbnail assets.",
    },
    {
        "id": "modality_thumbnails",
        "title": "Standalone modality thumbnails",
        "path": "docs/assets/modalities",
        "kind": "generated_figure_assets",
        "surface": "website_hf",
        "proves": "Stores small derived thumbnails for readable website modality cards without raw data redistribution.",
    },
    {
        "id": "pipeline_figure",
        "title": "Pipeline figure",
        "path": "docs/assets/pipeline_diagram.png",
        "kind": "generated_figure",
        "surface": "website_hf",
        "proves": "Shows the raw-episode to artifact pipeline with verified labels.",
    },
    {
        "id": "architecture_figure",
        "title": "Architecture figure",
        "path": "docs/assets/task_architectures.png",
        "kind": "generated_figure",
        "surface": "website_hf",
        "proves": "Shows the shared feature pipeline and minimal/neural head families.",
    },
    {
        "id": "qwen_data_blocker",
        "title": "Qwen3-Omni data blocker report",
        "path": "results/omni_finetune/DATA_BLOCKER_REPORT.md",
        "kind": "blocker_report",
        "surface": "repo_hf",
        "proves": "Documents why no 32-episode Qwen3-Omni result is claimed yet.",
    },
    {
        "id": "a100_relay_status",
        "title": "A100 relay status",
        "path": "results/omni_finetune/A100_HF_RELAY_STATUS.md",
        "kind": "scaleup_status",
        "surface": "repo_hf",
        "proves": "Documents the pending A100-to-H20 data relay and 32-session pilot selection.",
    },
    {
        "id": "citation",
        "title": "Citation metadata",
        "path": "CITATION.cff",
        "kind": "citation",
        "surface": "repo_hf",
        "proves": "Makes the project externally citable.",
    },
    {
        "id": "license",
        "title": "License and data terms",
        "path": "LICENSE",
        "kind": "license",
        "surface": "repo_hf",
        "proves": "Separates MIT-scoped code from original Xperience-10M data terms.",
    },
]


def sha256(path: Path) -> str:
    digest = hashlib.sha256()
    with path.open("rb") as handle:
        for chunk in iter(lambda: handle.read(1024 * 1024), b""):
            digest.update(chunk)
    return digest.hexdigest()


def directory_stats(path: Path) -> dict:
    files = [item for item in path.rglob("*") if item.is_file()]
    return {
        "file_count": len(files),
        "bytes": sum(item.stat().st_size for item in files),
    }


def artifact_entry(item: dict) -> dict:
    path = ROOT / item["path"]
    entry = {
        **item,
        "exists": path.exists(),
    }
    if path.is_file():
        entry["bytes"] = path.stat().st_size
        if item.get("volatile"):
            entry["hash_policy"] = "existence_and_size_only"
        else:
            entry["sha256"] = sha256(path)
    elif path.is_dir():
        entry.update(directory_stats(path))
    else:
        entry.update({"bytes": 0})
    return entry


def main() -> int:
    entries = [artifact_entry(item) for item in ARTIFACTS]
    missing = [entry["path"] for entry in entries if not entry["exists"]]
    by_kind: dict[str, int] = {}
    for entry in entries:
        by_kind[entry["kind"]] = by_kind.get(entry["kind"], 0) + 1

    report = {
        "title": "Ropedia Xperience-10M Task Suite Artifact Index",
        "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
        "status": "pass" if not missing else "fail",
        "artifact_count": len(entries),
        "missing": missing,
        "by_kind": by_kind,
        "artifacts": entries,
    }
    OUTPUT.parent.mkdir(parents=True, exist_ok=True)
    OUTPUT.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8")
    print(f"{report['status'].upper()}: wrote {OUTPUT}")
    if missing:
        for path in missing:
            print(f"- missing: {path}")
        return 1
    return 0


if __name__ == "__main__":
    raise SystemExit(main())