#!/usr/bin/env python3
"""Validate public publication hygiene for the repo and HF bundles.

This check is intentionally conservative: it scans the GitHub repo plus the
prepared Hugging Face Space/artifact/model folders for generated Python caches,
raw Xperience-10M data, heavyweight checkpoint formats that should not be
published here, and accidental Hugging Face token strings.
"""

from __future__ import annotations

import argparse
import json
import re
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path


ROOT = Path(__file__).resolve().parents[1]
DEFAULT_HF_ROOT = ROOT.parent / "hf_publish"

BANNED_DIR_NAMES = {"__pycache__"}
BANNED_FILE_NAMES = {".DS_Store"}
BANNED_SUFFIXES = {".pyc", ".pyo"}
RAW_DATA_SUFFIXES = {".mp4", ".hdf5", ".h5", ".rrd"}
HEAVY_MODEL_SUFFIXES = {".safetensors", ".bin", ".tar"}
TEXT_SUFFIXES = {
    "",
    ".cff",
    ".csv",
    ".html",
    ".json",
    ".md",
    ".py",
    ".sh",
    ".svg",
    ".txt",
    ".xml",
    ".yaml",
    ".yml",
}
TOKEN_PATTERN = re.compile(r"hf_[A-Za-z0-9]{20,}")
STALE_PRESENTATION_STRINGS = {
    "xperience10m-" + "modalities-v9-large-atlas": "old task-suite infographic cache key",
    "xperience10m-" + "taskfirst-v10": "older task-suite infographic cache key",
    "Start with the large native " + "modality atlas": "old suite-section hierarchy copy",
}
CARD_FRESHNESS_EXPECTATIONS = [
    {
        "surface": "github_repo",
        "relative_path": "README.md",
        "required": [
            "xperience10m-taskfirst-v11-modality-spread",
            "all 12 task families before the",
            "Public-sample modality thumbnails remain enlarged below",
        ],
    },
    {
        "surface": "hf_space_bundle",
        "relative_path": "README.md",
        "required": [
            "xperience10m-taskfirst-v11-modality-spread",
            "task-first 12-task infographic",
            "native responsive modality atlas",
            "website HTML",
        ],
    },
    {
        "surface": "hf_artifact_bundle",
        "relative_path": "README.md",
        "required": [
            "xperience10m-taskfirst-v11-modality-spread",
            "task-first 12-task map",
            "including critical website HTML",
        ],
    },
    {
        "surface": "hf_artifact_bundle",
        "relative_path": "PROJECT_README.md",
        "required": [
            "xperience10m-taskfirst-v11-modality-spread",
            "all 12 task families before the",
            "Public-sample modality thumbnails remain enlarged below",
        ],
    },
    {
        "surface": "hf_model_bundle",
        "relative_path": "README.md",
        "required": [
            "xperience10m-taskfirst-v11-modality-spread",
            "task-first 12-head",
            "responsive modality atlas",
            "website HTML",
        ],
    },
]


def rel(path: Path, base: Path) -> str:
    try:
        return path.relative_to(base).as_posix()
    except ValueError:
        return path.as_posix()


def git_public_paths(root: Path) -> list[Path] | None:
    try:
        result = subprocess.run(
            ["git", "-C", str(root), "ls-files", "--cached", "--others", "--exclude-standard"],
            check=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.DEVNULL,
            text=True,
        )
    except (OSError, subprocess.CalledProcessError):
        return None
    return [root / line for line in result.stdout.splitlines() if line.strip()]


def iter_public_files(root: Path, paths: list[Path] | None = None):
    if paths is not None:
        for path in paths:
            if path.exists():
                yield path
        return
    if not root.exists():
        return
    for path in root.rglob("*"):
        parts = set(path.parts)
        if ".git" in parts or ".venv" in parts or "venv" in parts:
            continue
        yield path


def scan(root: Path, *, paths: list[Path] | None = None) -> dict:
    violations: list[dict] = []
    text_files = 0
    total_files = 0
    largest_file = {"path": None, "bytes": 0}

    for path in iter_public_files(root, paths):
        path_rel = rel(path, root)
        if path.is_dir():
            if path.name in BANNED_DIR_NAMES:
                violations.append({"kind": "generated_cache_dir", "path": path_rel})
            continue

        total_files += 1
        size = path.stat().st_size
        if size > largest_file["bytes"]:
            largest_file = {"path": path_rel, "bytes": size}

        suffix = path.suffix.lower()
        if path.name in BANNED_FILE_NAMES or suffix in BANNED_SUFFIXES:
            violations.append({"kind": "generated_cache_file", "path": path_rel})
        if suffix in RAW_DATA_SUFFIXES:
            violations.append({"kind": "raw_xperience10m_data", "path": path_rel})
        if suffix in HEAVY_MODEL_SUFFIXES:
            violations.append({"kind": "heavy_model_or_archive", "path": path_rel})

        if suffix in TEXT_SUFFIXES:
            text_files += 1
            try:
                text = path.read_text(encoding="utf-8", errors="ignore")
            except OSError:
                continue
            if TOKEN_PATTERN.search(text):
                violations.append({"kind": "possible_hf_token", "path": path_rel})
            for needle, reason in STALE_PRESENTATION_STRINGS.items():
                if needle in text:
                    violations.append({
                        "kind": "stale_presentation_copy",
                        "path": path_rel,
                        "detail": reason,
                    })

    return {
        "root": str(root),
        "exists": root.exists(),
        "file_count": total_files,
        "text_file_count": text_files,
        "largest_file": largest_file,
        "violations": violations,
    }


def required_assets(root: Path) -> dict[str, bool]:
    required = [
        "README.md",
        "CITATION.cff",
        "LICENSE",
        "codemeta.json",
        "ARTIFACT_GUIDE.md",
        "REPRODUCIBILITY.md",
        "EVIDENCE_CONTRACT.md",
        "DATA_NOTICE.md",
        "docs/404.html",
        "docs/favicon.svg",
        "docs/index.html",
        "docs/robots.txt",
        "docs/sitemap.xml",
        "docs/data/evidence_contract.json",
        "docs/data/artifact_index.json",
        "docs/data/project_manifest.json",
        "docs/data/reviewer_packet.json",
        "docs/data/reproducibility_matrix.json",
        "docs/data/modality_atlas.json",
        "docs/data/mirror_parity.json",
        "docs/data/scope_claims_audit.json",
        "docs/data/website_integrity.json",
        "docs/data/summary_metrics.json",
        "docs/assets/modalities/video.jpg",
        "docs/assets/modalities/audio.png",
        "docs/assets/modalities/depth.jpg",
        "docs/assets/modalities/pose_slam.png",
        "docs/assets/modalities/motion_capture.png",
        "docs/assets/modalities/inertial.png",
        "docs/assets/modalities/language.png",
        "docs/assets/task_suite_infographic.png",
        "docs/assets/pipeline_diagram.png",
        "docs/assets/task_architectures.png",
        "results/episode_task_suite/summary_report.json",
        "results/episode_task_suite/feature_manifest.json",
        "results/episode_task_suite/neural_mlp/timeline_action/metrics.json",
        "results/omni_finetune/DATA_BLOCKER_REPORT.md",
        "results/omni_finetune/A100_HF_RELAY_STATUS.md",
        "scripts/episode_task_suite.py",
        "scripts/neural_task_models.py",
        "scripts/build_artifact_index.py",
        "scripts/validate_mirror_parity.py",
        "scripts/validate_scope_claims.py",
        "scripts/validate_website_integrity.py",
        "scripts/omni/train_qwen3_omni_lora.py",
    ]
    return {item: (root / item).exists() for item in required}


def public_card_freshness(roots: dict[str, Path]) -> list[dict]:
    records = []
    for item in CARD_FRESHNESS_EXPECTATIONS:
        surface = item["surface"]
        path = roots[surface] / item["relative_path"]
        text = path.read_text(encoding="utf-8", errors="ignore") if path.exists() else ""
        missing = [marker for marker in item["required"] if marker not in text]
        records.append({
            "surface": surface,
            "path": item["relative_path"],
            "exists": path.exists(),
            "required_marker_count": len(item["required"]),
            "missing_markers": missing,
            "status": "pass" if path.exists() and not missing else "fail",
        })
    return records


def build_report(hf_root: Path) -> dict:
    roots = {
        "github_repo": ROOT,
        "hf_space_bundle": hf_root / "space",
        "hf_artifact_bundle": hf_root / "artifacts",
        "hf_model_bundle": hf_root / "model",
    }
    scans = {}
    for name, path in roots.items():
        public_paths = git_public_paths(path) if name == "github_repo" else None
        scans[name] = scan(path, paths=public_paths)
    assets = required_assets(ROOT)
    card_freshness = public_card_freshness(roots)
    missing_assets = [path for path, present in assets.items() if not present]
    violations = [
        {"root": name, **violation}
        for name, result in scans.items()
        for violation in result["violations"]
    ]
    checks = [
        {
            "name": "required_publication_assets_present",
            "status": "pass" if not missing_assets else "fail",
            "missing": missing_assets,
        },
        {
            "name": "no_generated_python_caches",
            "status": "pass"
            if not any(v["kind"].startswith("generated_cache") for v in violations)
            else "fail",
            "count": sum(1 for v in violations if v["kind"].startswith("generated_cache")),
        },
        {
            "name": "no_raw_xperience10m_data",
            "status": "pass" if not any(v["kind"] == "raw_xperience10m_data" for v in violations) else "fail",
            "count": sum(1 for v in violations if v["kind"] == "raw_xperience10m_data"),
        },
        {
            "name": "no_heavy_model_archives",
            "status": "pass" if not any(v["kind"] == "heavy_model_or_archive" for v in violations) else "fail",
            "count": sum(1 for v in violations if v["kind"] == "heavy_model_or_archive"),
        },
        {
            "name": "no_hf_tokens_in_public_text",
            "status": "pass" if not any(v["kind"] == "possible_hf_token" for v in violations) else "fail",
            "count": sum(1 for v in violations if v["kind"] == "possible_hf_token"),
        },
        {
            "name": "no_stale_task_suite_presentation_copy",
            "status": "pass" if not any(v["kind"] == "stale_presentation_copy" for v in violations) else "fail",
            "count": sum(1 for v in violations if v["kind"] == "stale_presentation_copy"),
        },
        {
            "name": "public_cards_reference_taskfirst_figure",
            "status": "pass" if all(item["status"] == "pass" for item in card_freshness) else "fail",
            "failures": [item for item in card_freshness if item["status"] != "pass"],
        },
    ]
    status = "pass" if all(check["status"] == "pass" for check in checks) else "fail"
    return {
        "status": status,
        "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
        "checks": checks,
        "required_assets": assets,
        "public_card_freshness": card_freshness,
        "scans": scans,
        "violations": violations,
    }


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--hf-root", type=Path, default=DEFAULT_HF_ROOT)
    parser.add_argument("--output", type=Path, default=ROOT / "docs/data/publication_audit.json")
    args = parser.parse_args()

    report = build_report(args.hf_root.resolve())
    args.output.parent.mkdir(parents=True, exist_ok=True)
    args.output.write_text(json.dumps(report, indent=2) + "\n", encoding="utf-8")
    print(f"{report['status'].upper()}: wrote {args.output}")
    if report["status"] != "pass":
        for violation in report["violations"][:40]:
            print(f"- {violation['root']}: {violation['kind']} {violation['path']}")
        if len(report["violations"]) > 40:
            print(f"- ... {len(report['violations']) - 40} more violations")
        return 1
    return 0


if __name__ == "__main__":
    raise SystemExit(main())