"""
Upload Kirana Detective Claude Code build sessions to HuggingFace Hub.

These are the raw Claude Code JSONL session files recorded while building
the project — showing every prompt, tool call, and agent action used to
design, code, train, and debug the full pipeline.

HF Hub natively renders these in the Data Studio trace viewer.

Usage:
    python finetune/upload_build_traces.py

Requires:
    HF_TOKEN env var with write access to naazimsnh02/

Uploads to:
    build-small-hackathon/kirana-detective-build-traces  (then transfer to build-small-hackathon/ manually)

Security:
    Secrets (HF tokens, API keys) are redacted from session content before upload.
    Run with --dry-run to preview what gets scrubbed without uploading.
"""

from __future__ import annotations

import io
import os
import re
import sys
from pathlib import Path

# ── Config ────────────────────────────────────────────────────────────────────

HF_REPO = "build-small-hackathon/kirana-detective-build-traces"

# Claude Code stores sessions at ~/.claude/projects/<encoded-path>/
# On this machine the project path encodes to:
SESSIONS_DIR = Path.home() / ".claude" / "projects" / "C--Users-naazi-Downloads-New-Projects-Kirana-Detective"

# ── Secret scrubbing patterns ─────────────────────────────────────────────────
# Ordered from most-specific to least-specific to avoid partial matches.

# Each entry is (pattern, replacement). Patterns with capture groups use r'\g<1>[REDACTED]'
# so the key name is preserved and only the secret value is replaced.
_SECRET_PATTERNS: list[tuple[re.Pattern[str], str]] = [
    # HuggingFace tokens: hf_<alphanumeric, 10-60 chars>
    (re.compile(r'\bhf_[A-Za-z0-9]{10,60}\b'), "[HF_TOKEN_REDACTED]"),
    # Modal tokens: ak-<alphanumeric>
    (re.compile(r'\bak-[A-Za-z0-9]{20,60}\b'), "[MODAL_TOKEN_REDACTED]"),
    # Roboflow API keys after env var assignment (capturing group preserves the key name)
    (re.compile(r'(ROBOFLOW_API_KEY\s*[=:]\s*["\']?)([A-Za-z0-9]{30,50})'), r'\g<1>[ROBOFLOW_KEY_REDACTED]'),
    # Generic "Bearer <token>" headers
    (re.compile(r'(?i)(Bearer\s+)([A-Za-z0-9\-_\.]{20,})'), r'\g<1>[BEARER_TOKEN_REDACTED]'),
    # AWS access key IDs
    (re.compile(r'\b(AKIA|ASIA|AROA)[A-Z0-9]{16}\b'), "[AWS_KEY_REDACTED]"),
    # AWS secret keys after env var assignment
    (re.compile(r'(AWS_SECRET_ACCESS_KEY\s*[=:]\s*["\']?)([A-Za-z0-9+/]{40})'), r'\g<1>[AWS_SECRET_REDACTED]'),
]


def scrub_secrets(text: str) -> tuple[str, int]:
    """Return (scrubbed_text, total_replacements_made)."""
    total = 0
    for pattern, replacement in _SECRET_PATTERNS:
        text, n = pattern.subn(replacement, text)
        total += n
    return text, total


DATASET_README = """\
---
license: mit
language:
  - en
tags:
  - agent-traces
  - claude-code
  - kirana-detective
  - indian-fmcg
  - invoice-audit
  - hackathon
format: agent-traces
---

# Kirana Detective — Claude Code Build Sessions

Raw Claude Code (claude-sonnet-4-6) session traces recorded while building
**Kirana Detective AI** for the HuggingFace Build Small Hackathon 2026.

Each `.jsonl` file is one coding session. Together they cover the entire
build — from first commit to final submission.

## What's Inside

| Sessions | Agent | Coverage |
|---|---|---|
| 11 JSONL files | Claude Code (Sonnet 4.6) | Full project build |

### Sessions include

- Designing the six-agent audit pipeline
- Writing and debugging all training scripts (`train_minicpm_v.py`, `train_minicpm5_1b.py`, `train_yolo26n.py`)
- Fixing QLoRA + MiniCPM-V processor shape mismatches
- Building YOLO26n ONNX inference + NMS post-processing
- Wiring all six agents into `pipeline.py`
- Writing push/export scripts and model cards
- Updating `README.md`, `MODEL_CARD.md`, `finetune/README.md`

## View in Data Studio

Open any `.jsonl` file in HuggingFace Data Studio to see the full
session timeline — prompts, tool calls, file reads/writes, bash output,
and assistant responses in the native trace viewer.

## Privacy & Security

Secrets (HF tokens, API keys, bearer tokens) were redacted from all session
files before upload using regex-based scrubbing. Local file paths are present
but contain no personal data beyond the project directory name.

## Project

- **Space**: [build-small-hackathon/kirana-detective](https://huggingface.co/spaces/build-small-hackathon/kirana-detective)
- **Code**: [github.com/naazimsnh02/kirana-detective](https://github.com/naazimsnh02/kirana-detective)
- **Runtime audit traces**: [build-small-hackathon/kirana-detective-traces](https://huggingface.co/datasets/build-small-hackathon/kirana-detective-traces)

## Citation

```bibtex
@misc{kirana_detective_traces_2026,
  author    = {Hussain, Syed Naazim},
  title     = {Kirana Detective — Claude Code Build Sessions},
  year      = {2026},
  publisher = {HuggingFace},
  howpublished = {\\url{https://huggingface.co/datasets/build-small-hackathon/kirana-detective-build-traces}},
}
```
"""


def main() -> None:
    dry_run = "--dry-run" in sys.argv

    token = os.environ.get("HF_TOKEN")
    if not token and not dry_run:
        sys.exit("Error: set HF_TOKEN environment variable before running.")

    if not SESSIONS_DIR.exists():
        sys.exit(
            f"Sessions directory not found: {SESSIONS_DIR}\n"
            "Update SESSIONS_DIR in this script to match your machine's path."
        )

    jsonl_files = sorted(SESSIONS_DIR.glob("*.jsonl"))
    if not jsonl_files:
        sys.exit(f"No .jsonl files found in {SESSIONS_DIR}")

    print(f"Found {len(jsonl_files)} session files:")
    for f in jsonl_files:
        size_kb = f.stat().st_size // 1024
        print(f"  {f.name}  ({size_kb} KB)")

    # ── Scrub secrets from every file in-memory ───────────────────────────────
    print("\nScrubbing secrets...")
    scrubbed: dict[str, bytes] = {}
    total_hits = 0
    for f in jsonl_files:
        raw = f.read_text(encoding="utf-8", errors="replace")
        clean, hits = scrub_secrets(raw)
        scrubbed[f.name] = clean.encode("utf-8")
        status = f"  {f.name}: {hits} replacement(s)" if hits else f"  {f.name}: clean"
        print(status)
        total_hits += hits

    print(f"Total secrets redacted: {total_hits}")

    if dry_run:
        print("\n[dry-run] No files uploaded.")
        return

    from huggingface_hub import HfApi, CommitOperationAdd

    api = HfApi(token=token)

    print(f"\nCreating dataset repo: {HF_REPO}")
    api.create_repo(repo_id=HF_REPO, repo_type="dataset", exist_ok=True, private=False)

    # Build all commit operations in one go (single commit = clean history)
    operations: list[CommitOperationAdd] = []

    for f in jsonl_files:
        operations.append(
            CommitOperationAdd(
                path_in_repo=f"sessions/{f.name}",
                path_or_fileobj=io.BytesIO(scrubbed[f.name]),
            )
        )
        print(f"  Staged: sessions/{f.name}")

    operations.append(
        CommitOperationAdd(
            path_in_repo="README.md",
            path_or_fileobj=DATASET_README.encode("utf-8"),
        )
    )
    print("  Staged: README.md (dataset card)")

    print(f"\nCommitting {len(operations)} files to {HF_REPO} ...")
    commit = api.create_commit(
        repo_id=HF_REPO,
        repo_type="dataset",
        operations=operations,
        commit_message=f"Add {len(jsonl_files)} Claude Code build sessions for Kirana Detective",
    )
    print(f"\nDone — {commit.commit_url}")
    print(f"Dataset: https://huggingface.co/datasets/{HF_REPO}")
    print("\nView traces: open any .jsonl in HF Data Studio")
    print("Remember to transfer to build-small-hackathon/ org when ready.")


if __name__ == "__main__":
    main()