""" Upload Kirana Detective Claude Code build sessions to HuggingFace Hub. These are the raw Claude Code JSONL session files recorded while building the project — showing every prompt, tool call, and agent action used to design, code, train, and debug the full pipeline. HF Hub natively renders these in the Data Studio trace viewer. Usage: python finetune/upload_build_traces.py Requires: HF_TOKEN env var with write access to naazimsnh02/ Uploads to: build-small-hackathon/kirana-detective-build-traces (then transfer to build-small-hackathon/ manually) Security: Secrets (HF tokens, API keys) are redacted from session content before upload. Run with --dry-run to preview what gets scrubbed without uploading. """ from __future__ import annotations import io import os import re import sys from pathlib import Path # ── Config ──────────────────────────────────────────────────────────────────── HF_REPO = "build-small-hackathon/kirana-detective-build-traces" # Claude Code stores sessions at ~/.claude/projects// # On this machine the project path encodes to: SESSIONS_DIR = Path.home() / ".claude" / "projects" / "C--Users-naazi-Downloads-New-Projects-Kirana-Detective" # ── Secret scrubbing patterns ───────────────────────────────────────────────── # Ordered from most-specific to least-specific to avoid partial matches. # Each entry is (pattern, replacement). Patterns with capture groups use r'\g<1>[REDACTED]' # so the key name is preserved and only the secret value is replaced. _SECRET_PATTERNS: list[tuple[re.Pattern[str], str]] = [ # HuggingFace tokens: hf_ (re.compile(r'\bhf_[A-Za-z0-9]{10,60}\b'), "[HF_TOKEN_REDACTED]"), # Modal tokens: ak- (re.compile(r'\bak-[A-Za-z0-9]{20,60}\b'), "[MODAL_TOKEN_REDACTED]"), # Roboflow API keys after env var assignment (capturing group preserves the key name) (re.compile(r'(ROBOFLOW_API_KEY\s*[=:]\s*["\']?)([A-Za-z0-9]{30,50})'), r'\g<1>[ROBOFLOW_KEY_REDACTED]'), # Generic "Bearer " headers (re.compile(r'(?i)(Bearer\s+)([A-Za-z0-9\-_\.]{20,})'), r'\g<1>[BEARER_TOKEN_REDACTED]'), # AWS access key IDs (re.compile(r'\b(AKIA|ASIA|AROA)[A-Z0-9]{16}\b'), "[AWS_KEY_REDACTED]"), # AWS secret keys after env var assignment (re.compile(r'(AWS_SECRET_ACCESS_KEY\s*[=:]\s*["\']?)([A-Za-z0-9+/]{40})'), r'\g<1>[AWS_SECRET_REDACTED]'), ] def scrub_secrets(text: str) -> tuple[str, int]: """Return (scrubbed_text, total_replacements_made).""" total = 0 for pattern, replacement in _SECRET_PATTERNS: text, n = pattern.subn(replacement, text) total += n return text, total DATASET_README = """\ --- license: mit language: - en tags: - agent-traces - claude-code - kirana-detective - indian-fmcg - invoice-audit - hackathon format: agent-traces --- # Kirana Detective — Claude Code Build Sessions Raw Claude Code (claude-sonnet-4-6) session traces recorded while building **Kirana Detective AI** for the HuggingFace Build Small Hackathon 2026. Each `.jsonl` file is one coding session. Together they cover the entire build — from first commit to final submission. ## What's Inside | Sessions | Agent | Coverage | |---|---|---| | 11 JSONL files | Claude Code (Sonnet 4.6) | Full project build | ### Sessions include - Designing the six-agent audit pipeline - Writing and debugging all training scripts (`train_minicpm_v.py`, `train_minicpm5_1b.py`, `train_yolo26n.py`) - Fixing QLoRA + MiniCPM-V processor shape mismatches - Building YOLO26n ONNX inference + NMS post-processing - Wiring all six agents into `pipeline.py` - Writing push/export scripts and model cards - Updating `README.md`, `MODEL_CARD.md`, `finetune/README.md` ## View in Data Studio Open any `.jsonl` file in HuggingFace Data Studio to see the full session timeline — prompts, tool calls, file reads/writes, bash output, and assistant responses in the native trace viewer. ## Privacy & Security Secrets (HF tokens, API keys, bearer tokens) were redacted from all session files before upload using regex-based scrubbing. Local file paths are present but contain no personal data beyond the project directory name. ## Project - **Space**: [build-small-hackathon/kirana-detective](https://huggingface.co/spaces/build-small-hackathon/kirana-detective) - **Code**: [github.com/naazimsnh02/kirana-detective](https://github.com/naazimsnh02/kirana-detective) - **Runtime audit traces**: [build-small-hackathon/kirana-detective-traces](https://huggingface.co/datasets/build-small-hackathon/kirana-detective-traces) ## Citation ```bibtex @misc{kirana_detective_traces_2026, author = {Hussain, Syed Naazim}, title = {Kirana Detective — Claude Code Build Sessions}, year = {2026}, publisher = {HuggingFace}, howpublished = {\\url{https://huggingface.co/datasets/build-small-hackathon/kirana-detective-build-traces}}, } ``` """ def main() -> None: dry_run = "--dry-run" in sys.argv token = os.environ.get("HF_TOKEN") if not token and not dry_run: sys.exit("Error: set HF_TOKEN environment variable before running.") if not SESSIONS_DIR.exists(): sys.exit( f"Sessions directory not found: {SESSIONS_DIR}\n" "Update SESSIONS_DIR in this script to match your machine's path." ) jsonl_files = sorted(SESSIONS_DIR.glob("*.jsonl")) if not jsonl_files: sys.exit(f"No .jsonl files found in {SESSIONS_DIR}") print(f"Found {len(jsonl_files)} session files:") for f in jsonl_files: size_kb = f.stat().st_size // 1024 print(f" {f.name} ({size_kb} KB)") # ── Scrub secrets from every file in-memory ─────────────────────────────── print("\nScrubbing secrets...") scrubbed: dict[str, bytes] = {} total_hits = 0 for f in jsonl_files: raw = f.read_text(encoding="utf-8", errors="replace") clean, hits = scrub_secrets(raw) scrubbed[f.name] = clean.encode("utf-8") status = f" {f.name}: {hits} replacement(s)" if hits else f" {f.name}: clean" print(status) total_hits += hits print(f"Total secrets redacted: {total_hits}") if dry_run: print("\n[dry-run] No files uploaded.") return from huggingface_hub import HfApi, CommitOperationAdd api = HfApi(token=token) print(f"\nCreating dataset repo: {HF_REPO}") api.create_repo(repo_id=HF_REPO, repo_type="dataset", exist_ok=True, private=False) # Build all commit operations in one go (single commit = clean history) operations: list[CommitOperationAdd] = [] for f in jsonl_files: operations.append( CommitOperationAdd( path_in_repo=f"sessions/{f.name}", path_or_fileobj=io.BytesIO(scrubbed[f.name]), ) ) print(f" Staged: sessions/{f.name}") operations.append( CommitOperationAdd( path_in_repo="README.md", path_or_fileobj=DATASET_README.encode("utf-8"), ) ) print(" Staged: README.md (dataset card)") print(f"\nCommitting {len(operations)} files to {HF_REPO} ...") commit = api.create_commit( repo_id=HF_REPO, repo_type="dataset", operations=operations, commit_message=f"Add {len(jsonl_files)} Claude Code build sessions for Kirana Detective", ) print(f"\nDone — {commit.commit_url}") print(f"Dataset: https://huggingface.co/datasets/{HF_REPO}") print("\nView traces: open any .jsonl in HF Data Studio") print("Remember to transfer to build-small-hackathon/ org when ready.") if __name__ == "__main__": main()