"""Re-run `normalize_category` over every product and update `canonical_category`. Used after a fix to `taste/categorize.py` to migrate the existing DB to the new bucketing without re-scraping. Idempotent — running twice changes nothing the second time. Usage:: TASTE_DATA_DIR=~/.taste-massimo uv run --no-project --python 3.13 \\ python scripts/recategorize.py [--dry-run] Prints: - Per-bucket distribution before / after - A sorted move matrix (old → new) showing how many rows shifted - Total rows updated """ import argparse import sqlite3 import sys from collections import Counter from pathlib import Path REPO = Path(__file__).resolve().parents[1] sys.path.insert(0, str(REPO)) from taste.categorize import normalize_category def _data_dir() -> Path: import os env = os.environ.get("TASTE_DATA_DIR") return Path(env).expanduser() if env else Path.home() / ".taste" def _print_dist(label: str, counts: Counter) -> None: print(f"\n{label}") print("-" * 32) total = sum(counts.values()) for bucket, n in sorted(counts.items(), key=lambda x: (-x[1], x[0])): pct = 100.0 * n / total if total else 0.0 print(f" {bucket:12s} {n:5d} {pct:5.1f}%") print(f" {'TOTAL':12s} {total:5d}") def main() -> int: ap = argparse.ArgumentParser() ap.add_argument("--dry-run", action="store_true", help="Compute and report changes without writing to DB.") args = ap.parse_args() db_path = _data_dir() / "taste.db" if not db_path.exists(): print(f"DB not found: {db_path}", file=sys.stderr) return 1 print(f"DB: {db_path}") print(f"Mode: {'DRY RUN' if args.dry_run else 'WRITE'}") conn = sqlite3.connect(str(db_path)) conn.row_factory = sqlite3.Row rows = conn.execute( "SELECT url, category, name, canonical_category FROM products" ).fetchall() before = Counter(r["canonical_category"] or "unknown" for r in rows) moves: Counter[tuple[str, str]] = Counter() updates: list[tuple[str, str]] = [] for r in rows: old = r["canonical_category"] or "unknown" new = normalize_category(r["category"] or "", r["name"] or "") if new != old: moves[(old, new)] += 1 updates.append((new, r["url"])) after = Counter(before) for (old, new), n in moves.items(): after[old] -= n after[new] += n after = Counter({k: v for k, v in after.items() if v > 0}) _print_dist("Before", before) _print_dist("After", after) print(f"\nMove matrix ({sum(moves.values())} rows shift)") print("-" * 40) for (old, new), n in sorted(moves.items(), key=lambda x: -x[1]): print(f" {old:12s} -> {new:12s} {n:5d}") if not updates: print("\nNo changes needed.") return 0 if args.dry_run: print(f"\n[dry-run] would update {len(updates)} rows.") return 0 conn.executemany( "UPDATE products SET canonical_category = ? WHERE url = ?", updates ) conn.commit() print(f"\nUpdated {len(updates)} rows.") return 0 if __name__ == "__main__": sys.exit(main())