"""Backfill taste_embedding column from on-disk .npy cache. Reads each scored product's image_path, recomputes the FashionSigLIP cache key (blake2b of `fashionsiglip:{abs_path}:{size}:{mtime_ns}`), loads the cached embedding, and stores it as a BLOB on the product row. Embeddings are pure image (mean-pooled across multi-image products), unit-norm, no text fusion — that's what the outfit composer wants for visual harmony. """ # /// script # requires-python = ">=3.11" # dependencies = ["numpy"] # /// import hashlib import json import sqlite3 import sys from pathlib import Path import numpy as np DB_PATH = Path.home() / ".taste/taste.db" EMB_DIR = Path.home() / ".taste/embeddings" def cache_key(path: Path) -> str | None: if not path.exists(): return None st = path.stat() raw = f"fashionsiglip:{path.resolve()}:{st.st_size}:{st.st_mtime_ns}" return hashlib.blake2b(raw.encode(), digest_size=16).hexdigest() def load_emb(path: Path) -> np.ndarray | None: key = cache_key(path) if not key: return None p = EMB_DIR / f"{key}.npy" if not p.exists(): return None return np.load(p) def main() -> int: if not DB_PATH.exists(): print(f"DB not found: {DB_PATH}", file=sys.stderr) return 1 conn = sqlite3.connect(str(DB_PATH)) conn.row_factory = sqlite3.Row # Ensure column exists (idempotent) cols = {r[1] for r in conn.execute("PRAGMA table_info(products)").fetchall()} if "taste_embedding" not in cols: conn.execute("ALTER TABLE products ADD COLUMN taste_embedding BLOB") conn.commit() rows = conn.execute( """ SELECT url, image_path, image_paths FROM products WHERE score IS NOT NULL AND image_path != '' AND taste_embedding IS NULL """ ).fetchall() total = len(rows) print(f"Backfilling {total} products...") ok = miss = err = 0 batch: list[tuple[bytes, str]] = [] BATCH = 500 for i, r in enumerate(rows, 1): try: paths_json = r["image_paths"] or "[]" paths = json.loads(paths_json) if paths_json else [] if not isinstance(paths, list): paths = [] paths = [Path(p) for p in paths if p] if not paths: paths = [Path(r["image_path"])] valid = [p for p in paths if p.exists()] if not valid: miss += 1 continue embs = [load_emb(p) for p in valid] embs = [e for e in embs if e is not None] if not embs: miss += 1 continue mean = np.mean(np.stack(embs), axis=0) mean = mean / np.linalg.norm(mean) batch.append((np.asarray(mean, dtype=np.float32).tobytes(), r["url"])) ok += 1 except Exception as e: err += 1 if err < 5: print(f" err on {r['url']}: {e}", file=sys.stderr) if len(batch) >= BATCH: conn.executemany( "UPDATE products SET taste_embedding = ? WHERE url = ?", batch ) conn.commit() batch.clear() print(f" {i}/{total} (ok={ok} miss={miss} err={err})", flush=True) if batch: conn.executemany( "UPDATE products SET taste_embedding = ? WHERE url = ?", batch ) conn.commit() print(f"\nDone. ok={ok} miss={miss} err={err}") # Sanity report n_with = conn.execute( "SELECT COUNT(*) FROM products WHERE taste_embedding IS NOT NULL" ).fetchone()[0] print(f"products.taste_embedding now populated for {n_with} rows") return 0 if __name__ == "__main__": sys.exit(main())