Spaces:
Running
Running
| """ | |
| Rebuild (pre-warm) embeddings for ALL organisations in platform.db. | |
| Run this after any bulk import or whenever you want to pre-compute | |
| embeddings so the first chatbot query is instant. | |
| Usage: | |
| python rebuild_embeddings.py | |
| python rebuild_embeddings.py --org-id 2 (single org) | |
| """ | |
| import argparse, json, os, sqlite3, time, re, unicodedata | |
| import torch | |
| from sentence_transformers import SentenceTransformer | |
| BASE_DIR = os.path.abspath(os.path.dirname(__file__)) | |
| DB_PATH = os.path.join(BASE_DIR, "platform.db") | |
| EMB_DIR = os.path.join(BASE_DIR, "embeddings") | |
| os.makedirs(EMB_DIR, exist_ok=True) | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--org-id", type=int, default=None, help="Rebuild only this org (default: all)") | |
| args = parser.parse_args() | |
| # ββ Text helpers (must match app.py) βββββββββββββββββββββββββββββββββββββββββ | |
| def normalise(text: str) -> str: | |
| text = text.lower().strip() | |
| text = unicodedata.normalize("NFD", text) | |
| text = "".join(c for c in text if unicodedata.category(c) != "Mn") | |
| text = re.sub(r"[^\w\s]", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def clean_question(q: str) -> str: | |
| return re.sub(r'\s+\d+\s*$', '', (q or '').strip()) | |
| def build_index_text(row) -> str: | |
| q = clean_question(row["question"] or "") | |
| proc = (row["processus"] or "").strip() | |
| procd = (row["procedure"] or "").strip() | |
| parts = [q] | |
| if proc: | |
| parts.append(proc) | |
| if procd: | |
| parts.append(procd) | |
| return " | ".join(parts) | |
| # ββ Load model βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("π Chargement du modΓ¨leβ¦") | |
| try: | |
| model = SentenceTransformer( | |
| "OrdalieTech/Solon-embeddings-mini-beta-1.1", | |
| device="cpu", trust_remote_code=True | |
| ) | |
| print("β ModΓ¨le principal (Solon) chargΓ©") | |
| except Exception as e: | |
| print(f"β ModΓ¨le principal Γ©chouΓ© ({e}), bascule sur secoursβ¦") | |
| model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", device="cpu") | |
| print("β ModΓ¨le de secours chargΓ©") | |
| # ββ DB helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| conn = sqlite3.connect(DB_PATH) | |
| conn.row_factory = sqlite3.Row | |
| if args.org_id: | |
| orgs = conn.execute( | |
| "SELECT id, name FROM organisations WHERE id=?", (args.org_id,) | |
| ).fetchall() | |
| else: | |
| orgs = conn.execute("SELECT id, name FROM organisations").fetchall() | |
| if not orgs: | |
| print("Aucune organisation trouvΓ©e dans la base de donnΓ©es.") | |
| raise SystemExit(0) | |
| total_built = 0 | |
| for org in orgs: | |
| oid = org["id"] | |
| name = org["name"] | |
| rows = conn.execute( | |
| "SELECT id, question, processus, procedure, intent, sub_intent, response " | |
| "FROM entries WHERE org_id=? ORDER BY id", (oid,) | |
| ).fetchall() | |
| if not rows: | |
| print(f" [{name}] 0 entrΓ©es β ignorΓ©") | |
| continue | |
| row_ids = [r["id"] for r in rows] | |
| texts = [build_index_text(r) for r in rows] | |
| emb_file = os.path.join(EMB_DIR, f"org_{oid}.pt") | |
| meta_file = os.path.join(EMB_DIR, f"org_{oid}_meta.json") | |
| # Remove stale files | |
| for f in [emb_file, meta_file]: | |
| if os.path.exists(f): | |
| os.remove(f) | |
| print(f" [{name}] {len(texts)} questions β calcul embeddingsβ¦", end=" ", flush=True) | |
| t0 = time.time() | |
| emb = model.encode(texts, convert_to_tensor=True, normalize_embeddings=True, | |
| batch_size=64, show_progress_bar=False) | |
| torch.save(emb, emb_file) | |
| with open(meta_file, "w") as mf: | |
| json.dump({"row_ids": row_ids, "org_id": oid, "org_name": name, "v": 4}, mf) | |
| elapsed = time.time() - t0 | |
| print(f"β ({elapsed:.1f}s) β org_{oid}.pt") | |
| total_built += 1 | |
| conn.close() | |
| print(f"\nβ {total_built} organisation(s) reconstruite(s). RedΓ©marrez Flask pour appliquer.") | |