FinalOrganisations / rebuild_embeddings.py
Hakim18's picture
feat: React frontend + Docker deploy
6364501 verified
Raw
History Blame Contribute Delete
4.31 kB
"""
Rebuild (pre-warm) embeddings for ALL organisations in platform.db.
Run this after any bulk import or whenever you want to pre-compute
embeddings so the first chatbot query is instant.
Usage:
python rebuild_embeddings.py
python rebuild_embeddings.py --org-id 2 (single org)
"""
import argparse, json, os, sqlite3, time, re, unicodedata
import torch
from sentence_transformers import SentenceTransformer
BASE_DIR = os.path.abspath(os.path.dirname(__file__))
DB_PATH = os.path.join(BASE_DIR, "platform.db")
EMB_DIR = os.path.join(BASE_DIR, "embeddings")
os.makedirs(EMB_DIR, exist_ok=True)
parser = argparse.ArgumentParser()
parser.add_argument("--org-id", type=int, default=None, help="Rebuild only this org (default: all)")
args = parser.parse_args()
# ── Text helpers (must match app.py) ─────────────────────────────────────────
def normalise(text: str) -> str:
text = text.lower().strip()
text = unicodedata.normalize("NFD", text)
text = "".join(c for c in text if unicodedata.category(c) != "Mn")
text = re.sub(r"[^\w\s]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def clean_question(q: str) -> str:
return re.sub(r'\s+\d+\s*$', '', (q or '').strip())
def build_index_text(row) -> str:
q = clean_question(row["question"] or "")
proc = (row["processus"] or "").strip()
procd = (row["procedure"] or "").strip()
parts = [q]
if proc:
parts.append(proc)
if procd:
parts.append(procd)
return " | ".join(parts)
# ── Load model ───────────────────────────────────────────────────────────────
print("πŸ”„ Chargement du modΓ¨le…")
try:
model = SentenceTransformer(
"OrdalieTech/Solon-embeddings-mini-beta-1.1",
device="cpu", trust_remote_code=True
)
print("βœ“ ModΓ¨le principal (Solon) chargΓ©")
except Exception as e:
print(f"⚠ ModΓ¨le principal Γ©chouΓ© ({e}), bascule sur secours…")
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", device="cpu")
print("βœ“ ModΓ¨le de secours chargΓ©")
# ── DB helpers ────────────────────────────────────────────────────────────────
conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row
if args.org_id:
orgs = conn.execute(
"SELECT id, name FROM organisations WHERE id=?", (args.org_id,)
).fetchall()
else:
orgs = conn.execute("SELECT id, name FROM organisations").fetchall()
if not orgs:
print("Aucune organisation trouvΓ©e dans la base de donnΓ©es.")
raise SystemExit(0)
total_built = 0
for org in orgs:
oid = org["id"]
name = org["name"]
rows = conn.execute(
"SELECT id, question, processus, procedure, intent, sub_intent, response "
"FROM entries WHERE org_id=? ORDER BY id", (oid,)
).fetchall()
if not rows:
print(f" [{name}] 0 entrΓ©es β€” ignorΓ©")
continue
row_ids = [r["id"] for r in rows]
texts = [build_index_text(r) for r in rows]
emb_file = os.path.join(EMB_DIR, f"org_{oid}.pt")
meta_file = os.path.join(EMB_DIR, f"org_{oid}_meta.json")
# Remove stale files
for f in [emb_file, meta_file]:
if os.path.exists(f):
os.remove(f)
print(f" [{name}] {len(texts)} questions β†’ calcul embeddings…", end=" ", flush=True)
t0 = time.time()
emb = model.encode(texts, convert_to_tensor=True, normalize_embeddings=True,
batch_size=64, show_progress_bar=False)
torch.save(emb, emb_file)
with open(meta_file, "w") as mf:
json.dump({"row_ids": row_ids, "org_id": oid, "org_name": name, "v": 4}, mf)
elapsed = time.time() - t0
print(f"βœ“ ({elapsed:.1f}s) β†’ org_{oid}.pt")
total_built += 1
conn.close()
print(f"\nβœ… {total_built} organisation(s) reconstruite(s). RedΓ©marrez Flask pour appliquer.")