Spaces:

Hakim18
/

FinalOrganisations

Running

App Files Files Community

FinalOrganisations / rebuild_embeddings.py

Hakim18

feat: React frontend + Docker deploy

6364501 verified about 1 month ago

Raw

History Blame Contribute Delete

4.31 kB

	"""
	Rebuild (pre-warm) embeddings for ALL organisations in platform.db.

	Run this after any bulk import or whenever you want to pre-compute
	embeddings so the first chatbot query is instant.

	Usage:
	python rebuild_embeddings.py
	python rebuild_embeddings.py --org-id 2 (single org)
	"""
	import argparse, json, os, sqlite3, time, re, unicodedata
	import torch
	from sentence_transformers import SentenceTransformer

	BASE_DIR = os.path.abspath(os.path.dirname(__file__))
	DB_PATH = os.path.join(BASE_DIR, "platform.db")
	EMB_DIR = os.path.join(BASE_DIR, "embeddings")
	os.makedirs(EMB_DIR, exist_ok=True)

	parser = argparse.ArgumentParser()
	parser.add_argument("--org-id", type=int, default=None, help="Rebuild only this org (default: all)")
	args = parser.parse_args()

	# ── Text helpers (must match app.py) ─────────────────────────────────────────
	def normalise(text: str) -> str:
	text = text.lower().strip()
	text = unicodedata.normalize("NFD", text)
	text = "".join(c for c in text if unicodedata.category(c) != "Mn")
	text = re.sub(r"[^\w\s]", " ", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text

	def clean_question(q: str) -> str:
	return re.sub(r'\s+\d+\s*$', '', (q or '').strip())

	def build_index_text(row) -> str:
	q = clean_question(row["question"] or "")
	proc = (row["processus"] or "").strip()
	procd = (row["procedure"] or "").strip()
	parts = [q]
	if proc:
	parts.append(proc)
	if procd:
	parts.append(procd)
	return " \| ".join(parts)

	# ── Load model ───────────────────────────────────────────────────────────────
	print("🔄 Chargement du modèle…")
	try:
	model = SentenceTransformer(
	"OrdalieTech/Solon-embeddings-mini-beta-1.1",
	device="cpu", trust_remote_code=True
	)
	print("✓ Modèle principal (Solon) chargé")
	except Exception as e:
	print(f"⚠ Modèle principal échoué ({e}), bascule sur secours…")
	model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", device="cpu")
	print("✓ Modèle de secours chargé")

	# ── DB helpers ────────────────────────────────────────────────────────────────
	conn = sqlite3.connect(DB_PATH)
	conn.row_factory = sqlite3.Row

	if args.org_id:
	orgs = conn.execute(
	"SELECT id, name FROM organisations WHERE id=?", (args.org_id,)
	).fetchall()
	else:
	orgs = conn.execute("SELECT id, name FROM organisations").fetchall()

	if not orgs:
	print("Aucune organisation trouvée dans la base de données.")
	raise SystemExit(0)

	total_built = 0

	for org in orgs:
	oid = org["id"]
	name = org["name"]

	rows = conn.execute(
	"SELECT id, question, processus, procedure, intent, sub_intent, response "
	"FROM entries WHERE org_id=? ORDER BY id", (oid,)
	).fetchall()

	if not rows:
	print(f" [{name}] 0 entrées — ignoré")
	continue

	row_ids = [r["id"] for r in rows]
	texts = [build_index_text(r) for r in rows]

	emb_file = os.path.join(EMB_DIR, f"org_{oid}.pt")
	meta_file = os.path.join(EMB_DIR, f"org_{oid}_meta.json")

	# Remove stale files
	for f in [emb_file, meta_file]:
	if os.path.exists(f):
	os.remove(f)

	print(f" [{name}] {len(texts)} questions → calcul embeddings…", end=" ", flush=True)
	t0 = time.time()
	emb = model.encode(texts, convert_to_tensor=True, normalize_embeddings=True,
	batch_size=64, show_progress_bar=False)
	torch.save(emb, emb_file)
	with open(meta_file, "w") as mf:
	json.dump({"row_ids": row_ids, "org_id": oid, "org_name": name, "v": 4}, mf)

	elapsed = time.time() - t0
	print(f"✓ ({elapsed:.1f}s) → org_{oid}.pt")
	total_built += 1

	conn.close()
	print(f"\n✅ {total_built} organisation(s) reconstruite(s). Redémarrez Flask pour appliquer.")