FinalOrganisations / embeddings.py
Hakim18's picture
feat: React frontend + Docker deploy
6364501 verified
Raw
History Blame Contribute Delete
1.72 kB
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
# ملفات
CSV_DATA = "dataset_2026.csv"
EMB_FILE = "embeddings_questions.pt"
# ✅ موديل مستقر يدعم العربية/الفرنسية/الإنجليزية
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
print("📥 Chargement du dataset...")
try:
df = pd.read_excel(CSV_DATA, engine="openpyxl")
except Exception:
# Fallback to CSV if it's genuinely a CSV
df = pd.read_csv(CSV_DATA, sep=None, engine="python", encoding="utf-8", on_bad_lines="skip")
# ✅ nettoyage colonnes (fix BOM + espaces)
df.columns = df.columns.str.replace('\ufeff', '', regex=True).str.strip()
print("📊 Colonnes détectées :", df.columns.tolist())
required_cols = ["Intent", "SubIntent", "Question"]
for col in required_cols:
if col not in df.columns:
raise ValueError(f"❌ Column '{col}' not found. Found: {df.columns}")
print("🧠 Construction des phrases enrichies...")
texts = (
df["Intent"].astype(str) + " " +
df["SubIntent"].astype(str) + " " +
df["Question"].astype(str)
).tolist()
print(f"✅ {len(texts)} entrées chargées")
print("🧠 Calcul des embeddings...")
# ✅ batching لتفادي مشاكل الذاكرة وتسريع العملية
embeddings = model.encode(
texts,
batch_size=32, # تقدر تنقصها إذا كان RAM ضعيف
show_progress_bar=True,
convert_to_tensor=True,
normalize_embeddings=True
)
print("💾 Sauvegarde des embeddings...")
# ✅ حفظ embeddings
torch.save(embeddings, EMB_FILE)
print("✅ Terminé :", embeddings.shape)