import torch import pandas as pd from sentence_transformers import SentenceTransformer # ملفات CSV_DATA = "dataset_2026.csv" EMB_FILE = "embeddings_questions.pt" # ✅ موديل مستقر يدعم العربية/الفرنسية/الإنجليزية model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2") print("📥 Chargement du dataset...") try: df = pd.read_excel(CSV_DATA, engine="openpyxl") except Exception: # Fallback to CSV if it's genuinely a CSV df = pd.read_csv(CSV_DATA, sep=None, engine="python", encoding="utf-8", on_bad_lines="skip") # ✅ nettoyage colonnes (fix BOM + espaces) df.columns = df.columns.str.replace('\ufeff', '', regex=True).str.strip() print("📊 Colonnes détectées :", df.columns.tolist()) required_cols = ["Intent", "SubIntent", "Question"] for col in required_cols: if col not in df.columns: raise ValueError(f"❌ Column '{col}' not found. Found: {df.columns}") print("🧠 Construction des phrases enrichies...") texts = ( df["Intent"].astype(str) + " " + df["SubIntent"].astype(str) + " " + df["Question"].astype(str) ).tolist() print(f"✅ {len(texts)} entrées chargées") print("🧠 Calcul des embeddings...") # ✅ batching لتفادي مشاكل الذاكرة وتسريع العملية embeddings = model.encode( texts, batch_size=32, # تقدر تنقصها إذا كان RAM ضعيف show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True ) print("💾 Sauvegarde des embeddings...") # ✅ حفظ embeddings torch.save(embeddings, EMB_FILE) print("✅ Terminé :", embeddings.shape)