Spaces:
Running
Running
| import torch | |
| import pandas as pd | |
| from sentence_transformers import SentenceTransformer | |
| # ملفات | |
| CSV_DATA = "dataset_2026.csv" | |
| EMB_FILE = "embeddings_questions.pt" | |
| # ✅ موديل مستقر يدعم العربية/الفرنسية/الإنجليزية | |
| model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2") | |
| print("📥 Chargement du dataset...") | |
| try: | |
| df = pd.read_excel(CSV_DATA, engine="openpyxl") | |
| except Exception: | |
| # Fallback to CSV if it's genuinely a CSV | |
| df = pd.read_csv(CSV_DATA, sep=None, engine="python", encoding="utf-8", on_bad_lines="skip") | |
| # ✅ nettoyage colonnes (fix BOM + espaces) | |
| df.columns = df.columns.str.replace('\ufeff', '', regex=True).str.strip() | |
| print("📊 Colonnes détectées :", df.columns.tolist()) | |
| required_cols = ["Intent", "SubIntent", "Question"] | |
| for col in required_cols: | |
| if col not in df.columns: | |
| raise ValueError(f"❌ Column '{col}' not found. Found: {df.columns}") | |
| print("🧠 Construction des phrases enrichies...") | |
| texts = ( | |
| df["Intent"].astype(str) + " " + | |
| df["SubIntent"].astype(str) + " " + | |
| df["Question"].astype(str) | |
| ).tolist() | |
| print(f"✅ {len(texts)} entrées chargées") | |
| print("🧠 Calcul des embeddings...") | |
| # ✅ batching لتفادي مشاكل الذاكرة وتسريع العملية | |
| embeddings = model.encode( | |
| texts, | |
| batch_size=32, # تقدر تنقصها إذا كان RAM ضعيف | |
| show_progress_bar=True, | |
| convert_to_tensor=True, | |
| normalize_embeddings=True | |
| ) | |
| print("💾 Sauvegarde des embeddings...") | |
| # ✅ حفظ embeddings | |
| torch.save(embeddings, EMB_FILE) | |
| print("✅ Terminé :", embeddings.shape) | |