Spaces:
Running
Running
File size: 1,716 Bytes
6364501 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
# ملفات
CSV_DATA = "dataset_2026.csv"
EMB_FILE = "embeddings_questions.pt"
# ✅ موديل مستقر يدعم العربية/الفرنسية/الإنجليزية
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
print("📥 Chargement du dataset...")
try:
df = pd.read_excel(CSV_DATA, engine="openpyxl")
except Exception:
# Fallback to CSV if it's genuinely a CSV
df = pd.read_csv(CSV_DATA, sep=None, engine="python", encoding="utf-8", on_bad_lines="skip")
# ✅ nettoyage colonnes (fix BOM + espaces)
df.columns = df.columns.str.replace('\ufeff', '', regex=True).str.strip()
print("📊 Colonnes détectées :", df.columns.tolist())
required_cols = ["Intent", "SubIntent", "Question"]
for col in required_cols:
if col not in df.columns:
raise ValueError(f"❌ Column '{col}' not found. Found: {df.columns}")
print("🧠 Construction des phrases enrichies...")
texts = (
df["Intent"].astype(str) + " " +
df["SubIntent"].astype(str) + " " +
df["Question"].astype(str)
).tolist()
print(f"✅ {len(texts)} entrées chargées")
print("🧠 Calcul des embeddings...")
# ✅ batching لتفادي مشاكل الذاكرة وتسريع العملية
embeddings = model.encode(
texts,
batch_size=32, # تقدر تنقصها إذا كان RAM ضعيف
show_progress_bar=True,
convert_to_tensor=True,
normalize_embeddings=True
)
print("💾 Sauvegarde des embeddings...")
# ✅ حفظ embeddings
torch.save(embeddings, EMB_FILE)
print("✅ Terminé :", embeddings.shape)
|