import sqlite3, os, torch, json from sentence_transformers import SentenceTransformer, util BASE_DIR = os.path.abspath(os.path.dirname(__file__)) DB_PATH = os.path.join(BASE_DIR, "platform.db") EMB_DIR = os.path.join(BASE_DIR, "embeddings") conn = sqlite3.connect(DB_PATH) conn.row_factory = sqlite3.Row # Check for short/weird responses print("=== SUSPICIOUS ENTRIES (response < 20 chars or contains repetition) ===") rows = conn.execute("SELECT id, org_id, question, response FROM entries ORDER BY id").fetchall() for r in rows: resp = (r["response"] or "") if len(resp) < 20 or resp != resp[:len(resp)//2]*2 and "vous"*3 in resp.lower(): print(f"[id={r['id']} org={r['org_id']}] Q: {r['question']}") print(f" R: {repr(resp[:120])}") print() print("=== ALL SHORT RESPONSES ===") for r in rows: resp = (r["response"] or "") if len(resp) < 30: print(f"[id={r['id']} org={r['org_id']}] Q: {r['question']} | R: {repr(resp)}") print() print("=== SEARCH: 'wxx' ===") model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", device="cpu") emb_file = os.path.join(EMB_DIR, "org_2.pt") meta_file = os.path.join(EMB_DIR, "org_2_meta.json") emb_base = torch.load(emb_file, map_location="cpu", weights_only=True) with open(meta_file) as f: meta = json.load(f) row_ids = meta["row_ids"] emb_q = model.encode("wxx", convert_to_tensor=True, normalize_embeddings=True) scores = util.pytorch_cos_sim(emb_q, emb_base)[0] top5 = torch.topk(scores, 5).indices.tolist() print("Top 5 matches for 'wxx':") for idx in top5: eid = row_ids[idx] r = conn.execute("SELECT question, response FROM entries WHERE id=?", (eid,)).fetchone() print(f" [{float(scores[idx])*100:.1f}%] id={eid} | Q: {r['question']}") print(f" R: {repr((r['response'] or '')[:100])}")