import sqlite3, os, torch, json
from sentence_transformers import SentenceTransformer, util

BASE_DIR = os.path.abspath(os.path.dirname(__file__))
DB_PATH  = os.path.join(BASE_DIR, "platform.db")
EMB_DIR  = os.path.join(BASE_DIR, "embeddings")

conn = sqlite3.connect(DB_PATH)
conn.row_factory = sqlite3.Row

# Check for short/weird responses
print("=== SUSPICIOUS ENTRIES (response < 20 chars or contains repetition) ===")
rows = conn.execute("SELECT id, org_id, question, response FROM entries ORDER BY id").fetchall()
for r in rows:
    resp = (r["response"] or "")
    if len(resp) < 20 or resp != resp[:len(resp)//2]*2 and "vous"*3 in resp.lower():
        print(f"[id={r['id']} org={r['org_id']}] Q: {r['question']}")
        print(f"  R: {repr(resp[:120])}")

print()
print("=== ALL SHORT RESPONSES ===")
for r in rows:
    resp = (r["response"] or "")
    if len(resp) < 30:
        print(f"[id={r['id']} org={r['org_id']}] Q: {r['question']} | R: {repr(resp)}")

print()
print("=== SEARCH: 'wxx' ===")
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", device="cpu")
emb_file = os.path.join(EMB_DIR, "org_2.pt")
meta_file = os.path.join(EMB_DIR, "org_2_meta.json")
emb_base = torch.load(emb_file, map_location="cpu", weights_only=True)
with open(meta_file) as f:
    meta = json.load(f)
row_ids = meta["row_ids"]

emb_q  = model.encode("wxx", convert_to_tensor=True, normalize_embeddings=True)
scores = util.pytorch_cos_sim(emb_q, emb_base)[0]
top5   = torch.topk(scores, 5).indices.tolist()

print("Top 5 matches for 'wxx':")
for idx in top5:
    eid = row_ids[idx]
    r   = conn.execute("SELECT question, response FROM entries WHERE id=?", (eid,)).fetchone()
    print(f"  [{float(scores[idx])*100:.1f}%] id={eid} | Q: {r['question']}")
    print(f"    R: {repr((r['response'] or '')[:100])}")