# database/qdrant.py from qdrant_client import QdrantClient from qdrant_client.http import models from utils.config import get_qdrant_api_key, get_qdrant_uri from api.openai import generate_embeddings def get_qdrant_client(): """ Get a Qdrant client instance Returns: QdrantClient: Qdrant client """ uri = get_qdrant_uri() api_key = get_qdrant_api_key() return QdrantClient(url=uri, api_key=api_key) def create_collection_if_not_exists(collection_name="podcast_vectors", vector_size=1536): """ Create a Qdrant collection if it doesn't exist Args: collection_name: Name of the collection vector_size: Size of the vectors """ client = get_qdrant_client() # Check if collection exists collections = client.get_collections() collection_names = [c.name for c in collections.collections] if collection_name not in collection_names: # Create the collection client.create_collection( collection_name=collection_name, vectors_config=models.VectorParams( size=vector_size, distance=models.Distance.COSINE ) ) print(f"Created collection: {collection_name}") else: print(f"Collection {collection_name} already exists") def store_vectors(podcast_data, document_id): """ Store Meeting vectors in Qdrant for semantic search Args: podcast_data: Dictionary containing Meeting data document_id: MongoDB document ID as a reference Returns: bool: True if successful """ try: # Create collection if it doesn't exist create_collection_if_not_exists() # Generate embedding for the summary text summary_text = podcast_data["summary"] embedding = generate_embeddings(summary_text) # Convert MongoDB document to JSON-serializable dict # This is the key fix - convert any MongoDB ObjectId to string cleaned_data = json_serialize_podcast_data(podcast_data) # Generate a numeric hash ID from the string document_id point_id = hash(str(document_id)) % (2**63) # Store in Qdrant client = get_qdrant_client() client.upsert( collection_name="podcast_vectors", points=[ models.PointStruct( id=point_id, # Use a numeric ID vector=embedding, payload=cleaned_data # Use cleaned data ) ] ) return True except Exception as e: print(f"Error storing vectors in Qdrant: {e}") # Continue without vector storage for testing return False def json_serialize_podcast_data(podcast_data): """ Convert MongoDB document with ObjectId to JSON-serializable dict Args: podcast_data: Dictionary that may contain ObjectId Returns: dict: JSON-serializable dictionary """ from bson.objectid import ObjectId import json # Define a custom JSON encoder to handle ObjectId class MongoJSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, ObjectId): return str(obj) return super().default(obj) # Convert to JSON and back to dict to ensure all values are serializable json_str = json.dumps(podcast_data, cls=MongoJSONEncoder) return json.loads(json_str) def generate_consistent_id(document_id): """ Generate a consistent numeric ID from a string ID Args: document_id: String ID Returns: int: Consistent numeric ID """ import hashlib # Convert document_id to string if it's not already doc_id_str = str(document_id) # Use MD5 to get a consistent hash hash_object = hashlib.md5(doc_id_str.encode()) # Convert first 8 bytes of hash to integer numeric_id = int.from_bytes(hash_object.digest()[:8], byteorder='big') return numeric_id def search_similar_content(query_text, limit=3): """ Search for Meeting content similar to the query Args: query_text: Query text to search for limit: Maximum number of results to return Returns: list: List of search results """ try: # Generate embedding for the query query_embedding = generate_embeddings(query_text) # Search Qdrant client = get_qdrant_client() search_results = client.search( collection_name="podcast_vectors", query_vector=query_embedding, limit=limit ) return search_results except Exception as e: print(f"Error searching vectors: {e}") return []