Movie-Recommendation-Bot

Sleeping

App Files Files Community

kanad13 commited on Aug 12, 2024

Commit

35e476e

verified ·

1 Parent(s): a965c00

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -120

app.py CHANGED Viewed

@@ -1,189 +1,144 @@
-# Import necessary libraries
-import gradio as gr  # Gradio is used to create a web interface to interact with the model.
-import pandas as pd  # Pandas is used for data manipulation and analysis.
-from datasets import load_dataset  # This function loads datasets from Hugging Face.
-from sentence_transformers import SentenceTransformer  # Used for generating text embeddings.
-from transformers import AutoTokenizer, AutoModelForCausalLM  # Transformers are used for natural language processing tasks.
-import pymongo  # Pymongo is used to interact with MongoDB.
-import os  # Used for accessing environment variables.
-# Load Dataset from Hugging Face and convert it to a pandas DataFrame
-# The dataset contains movie information and is split to use 80% for training.
-dataset = load_dataset("MongoDB/embedded_movies", split='train[:80%]') # AIatMongoDB/embedded_movies
 dataset_df = pd.DataFrame(dataset)
-# Remove rows where the 'fullplot' column is empty
-# It's crucial to ensure that every movie entry has a complete plot description.
-# The 'fullplot' column is necessary for generating embeddings, which are numerical representations of the text.
-# If this data is missing, the embeddings will be incomplete or nonsensical, reducing the accuracy of recommendations.
 dataset_df = dataset_df.dropna(subset=["fullplot"])
-# Drop the 'plot_embedding' column as we will generate new embeddings
-# We drop the existing 'plot_embedding' column to create new embeddings using a different, potentially more effective model.
-# This step ensures consistency and accuracy in the embeddings used for similarity searches.
 dataset_df = dataset_df.drop(columns=["plot_embedding"])
-# Load a pre-trained embedding model
-# We use a pre-trained model from Sentence Transformers to convert movie plots into numerical embeddings.
-# These embeddings capture the semantic content of the plots, allowing us to perform efficient and meaningful similarity searches.
-embedding_model = SentenceTransformer("thenlper/gte-large")
-# Define a function to generate embeddings for a given text
-# Embeddings are numerical representations of text that capture its semantic meaning.
-# This function checks if the text is not empty and then generates an embedding using the loaded model.
 def get_embedding(text: str) -> list:
-    if not text.strip():  # Check if the text is not empty
-        # If the text is empty, return an empty list as it does not make sense to generate embeddings for empty text.
-        # This ensures that we avoid errors and meaningless embeddings.
         print("Attempted to get embedding for empty text.")
         return []
-    embedding = embedding_model.encode(text)  # Generate the embedding
-    return embedding.tolist()  # Convert embedding to a list for storage and manipulation
-# Apply the embedding function to the 'fullplot' column in the DataFrame
-# This step generates embeddings for each movie plot in the dataset, storing them in the DataFrame for later use in similarity searches.
-dataset_df["embedding"] = dataset_df["fullplot"].apply(get_embedding)
-# Function to connect to MongoDB
-# MongoDB is a NoSQL database used to store and retrieve large datasets efficiently.
-# This function attempts to create a MongoDB client to connect to the database.
 def get_mongo_client(mongo_uri):
     try:
-        client = pymongo.MongoClient(mongo_uri)  # Create a MongoDB client
         print("Connection to MongoDB successful")
         return client
     except pymongo.errors.ConnectionFailure as e:
-        # Handle potential connection failures to provide feedback in case of issues with the MongoDB URI or network problems.
         print(f"Connection failed: {e}")
         return None
-# Get the MongoDB URI from environment variables
-# The MongoDB URI is required to connect to the database. It should be stored securely in environment variables to protect sensitive information.
 mongo_uri = os.getenv("MONGO_URI")
 if not mongo_uri:
     print("MONGO_URI not set in environment variables")
-# Connect to MongoDB using the URI
-# The client connects to the 'movies' database and accesses the 'movie_collection_2' collection.
-# This collection will store the movie data with their respective embeddings.
 mongo_client = get_mongo_client(mongo_uri)
-db = mongo_client["movies"]  # Access the 'movies' database
-collection = db["movie_collection_2"]  # Access the 'movie_collection_2' collection
-# Clear the collection and insert the new data
-# Clearing the collection to avoid duplication of records and ensure we start with a fresh set of data.
-# This step ensures that the collection only contains the most recent data with newly generated embeddings.
-collection.delete_many({})  # Delete any existing records in the collection
-documents = dataset_df.to_dict("records")  # Convert DataFrame to list of dictionaries
-collection.insert_many(documents)  # Insert documents into the collection
 print("Data ingestion into MongoDB completed")
-# Function to perform a vector search on the user query
-# This function generates an embedding for the user's query and uses it to search for similar movie plots in the MongoDB collection.
-# Vector search allows us to find movies with plots that are semantically similar to the query.
 def vector_search(user_query, collection):
-    query_embedding = get_embedding(user_query)  # Generate embedding for the user query
     if query_embedding is None:
-        # Return an error message if the embedding generation fails, ensuring graceful handling of invalid queries.
         return "Invalid query or embedding generation failed."
-    # Define the MongoDB aggregation pipeline for vector search
-    # This pipeline uses the generated query embedding to search for similar embeddings in the collection.
     pipeline = [
         {
             "$vectorSearch": {
-                "index": "vector_index",  # Name of the vector index
-                "queryVector": query_embedding,  # Embedding of the user query
-                "path": "embedding",  # Path to the embedding field in the documents
-                "numCandidates": 150,  # Number of candidate matches to consider for broad retrieval
-                "limit": 4,  # Return top 4 matches to keep results concise and relevant
             }
         },
         {
             "$project": {
-                "_id": 0,  # Exclude the '_id' field from the results for cleaner output
-                "fullplot": 1,  # Include the 'fullplot' field in the results for detailed descriptions
-                "title": 1,  # Include the 'title' field in the results to identify movies
-                "genres": 1,  # Include the 'genres' field in the results for additional context
-                "score": {"$meta": "vectorSearchScore"},  # Include the search score to assess relevance
             }
         },
     ]
-    results = collection.aggregate(pipeline)  # Execute the aggregation pipeline
-    return list(results)  # Return the results as a list
-# Function to format search results
-# This function formats the search results into a user-friendly format, making it easier for users to read and understand the recommendations.
 def get_search_result(query):
-    get_knowledge = vector_search(query, collection)  # Perform vector search on the query
     search_result = ""
-    for result in get_knowledge:  # Iterate through search results
-        # Format the search results to be user-friendly, including only the first 200 characters of the plot for brevity.
-        search_result += f"Title: {result.get('title', 'N/A')}\nGenres: {', '.join(result.get('genres', ['N/A']))}\nPlot: {result.get('fullplot', 'N/A')[:200]}...\n\n"
     return search_result
-# Load a pre-trained language model for generating responses
-# Using GPT-2 to generate human-like responses based on the search results.
-# The tokenizer converts text to a format that the model can understand, and the model generates responses.
-tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
-model = AutoModelForCausalLM.from_pretrained("gpt2-medium")
-# Function to generate a response based on the user's query
-# This function combines the search results with the user's query and generates a response using the GPT-2 model.
 def generate_response(query):
-    source_information = get_search_result(query)  # Get search results for the query
-    combined_information = (
-        f"Answer the question '{query}' based on these movie details:\n\n{source_information}"
-    )
-    # Prepare input for the language model
-    # Ensures the input does not exceed the model's maximum token capacity.
-    max_length = tokenizer.model_max_length  # Get the maximum token length to ensure input does not exceed model capacity
     input_ids = tokenizer(combined_information, return_tensors="pt", max_length=max_length, truncation=True)
     try:
         response = model.generate(
             **input_ids,
-            max_new_tokens=150,  # Limit the number of tokens to generate to control response length
-            num_return_sequences=1,  # Generate a single response sequence
-            no_repeat_ngram_size=2,  # Avoid repeating n-grams to improve response quality
-            top_k=50,  # Use top-k sampling for diversity in responses
-            top_p=0.95,  # Use nucleus sampling to focus on high-probability words
-            temperature=0.7,  # Control the randomness of predictions to balance between creativity and coherence
-            do_sample=True  # Enable sampling
         )
-        return tokenizer.decode(response[0], skip_special_tokens=True)  # Decode and return the response
     except Exception as e:
-        # Handle potential errors during generation and provide a meaningful error message.
         return f"An error occurred: {str(e)}"
-# Function to handle user queries and generate responses
-# This function ties together the query handling and response generation processes.
 def query_movie_db(user_query):
     return generate_response(user_query)
-# Create the Gradio interface
-# Gradio provides a simple interface to interact with the model, allowing users to enter queries and receive responses.
-import gradio as gr
 description_and_article = """
 Ask this bot to recommend you a movie.
 Checkout [my github repo](https://github.com/kanad13/Movie-Recommendation-Bot) to look at the code that powers this bot.
 Note that the bot truncates replies due to token limitations in the free tier of Hugging Face resources.
-This is not a coding issue but a result of operating within the token limitations of the free tier of Hugging Face resources.
-To enhance response quality, better models and more resources could be used, but these come with higher costs, which I want to avoid as this is a hobby project.
 """
 iface = gr.Interface(
-    fn=query_movie_db,  # Function to handle user queries
-    inputs=gr.Textbox(lines=2, placeholder="Enter your movie query here..."),  # Textbox input for user queries
-    outputs="text",  # Text output for responses
-    title="Movie Recommendation Bot",  # Title of the interface
-    description=description_and_article,  # Combined description and article content
-    examples=[["Suggest me a scary movie?"], ["What action movie can I watch?"]]  # Example queries
 )
-# Launch the interface
 if __name__ == "__main__":
-    iface.launch()  # This launches the Gradio interface.

+import gradio as gr
+import pandas as pd
+from datasets import load_dataset
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import pymongo
+import os
+# Load a smaller portion of the dataset
+dataset = load_dataset("MongoDB/embedded_movies", split='train[:70%]')
 dataset_df = pd.DataFrame(dataset)
+# Data cleaning and preprocessing
 dataset_df = dataset_df.dropna(subset=["fullplot"])
 dataset_df = dataset_df.drop(columns=["plot_embedding"])
+# Load a smaller embedding model
+embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
 def get_embedding(text: str) -> list:
+    if not text.strip():
         print("Attempted to get embedding for empty text.")
         return []
+    embedding = embedding_model.encode(text)
+    return embedding.tolist()
+# Process embeddings in batches
+batch_size = 100
+for i in range(0, len(dataset_df), batch_size):
+    batch = dataset_df.iloc[i:i+batch_size]
+    batch["embedding"] = batch["fullplot"].apply(get_embedding)
+    dataset_df.iloc[i:i+batch_size] = batch
 def get_mongo_client(mongo_uri):
     try:
+        client = pymongo.MongoClient(mongo_uri)
         print("Connection to MongoDB successful")
         return client
     except pymongo.errors.ConnectionFailure as e:
         print(f"Connection failed: {e}")
         return None
 mongo_uri = os.getenv("MONGO_URI")
 if not mongo_uri:
     print("MONGO_URI not set in environment variables")
 mongo_client = get_mongo_client(mongo_uri)
+db = mongo_client["movies"]
+collection = db["movie_collection_2"]
+# Clear the collection and insert new data in bulk
+collection.delete_many({})
+documents = dataset_df.to_dict("records")
+collection.insert_many(documents)
 print("Data ingestion into MongoDB completed")
 def vector_search(user_query, collection):
+    query_embedding = get_embedding(user_query)
     if query_embedding is None:
         return "Invalid query or embedding generation failed."
     pipeline = [
         {
             "$vectorSearch": {
+                "index": "vector_index",
+                "queryVector": query_embedding,
+                "path": "embedding",
+                "numCandidates": 100,
+                "limit": 3,
             }
         },
         {
             "$project": {
+                "_id": 0,
+                "fullplot": 1,
+                "title": 1,
+                "genres": 1,
+                "score": {"$meta": "vectorSearchScore"},
             }
         },
     ]
+    results = collection.aggregate(pipeline)
+    return list(results)
 def get_search_result(query):
+    get_knowledge = vector_search(query, collection)
     search_result = ""
+    for result in get_knowledge:
+        search_result += f"Title: {result.get('title', 'N/A')}\nGenres: {', '.join(result.get('genres', ['N/A']))}\nPlot: {result.get('fullplot', 'N/A')[:150]}...\n\n"
     return search_result
+# Lazy loading of the language model
+model = None
+tokenizer = None
+def load_language_model():
+    global model, tokenizer
+    if model is None or tokenizer is None:
+        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        model = AutoModelForCausalLM.from_pretrained("gpt2")
 def generate_response(query):
+    load_language_model()
+    source_information = get_search_result(query)
+    combined_information = f"Answer the question '{query}' based on these movie details:\n\n{source_information}"
+    max_length = tokenizer.model_max_length
     input_ids = tokenizer(combined_information, return_tensors="pt", max_length=max_length, truncation=True)
     try:
         response = model.generate(
             **input_ids,
+            max_new_tokens=100,
+            num_return_sequences=1,
+            no_repeat_ngram_size=2,
+            top_k=40,
+            top_p=0.9,
+            temperature=0.7,
+            do_sample=True
         )
+        return tokenizer.decode(response[0], skip_special_tokens=True)
     except Exception as e:
         return f"An error occurred: {str(e)}"
 def query_movie_db(user_query):
     return generate_response(user_query)
 description_and_article = """
 Ask this bot to recommend you a movie.
 Checkout [my github repo](https://github.com/kanad13/Movie-Recommendation-Bot) to look at the code that powers this bot.
 Note that the bot truncates replies due to token limitations in the free tier of Hugging Face resources.
 """
 iface = gr.Interface(
+    fn=query_movie_db,
+    inputs=gr.Textbox(lines=2, placeholder="Enter your movie query here..."),
+    outputs="text",
+    title="Movie Recommendation Bot",
+    description=description_and_article,
+    examples=[["Suggest me a scary movie?"], ["What action movie can I watch?"]]
 )
 if __name__ == "__main__":
+    iface.launch()