Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,12 +2,13 @@ import gradio as gr
|
|
| 2 |
import pandas as pd
|
| 3 |
from datasets import load_dataset
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
-
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 6 |
import pymongo
|
| 7 |
import os
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# Load a smaller portion of the dataset
|
| 10 |
-
dataset = load_dataset("MongoDB/embedded_movies", split='train[:
|
| 11 |
dataset_df = pd.DataFrame(dataset)
|
| 12 |
|
| 13 |
# Data cleaning and preprocessing
|
|
@@ -54,73 +55,40 @@ documents = dataset_df.to_dict("records")
|
|
| 54 |
collection.insert_many(documents)
|
| 55 |
print("Data ingestion into MongoDB completed")
|
| 56 |
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
query_embedding = get_embedding(user_query)
|
| 59 |
-
if
|
| 60 |
return "Invalid query or embedding generation failed."
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
"fullplot": 1,
|
| 76 |
-
"title": 1,
|
| 77 |
-
"genres": 1,
|
| 78 |
-
"score": {"$meta": "vectorSearchScore"},
|
| 79 |
-
}
|
| 80 |
-
},
|
| 81 |
-
]
|
| 82 |
-
results = collection.aggregate(pipeline)
|
| 83 |
-
return list(results)
|
| 84 |
|
| 85 |
def get_search_result(query):
|
| 86 |
-
get_knowledge = vector_search(query
|
| 87 |
search_result = ""
|
| 88 |
for result in get_knowledge:
|
| 89 |
search_result += f"Title: {result.get('title', 'N/A')}\nGenres: {', '.join(result.get('genres', ['N/A']))}\nPlot: {result.get('fullplot', 'N/A')[:150]}...\n\n"
|
| 90 |
return search_result
|
| 91 |
|
| 92 |
-
# Lazy loading of the language model
|
| 93 |
-
model = None
|
| 94 |
-
tokenizer = None
|
| 95 |
-
|
| 96 |
-
def load_language_model():
|
| 97 |
-
global model, tokenizer
|
| 98 |
-
if model is None or tokenizer is None:
|
| 99 |
-
tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
| 100 |
-
model = AutoModelForCausalLM.from_pretrained("gpt2")
|
| 101 |
-
|
| 102 |
def generate_response(query):
|
| 103 |
-
load_language_model()
|
| 104 |
source_information = get_search_result(query)
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
max_length = tokenizer.model_max_length
|
| 108 |
-
input_ids = tokenizer(combined_information, return_tensors="pt", max_length=max_length, truncation=True)
|
| 109 |
-
|
| 110 |
-
try:
|
| 111 |
-
response = model.generate(
|
| 112 |
-
**input_ids,
|
| 113 |
-
max_new_tokens=100,
|
| 114 |
-
num_return_sequences=1,
|
| 115 |
-
no_repeat_ngram_size=2,
|
| 116 |
-
top_k=40,
|
| 117 |
-
top_p=0.9,
|
| 118 |
-
temperature=0.7,
|
| 119 |
-
do_sample=True
|
| 120 |
-
)
|
| 121 |
-
return tokenizer.decode(response[0], skip_special_tokens=True)
|
| 122 |
-
except Exception as e:
|
| 123 |
-
return f"An error occurred: {str(e)}"
|
| 124 |
|
| 125 |
def query_movie_db(user_query):
|
| 126 |
return generate_response(user_query)
|
|
@@ -128,7 +96,7 @@ def query_movie_db(user_query):
|
|
| 128 |
description_and_article = """
|
| 129 |
Ask this bot to recommend you a movie.
|
| 130 |
Checkout [my github repo](https://github.com/kanad13/Movie-Recommendation-Bot) to look at the code that powers this bot.
|
| 131 |
-
Note that the bot
|
| 132 |
"""
|
| 133 |
|
| 134 |
iface = gr.Interface(
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
from datasets import load_dataset
|
| 4 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 5 |
import pymongo
|
| 6 |
import os
|
| 7 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 8 |
+
import numpy as np
|
| 9 |
|
| 10 |
# Load a smaller portion of the dataset
|
| 11 |
+
dataset = load_dataset("MongoDB/embedded_movies", split='train[:10%]')
|
| 12 |
dataset_df = pd.DataFrame(dataset)
|
| 13 |
|
| 14 |
# Data cleaning and preprocessing
|
|
|
|
| 55 |
collection.insert_many(documents)
|
| 56 |
print("Data ingestion into MongoDB completed")
|
| 57 |
|
| 58 |
+
# Load all embeddings into memory for faster similarity search
|
| 59 |
+
all_embeddings = np.array(dataset_df["embedding"].tolist())
|
| 60 |
+
all_titles = dataset_df["title"].tolist()
|
| 61 |
+
|
| 62 |
+
def vector_search(user_query):
|
| 63 |
query_embedding = get_embedding(user_query)
|
| 64 |
+
if not query_embedding:
|
| 65 |
return "Invalid query or embedding generation failed."
|
| 66 |
|
| 67 |
+
similarities = cosine_similarity([query_embedding], all_embeddings)[0]
|
| 68 |
+
top_indices = similarities.argsort()[-3:][::-1]
|
| 69 |
+
|
| 70 |
+
results = []
|
| 71 |
+
for idx in top_indices:
|
| 72 |
+
results.append({
|
| 73 |
+
"title": all_titles[idx],
|
| 74 |
+
"fullplot": dataset_df.iloc[idx]["fullplot"],
|
| 75 |
+
"genres": dataset_df.iloc[idx]["genres"],
|
| 76 |
+
"score": similarities[idx]
|
| 77 |
+
})
|
| 78 |
+
|
| 79 |
+
return results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
def get_search_result(query):
|
| 82 |
+
get_knowledge = vector_search(query)
|
| 83 |
search_result = ""
|
| 84 |
for result in get_knowledge:
|
| 85 |
search_result += f"Title: {result.get('title', 'N/A')}\nGenres: {', '.join(result.get('genres', ['N/A']))}\nPlot: {result.get('fullplot', 'N/A')[:150]}...\n\n"
|
| 86 |
return search_result
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
def generate_response(query):
|
|
|
|
| 89 |
source_information = get_search_result(query)
|
| 90 |
+
response = f"Based on your query '{query}', here are some movie recommendations:\n\n{source_information}\nThese movies match your query based on their plot summaries and genres. Let me know if you'd like more information about any of them!"
|
| 91 |
+
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
def query_movie_db(user_query):
|
| 94 |
return generate_response(user_query)
|
|
|
|
| 96 |
description_and_article = """
|
| 97 |
Ask this bot to recommend you a movie.
|
| 98 |
Checkout [my github repo](https://github.com/kanad13/Movie-Recommendation-Bot) to look at the code that powers this bot.
|
| 99 |
+
Note that the bot provides concise recommendations based on a limited dataset to ensure optimal performance.
|
| 100 |
"""
|
| 101 |
|
| 102 |
iface = gr.Interface(
|