kanad13 commited on
Commit
b811610
·
verified ·
1 Parent(s): 35e476e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -58
app.py CHANGED
@@ -2,12 +2,13 @@ import gradio as gr
2
  import pandas as pd
3
  from datasets import load_dataset
4
  from sentence_transformers import SentenceTransformer
5
- from transformers import AutoTokenizer, AutoModelForCausalLM
6
  import pymongo
7
  import os
 
 
8
 
9
  # Load a smaller portion of the dataset
10
- dataset = load_dataset("MongoDB/embedded_movies", split='train[:70%]')
11
  dataset_df = pd.DataFrame(dataset)
12
 
13
  # Data cleaning and preprocessing
@@ -54,73 +55,40 @@ documents = dataset_df.to_dict("records")
54
  collection.insert_many(documents)
55
  print("Data ingestion into MongoDB completed")
56
 
57
- def vector_search(user_query, collection):
 
 
 
 
58
  query_embedding = get_embedding(user_query)
59
- if query_embedding is None:
60
  return "Invalid query or embedding generation failed."
61
 
62
- pipeline = [
63
- {
64
- "$vectorSearch": {
65
- "index": "vector_index",
66
- "queryVector": query_embedding,
67
- "path": "embedding",
68
- "numCandidates": 100,
69
- "limit": 3,
70
- }
71
- },
72
- {
73
- "$project": {
74
- "_id": 0,
75
- "fullplot": 1,
76
- "title": 1,
77
- "genres": 1,
78
- "score": {"$meta": "vectorSearchScore"},
79
- }
80
- },
81
- ]
82
- results = collection.aggregate(pipeline)
83
- return list(results)
84
 
85
  def get_search_result(query):
86
- get_knowledge = vector_search(query, collection)
87
  search_result = ""
88
  for result in get_knowledge:
89
  search_result += f"Title: {result.get('title', 'N/A')}\nGenres: {', '.join(result.get('genres', ['N/A']))}\nPlot: {result.get('fullplot', 'N/A')[:150]}...\n\n"
90
  return search_result
91
 
92
- # Lazy loading of the language model
93
- model = None
94
- tokenizer = None
95
-
96
- def load_language_model():
97
- global model, tokenizer
98
- if model is None or tokenizer is None:
99
- tokenizer = AutoTokenizer.from_pretrained("gpt2")
100
- model = AutoModelForCausalLM.from_pretrained("gpt2")
101
-
102
  def generate_response(query):
103
- load_language_model()
104
  source_information = get_search_result(query)
105
- combined_information = f"Answer the question '{query}' based on these movie details:\n\n{source_information}"
106
-
107
- max_length = tokenizer.model_max_length
108
- input_ids = tokenizer(combined_information, return_tensors="pt", max_length=max_length, truncation=True)
109
-
110
- try:
111
- response = model.generate(
112
- **input_ids,
113
- max_new_tokens=100,
114
- num_return_sequences=1,
115
- no_repeat_ngram_size=2,
116
- top_k=40,
117
- top_p=0.9,
118
- temperature=0.7,
119
- do_sample=True
120
- )
121
- return tokenizer.decode(response[0], skip_special_tokens=True)
122
- except Exception as e:
123
- return f"An error occurred: {str(e)}"
124
 
125
  def query_movie_db(user_query):
126
  return generate_response(user_query)
@@ -128,7 +96,7 @@ def query_movie_db(user_query):
128
  description_and_article = """
129
  Ask this bot to recommend you a movie.
130
  Checkout [my github repo](https://github.com/kanad13/Movie-Recommendation-Bot) to look at the code that powers this bot.
131
- Note that the bot truncates replies due to token limitations in the free tier of Hugging Face resources.
132
  """
133
 
134
  iface = gr.Interface(
 
2
  import pandas as pd
3
  from datasets import load_dataset
4
  from sentence_transformers import SentenceTransformer
 
5
  import pymongo
6
  import os
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ import numpy as np
9
 
10
  # Load a smaller portion of the dataset
11
+ dataset = load_dataset("MongoDB/embedded_movies", split='train[:10%]')
12
  dataset_df = pd.DataFrame(dataset)
13
 
14
  # Data cleaning and preprocessing
 
55
  collection.insert_many(documents)
56
  print("Data ingestion into MongoDB completed")
57
 
58
+ # Load all embeddings into memory for faster similarity search
59
+ all_embeddings = np.array(dataset_df["embedding"].tolist())
60
+ all_titles = dataset_df["title"].tolist()
61
+
62
+ def vector_search(user_query):
63
  query_embedding = get_embedding(user_query)
64
+ if not query_embedding:
65
  return "Invalid query or embedding generation failed."
66
 
67
+ similarities = cosine_similarity([query_embedding], all_embeddings)[0]
68
+ top_indices = similarities.argsort()[-3:][::-1]
69
+
70
+ results = []
71
+ for idx in top_indices:
72
+ results.append({
73
+ "title": all_titles[idx],
74
+ "fullplot": dataset_df.iloc[idx]["fullplot"],
75
+ "genres": dataset_df.iloc[idx]["genres"],
76
+ "score": similarities[idx]
77
+ })
78
+
79
+ return results
 
 
 
 
 
 
 
 
 
80
 
81
  def get_search_result(query):
82
+ get_knowledge = vector_search(query)
83
  search_result = ""
84
  for result in get_knowledge:
85
  search_result += f"Title: {result.get('title', 'N/A')}\nGenres: {', '.join(result.get('genres', ['N/A']))}\nPlot: {result.get('fullplot', 'N/A')[:150]}...\n\n"
86
  return search_result
87
 
 
 
 
 
 
 
 
 
 
 
88
  def generate_response(query):
 
89
  source_information = get_search_result(query)
90
+ response = f"Based on your query '{query}', here are some movie recommendations:\n\n{source_information}\nThese movies match your query based on their plot summaries and genres. Let me know if you'd like more information about any of them!"
91
+ return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  def query_movie_db(user_query):
94
  return generate_response(user_query)
 
96
  description_and_article = """
97
  Ask this bot to recommend you a movie.
98
  Checkout [my github repo](https://github.com/kanad13/Movie-Recommendation-Bot) to look at the code that powers this bot.
99
+ Note that the bot provides concise recommendations based on a limited dataset to ensure optimal performance.
100
  """
101
 
102
  iface = gr.Interface(