Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| from datasets import load_dataset | |
| from sentence_transformers import SentenceTransformer | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import pymongo | |
| import os | |
| # Load a smaller portion of the dataset | |
| dataset = load_dataset("MongoDB/embedded_movies", split='train[:70%]') | |
| dataset_df = pd.DataFrame(dataset) | |
| # Data cleaning and preprocessing | |
| dataset_df = dataset_df.dropna(subset=["fullplot"]) | |
| dataset_df = dataset_df.drop(columns=["plot_embedding"]) | |
| # Load a smaller embedding model | |
| embedding_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| def get_embedding(text: str) -> list: | |
| if not text.strip(): | |
| print("Attempted to get embedding for empty text.") | |
| return [] | |
| embedding = embedding_model.encode(text) | |
| return embedding.tolist() | |
| # Process embeddings in batches | |
| batch_size = 100 | |
| for i in range(0, len(dataset_df), batch_size): | |
| batch = dataset_df.iloc[i:i+batch_size] | |
| batch["embedding"] = batch["fullplot"].apply(get_embedding) | |
| dataset_df.iloc[i:i+batch_size] = batch | |
| def get_mongo_client(mongo_uri): | |
| try: | |
| client = pymongo.MongoClient(mongo_uri) | |
| print("Connection to MongoDB successful") | |
| return client | |
| except pymongo.errors.ConnectionFailure as e: | |
| print(f"Connection failed: {e}") | |
| return None | |
| mongo_uri = os.getenv("MONGO_URI") | |
| if not mongo_uri: | |
| print("MONGO_URI not set in environment variables") | |
| mongo_client = get_mongo_client(mongo_uri) | |
| db = mongo_client["movies"] | |
| collection = db["movie_collection_2"] | |
| # Clear the collection and insert new data in bulk | |
| collection.delete_many({}) | |
| documents = dataset_df.to_dict("records") | |
| collection.insert_many(documents) | |
| print("Data ingestion into MongoDB completed") | |
| def vector_search(user_query, collection): | |
| query_embedding = get_embedding(user_query) | |
| if query_embedding is None: | |
| return "Invalid query or embedding generation failed." | |
| pipeline = [ | |
| { | |
| "$vectorSearch": { | |
| "index": "vector_index", | |
| "queryVector": query_embedding, | |
| "path": "embedding", | |
| "numCandidates": 100, | |
| "limit": 3, | |
| } | |
| }, | |
| { | |
| "$project": { | |
| "_id": 0, | |
| "fullplot": 1, | |
| "title": 1, | |
| "genres": 1, | |
| "score": {"$meta": "vectorSearchScore"}, | |
| } | |
| }, | |
| ] | |
| results = collection.aggregate(pipeline) | |
| return list(results) | |
| def get_search_result(query): | |
| get_knowledge = vector_search(query, collection) | |
| search_result = "" | |
| for result in get_knowledge: | |
| search_result += f"Title: {result.get('title', 'N/A')}\nGenres: {', '.join(result.get('genres', ['N/A']))}\nPlot: {result.get('fullplot', 'N/A')[:150]}...\n\n" | |
| return search_result | |
| # Lazy loading of the language model | |
| model = None | |
| tokenizer = None | |
| def load_language_model(): | |
| global model, tokenizer | |
| if model is None or tokenizer is None: | |
| tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
| model = AutoModelForCausalLM.from_pretrained("gpt2") | |
| def generate_response(query): | |
| load_language_model() | |
| source_information = get_search_result(query) | |
| combined_information = f"Answer the question '{query}' based on these movie details:\n\n{source_information}" | |
| max_length = tokenizer.model_max_length | |
| input_ids = tokenizer(combined_information, return_tensors="pt", max_length=max_length, truncation=True) | |
| try: | |
| response = model.generate( | |
| **input_ids, | |
| max_new_tokens=100, | |
| num_return_sequences=1, | |
| no_repeat_ngram_size=2, | |
| top_k=40, | |
| top_p=0.9, | |
| temperature=0.7, | |
| do_sample=True | |
| ) | |
| return tokenizer.decode(response[0], skip_special_tokens=True) | |
| except Exception as e: | |
| return f"An error occurred: {str(e)}" | |
| def query_movie_db(user_query): | |
| return generate_response(user_query) | |
| description_and_article = """ | |
| Ask this bot to recommend you a movie. | |
| Checkout [my github repo](https://github.com/kanad13/Movie-Recommendation-Bot) to look at the code that powers this bot. | |
| Note that the bot truncates replies due to token limitations in the free tier of Hugging Face resources. | |
| """ | |
| iface = gr.Interface( | |
| fn=query_movie_db, | |
| inputs=gr.Textbox(lines=2, placeholder="Enter your movie query here..."), | |
| outputs="text", | |
| title="Movie Recommendation Bot", | |
| description=description_and_article, | |
| examples=[["Suggest me a scary movie?"], ["What action movie can I watch?"]] | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() |