import gradio as gr import google.generativeai as genai import os import markdown import cv2 import numpy as np from tensorflow.keras.models import load_model import mediapipe as mp from dotenv import load_dotenv load_dotenv() genai.configure(api_key=os.environ.get("API_KEY")) # Setup the model generation_config = { "temperature": 1, "top_p": 0.95, "top_k": 0, "max_output_tokens": 8192, } safety_settings = [ { "category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE" }, { "category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE" }, { "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE" }, { "category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE" }, ] model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest", generation_config=generation_config, safety_settings=safety_settings) convo = model.start_chat(history=[]) model = load_model('asl_landmark_mine_model_one.h5') def preprocess_image(img, target_size=(64, 64)): img = cv2.cvtColor(cv2.flip(img, 1), cv2.COLOR_BGR2RGB) img = cv2.resize(img, target_size) img = np.expand_dims(img, axis=0) img = img / 255.0 return img def predict_asl_letter(image, model): img = preprocess_image(image) predictions = model.predict(img) predicted_class = np.argmax(predictions) asl_letter = chr(predicted_class + ord('A')) return asl_letter def asl_video(): cap = cv2.VideoCapture(0) sentence = "" mp_hands = mp.solutions.hands mp_drawing = mp.solutions.drawing_utils hands = mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5) while cap.isOpened(): ret, frame = cap.read() if not ret: print("Ignoring empty camera frame.") continue frame.flags.writeable = False results = hands.process(frame) frame.flags.writeable = True frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) if results.multi_hand_landmarks: for hand_landmarks in results.multi_hand_landmarks: mp_drawing.draw_landmarks( frame, hand_landmarks, mp_hands.HAND_CONNECTIONS) asl_letter = predict_asl_letter(frame, model) cv2.putText(frame, "Predicted Letter: " + asl_letter, (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2) sentence += asl_letter cv2.imshow('MediaPipe Hands', frame) if cv2.waitKey(5) & 0xFF == ord('q'): break cap.release() cv2.destroyAllWindows() return sentence def greet(): global convo sentence = "Hello" expected = "Hello" questionsRight = 4 numberOfQuestions = 10 questionNumber = 5 sentence = asl_video() prompt = f"Analyze the user's sign for \"{expected}\" and provide feedback. Did they sign it correctly? If not, explain what went wrong and how to improve. If they got it right, offer encouragement. The user's sign was \"{sentence}\". This is question number {questionNumber} out of {numberOfQuestions}, and the user has gotten {questionsRight} questions right so far." convo.send_message(prompt) result = markdown.markdown(convo.last.text) return result if __name__ == "__main__": gr.Interface( fn=greet, inputs=None, outputs="html" ).launch()