File size: 3,581 Bytes
f853f39 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | import gradio as gr
import google.generativeai as genai
import os
import markdown
import cv2
import numpy as np
from tensorflow.keras.models import load_model
import mediapipe as mp
from dotenv import load_dotenv
load_dotenv()
genai.configure(api_key=os.environ.get("API_KEY"))
# Setup the model
generation_config = {
"temperature": 1,
"top_p": 0.95,
"top_k": 0,
"max_output_tokens": 8192,
}
safety_settings = [
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
},
]
model = genai.GenerativeModel(model_name="gemini-1.5-pro-latest",
generation_config=generation_config,
safety_settings=safety_settings)
convo = model.start_chat(history=[])
model = load_model('asl_landmark_mine_model_one.h5')
def preprocess_image(img, target_size=(64, 64)):
img = cv2.cvtColor(cv2.flip(img, 1), cv2.COLOR_BGR2RGB)
img = cv2.resize(img, target_size)
img = np.expand_dims(img, axis=0)
img = img / 255.0
return img
def predict_asl_letter(image, model):
img = preprocess_image(image)
predictions = model.predict(img)
predicted_class = np.argmax(predictions)
asl_letter = chr(predicted_class + ord('A'))
return asl_letter
def asl_video():
cap = cv2.VideoCapture(0)
sentence = ""
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5)
while cap.isOpened():
ret, frame = cap.read()
if not ret:
print("Ignoring empty camera frame.")
continue
frame.flags.writeable = False
results = hands.process(frame)
frame.flags.writeable = True
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
if results.multi_hand_landmarks:
for hand_landmarks in results.multi_hand_landmarks:
mp_drawing.draw_landmarks(
frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
asl_letter = predict_asl_letter(frame, model)
cv2.putText(frame, "Predicted Letter: " + asl_letter, (20, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0),
2)
sentence += asl_letter
cv2.imshow('MediaPipe Hands', frame)
if cv2.waitKey(5) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
return sentence
def greet():
global convo
sentence = "Hello"
expected = "Hello"
questionsRight = 4
numberOfQuestions = 10
questionNumber = 5
sentence = asl_video()
prompt = f"Analyze the user's sign for \"{expected}\" and provide feedback. Did they sign it correctly? If not, explain what went wrong and how to improve. If they got it right, offer encouragement. The user's sign was \"{sentence}\". This is question number {questionNumber} out of {numberOfQuestions}, and the user has gotten {questionsRight} questions right so far."
convo.send_message(prompt)
result = markdown.markdown(convo.last.text)
return result
if __name__ == "__main__":
gr.Interface(
fn=greet,
inputs=None,
outputs="html"
).launch()
|