Spaces:

522H0134-NguyenNhatHuy
/

Vi-VQA-Animals

Running

App Files Files Community

Vi-VQA-Animals / app.py

522H0134-NguyenNhatHuy

Update app.py

343612e verified 2 months ago

raw

history blame

4.58 kB

	import os
	os.environ["GRADIO_ALLOWED_PATHS"] = os.path.abspath("./")

	import warnings
	warnings.filterwarnings("ignore")
	import transformers
	transformers.logging.set_verbosity_error()

	import gradio as gr
	import torch
	import torch.nn as nn
	import torchvision.transforms as transforms
	import torchvision.models as models
	from transformers import AutoTokenizer, AutoModel
	from PIL import Image
	from pyvi import ViTokenizer
	from safetensors.torch import load_file
	import pandas as pd
	from sklearn.preprocessing import LabelEncoder

	# 1. LOAD DATASET AND LABEL ENCODER
	df = pd.read_csv('./animal_dataset_vi.csv')
	label_encoder = LabelEncoder()
	label_encoder.fit(df['answer'].astype(str))
	num_classes = len(label_encoder.classes_)

	# 2. PREPARE 3 RANDOM SAMPLES
	custom_questions = [
	"Con vật trong hình là con gì?",
	"Màu sắc chủ đạo của con vật này là gì?",
	"Con này sống ở đâu?"
	]

	# Randomly select 3 images from the dataset
	df_samples = df.sample(n=3)

	examples_list = []
	for i, (_, row) in enumerate(df_samples.iterrows()):
	# Convert absolute paths from your dataset to relative paths for Hugging Face
	img_path = row['image_path'].replace(
	"animal_dataset/animals/animals",
	"./animals/animals"
	)
	# Double check if file exists on server before adding to examples
	if os.path.exists(img_path):
	examples_list.append([img_path, custom_questions[i]])

	# 3. INITIALIZE MODEL ARCHITECTURE
	class VQAModel(nn.Module):
	def __init__(self, num_classes):
	super(VQAModel, self).__init__()
	self.image_encoder = nn.Sequential(*list(models.resnet50(weights=None).children())[:-1])
	self.img_proj = nn.Linear(2048, 512)

	self.text_encoder = AutoModel.from_pretrained("vinai/phobert-base-v2")
	self.text_proj = nn.Linear(768, 512)

	self.classifier = nn.Sequential(
	nn.LayerNorm(512),
	nn.Dropout(0.4),
	nn.Linear(512, 512),
	nn.ReLU(),
	nn.Dropout(0.4),
	nn.Linear(512, num_classes)
	)

	def forward(self, images, input_ids, attention_mask):
	img_features = self.image_encoder(images).flatten(start_dim=1)
	img_features = self.img_proj(img_features)

	text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
	text_features = self.text_proj(text_outputs.pooler_output)

	combined_features = img_features * text_features
	return self.classifier(combined_features)

	# Setup device and load weights
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = VQAModel(num_classes).to(device)

	model_path = './vqa_resnet50_phobert.safetensors'
	if os.path.exists(model_path):
	model.load_state_dict(load_file(model_path))
	model.eval()

	# Initialize text tokenizer and image transformations
	tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
	transform = transforms.Compose([
	transforms.Resize((224, 224)),
	transforms.ToTensor(),
	transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
	])

	# 4. INFERENCE FUNCTION
	def predict_vqa(image, question):
	if image is None or question.strip() == "":
	return "Please provide both an image and a question."
	try:
	image_tensor = transform(image.convert('RGB')).unsqueeze(0).to(device)
	segmented_question = ViTokenizer.tokenize(question)
	encoding = tokenizer(
	segmented_question, truncation=True, padding='max_length',
	max_length=64, return_tensors='pt'
	)

	with torch.no_grad():
	outputs = model(
	image_tensor,
	encoding['input_ids'].to(device),
	encoding['attention_mask'].to(device)
	)
	_, predicted_id = torch.max(outputs, 1)

	answer = label_encoder.inverse_transform([predicted_id.item()])[0]
	return answer.capitalize()
	except Exception as e:
	return f"Error: {str(e)}"

	# 5. GRADIO INTERFACE
	demo = gr.Interface(
	fn=predict_vqa,
	inputs=[
	gr.Image(type="pil", label="Image"),
	gr.Textbox(lines=2, label="Question")
	],
	outputs=gr.Textbox(label="Answer"),
	examples=examples_list,
	cache_examples=False,
	title="Vi-VQA Animal",
	theme=gr.themes.Default(primary_hue="orange")
	)

	# Launch with explicit paths allowed
	if __name__ == "__main__":
	demo.launch(allowed_paths=[os.path.abspath("./")])