Spaces:

522H0134-NguyenNhatHuy
/

Vi-VQA-Animals

Running

App Files Files Community

522H0134-NguyenNhatHuy commited on Mar 15

Commit

9f08b12

verified ·

1 Parent(s): 825683b

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -10

app.py CHANGED Viewed

@@ -16,26 +16,45 @@ from safetensors.torch import load_file
 import pandas as pd
 from sklearn.preprocessing import LabelEncoder
 df = pd.read_csv('./animal_dataset_vi.csv')
 label_encoder = LabelEncoder()
 label_encoder.fit(df['answer'].astype(str))
 num_classes = len(label_encoder.classes_)
-examples_list = [
-    ["./animals/animals/zebra/zebra_18.jpg", "Bộ phận cơ thể riêng biệt của sinh vật này là gì?"],
-    ["./animals/animals/hare/hare_20.jpg", "Con vật này có ở trong tự nhiên không?"],
-    ["./animals/animals/hedgehog/hedgehog_56.jpg", "Con vật này có màu gì?"],
-    ["./animals/animals/lion/lion_37.jpg", "Động vật có ở môi trường sống tự nhiên của nó không?"],
-    ["./animals/animals/bat/bat_52.jpg", "Con vật này có màu sẫm hay nhạt?"]
 ]
 class VQAModel(nn.Module):
     def __init__(self, num_classes):
         super(VQAModel, self).__init__()
         self.image_encoder = nn.Sequential(*list(models.resnet50(weights=None).children())[:-1])
         self.img_proj = nn.Linear(2048, 512)
         self.text_encoder = AutoModel.from_pretrained("vinai/phobert-base-v2")
         self.text_proj = nn.Linear(768, 512)
         self.classifier = nn.Sequential(
             nn.LayerNorm(512),
             nn.Dropout(0.4),
@@ -48,11 +67,14 @@ class VQAModel(nn.Module):
     def forward(self, images, input_ids, attention_mask):
         img_features = self.image_encoder(images).flatten(start_dim=1)
         img_features = self.img_proj(img_features)
         text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
         text_features = self.text_proj(text_outputs.pooler_output)
         combined_features = img_features * text_features
         return self.classifier(combined_features)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = VQAModel(num_classes).to(device)
@@ -61,6 +83,7 @@ if os.path.exists(model_path):
     model.load_state_dict(load_file(model_path))
     model.eval()
 tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
@@ -68,6 +91,7 @@ transform = transforms.Compose([
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 ])
 def predict_vqa(image, question):
     if image is None or question.strip() == "":
         return "Please provide both an image and a question."
@@ -78,6 +102,7 @@ def predict_vqa(image, question):
             segmented_question, truncation=True, padding='max_length',
             max_length=64, return_tensors='pt'
         )
         with torch.no_grad():
             outputs = model(
                 image_tensor,
@@ -85,20 +110,25 @@ def predict_vqa(image, question):
                 encoding['attention_mask'].to(device)
             )
             _, predicted_id = torch.max(outputs, 1)
         answer = label_encoder.inverse_transform([predicted_id.item()])[0]
         return answer.capitalize()
     except Exception as e:
         return f"Error: {str(e)}"
 demo = gr.Interface(
     fn=predict_vqa,
-    inputs=[gr.Image(type="pil", label="Image"), gr.Textbox(lines=2, label="Question")],
     outputs=gr.Textbox(label="Answer"),
     examples=examples_list,
     title="Vi-VQA Animal",
-    theme=gr.themes.Default(primary_hue="orange"),
-    allow_flagging="never"
 )
 if __name__ == "__main__":
-    demo.launch()

 import pandas as pd
 from sklearn.preprocessing import LabelEncoder
+# 1. LOAD DATASET AND LABEL ENCODER
 df = pd.read_csv('./animal_dataset_vi.csv')
 label_encoder = LabelEncoder()
 label_encoder.fit(df['answer'].astype(str))
 num_classes = len(label_encoder.classes_)
+# 2. PREPARE 3 RANDOM SAMPLES
+custom_questions = [
+    "Con vật trong hình là con gì?",
+    "Màu sắc chủ đạo của con vật này là gì?",
+    "Con này sống ở đâu?"
 ]
+# Randomly select 3 images from the dataset
+df_samples = df.sample(n=3)
+examples_list = []
+for i, (_, row) in enumerate(df_samples.iterrows()):
+    # Convert absolute paths from your dataset to relative paths for Hugging Face
+    img_path = row['image_path'].replace(
+        "animal_dataset/animals/animals",
+        "./animals/animals"
+    )
+    # Pair the random image with a fixed question
+    examples_list.append([img_path, custom_questions[i]])
+# 3. INITIALIZE MODEL ARCHITECTURE
 class VQAModel(nn.Module):
     def __init__(self, num_classes):
         super(VQAModel, self).__init__()
+        # Image Feature Extractor (ResNet50)
         self.image_encoder = nn.Sequential(*list(models.resnet50(weights=None).children())[:-1])
         self.img_proj = nn.Linear(2048, 512)
+        # Text Feature Extractor (PhoBERT)
         self.text_encoder = AutoModel.from_pretrained("vinai/phobert-base-v2")
         self.text_proj = nn.Linear(768, 512)
+        # Classification Head
         self.classifier = nn.Sequential(
             nn.LayerNorm(512),
             nn.Dropout(0.4),
     def forward(self, images, input_ids, attention_mask):
         img_features = self.image_encoder(images).flatten(start_dim=1)
         img_features = self.img_proj(img_features)
         text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
         text_features = self.text_proj(text_outputs.pooler_output)
         combined_features = img_features * text_features
         return self.classifier(combined_features)
+# Setup device and load weights
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = VQAModel(num_classes).to(device)
     model.load_state_dict(load_file(model_path))
     model.eval()
+# Initialize text tokenizer and image transformations
 tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
 transform = transforms.Compose([
     transforms.Resize((224, 224)),
     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
 ])
+# 4. INFERENCE FUNCTION
 def predict_vqa(image, question):
     if image is None or question.strip() == "":
         return "Please provide both an image and a question."
             segmented_question, truncation=True, padding='max_length',
             max_length=64, return_tensors='pt'
         )
         with torch.no_grad():
             outputs = model(
                 image_tensor,
                 encoding['attention_mask'].to(device)
             )
             _, predicted_id = torch.max(outputs, 1)
         answer = label_encoder.inverse_transform([predicted_id.item()])[0]
         return answer.capitalize()
     except Exception as e:
         return f"Error: {str(e)}"
+# 5. GRADIO INTERFACE
 demo = gr.Interface(
     fn=predict_vqa,
+    inputs=[
+        gr.Image(type="pil", label="Image"),
+        gr.Textbox(lines=2, label="Question")
+    ],
     outputs=gr.Textbox(label="Answer"),
     examples=examples_list,
     title="Vi-VQA Animal",
+    theme=gr.themes.Default(primary_hue="orange")
 )
+# Launch the web app
 if __name__ == "__main__":
+    demo.launch()