visual-narrator-llm / create_enhanced_dataset.py
Ytgetahun's picture
feat: Visual Narrator 3B - Clean repository with professional benchmarks
d6e97b5
Raw
History Blame
3.61 kB
import os
import json
import glob
import random
from pathlib import Path
def count_adjectives(text):
"""Count adjectives in text"""
adjectives = ['vivid', 'gleaming', 'rugged', 'tranquil', 'velvety', 'golden',
'richly', 'detailed', 'cinematic', 'dramatic', 'vibrant', 'serene',
'majestic', 'luminous', 'textured', 'atmospheric', 'expressive',
'stunning', 'breathtaking', 'captivating', 'mesmerizing']
return sum(1 for adj in adjectives if adj in text.lower())
def adjective_augment(caption, adjective_ratio=0.9, min_adjs=2, max_adjs=5):
"""Augment caption with adjectives"""
adjectives_pool = [
"vivid", "gleaming", "rugged", "tranquil", "velvety", "golden",
"richly detailed", "cinematic", "dramatic", "vibrant", "serene",
"majestic", "luminous", "textured", "atmospheric", "expressive",
"stunning", "breathtaking", "captivating", "mesmerizing"
]
if random.random() < adjective_ratio:
num_adjectives = random.randint(min_adjs, max_adjs)
selected_adjs = random.sample(adjectives_pool, num_adjectives)
# Insert adjectives at beginning
augmented = f"{', '.join(selected_adjs)} {caption}"
return augmented
return caption
def create_enhanced_dataset():
"""Create enhanced dataset from all available images"""
# Find all images across different directories
image_dirs = [
"/data/coco/train2017",
"/home/ubuntu/data/coco/train2017",
"dummy_images"
]
all_images = []
for dir_path in image_dirs:
if os.path.exists(dir_path):
jpg_files = glob.glob(f"{dir_path}/*.jpg")
png_files = glob.glob(f"{dir_path}/*.png")
all_images.extend(jpg_files)
all_images.extend(png_files)
print(f"πŸ“Š Found {len(all_images)} total images")
# Create enhanced dataset with high adjective density
dataset = []
for img_path in all_images:
# Base caption from filename
base_caption = f"a photo of {Path(img_path).stem.replace('_', ' ').replace('-', ' ')}"
# Augment with high adjective density
augmented_caption = adjective_augment(
base_caption,
adjective_ratio=0.9, # 90% get adjectives
min_adjs=3, # Minimum 3 adjectives
max_adjs=5 # Maximum 5 adjectives
)
dataset.append({
"image": img_path,
"caption": augmented_caption,
"adjective_count": count_adjectives(augmented_caption)
})
# Save dataset
output_path = "phase7/synth_train_enhanced.json"
with open(output_path, 'w') as f:
json.dump(dataset, f, indent=2)
# Print stats
total_images = len(dataset)
total_adjectives = sum(item["adjective_count"] for item in dataset)
avg_adjectives = total_adjectives / total_images if total_images > 0 else 0
print(f"βœ… Enhanced dataset created: {output_path}")
print(f"πŸ“Š Dataset Stats:")
print(f" - Total samples: {total_images}")
print(f" - Total adjectives: {total_adjectives}")
print(f" - Average adjectives per caption: {avg_adjectives:.2f}")
print(f" - Target density: β‰₯3.0 adjectives/description")
# Show sample captions
print(f"πŸ“ Sample captions:")
for i in range(min(3, len(dataset))):
print(f" {i+1}: {dataset[i]['caption']}")
return dataset
if __name__ == "__main__":
create_enhanced_dataset()