import os import json import glob import random from pathlib import Path def count_adjectives(text): adjectives = ['vivid', 'gleaming', 'rugged', 'tranquil', 'velvety', 'golden', 'richly', 'detailed', 'cinematic', 'dramatic', 'vibrant', 'serene', 'majestic', 'luminous', 'textured', 'atmospheric', 'expressive', 'stunning', 'breathtaking', 'captivating', 'mesmerizing'] return sum(1 for adj in adjectives if adj in text.lower()) def create_proper_dataset(): """Create a proper training dataset with all available images""" print("🔄 Creating proper training dataset...") # Find ALL images in the system search_paths = [ "/data/coco/train2017", "/home/ubuntu/data/coco/train2017", "dummy_images", "/home/ubuntu/visual-narrator-llm" ] all_images = [] for path in search_paths: if os.path.exists(path): jpg_files = glob.glob(f"{path}/*.jpg") png_files = glob.glob(f"{path}/*.png") all_images.extend(jpg_files) all_images.extend(png_files) print(f"📁 Found {len(jpg_files) + len(png_files)} images in {path}") # Remove duplicates by keeping only unique filenames unique_images = [] seen_names = set() for img_path in all_images: img_name = os.path.basename(img_path) if img_name not in seen_names and os.path.exists(img_path): unique_images.append(img_path) seen_names.add(img_name) print(f"📊 Total unique images: {len(unique_images)}") if len(unique_images) < 50: print("⚠️ Warning: Dataset is still small. We'll use data augmentation.") # Create dataset with high adjective density dataset = [] adjective_pool = [ "vivid", "gleaming", "rugged", "tranquil", "velvety", "golden", "cinematic", "dramatic", "vibrant", "serene", "majestic", "luminous", "textured", "atmospheric", "expressive", "stunning", "breathtaking", "captivating", "mesmerizing", "radiant", "glowing", "sparkling" ] scene_types = [ "landscape", "portrait", "street scene", "urban environment", "natural setting", "indoor space", "architectural detail", "still life", "action shot", "candid moment" ] for img_path in unique_images: # Create multiple augmented versions of each image for aug_version in range(3): # 3 versions per image num_adjectives = random.randint(3, 6) # 3-6 adjectives selected_adjs = random.sample(adjective_pool, num_adjectives) scene_type = random.choice(scene_types) caption = f"a {', '.join(selected_adjs)} {scene_type} photograph of {Path(img_path).stem.replace('_', ' ')}" dataset.append({ "image": img_path, "caption": caption, "adjective_count": count_adjectives(caption), "augmentation": aug_version }) print(f"📈 Created {len(dataset)} training samples (augmented)") # Save dataset output_path = "phase7/synth_train_proper.json" with open(output_path, 'w') as f: json.dump(dataset, f, indent=2) # Calculate statistics total_adjectives = sum(item["adjective_count"] for item in dataset) avg_adjectives = total_adjectives / len(dataset) print("✅ PROPER DATASET CREATED") print(f"📊 Final Stats:") print(f" - Total samples: {len(dataset)}") print(f" - Unique images: {len(unique_images)}") print(f" - Average adjectives: {avg_adjectives:.2f} per caption") print(f" - Augmentation factor: 3x") print(f" - Expected training steps: {len(dataset) // 8} (batch_size=8)") # Verify some samples exist print(f"🔍 Verifying image accessibility...") accessible = 0 for item in dataset[:10]: # Check first 10 if os.path.exists(item["image"]): accessible += 1 print(f" - Images accessible: {accessible}/10 checked") return dataset if __name__ == "__main__": create_proper_dataset()