import os import json import glob import random from pathlib import Path def count_adjectives(text): adjectives = ['vivid', 'gleaming', 'rugged', 'tranquil', 'velvety', 'golden', 'richly', 'detailed', 'cinematic', 'dramatic', 'vibrant', 'serene', 'majestic', 'luminous', 'textured', 'atmospheric', 'expressive'] return sum(1 for adj in adjectives if adj in text.lower()) def create_scaled_dataset(): """Create scaled dataset for Phase 7.3""" print("🔄 Creating Phase 7.3 scaled dataset...") # Use the downloaded 5K images image_dir = "/home/ubuntu/data/coco/train2017_5k" images = glob.glob(f"{image_dir}/*.jpg") # Also include existing images for more diversity existing_images = glob.glob("/data/coco/train2017/*.jpg") + glob.glob("/home/ubuntu/data/coco/train2017/*.jpg") all_images = list(set(images + existing_images)) print(f"📊 Found {len(all_images)} total images for dataset creation") # Create dataset with high adjective density dataset = [] adjective_pool = [ "vivid", "gleaming", "rugged", "tranquil", "velvety", "golden", "cinematic", "dramatic", "vibrant", "serene", "majestic", "luminous", "textured", "atmospheric", "expressive", "stunning", "breathtaking" ] scene_types = [ "landscape", "portrait", "street scene", "urban environment", "natural setting", "indoor space", "architectural detail", "still life", "action shot", "candid moment" ] for img_path in all_images: # Create 3 augmented versions per image for aug_version in range(3): num_adjectives = random.randint(3, 6) selected_adjs = random.sample(adjective_pool, num_adjectives) scene_type = random.choice(scene_types) # Create descriptive caption filename = Path(img_path).stem if 'coco' in filename: subject = "scene" else: subject = filename.replace('_', ' ').replace('-', ' ') caption = f"a {', '.join(selected_adjs)} {scene_type} photograph of {subject}" dataset.append({ "image": img_path, "caption": caption, "adjective_count": count_adjectives(caption), "augmentation": aug_version }) # Save dataset output_path = "phase7/phase7_3_dataset.json" with open(output_path, 'w') as f: json.dump(dataset, f, indent=2) # Statistics total_adjectives = sum(item["adjective_count"] for item in dataset) avg_adjectives = total_adjectives / len(dataset) print(f"✅ Phase 7.3 dataset created: {output_path}") print(f"📊 Dataset Stats:") print(f" - Total samples: {len(dataset)}") print(f" - Unique images: {len(all_images)}") print(f" - Average adjectives: {avg_adjectives:.2f}") print(f" - Augmentation factor: 3x") print(f" - Expected training steps: {len(dataset) // 8}") # Show sample captions print(f"📝 Sample captions:") for i in range(min(3, len(dataset))): print(f" {i+1}: {dataset[i]['caption']}") return dataset if __name__ == "__main__": create_scaled_dataset()