import torch from transformers import BlipProcessor, BlipForConditionalGeneration from PIL import Image import json import pandas as pd def count_adjectives(text): adjectives = [ 'vivid', 'gleaming', 'rugged', 'tranquil', 'velvety', 'golden', 'richly', 'detailed', 'cinematic', 'dramatic', 'vibrant', 'serene', 'majestic', 'luminous', 'textured', 'atmospheric', 'expressive' ] return sum(1 for adj in adjectives if adj in text.lower()) def benchmark_by_category(): """Benchmark across different image categories""" print("šŸŽÆ CATEGORY-SPECIFIC BENCHMARKING") print("=" * 60) device = "cuda" if torch.cuda.is_available() else "cpu" # Load our best model our_model_path = "outputs/phase7_3_large_scale/checkpoint-step-5000-1762322982" our_processor = BlipProcessor.from_pretrained(our_model_path) our_model = BlipForConditionalGeneration.from_pretrained(our_model_path).to(device) # Define test categories with sample images categories = { "Landscapes": ["coco_downloaded_00000000.jpg", "coco_downloaded_00000017.jpg"], "Portraits": ["coco_downloaded_00000001.jpg", "coco_downloaded_00000002.jpg"], "Urban Scenes": ["coco_downloaded_00000003.jpg", "coco_downloaded_00000010.jpg"], "Objects": ["img_001.jpg", "img_015.jpg", "img_020.jpg"], "Indoor Scenes": ["img_024.jpg", "img_028.jpg"] } results = [] for category, image_files in categories.items(): print(f"\nšŸ“Š Testing {category}:") print("-" * 40) category_adjectives = [] category_captions = [] for img_file in image_files: img_path = f"/data/coco/train2017/{img_file}" if not os.path.exists(img_path): continue try: image = Image.open(img_path) inputs = our_processor(images=image, return_tensors="pt").to(device) with torch.amp.autocast("cuda", enabled=True): outputs = our_model.generate(**inputs, max_length=50) caption = our_processor.decode(outputs[0], skip_special_tokens=True) adj_count = count_adjectives(caption) category_adjectives.append(adj_count) category_captions.append(caption) print(f" šŸ–¼ļø {img_file}: {adj_count} adjectives") print(f" '{caption}'") except Exception as e: print(f" āŒ Error with {img_file}: {e}") continue if category_adjectives: avg_adj = sum(category_adjectives) / len(category_adjectives) results.append({ 'category': category, 'avg_adjectives': avg_adj, 'samples': len(category_adjectives), 'sample_caption': category_captions[0] if category_captions else "" }) print(f" šŸ“ˆ Category Average: {avg_adj:.2f} adjectives") # Generate category analysis print("\n" + "="*60) print("šŸ† CATEGORY PERFORMANCE ANALYSIS") print("="*60) if results: df = pd.DataFrame(results) df = df.sort_values('avg_adjectives', ascending=False) print("\nšŸ“ˆ Performance by Category (Ranked):") for i, row in df.iterrows(): print(f"{i+1}. {row['category']:15} | Avg Adjectives: {row['avg_adjectives']:5.2f} | " f"Samples: {row['samples']}") print("\nšŸŽØ Best Performing Categories:") best_cat = df.iloc[0] print(f" šŸ„‡ {best_cat['category']}: {best_cat['avg_adjectives']:.2f} adjectives") print(f" šŸ“ Sample: '{best_cat['sample_caption']}'") print(f"\nšŸ“‰ Most Challenging Categories:") worst_cat = df.iloc[-1] print(f" šŸ“ {worst_cat['category']}: {worst_cat['avg_adjectives']:.2f} adjectives") print(f" šŸ“ Sample: '{worst_cat['sample_caption']}'") # Save category results with open("benchmark_category_results.json", "w") as f: json.dump(results, f, indent=2) print(f"\nšŸ’¾ Category results saved to: benchmark_category_results.json") if __name__ == "__main__": import os benchmark_by_category()