| import torch |
| import time |
| import json |
| import pandas as pd |
| from transformers import BlipProcessor, BlipForConditionalGeneration |
| from PIL import Image |
| import glob |
| import os |
| from datetime import datetime |
|
|
| def count_adjectives_comprehensive(text): |
| """Comprehensive adjective counting""" |
| adjectives = [ |
| 'vivid', 'gleaming', 'rugged', 'tranquil', 'velvety', 'golden', |
| 'richly', 'detailed', 'cinematic', 'dramatic', 'vibrant', 'serene', |
| 'majestic', 'luminous', 'textured', 'atmospheric', 'expressive', |
| 'stunning', 'breathtaking', 'captivating', 'mesmerizing', 'radiant', |
| 'glowing', 'sparkling', 'pristine', 'ethereal', 'soothing', 'dynamic', |
| 'brilliant', 'crisp', 'elegant', 'exquisite', 'gorgeous', 'grand', |
| 'impressive', 'luxurious', 'opulent', 'picturesque', 'refined', |
| 'splendid', 'sumptuous', 'superb', 'tasteful', 'aesthetic' |
| ] |
| text_lower = text.lower() |
| return sum(1 for adj in adjectives if adj in text_lower) |
|
|
| def create_baseline_model(): |
| """Create a baseline model from our original BLIP weights""" |
| print("๐ Creating baseline model from original weights...") |
| |
| |
| baseline_path = "models/blip-base-local" |
| if os.path.exists(baseline_path): |
| processor = BlipProcessor.from_pretrained(baseline_path, local_files_only=True) |
| model = BlipForConditionalGeneration.from_pretrained(baseline_path, local_files_only=True) |
| return processor, model, "BLIP-Baseline (Original)" |
| else: |
| raise FileNotFoundError("Baseline model not found") |
|
|
| def benchmark_comprehensive(): |
| """Comprehensive benchmarking that works offline""" |
| |
| print("๐ฏ VISUAL NARRATOR VLM - OFFLINE BENCHMARKING") |
| print("=" * 70) |
| print(f"๐
{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") |
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| print(f"๐ฅ๏ธ Device: {device}") |
| |
| |
| test_images = glob.glob("/data/coco/train2017/*.jpg")[:15] |
| print(f"๐ผ๏ธ Test images: {len(test_images)}") |
| |
| results = [] |
| |
| |
| print("\n1. ๐ญ OUR VISUAL NARRATOR VLM") |
| print("-" * 40) |
| |
| try: |
| our_model_path = "outputs/phase7_3_large_scale/checkpoint-step-5000-1762322982" |
| our_processor = BlipProcessor.from_pretrained(our_model_path, local_files_only=True) |
| our_model = BlipForConditionalGeneration.from_pretrained(our_model_path, local_files_only=True).to(device) |
| |
| our_results = run_benchmark("Visual-Narrator-VLM", our_processor, our_model, test_images, device) |
| if our_results: |
| results.append(our_results) |
| print(f" โ
Adjectives: {our_results['avg_adjectives']:.2f}") |
| print(f" โก Speed: {our_results['avg_inference_ms']:.1f}ms") |
| print(f" ๐ Samples: {our_results['samples_tested']}") |
| except Exception as e: |
| print(f"โ Our model failed: {e}") |
| |
| |
| print("\n2. ๐ BASELINE MODEL (Original BLIP)") |
| print("-" * 40) |
| |
| try: |
| base_processor, base_model, base_name = create_baseline_model() |
| base_model = base_model.to(device) |
| |
| base_results = run_benchmark(base_name, base_processor, base_model, test_images, device) |
| if base_results: |
| results.append(base_results) |
| print(f" โ
Adjectives: {base_results['avg_adjectives']:.2f}") |
| print(f" โก Speed: {base_results['avg_inference_ms']:.1f}ms") |
| print(f" ๐ Samples: {base_results['samples_tested']}") |
| except Exception as e: |
| print(f"โ Baseline failed: {e}") |
| |
| |
| print("\n3. ๐ PHASE 7.2 MODEL (Previous Best)") |
| print("-" * 40) |
| |
| try: |
| phase7_2_ckpts = glob.glob("outputs/phase7_optimized/checkpoint-epoch-*") |
| if phase7_2_ckpts: |
| phase7_2_path = sorted(phase7_2_ckpts)[-1] |
| phase7_2_processor = BlipProcessor.from_pretrained(phase7_2_path, local_files_only=True) |
| phase7_2_model = BlipForConditionalGeneration.from_pretrained(phase7_2_path, local_files_only=True).to(device) |
| |
| phase7_2_results = run_benchmark("Phase7.2-Model", phase7_2_processor, phase7_2_model, test_images, device) |
| if phase7_2_results: |
| results.append(phase7_2_results) |
| print(f" โ
Adjectives: {phase7_2_results['avg_adjectives']:.2f}") |
| print(f" โก Speed: {phase7_2_results['avg_inference_ms']:.1f}ms") |
| print(f" ๐ Samples: {phase7_2_results['samples_tested']}") |
| except Exception as e: |
| print(f"โ Phase 7.2 model failed: {e}") |
| |
| |
| if len(results) >= 2: |
| generate_benchmark_analysis(results) |
| else: |
| print("\nโ Insufficient results for meaningful comparison") |
|
|
| def run_benchmark(model_name, processor, model, test_images, device): |
| """Run benchmark for a single model""" |
| print(f" ๐งช Testing {model_name}...") |
| |
| adjective_counts = [] |
| inference_times = [] |
| captions = [] |
| |
| for img_path in test_images: |
| try: |
| image = Image.open(img_path) |
| |
| |
| start_time = time.time() |
| inputs = processor(images=image, return_tensors="pt").to(device) |
| |
| with torch.amp.autocast("cuda", enabled=True): |
| outputs = model.generate( |
| **inputs, |
| max_length=60, |
| num_beams=3, |
| early_stopping=True |
| ) |
| |
| inference_time = time.time() - start_time |
| caption = processor.decode(outputs[0], skip_special_tokens=True) |
| |
| adj_count = count_adjectives_comprehensive(caption) |
| |
| adjective_counts.append(adj_count) |
| inference_times.append(inference_time) |
| captions.append(caption) |
| |
| except Exception as e: |
| print(f" โ Error on {os.path.basename(img_path)}: {e}") |
| continue |
| |
| if adjective_counts and inference_times: |
| return { |
| 'model': model_name, |
| 'avg_adjectives': sum(adjective_counts) / len(adjective_counts), |
| 'max_adjectives': max(adjective_counts), |
| 'min_adjectives': min(adjective_counts), |
| 'avg_inference_ms': sum(inference_times) / len(inference_times) * 1000, |
| 'samples_tested': len(adjective_counts), |
| 'sample_captions': captions[:3], |
| 'all_captions': captions |
| } |
| return None |
|
|
| def generate_benchmark_analysis(results): |
| """Generate detailed benchmark analysis""" |
| print("\n" + "="*70) |
| print("๐ COMPREHENSIVE BENCHMARK ANALYSIS") |
| print("="*70) |
| |
| df = pd.DataFrame(results) |
| df = df.sort_values('avg_adjectives', ascending=False) |
| |
| |
| print("\n๐ PERFORMANCE RANKING:") |
| print("-" * 60) |
| for i, row in df.iterrows(): |
| stars = "โญ" * min(5, int(row['avg_adjectives'])) |
| print(f"{i+1}. {row['model']:25} | " |
| f"Adjectives: {row['avg_adjectives']:5.2f} {stars:5} | " |
| f"Speed: {row['avg_inference_ms']:6.1f}ms") |
| |
| |
| if len(results) >= 2: |
| our_model = next((r for r in results if 'Visual-Narrator' in r['model']), None) |
| baseline = next((r for r in results if 'Baseline' in r['model']), None) |
| |
| if our_model and baseline: |
| improvement = ((our_model['avg_adjectives'] - baseline['avg_adjectives']) / |
| baseline['avg_adjectives'] * 100) |
| |
| print(f"\n๐ฏ KEY IMPROVEMENT:") |
| print(f" Our VLM: {our_model['avg_adjectives']:.2f} adjectives") |
| print(f" Baseline: {baseline['avg_adjectives']:.2f} adjectives") |
| print(f" IMPROVEMENT: {improvement:+.1f}% ๐") |
| |
| |
| print("\n๐จ QUALITY SHOWCASE (Our VLM vs Baseline):") |
| print("-" * 60) |
| |
| our_model = next((r for r in results if 'Visual-Narrator' in r['model']), None) |
| baseline = next((r for r in results if 'Baseline' in r['model']), None) |
| |
| if our_model and baseline: |
| print("Our Visual Narrator VLM:") |
| for i, caption in enumerate(our_model['sample_captions'][:2], 1): |
| adj_count = count_adjectives_comprehensive(caption) |
| print(f" {i}. [{adj_count} adj] {caption}") |
| |
| print("\nBaseline Model:") |
| for i, caption in enumerate(baseline['sample_captions'][:2], 1): |
| adj_count = count_adjectives_comprehensive(caption) |
| print(f" {i}. [{adj_count} adj] {caption}") |
| |
| |
| print(f"\n๐ STATISTICAL ANALYSIS:") |
| print(f" Best single caption: {max(r['max_adjectives'] for r in results)} adjectives") |
| print(f" Most consistent: {df.iloc[df['min_adjectives'].idxmax()]['model']}") |
| print(f" Fastest: {df.iloc[df['avg_inference_ms'].idxmin()]['model']}") |
| |
| |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| results_file = f"comprehensive_benchmark_{timestamp}.json" |
| |
| with open(results_file, 'w') as f: |
| json.dump({ |
| 'timestamp': timestamp, |
| 'results': results, |
| 'summary': { |
| 'best_model': df.iloc[0]['model'], |
| 'best_score': df.iloc[0]['avg_adjectives'], |
| 'improvement_over_baseline': improvement if 'improvement' in locals() else 0 |
| } |
| }, f, indent=2) |
| |
| print(f"\n๐พ Detailed results saved to: {results_file}") |
| |
| |
| print(f"\nโ
DEPLOYMENT RECOMMENDATION:") |
| best_model = df.iloc[0] |
| print(f" Use {best_model['model']} for production") |
| print(f" Performance: {best_model['avg_adjectives']:.2f} adjectives/description") |
| print(f" Speed: {best_model['avg_inference_ms']:.1f}ms per image") |
| print(f" Quality: Exceptional descriptive density ๐") |
|
|
| if __name__ == "__main__": |
| benchmark_comprehensive() |
|
|