| |
| """ |
| Oculus 0.2 Unified Demo |
| |
| Demonstrates all features of the unified Oculus model: |
| - Text mode (captioning, VQA) |
| - Point mode (counting objects) |
| - Box mode (detection with bounding boxes) |
| - Polygon mode (segmentation) |
| - Optional reasoning with thinking traces |
| - Focus system for fine-grained perception |
| """ |
|
|
| import os |
| import sys |
| import requests |
| from pathlib import Path |
| from io import BytesIO |
|
|
| from PIL import Image |
| import torch |
|
|
| |
| sys.path.insert(0, str(Path(__file__).parent)) |
|
|
| from oculus_unified_model import OculusForConditionalGeneration, OculusConfig |
|
|
|
|
| def download_image(url: str) -> Image.Image: |
| """Download image from URL.""" |
| headers = {'User-Agent': 'Mozilla/5.0'} |
| response = requests.get(url, headers=headers, timeout=10) |
| response.raise_for_status() |
| return Image.open(BytesIO(response.content)).convert('RGB') |
|
|
|
|
| def print_header(title: str): |
| print("\n" + "=" * 70) |
| print(f"🔮 {title}") |
| print("=" * 70) |
|
|
|
|
| def print_section(title: str): |
| print(f"\n{'─' * 70}") |
| print(f" {title}") |
| print(f"{'─' * 70}") |
|
|
|
|
| def demo(): |
| print_header("OCULUS 0.2 UNIFIED MODEL DEMO") |
| |
| |
| |
| |
| print("\n[1] Loading Oculus Model...") |
| |
| |
| weights_path = Path(__file__).parent / "checkpoints" / "oculus_coco" / "final" |
| |
| if weights_path.exists(): |
| print(f" Found trained weights at: {weights_path}") |
| model = OculusForConditionalGeneration.from_pretrained(weights_path) |
| else: |
| print(" Using default configuration") |
| config = OculusConfig( |
| reasoning_enabled=True, |
| enable_focus=True, |
| ) |
| model = OculusForConditionalGeneration(config) |
| |
| print(" ✓ Model loaded!") |
| |
| |
| |
| |
| test_images = [ |
| { |
| "name": "Cat on Couch", |
| "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg" |
| }, |
| { |
| "name": "Golden Gate Bridge", |
| "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/GoldenGateBridge-001.jpg/1200px-GoldenGateBridge-001.jpg" |
| }, |
| ] |
| |
| for test in test_images: |
| print_header(f"Testing: {test['name']}") |
| |
| try: |
| print("\n[Downloading image...]") |
| image = download_image(test["url"]) |
| print(f" Image size: {image.size}") |
| |
| |
| |
| |
| print_section("📝 TEXT MODE - Captioning") |
| |
| output = model.generate( |
| image=image, |
| prompt="Describe this image in detail", |
| mode="text", |
| think=False |
| ) |
| |
| print(f" Caption: \"{output.text}\"") |
| |
| |
| |
| |
| print_section("🧠 TEXT MODE - With Reasoning") |
| |
| output = model.generate( |
| image=image, |
| prompt="What is the main subject of this image?", |
| mode="text", |
| think=True |
| ) |
| |
| if output.thinking_trace: |
| print(f" 💭 Thinking: {output.thinking_trace[:200]}...") |
| print(f" Answer: \"{output.text}\"") |
| |
| |
| |
| |
| print_section("❓ TEXT MODE - VQA") |
| |
| questions = [ |
| "What colors are visible in this image?", |
| "Is this indoors or outdoors?", |
| ] |
| |
| for q in questions: |
| output = model.generate( |
| image=image, |
| prompt=q, |
| mode="text" |
| ) |
| print(f" Q: {q}") |
| print(f" A: {output.text}") |
| |
| |
| |
| |
| print_section("📍 POINT MODE - Object Counting") |
| |
| output = model.generate( |
| image=image, |
| prompt="Find objects", |
| mode="point" |
| ) |
| |
| print(f" Detected {len(output.points)} points") |
| for i, (pt, label, conf) in enumerate(zip( |
| output.points[:5], |
| output.labels[:5], |
| output.confidences[:5] |
| )): |
| print(f" Point {i+1}: {pt} (class={label}, conf={conf:.2f})") |
| |
| |
| |
| |
| print_section("📦 BOX MODE - Object Detection") |
| |
| output = model.generate( |
| image=image, |
| prompt="Detect all objects", |
| mode="box" |
| ) |
| |
| print(f" Detected {len(output.boxes)} boxes") |
| for i, (box, label, conf) in enumerate(zip( |
| output.boxes[:5], |
| output.labels[:5], |
| output.confidences[:5] |
| )): |
| print(f" Box {i+1}: {[f'{b:.2f}' for b in box]} (class={label}, conf={conf:.2f})") |
| |
| |
| |
| |
| print_section("🔷 POLYGON MODE - Segmentation") |
| |
| output = model.generate( |
| image=image, |
| prompt="Segment the scene", |
| mode="polygon" |
| ) |
| |
| print(f" Segmentation mask shape: {output.mask.shape if output.mask is not None else 'N/A'}") |
| print(f" Detected {len(output.polygons)} regions") |
| for i, (poly, label) in enumerate(zip( |
| output.polygons[:3], |
| output.labels[:3] |
| )): |
| print(f" Region {i+1}: class={label}, vertices={len(poly)}") |
| |
| print("\n ✅ All modes successful!") |
| |
| except Exception as e: |
| print(f"\n ❌ Error: {e}") |
| import traceback |
| traceback.print_exc() |
| |
| |
| |
| |
| print_header("DEMO COMPLETE") |
| |
| print(""" |
| Oculus 0.2 supports: |
| |
| 📝 TEXT MODE |
| - Image captioning |
| - Visual question answering |
| - With optional reasoning traces |
| |
| 📍 POINT MODE |
| - Object counting |
| - Point localization |
| |
| 📦 BOX MODE |
| - Object detection |
| - Bounding box prediction |
| |
| 🔷 POLYGON MODE |
| - Semantic segmentation |
| - Instance segmentation |
| |
| 🧠 REASONING |
| - Optional thinking traces |
| - Multi-step reasoning |
| |
| 🔍 FOCUS SYSTEM |
| - Zoom & crop for fine-grained perception |
| - Automatic region detection |
| |
| Usage: |
| ```python |
| from oculus_unified_model import OculusForConditionalGeneration |
| |
| model = OculusForConditionalGeneration.from_pretrained("./checkpoints/oculus_coco/final") |
| |
| # Caption |
| output = model.generate(image, mode="text", prompt="Describe this") |
| |
| # VQA with reasoning |
| output = model.generate(image, mode="text", prompt="What color is it?", think=True) |
| |
| # Detection |
| output = model.generate(image, mode="box", prompt="Find cars") |
| |
| # Segmentation |
| output = model.generate(image, mode="polygon") |
| ``` |
| """) |
|
|
|
|
| if __name__ == "__main__": |
| demo() |
|
|