""" Visual Narrator 3B - Technical Comparison Demo Updated with accurate, verified claims (January 2026) Hugging Face Space: https://huggingface.co/spaces/Ytgetahun/visual-narrator-comparison """ import gradio as gr # ============================================================================= # VERIFIED BENCHMARK DATA (Source: Accurate Benchmark Claims Jan 2026) # ============================================================================= SPEED_DATA = { "Visual Narrator 3B": { "latency_ms": 2.4, "real_time": True, "fps_capable": 400, }, "GPT-4 Turbo": { "latency_ms": 2344, "real_time": False, "fps_capable": 0.43, }, "Claude Opus": { "latency_ms": 3536, "real_time": False, "fps_capable": 0.28, }, } QUALITY_DATA = { "Visual Narrator 3B": {"adj_density": 2.0, "semantic_accuracy": 71.6}, "Claude Sonnet 4.5": {"adj_density": 2.0, "semantic_accuracy": 64.2}, "GPT-4 Turbo": {"adj_density": 2.0, "semantic_accuracy": 66.8}, } COST_DATA = { "Visual Narrator": {"per_frame": 0.00, "monthly_1m": 900}, "GPT-4 Vision": {"per_frame": 0.083, "monthly_1m": 83000}, "Claude Vision": {"per_frame": 0.252, "monthly_1m": 252000}, } # ============================================================================= # UI COMPONENTS # ============================================================================= def create_speed_comparison(): """Generate speed benchmark comparison.""" vn = SPEED_DATA["Visual Narrator 3B"] gpt4 = SPEED_DATA["GPT-4 Turbo"] claude = SPEED_DATA["Claude Opus"] speed_vs_gpt4 = gpt4["latency_ms"] / vn["latency_ms"] speed_vs_claude = claude["latency_ms"] / vn["latency_ms"] return f""" ## Video-to-Text Speed Benchmark **What's measured:** End-to-end latency from video frame input to narration output. | Model | Latency | Speed vs VN | Real-Time? | |-------|---------|-------------|------------| | **Visual Narrator 3B** | **{vn['latency_ms']}ms** | — | Yes ({vn['fps_capable']}+ FPS) | | GPT-4 Turbo | {gpt4['latency_ms']:,}ms | {speed_vs_gpt4:.0f}x slower | No | | Claude Opus | {claude['latency_ms']:,}ms | {speed_vs_claude:.0f}x slower | No | ### What This Proves Visual Narrator can narrate **live video in real-time**. Competitor APIs are limited to batch/offline processing due to 2-3 second latency. ### Methodology Note - Visual Narrator: Local/edge GPU inference - Competitors: Cloud API round-trip (includes network latency) - This reflects real-world deployment conditions """ def create_quality_comparison(): """Generate quality benchmark comparison.""" vn = QUALITY_DATA["Visual Narrator 3B"] claude = QUALITY_DATA["Claude Sonnet 4.5"] gpt4 = QUALITY_DATA["GPT-4 Turbo"] return f""" ## Text-to-Text Quality Benchmark **What's measured:** Descriptive language richness (adjectives per description). | Model | Adj/Description | Semantic Accuracy | |-------|-----------------|-------------------| | **Visual Narrator 3B** | **{vn['adj_density']}** | **{vn['semantic_accuracy']}%** | | Claude Sonnet 4.5 | {claude['adj_density']} | {claude['semantic_accuracy']}% | | GPT-4 Turbo | {gpt4['adj_density']} | {gpt4['semantic_accuracy']}% | ### What This Proves Our language generation quality **matches Claude-tier output**. We don't sacrifice quality for speed. ### Historical Context Early training achieved 3.62 adj/desc (+81% vs Claude). We **intentionally reduced to 2.0** after determining higher density = "fluff". Claude's 2.0 was the **correct quality target**, not something to exceed. """ def create_cost_comparison(): """Generate cost comparison.""" vn = COST_DATA["Visual Narrator"] gpt4 = COST_DATA["GPT-4 Vision"] claude = COST_DATA["Claude Vision"] savings_vs_gpt4 = gpt4["monthly_1m"] / vn["monthly_1m"] savings_vs_claude = claude["monthly_1m"] / vn["monthly_1m"] return f""" ## Economics at Scale **Scenario:** Processing 1 million videos per month | Provider | Cost/Frame | Monthly Cost (1M videos) | |----------|------------|--------------------------| | **Visual Narrator** | **${vn['per_frame']:.2f}** | **${vn['monthly_1m']:,}** (fixed) | | GPT-4 Vision | ${gpt4['per_frame']:.3f} | ${gpt4['monthly_1m']:,} | | Claude Vision | ${claude['per_frame']:.3f} | ${claude['monthly_1m']:,} | ### Cost Advantage - **{savings_vs_gpt4:.0f}x cheaper** than GPT-4 Vision - **{savings_vs_claude:.0f}x cheaper** than Claude Vision - **Zero marginal cost** per additional frame (fixed infrastructure) """ def create_summary(): """Generate executive summary.""" return """ ## The Honest Pitch ### What We Don't Claim - We don't claim to "beat" Claude on language quality - We don't claim "trillion-parameter" comparisons - We don't claim adjective density superiority ### What We Do Claim (Verified) - We **MATCH** premium API quality (2.0 adj/desc) - We process video **976x faster** (2.4ms vs 2,344ms) - We enable **real-time markets** competitors cannot serve - We cost **90-280x less** at scale ### The Unlock > Real-time video narration at premium quality— > a combination no API-based competitor can match. This enables: live broadcasting, streaming accessibility, real-time content creation—markets that API latency blocks. """ def create_sample_output(): """Show sample model output.""" return """ ## Sample Output **Input:** Video frame of urban night scene **Visual Narrator 3B:** > "A sleek automobile navigates the urban landscape at night, > neon lights reflecting off wet pavement as pedestrians > move through crosswalks beneath glowing storefronts." **Characteristics:** - Professional narrative flow - Appropriate descriptive density (not over-decorated) - Spatial and temporal awareness - Suitable for audio description / accessibility """ # ============================================================================= # GRADIO INTERFACE # ============================================================================= with gr.Blocks( title="Visual Narrator 3B - Technical Comparison", theme=gr.themes.Soft() ) as demo: gr.Markdown(""" # Visual Narrator 3B - Technical Comparison **Matching Premium Quality at Real-Time Speed** A specialized 3B parameter model that matches Claude-quality descriptions while enabling real-time video narration that API-based models cannot achieve. --- ### Understanding the Two Benchmark Types | Benchmark | What's Measured | Our Advantage | |-----------|-----------------|---------------| | **Video-to-Text (Speed)** | Frame processing latency | 976x faster | | **Text-to-Text (Quality)** | Descriptive language richness | Parity with Claude | These measure **different capabilities** and should not be conflated. --- """) with gr.Tabs(): with gr.Tab("Speed Benchmark"): gr.Markdown(create_speed_comparison()) with gr.Tab("Quality Benchmark"): gr.Markdown(create_quality_comparison()) with gr.Tab("Cost Analysis"): gr.Markdown(create_cost_comparison()) with gr.Tab("Sample Output"): gr.Markdown(create_sample_output()) with gr.Tab("Summary"): gr.Markdown(create_summary()) gr.Markdown(""" --- ### Links - [Model Repository](https://huggingface.co/Ytgetahun/visual-narrator-llm) - [Full Documentation](https://github.com/yonnastgetahun/visual-narrator-docs) ### Methodology All claims verified and documented in "Visual Narrator 3B - Accurate Benchmark Claims (Jan 2026)" --- *Last updated: January 2026* """) # ============================================================================= # LAUNCH # ============================================================================= if __name__ == "__main__": demo.launch()