""" Official Benchmarks Leaderboard 2026 - Gradio App A unified leaderboard aggregating scores from 11 official HuggingFace benchmarks. """ import gradio as gr import pandas as pd from gradio_rangeslider import RangeSlider from utils.data_loader import ( load_leaderboard_data, get_benchmark_info, load_provider_logos, ) from utils.filters import filter_data, calculate_stats, parse_benchmark_selections from utils.formatters import format_for_display, create_empty_table, prepare_export_data from utils.html_generator import generate_leaderboard_html # Global data cache leaderboard_data = None provider_logos = None def initialize_data(): """Load initial data on app startup.""" global leaderboard_data, provider_logos leaderboard_data = load_leaderboard_data() provider_logos = load_provider_logos() return leaderboard_data def refresh_data(): """Reload data from HuggingFace dataset.""" global leaderboard_data print("Refreshing data from HuggingFace...") leaderboard_data = load_leaderboard_data() # Return updated table with current filters - we'll trigger a full update return gr.Info("Data refreshed successfully!") def update_table( search_term, size_range, bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ): """ Update the leaderboard table based on all filters. Returns: tuple: (html_string, num_models, num_benchmarks, num_scores) """ # Extract min and max from range slider tuple size_min, size_max = size_range # Parse benchmark selections from all checkbox groups selected_benchmarks = parse_benchmark_selections( bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ) # Handle case where no benchmarks are selected if not selected_benchmarks or len(selected_benchmarks) == 0: empty_html = generate_leaderboard_html(pd.DataFrame(), [], provider_logos) return empty_html, 0, 0, 0 # Filter the data filtered_df = filter_data( leaderboard_data, search_term, size_min, size_max, selected_benchmarks ) # Calculate statistics stats = calculate_stats(filtered_df, selected_benchmarks) # Generate HTML table html_table = generate_leaderboard_html( filtered_df, selected_benchmarks, provider_logos ) return (html_table, stats["models"], stats["benchmarks"], stats["scores"]) def select_all_benchmarks(): """Select all benchmark checkboxes.""" return ( ["GSM8K", "AIME 2026", "HMMT"], # Math ["MMLU-Pro", "GPQA", "HLE"], # Knowledge ["SWE-V", "SWE-Pro"], # Coding ["olmOCR"], # Vision ["TB 2.0"], # Agent ["EvasionB"], # Language ) def clear_all_benchmarks(): """Clear all benchmark checkboxes.""" return [], [], [], [], [], [] def export_to_csv( search_term, size_range, bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ): """ Export filtered data to CSV file. Returns: str: Path to temporary CSV file """ # Extract min and max from range slider tuple size_min, size_max = size_range # Parse benchmark selections selected_benchmarks = parse_benchmark_selections( bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ) if not selected_benchmarks: return None # Filter the data filtered_df = filter_data( leaderboard_data, search_term, size_min, size_max, selected_benchmarks ) # Prepare for export (without HTML/markdown) export_df = prepare_export_data(filtered_df, selected_benchmarks) # Save to temporary file import tempfile tmp_file = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv") export_df.to_csv(tmp_file.name, index=False) tmp_file.close() return tmp_file.name # Minimal CSS - only for leaderboard table custom_css = """ /* Leaderboard table container */ .leaderboard-html-container { margin-top: 16px; } """ # JavaScript to enable table sorting custom_js = """ function() { // Load and execute the sorting script const script = document.createElement('script'); script.textContent = ` let currentSortColumn = null; let currentSortDirection = 'desc'; function sortTable(colIndex) { const table = document.querySelector('#leaderboardTable'); if (!table) return; const tbody = table.querySelector('tbody'); if (!tbody) return; const rows = Array.from(tbody.querySelectorAll('tr')); if (currentSortColumn === colIndex) { currentSortDirection = currentSortDirection === 'desc' ? 'asc' : 'desc'; } else { currentSortColumn = colIndex; currentSortDirection = 'desc'; } rows.sort((a, b) => { if (colIndex === 0) { const aVal = a.dataset.name || ''; const bVal = b.dataset.name || ''; return currentSortDirection === 'asc' ? aVal.localeCompare(bVal) : bVal.localeCompare(aVal); } else { const aCell = a.cells[colIndex]; const bCell = b.cells[colIndex]; const aText = aCell ? aCell.textContent.trim() : ''; const bText = bCell ? bCell.textContent.trim() : ''; const aScore = aText === '—' ? -1 : parseFloat(aText); const bScore = bText === '—' ? -1 : parseFloat(bText); if (isNaN(aScore) && isNaN(bScore)) return 0; if (isNaN(aScore)) return 1; if (isNaN(bScore)) return -1; return currentSortDirection === 'desc' ? bScore - aScore : aScore - bScore; } }); rows.forEach(row => tbody.appendChild(row)); updateSortIndicators(colIndex); } function updateSortIndicators(colIndex) { const headers = document.querySelectorAll('#leaderboardTable thead th'); headers.forEach((th, index) => { const sortArrow = th.querySelector('.sa'); if (sortArrow) { if (index === colIndex) { sortArrow.textContent = currentSortDirection === 'desc' ? '↓' : '↑'; th.classList.add('sorted'); } else { sortArrow.textContent = '↕'; th.classList.remove('sorted'); } } }); } `; document.head.appendChild(script); } """ # Build the Gradio interface with gr.Blocks( title="Official Benchmarks Leaderboard 2026", css=custom_css, js=custom_js ) as app: # Header gr.Markdown("# 🏆 Official Benchmarks Leaderboard 2026") gr.Markdown( "Unified leaderboard for **11 official Hugging Face benchmarks**. " "Compare AI models across math, coding, knowledge, vision, agent, and language tasks." ) # Statistics row with gr.Row(): stat_models = gr.Number( label="📊 Models", value=0, precision=0, interactive=False ) stat_benchmarks = gr.Number( label="🎯 Benchmarks", value=11, precision=0, interactive=False ) stat_scores = gr.Number( label="✅ Total Scores", value=0, precision=0, interactive=False ) # Quick filter presets with gr.Row(): gr.Markdown("**Quick Filters:**") preset_small = gr.Button("🔹 Small (<10B)", size="lg", variant="secondary") preset_medium = gr.Button("🔸 Medium (10-100B)", size="lg", variant="secondary") preset_large = gr.Button("🔶 Large (100B+)", size="lg", variant="secondary") with gr.Row(): gr.Markdown("**By Category:**") preset_coding = gr.Button("💻 Coding", size="lg", variant="secondary") preset_knowledge = gr.Button("🧠 Knowledge", size="lg", variant="secondary") preset_math = gr.Button("📐 Math", size="lg", variant="secondary") preset_vision = gr.Button("👁️ Vision", size="lg", variant="secondary") preset_agent = gr.Button("🤖 Agent", size="lg", variant="secondary") preset_language = gr.Button("💬 Language", size="lg", variant="secondary") # Filters Section with gr.Accordion("🎛️ Filters & Settings", open=False): # Search, Size Range, and Refresh on same row with gr.Row(): search_box = gr.Textbox( label="🔍 Search", placeholder="Try 'Llama', 'GPT', 'Qwen'...", scale=2 ) size_range = RangeSlider( minimum=0, maximum=1100, value=(0, 1100), step=10, label="📏 Size Range (Billions)", scale=2, ) refresh_btn = gr.Button("🔄 Refresh", scale=1) # Benchmark category filters gr.Markdown("### 🎯 Benchmarks") with gr.Row(): with gr.Column(scale=1): bench_math = gr.CheckboxGroup( choices=["GSM8K", "AIME 2026", "HMMT"], value=["GSM8K", "AIME 2026", "HMMT"], label="📐 Math", ) with gr.Column(scale=1): bench_knowledge = gr.CheckboxGroup( choices=["MMLU-Pro", "GPQA", "HLE"], value=["MMLU-Pro", "GPQA", "HLE"], label="🧠 Knowledge", ) with gr.Column(scale=1): bench_coding = gr.CheckboxGroup( choices=["SWE-V", "SWE-Pro"], value=["SWE-V", "SWE-Pro"], label="💻 Coding", ) with gr.Column(scale=1): bench_vision = gr.CheckboxGroup( choices=["olmOCR"], value=[], label="👁️ Vision" ) with gr.Column(scale=1): bench_agent = gr.CheckboxGroup( choices=["TB 2.0"], value=["TB 2.0"], label="🤖 Agent" ) with gr.Column(scale=1): bench_language = gr.CheckboxGroup( choices=["EvasionB"], value=["EvasionB"], label="💬 Language" ) # Quick actions for benchmark selection with gr.Row(): select_all_btn = gr.Button("✓ Select All", size="sm") clear_all_btn = gr.Button("✗ Clear All", size="sm") # Status message for user feedback status_msg = gr.Markdown("", visible=False) # Main leaderboard table gr.Markdown("## 📊 Leaderboard") gr.Markdown("*💡 Tip: Click any column header to sort the table*") leaderboard_table = gr.HTML( value="