""" Official Benchmarks Leaderboard 2026 - Gradio App A unified leaderboard aggregating scores from 11 official HuggingFace benchmarks. """ import gradio as gr import pandas as pd from gradio_rangeslider import RangeSlider from utils.data_loader import ( load_leaderboard_data, get_benchmark_info, load_provider_logos, ) from utils.filters import filter_data, calculate_stats, parse_benchmark_selections from utils.formatters import format_for_display, create_empty_table, prepare_export_data from utils.html_generator import generate_leaderboard_html # Global data cache leaderboard_data = None provider_logos = None def initialize_data(): """Load initial data on app startup.""" global leaderboard_data, provider_logos leaderboard_data = load_leaderboard_data() provider_logos = load_provider_logos() return leaderboard_data def refresh_data(): """Reload data from HuggingFace dataset.""" global leaderboard_data print("Refreshing data from HuggingFace...") leaderboard_data = load_leaderboard_data() # Return updated table with current filters - we'll trigger a full update return gr.Info("Data refreshed successfully!") def update_table( search_term, size_range, bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ): """ Update the leaderboard table based on all filters. Returns: tuple: (html_string, num_models, num_benchmarks, num_scores) """ # Extract min and max from range slider tuple size_min, size_max = size_range # Parse benchmark selections from all checkbox groups selected_benchmarks = parse_benchmark_selections( bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ) # Handle case where no benchmarks are selected if not selected_benchmarks or len(selected_benchmarks) == 0: empty_html = generate_leaderboard_html(pd.DataFrame(), [], provider_logos) return empty_html, 0, 0, 0 # Filter the data filtered_df = filter_data( leaderboard_data, search_term, size_min, size_max, selected_benchmarks ) # Calculate statistics stats = calculate_stats(filtered_df, selected_benchmarks) # Generate HTML table html_table = generate_leaderboard_html( filtered_df, selected_benchmarks, provider_logos ) return (html_table, stats["models"], stats["benchmarks"], stats["scores"]) def select_all_benchmarks(): """Select all benchmark checkboxes.""" return ( ["GSM8K", "AIME 2026", "HMMT"], # Math ["MMLU-Pro", "GPQA", "HLE"], # Knowledge ["SWE-V", "SWE-Pro"], # Coding ["olmOCR"], # Vision ["TB 2.0"], # Agent ["EvasionB"], # Language ) def clear_all_benchmarks(): """Clear all benchmark checkboxes.""" return [], [], [], [], [], [] def export_to_csv( search_term, size_range, bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ): """ Export filtered data to CSV file. Returns: str: Path to temporary CSV file """ # Extract min and max from range slider tuple size_min, size_max = size_range # Parse benchmark selections selected_benchmarks = parse_benchmark_selections( bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ) if not selected_benchmarks: return None # Filter the data filtered_df = filter_data( leaderboard_data, search_term, size_min, size_max, selected_benchmarks ) # Prepare for export (without HTML/markdown) export_df = prepare_export_data(filtered_df, selected_benchmarks) # Save to temporary file import tempfile tmp_file = tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv") export_df.to_csv(tmp_file.name, index=False) tmp_file.close() return tmp_file.name # Minimal CSS - only for leaderboard table custom_css = """ /* Leaderboard table container */ .leaderboard-html-container { margin-top: 16px; } """ # JavaScript to enable table sorting custom_js = """ function() { // Load and execute the sorting script const script = document.createElement('script'); script.textContent = ` let currentSortColumn = null; let currentSortDirection = 'desc'; function sortTable(colIndex) { const table = document.querySelector('#leaderboardTable'); if (!table) return; const tbody = table.querySelector('tbody'); if (!tbody) return; const rows = Array.from(tbody.querySelectorAll('tr')); if (currentSortColumn === colIndex) { currentSortDirection = currentSortDirection === 'desc' ? 'asc' : 'desc'; } else { currentSortColumn = colIndex; currentSortDirection = 'desc'; } rows.sort((a, b) => { if (colIndex === 0) { const aVal = a.dataset.name || ''; const bVal = b.dataset.name || ''; return currentSortDirection === 'asc' ? aVal.localeCompare(bVal) : bVal.localeCompare(aVal); } else { const aCell = a.cells[colIndex]; const bCell = b.cells[colIndex]; const aText = aCell ? aCell.textContent.trim() : ''; const bText = bCell ? bCell.textContent.trim() : ''; const aScore = aText === '—' ? -1 : parseFloat(aText); const bScore = bText === '—' ? -1 : parseFloat(bText); if (isNaN(aScore) && isNaN(bScore)) return 0; if (isNaN(aScore)) return 1; if (isNaN(bScore)) return -1; return currentSortDirection === 'desc' ? bScore - aScore : aScore - bScore; } }); rows.forEach(row => tbody.appendChild(row)); updateSortIndicators(colIndex); } function updateSortIndicators(colIndex) { const headers = document.querySelectorAll('#leaderboardTable thead th'); headers.forEach((th, index) => { const sortArrow = th.querySelector('.sa'); if (sortArrow) { if (index === colIndex) { sortArrow.textContent = currentSortDirection === 'desc' ? '↓' : '↑'; th.classList.add('sorted'); } else { sortArrow.textContent = '↕'; th.classList.remove('sorted'); } } }); } `; document.head.appendChild(script); } """ # Build the Gradio interface with gr.Blocks( title="Official Benchmarks Leaderboard 2026", css=custom_css, js=custom_js ) as app: # Header gr.Markdown("# 🏆 Official Benchmarks Leaderboard 2026") gr.Markdown( "Unified leaderboard for **11 official Hugging Face benchmarks**. " "Compare AI models across math, coding, knowledge, vision, agent, and language tasks." ) # Statistics row with gr.Row(): stat_models = gr.Number( label="📊 Models", value=0, precision=0, interactive=False ) stat_benchmarks = gr.Number( label="🎯 Benchmarks", value=11, precision=0, interactive=False ) stat_scores = gr.Number( label="✅ Total Scores", value=0, precision=0, interactive=False ) # Quick filter presets with gr.Row(): gr.Markdown("**Quick Filters:**") preset_small = gr.Button("🔹 Small (<10B)", size="lg", variant="secondary") preset_medium = gr.Button("🔸 Medium (10-100B)", size="lg", variant="secondary") preset_large = gr.Button("🔶 Large (100B+)", size="lg", variant="secondary") with gr.Row(): gr.Markdown("**By Category:**") preset_coding = gr.Button("💻 Coding", size="lg", variant="secondary") preset_knowledge = gr.Button("🧠 Knowledge", size="lg", variant="secondary") preset_math = gr.Button("📐 Math", size="lg", variant="secondary") preset_vision = gr.Button("👁️ Vision", size="lg", variant="secondary") preset_agent = gr.Button("🤖 Agent", size="lg", variant="secondary") preset_language = gr.Button("💬 Language", size="lg", variant="secondary") # Filters Section with gr.Accordion("🎛️ Filters & Settings", open=False): # Search, Size Range, and Refresh on same row with gr.Row(): search_box = gr.Textbox( label="🔍 Search", placeholder="Try 'Llama', 'GPT', 'Qwen'...", scale=2 ) size_range = RangeSlider( minimum=0, maximum=1100, value=(0, 1100), step=10, label="📏 Size Range (Billions)", scale=2, ) refresh_btn = gr.Button("🔄 Refresh", scale=1) # Benchmark category filters gr.Markdown("### 🎯 Benchmarks") with gr.Row(): with gr.Column(scale=1): bench_math = gr.CheckboxGroup( choices=["GSM8K", "AIME 2026", "HMMT"], value=["GSM8K", "AIME 2026", "HMMT"], label="📐 Math", ) with gr.Column(scale=1): bench_knowledge = gr.CheckboxGroup( choices=["MMLU-Pro", "GPQA", "HLE"], value=["MMLU-Pro", "GPQA", "HLE"], label="🧠 Knowledge", ) with gr.Column(scale=1): bench_coding = gr.CheckboxGroup( choices=["SWE-V", "SWE-Pro"], value=["SWE-V", "SWE-Pro"], label="💻 Coding", ) with gr.Column(scale=1): bench_vision = gr.CheckboxGroup( choices=["olmOCR"], value=[], label="👁️ Vision" ) with gr.Column(scale=1): bench_agent = gr.CheckboxGroup( choices=["TB 2.0"], value=["TB 2.0"], label="🤖 Agent" ) with gr.Column(scale=1): bench_language = gr.CheckboxGroup( choices=["EvasionB"], value=["EvasionB"], label="💬 Language" ) # Quick actions for benchmark selection with gr.Row(): select_all_btn = gr.Button("✓ Select All", size="sm") clear_all_btn = gr.Button("✗ Clear All", size="sm") # Status message for user feedback status_msg = gr.Markdown("", visible=False) # Main leaderboard table gr.Markdown("## 📊 Leaderboard") gr.Markdown("*💡 Tip: Click any column header to sort the table*") leaderboard_table = gr.HTML( value="
Loading leaderboard data...
", label="", elem_classes="leaderboard-html-container", ) # Export button with better feedback with gr.Row(): export_btn = gr.Button("📥 Export CSV", size="sm") export_file = gr.File(label="Download", visible=False) # Footer gr.Markdown( "---\n" "**Data Source**: [OpenEvals/leaderboard-data](https://huggingface.co/datasets/OpenEvals/leaderboard-data) | " "**Open Source Models Only** | " "Made with ❤️ by the Benchmarks Team" ) # Define all filter inputs filter_inputs = [ search_box, size_range, bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ] # Define all outputs table_outputs = [leaderboard_table, stat_models, stat_benchmarks, stat_scores] benchmark_outputs = [ bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ] # Event handlers - attach update_table to all filter changes # Use trigger_mode for smoother interactions (debounce on typing) search_box.change( fn=update_table, inputs=filter_inputs, outputs=table_outputs, show_progress="hidden", trigger_mode="always_last", # Debounce search input ) # Other filters update immediately for filter_input in [ size_range, bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ]: filter_input.change( fn=update_table, inputs=filter_inputs, outputs=table_outputs, show_progress="minimal", ) # Refresh button - reloads data and updates table def refresh_and_update(*filter_args): refresh_data() return update_table(*filter_args) refresh_btn.click( fn=refresh_and_update, inputs=filter_inputs, outputs=table_outputs, show_progress="full", ) # Select All / Clear All buttons select_all_btn.click(fn=select_all_benchmarks, outputs=benchmark_outputs).then( fn=update_table, inputs=filter_inputs, outputs=table_outputs ) clear_all_btn.click(fn=clear_all_benchmarks, outputs=benchmark_outputs).then( fn=update_table, inputs=filter_inputs, outputs=table_outputs ) # Export button with success message def export_with_feedback(*args): filepath = export_to_csv(*args) return filepath, gr.File(visible=True) export_btn.click( fn=export_with_feedback, inputs=filter_inputs, outputs=[export_file, export_file], show_progress="minimal", ) # Preset filter handlers def apply_small_models(): return "", (0, 10) # search, size_range def apply_medium_models(): return "", (10, 100) def apply_large_models(): return "", (100, 1100) # Category filter functions - deselect all except the chosen category def apply_coding_filter(): return ( "", (0, 1100), [], [], ["SWE-V", "SWE-Pro"], [], [], [], ) # search, size_range, math, knowledge, coding, vision, agent, language def apply_knowledge_filter(): return "", (0, 1100), [], ["MMLU-Pro", "GPQA", "HLE"], [], [], [], [] def apply_math_filter(): return "", (0, 1100), ["GSM8K", "AIME 2026", "HMMT"], [], [], [], [], [] def apply_vision_filter(): return "", (0, 1100), [], [], [], ["olmOCR"], [], [] def apply_agent_filter(): return "", (0, 1100), [], [], [], [], ["TB 2.0"], [] def apply_language_filter(): return "", (0, 1100), [], [], [], [], [], ["EvasionB"] # Size preset handlers preset_small.click(fn=apply_small_models, outputs=[search_box, size_range]).then( fn=update_table, inputs=filter_inputs, outputs=table_outputs ) preset_medium.click(fn=apply_medium_models, outputs=[search_box, size_range]).then( fn=update_table, inputs=filter_inputs, outputs=table_outputs ) preset_large.click(fn=apply_large_models, outputs=[search_box, size_range]).then( fn=update_table, inputs=filter_inputs, outputs=table_outputs ) # Category preset handlers preset_coding.click( fn=apply_coding_filter, outputs=[ search_box, size_range, bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ], ).then(fn=update_table, inputs=filter_inputs, outputs=table_outputs) preset_knowledge.click( fn=apply_knowledge_filter, outputs=[ search_box, size_range, bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ], ).then(fn=update_table, inputs=filter_inputs, outputs=table_outputs) preset_math.click( fn=apply_math_filter, outputs=[ search_box, size_range, bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ], ).then(fn=update_table, inputs=filter_inputs, outputs=table_outputs) preset_vision.click( fn=apply_vision_filter, outputs=[ search_box, size_range, bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ], ).then(fn=update_table, inputs=filter_inputs, outputs=table_outputs) preset_agent.click( fn=apply_agent_filter, outputs=[ search_box, size_range, bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ], ).then(fn=update_table, inputs=filter_inputs, outputs=table_outputs) preset_language.click( fn=apply_language_filter, outputs=[ search_box, size_range, bench_math, bench_knowledge, bench_coding, bench_vision, bench_agent, bench_language, ], ).then(fn=update_table, inputs=filter_inputs, outputs=table_outputs) # Initialize data and populate table on app load def init_wrapper(): initialize_data() return None app.load( fn=init_wrapper, # Load data without returning it outputs=None, ).then(fn=update_table, inputs=filter_inputs, outputs=table_outputs) if __name__ == "__main__": # Initialize data before launching print("Initializing leaderboard app...") initialize_data() print("✓ Data loaded successfully") print("Launching Gradio app...") app.launch(server_name="0.0.0.0", server_port=7860, share=False)