import gradio as gr import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from gradio_leaderboard import ColumnFilter, Leaderboard, SearchColumns, SelectColumns from huggingface_hub import snapshot_download from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, EVALUATION_QUEUE_TEXT_ES, INTRODUCTION_TEXT, INTRODUCTION_TEXT_ES, LLM_BENCHMARKS_TEXT, LLM_BENCHMARKS_TEXT_ES, LOGOS, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( BENCHMARK_COLS, COLS, EVAL_COLS, EVAL_TYPES, AutoEvalColumn, ModelType, Precision, WeightType, fields, ) from src.envs import API, DEVICE, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.leaderboard.read_evals import get_raw_eval_results from src.populate import get_evaluation_queue_df, get_evaluation_time_df, get_leaderboard_df from src.submission.submit import add_new_eval from src.tasks import Tasks tasks_df = pd.read_csv("tasks/tasks.csv") tasks_df = tasks_df.drop(tasks_df.index[0]) # The first row is the average tasks_df.drop(columns=["Domain", "Name"], inplace=True) def restart_space(): API.restart_space(repo_id=REPO_ID) try: snapshot_download( repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN, ) except Exception: restart_space() try: snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN, ) except Exception: restart_space() # Function to check if a user is logged in def check_login(profile: gr.OAuthProfile | None) -> bool: if profile is None: return False return True # Load data once and reuse for both leaderboard and evaluation time raw_eval_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH) original_df = get_leaderboard_df( results_path=EVAL_RESULTS_PATH, requests_path=EVAL_REQUESTS_PATH, cols=COLS, benchmark_cols=BENCHMARK_COLS, raw_data=raw_eval_data, ) leaderboard_df = original_df.copy() print(f"leaderboard_df: {leaderboard_df}") # Reuse raw_data to avoid re-reading files evaluation_time_df = get_evaluation_time_df( results_path=EVAL_RESULTS_PATH, requests_path=EVAL_REQUESTS_PATH, raw_data=raw_eval_data ) print(f"evaluation_time_df: {evaluation_time_df}") ( finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df, ) = get_evaluation_queue_df(save_path=EVAL_REQUESTS_PATH, cols=EVAL_COLS) def init_leaderboard(dataframe: pd.DataFrame, language: str = None, is_time_tab: bool = False): """ Initialize leaderboard component for different tab types. Args: dataframe: The dataframe to display language: Language code (ES, CA, EU, GL, VA, PT) for language-specific tabs, None for Summary tab is_time_tab: If True, configure for Time/CO2 tab with time-specific columns and filters """ base_cols = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] # Prepare dataframe and select_columns based on tab type if is_time_tab: # Time/CO2 tab: filter, sort, and reorder columns sum_cols = [ AutoEvalColumn.sum_t.name, AutoEvalColumn.sum_t_es.name, AutoEvalColumn.sum_t_ca.name, AutoEvalColumn.sum_t_eu.name, AutoEvalColumn.sum_t_gl.name, AutoEvalColumn.sum_t_va.name, AutoEvalColumn.sum_t_pt.name, AutoEvalColumn.sum_kg_co2.name, AutoEvalColumn.sum_kg_co2_es.name, AutoEvalColumn.sum_kg_co2_ca.name, AutoEvalColumn.sum_kg_co2_eu.name, AutoEvalColumn.sum_kg_co2_gl.name, AutoEvalColumn.sum_kg_co2_va.name, AutoEvalColumn.sum_kg_co2_pt.name, ] # Filter out models with no time information sum_cols_in_df = [c for c in sum_cols if c in dataframe.columns] if sum_cols_in_df: dataframe = dataframe[dataframe[sum_cols_in_df].notna().any(axis=1)] # Sort by Sum t descending (highest on top) if AutoEvalColumn.sum_t.name in dataframe.columns: dataframe = dataframe.sort_values(by=AutoEvalColumn.sum_t.name, ascending=False, na_position="last") # Reorder columns: base, filter columns, sum columns, task columns (time and CO2 interleaved) filter_cols = [ AutoEvalColumn.model_type.name, AutoEvalColumn.precision.name, AutoEvalColumn.params.name, ] filter_cols_in_df = [c for c in filter_cols if c in dataframe.columns] # Interleave time and CO2 columns: t , kg CO2 , t , kg CO2 , ... interleaved_task_cols = [] for task in Tasks: time_col = f"t {task.value.col_name}" co2_col = f"kg CO2 {task.value.col_name}" if time_col in dataframe.columns: interleaved_task_cols.append(time_col) if co2_col in dataframe.columns: interleaved_task_cols.append(co2_col) ordered_columns = ( base_cols + filter_cols_in_df + sum_cols_in_df + interleaved_task_cols + [AutoEvalColumn.dummy.name] ) dataframe = dataframe[[c for c in ordered_columns if c in dataframe.columns]] # Default selection: only sum columns (time and CO2) selected, tasks unselected default_selection = base_cols + sum_cols_in_df + [AutoEvalColumn.dummy.name] cant_deselect = [ AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name, AutoEvalColumn.dummy.name, ] # Create datatypes matching column order col_to_type = {c.name: c.type for c in fields(AutoEvalColumn)} datatypes = [col_to_type.get(col, "number") for col in dataframe.columns] else: # Regular tabs (Summary and language-specific) if language: filtered_columns = [ c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.task_language == "" or c.task_language == language) ] else: filtered_columns = [ c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.task_language == "" or c.task_language == "ALL" or c.task_domain == "average") ] if language: # For language tabs: Average (not selected), Sum t (not selected), Avg language (selected), Sum t language (selected), Sum kg CO2 language (selected), all language tasks (selected) lang_sum_col_map = { "ES": AutoEvalColumn.sum_t_es.name, "CA": AutoEvalColumn.sum_t_ca.name, "EU": AutoEvalColumn.sum_t_eu.name, "GL": AutoEvalColumn.sum_t_gl.name, "VA": AutoEvalColumn.sum_t_va.name, "PT": AutoEvalColumn.sum_t_pt.name, } lang_sum_col = lang_sum_col_map.get(language, AutoEvalColumn.sum_t.name) lang_co2_col_map = { "ES": AutoEvalColumn.sum_kg_co2_es.name, "CA": AutoEvalColumn.sum_kg_co2_ca.name, "EU": AutoEvalColumn.sum_kg_co2_eu.name, "GL": AutoEvalColumn.sum_kg_co2_gl.name, "VA": AutoEvalColumn.sum_kg_co2_va.name, "PT": AutoEvalColumn.sum_kg_co2_pt.name, } lang_co2_col = lang_co2_col_map.get(language, AutoEvalColumn.sum_kg_co2.name) lang_category_map = { "ES": AutoEvalColumn.es.name, "CA": AutoEvalColumn.ca.name, "EU": AutoEvalColumn.eu.name, "GL": AutoEvalColumn.gl.name, "VA": AutoEvalColumn.va.name, "PT": AutoEvalColumn.pt.name, } lang_category_col = lang_category_map.get(language) # Reorder columns: base, Average, Sum t, then language-specific columns in desired order language_task_cols = [ c.name for c in fields(AutoEvalColumn) if c.task_language == language and c.task_domain != "average" ] ordered_columns = ( base_cols + [ AutoEvalColumn.average.name, # Avg Performance (not selected) AutoEvalColumn.sum_t.name, # Sum t (not selected) AutoEvalColumn.sum_kg_co2.name, # Sum kg CO2 (not selected) lang_category_col, # Avg language (selected) lang_sum_col, # Sum t language (selected) lang_co2_col, # Sum kg CO2 language (selected) ] + language_task_cols # all language tasks (selected) + [AutoEvalColumn.dummy.name] ) # Add any other filtered columns that aren't already in ordered_columns ordered_columns = ordered_columns + [c for c in filtered_columns if c not in ordered_columns] dataframe = dataframe[[c for c in ordered_columns if c in dataframe.columns]] # Sort by language average value (descending) - highest on top if lang_category_col in dataframe.columns: dataframe = dataframe.sort_values(by=lang_category_col, ascending=False, na_position="last") default_selection = ( base_cols + [lang_category_col] # Avg language (selected) + [lang_sum_col] # Sum t language (selected) + [lang_co2_col] # Sum kg CO2 language (selected) + language_task_cols # all language tasks (selected) + [AutoEvalColumn.dummy.name] ) else: # For Summary tab: Average, Sum t, Sum kg CO2, Avg ES, Avg CA, Avg EU, Avg GL, Sum t ES, Sum t CA, Sum t EU, Sum t GL, Sum kg CO2 ES, Sum kg CO2 CA, Sum kg CO2 EU, Sum kg CO2 GL # Reorder columns to match desired order ordered_columns = ( base_cols + [ AutoEvalColumn.average.name, AutoEvalColumn.es.name, AutoEvalColumn.ca.name, AutoEvalColumn.eu.name, AutoEvalColumn.gl.name, AutoEvalColumn.va.name, AutoEvalColumn.pt.name, AutoEvalColumn.sum_t.name, AutoEvalColumn.sum_t_es.name, AutoEvalColumn.sum_t_ca.name, AutoEvalColumn.sum_t_eu.name, AutoEvalColumn.sum_t_gl.name, AutoEvalColumn.sum_t_va.name, AutoEvalColumn.sum_t_pt.name, AutoEvalColumn.sum_kg_co2.name, AutoEvalColumn.sum_kg_co2_es.name, AutoEvalColumn.sum_kg_co2_ca.name, AutoEvalColumn.sum_kg_co2_eu.name, AutoEvalColumn.sum_kg_co2_gl.name, AutoEvalColumn.sum_kg_co2_va.name, AutoEvalColumn.sum_kg_co2_pt.name, ] + [AutoEvalColumn.dummy.name] ) # Add any other filtered columns that aren't already in ordered_columns ordered_columns = ordered_columns + [c for c in filtered_columns if c not in ordered_columns] dataframe = dataframe[[c for c in ordered_columns if c in dataframe.columns]] default_selection = ( base_cols + [ AutoEvalColumn.average.name, AutoEvalColumn.es.name, AutoEvalColumn.ca.name, AutoEvalColumn.eu.name, AutoEvalColumn.gl.name, AutoEvalColumn.va.name, AutoEvalColumn.pt.name, AutoEvalColumn.sum_t.name, AutoEvalColumn.sum_kg_co2.name, # Sum t and Sum kg CO2 per language are available but not selected by default ] + [AutoEvalColumn.dummy.name] ) datatypes = [c.type for c in fields(AutoEvalColumn) if c.name in dataframe.columns] cant_deselect = [c.name for c in fields(AutoEvalColumn) if c.never_hidden or c.dummy] # Build shared components (same for all tabs) # Only include license in secondary_columns if it exists in the dataframe secondary_columns = [] if AutoEvalColumn.license.name in dataframe.columns: secondary_columns = [AutoEvalColumn.license.name] placeholder = "Search by model name or license. To search by license, type 'license:'" else: placeholder = "Search by model name" search_columns = SearchColumns( primary_column=AutoEvalColumn.model.name, secondary_columns=secondary_columns, placeholder=placeholder, label="Search", ) hide_columns = [c.name for c in fields(AutoEvalColumn) if c.hidden] filter_columns = [ ColumnFilter( AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types", info="Tipos de modelo", ), ColumnFilter( AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision", info="Precisión", ), ColumnFilter( AutoEvalColumn.params.name, type="slider", min=0.01, max=150, label="Number of parameters (B)", info="Número de parámetros (B)", ), ] # Only add still_on_hub filter if the column exists in the dataframe if AutoEvalColumn.still_on_hub.name in dataframe.columns: filter_columns.append( ColumnFilter( AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", info="Borrado/incompleto", default=False, ) ) bool_checkboxgroup_label = "Show models" # Single return statement return Leaderboard( value=dataframe, datatype=datatypes, select_columns=SelectColumns( default_selection=default_selection, cant_deselect=cant_deselect, label="Columns to display", info="Columnas que mostrar", ), search_columns=search_columns, hide_columns=hide_columns, filter_columns=filter_columns, bool_checkboxgroup_label=bool_checkboxgroup_label, interactive=False, ) # Design interface demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") gr.Markdown(INTRODUCTION_TEXT_ES, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 La Leaderboard", elem_id="llm-benchmark-tab", id=1): with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("Summary", elem_id="llm-benchmark-tab-table", id=1): leaderboard = init_leaderboard(leaderboard_df) with gr.TabItem("Spanish", elem_id="llm-benchmark-tab-table", id=2): leaderboard = init_leaderboard(leaderboard_df, language="ES") with gr.TabItem("Catalan", elem_id="llm-benchmark-tab-table", id=3): leaderboard = init_leaderboard(leaderboard_df, language="CA") with gr.TabItem("Basque", elem_id="llm-benchmark-tab-table", id=4): leaderboard = init_leaderboard(leaderboard_df, language="EU") with gr.TabItem("Galician", elem_id="llm-benchmark-tab-table", id=5): leaderboard = init_leaderboard(leaderboard_df, language="GL") with gr.TabItem("Valencian", elem_id="llm-benchmark-tab-table", id=6): leaderboard = init_leaderboard(leaderboard_df, language="VA") with gr.TabItem("Portuguese", elem_id="llm-benchmark-tab-table", id=7): leaderboard = init_leaderboard(leaderboard_df, language="PT") with gr.TabItem("Time / CO2", elem_id="llm-benchmark-tab-table", id=8): evaluation_time_leaderboard = init_leaderboard(evaluation_time_df, is_time_tab=True) with gr.TabItem("💡 Info", elem_id="llm-benchmark-tab", id=2): with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") with gr.TabItem("ES", elem_id="llm-benchmark-tab-table", id=2): gr.Markdown(LLM_BENCHMARKS_TEXT_ES, elem_classes="markdown-text") with gr.TabItem("📝 Tasks", elem_id="llm-benchmark-tab", id=3): Leaderboard( value=tasks_df, datatype="markdown", select_columns=SelectColumns( default_selection=["Task", "Description", "Language", "Donated By"], cant_deselect=["Task"], label="Columns to display", info="Columnas que mostrar", ), search_columns=SearchColumns( primary_column="Task", secondary_columns=["Description"], placeholder="Search by task name", label="Search", ), hide_columns=None, filter_columns=[ ColumnFilter( "Language", type="checkboxgroup", label="Language", info="Lengua", ) ], interactive=False, ) with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab", id=4): with gr.Column(): with gr.Row(): with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.TabItem("ES", elem_id="llm-benchmark-tab-table", id=2): gr.Markdown(EVALUATION_QUEUE_TEXT_ES, elem_classes="markdown-text") with gr.Column(): with gr.Accordion( f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", open=False, ): with gr.Row(): finished_eval_table = gr.components.Dataframe( value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Accordion( f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", open=False, ): with gr.Row(): running_eval_table = gr.components.Dataframe( value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Accordion( f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False, ): with gr.Row(): pending_eval_table = gr.components.Dataframe( value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5, ) with gr.Row(): gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text") login_button = gr.LoginButton(elem_id="oauth-button") with gr.Row(): with gr.Column(): model_name_textbox = gr.Textbox(label="Model name") revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") model_type = gr.Dropdown( choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], label="Model type", multiselect=False, value=None, interactive=True, ) with gr.Column(): precision = gr.Dropdown( choices=[i.value.name for i in Precision if i != Precision.Unknown], label="Precision", multiselect=False, value="float16" if DEVICE != "cpu" else "float32", interactive=True, ) weight_type = gr.Dropdown( choices=[i.value.name for i in WeightType], label="Weights type", multiselect=False, value="Original", interactive=True, ) base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( add_new_eval, [ model_name_textbox, base_model_name_textbox, revision_name_textbox, precision, weight_type, model_type, ], submission_result, ) with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) with gr.Row(): for logo_path in LOGOS[:7]: gr.Image( value=logo_path, show_label=False, show_download_button=False, show_share_button=False, show_fullscreen_button=False, ) with gr.Row(): for logo_path in LOGOS[7:]: gr.Image( value=logo_path, show_label=False, show_download_button=False, show_share_button=False, show_fullscreen_button=False, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=1800) scheduler.start() demo.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0")