| | import gradio as gr |
| | from gradio_rangeslider import RangeSlider |
| |
|
| | import core as core |
| | from style import CSS, LANG_SYMBOLS, T_SYMBOLS, TITLE |
| |
|
| |
|
| | def create_model_controls(): |
| | with gr.Row(): |
| | with gr.Column(): |
| | model_types = gr.CheckboxGroup( |
| | label="Select model type", |
| | choices=[ |
| | ( |
| | f"Pretrained {T_SYMBOLS['pretrained']}", |
| | T_SYMBOLS["pretrained"], |
| | ), |
| | (f"Chat {T_SYMBOLS['chat']}", T_SYMBOLS["chat"]), |
| | ], |
| | value=list(T_SYMBOLS.values()), |
| | ) |
| | with gr.Column(): |
| | model_sizes = RangeSlider(minimum=0, maximum=150, value=(7, 8), |
| | label="Select the number of parameters (B)") |
| | return model_types, model_sizes |
| |
|
| |
|
| | def create_language_controls(lang_choices): |
| | with gr.Row(): |
| | langs_bar = gr.CheckboxGroup( |
| | choices=[(LANG_SYMBOLS.get(l, l), l) for l in lang_choices], |
| | value=lang_choices, |
| | label="Select languages to average over", |
| | elem_id="column-select", |
| | interactive=True, |
| | scale=6, |
| | ) |
| | with gr.Column(scale=1): |
| | clear = gr.ClearButton( |
| | langs_bar, |
| | value="Deselect all languages", |
| | size="sm", |
| | scale=1, |
| | ) |
| | select = gr.Button( |
| | value="Select all languages", |
| | size="sm", |
| | scale=1, |
| | ) |
| | select.click( |
| | lambda: gr.CheckboxGroup(value=lang_choices), |
| | inputs=[], |
| | outputs=langs_bar, |
| | ) |
| | return langs_bar |
| |
|
| |
|
| | def create_task_controls(tab_id): |
| | with gr.Row(): |
| | shown_tasks = gr.CheckboxGroup( |
| | choices=core.get_available_task_groups(core.get_selected_task_type(tab_id), True), |
| | value=core.get_available_task_groups(core.get_selected_task_type(tab_id), True), |
| | label="Select tasks to show", |
| | elem_id="column-select", |
| | interactive=True, |
| | scale=50, |
| | ) |
| | clear = gr.ClearButton( |
| | shown_tasks, |
| | value="Deselect all tasks", |
| | size="sm", |
| | scale=1, |
| | ) |
| | select = gr.Button( |
| | value="Select all tasks", |
| | size="sm", |
| | scale=1, |
| | ) |
| | select.click( |
| | lambda: gr.CheckboxGroup( |
| | value=core.get_available_task_groups(core.get_selected_task_type(tab_id), True)), |
| | inputs=[], |
| | outputs=shown_tasks, |
| | ) |
| | return shown_tasks |
| |
|
| |
|
| | theme = gr.themes.Default( |
| | primary_hue="blue", |
| | ).set( |
| | button_border_width='*block_border_width' |
| | ) |
| |
|
| | demo = gr.Blocks(css=CSS, theme=theme) |
| | with demo: |
| | gr.HTML(TITLE) |
| | gr.Markdown( |
| | "This is a collection of multilingual evaluation results obtained using our fork of the LM-evaluation-harness (https://github.com/OpenGPTX/lm-evaluation-harness), based on V1 of the https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard.\ |
| | Note that currently, benchmarks are available in 21 European languages (Irish, Maltese, Croatian missing).", |
| | elem_classes="markdown-text", |
| | ) |
| |
|
| | selected_tab = gr.State(value=0) |
| |
|
| | with gr.Tabs(elem_classes="tab-buttons") as tabs: |
| | with gr.TabItem( |
| | "π
LLM accuracy benchmark", |
| | elem_id="llm-benchmark-tab-table-acc", |
| | id=0, |
| | ) as acc: |
| | with gr.Column(): |
| | with gr.Row(): |
| | search_bar = gr.Textbox( |
| | label="Search models", |
| | placeholder=" π Separate multiple queries with ';' and press ENTER...", |
| | show_label=True, |
| | elem_id="search-bar", |
| | ) |
| |
|
| | model_types, model_sizes = create_model_controls() |
| | langs_bar = create_language_controls(core.languages_list) |
| | shown_tasks = create_task_controls(0) |
| |
|
| | leaderboard_table = gr.Dataframe(datatype=["str", "markdown", "number"]) |
| |
|
| | with gr.TabItem( |
| | "π
LLM accuracy benchmark (Zero-Shot)", |
| | elem_id="llm-benchmark-tab-table-acc-zeroshot", |
| | id=3, |
| | ) as acc_zero_shot: |
| | with gr.Column(): |
| | with gr.Row(): |
| | search_bar_zero_shot = gr.Textbox( |
| | label="Search models", |
| | placeholder=" π Separate multiple queries with ';' and press ENTER...", |
| | show_label=True, |
| | elem_id="search-bar", |
| | ) |
| |
|
| | model_types_zero_shot, model_sizes_zero_shot = create_model_controls() |
| | langs_bar_zero_shot = create_language_controls(core.languages_list) |
| | shown_tasks_zero_shot = create_task_controls(1) |
| | leaderboard_table_zero_shot = gr.Dataframe(datatype=["str", "markdown", "number"]) |
| |
|
| | with gr.TabItem( |
| | "π LLM translation benchmark", |
| | elem_id="llm-benchmark-tab-table-misc", |
| | id=1, |
| | ) as misc: |
| | with gr.Column(): |
| | with gr.Row(): |
| | search_bar_misc = gr.Textbox( |
| | label="Search models", |
| | placeholder=" π Separate multiple queries with ';' and press ENTER...", |
| | show_label=True, |
| | elem_id="search-bar", |
| | ) |
| |
|
| | model_types_misc, model_sizes_misc = create_model_controls() |
| | langs_bar_misc = create_language_controls(core.languages_list) |
| | shown_tasks_misc = create_task_controls(3) |
| | leaderboard_table_misc = gr.Dataframe(datatype=["str", "markdown", "number"]) |
| |
|
| | with gr.TabItem( |
| | "π LLM MT-Bench benchmark", |
| | elem_id="llm-benchmark-tab-table-mtbench", |
| | id=2, |
| | ) as mtbench: |
| | with gr.Column(): |
| | with gr.Row(): |
| | search_bar_mtbench = gr.Textbox( |
| | label="Search models", |
| | placeholder=" π Separate multiple queries with ';' and press ENTER...", |
| | show_label=True, |
| | elem_id="search-bar", |
| | ) |
| | langs_bar_mtbench = create_language_controls(core.mt_bench_language_list) |
| | leaderboard_table_mtbench = gr.Dataframe(datatype=["str", "markdown", "number"]) |
| |
|
| | for comp, fn in [ |
| | (search_bar, "submit"), |
| | (langs_bar, "change"), |
| | (shown_tasks, "change"), |
| | (model_types, "change"), |
| | (model_sizes, "change"), |
| | ]: |
| | getattr(comp, fn)( |
| | core.update_df, |
| | [gr.State(value=0), shown_tasks, search_bar, langs_bar, model_sizes, gr.State(value=True), model_types], |
| | leaderboard_table, |
| | ) |
| |
|
| | for comp, fn in [ |
| | (search_bar_zero_shot, "submit"), |
| | (model_types_zero_shot, "change"), |
| | (langs_bar_zero_shot, "change"), |
| | (shown_tasks_zero_shot, "change"), |
| | (model_sizes_zero_shot, "change") |
| | ]: |
| | getattr(comp, fn)( |
| | core.update_df, |
| | [gr.State(value=1), shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot, |
| | model_sizes_zero_shot, gr.State(value=False), model_types_zero_shot], |
| | leaderboard_table_zero_shot, |
| | ) |
| |
|
| | for comp, fn in [ |
| | (search_bar_misc, "submit"), |
| | (langs_bar_misc, "change"), |
| | (shown_tasks_misc, "change"), |
| | (model_types_misc, "change"), |
| | (model_sizes_misc, "change"), |
| | ]: |
| | getattr(comp, fn)( |
| | core.update_df, |
| | [gr.State(value=2), shown_tasks_misc, search_bar_misc, langs_bar_misc, model_sizes_misc, |
| | gr.State(value=False), model_types_misc], |
| | leaderboard_table_misc, |
| | ) |
| |
|
| | for comp, fn in [ |
| | (search_bar_mtbench, "submit"), |
| | (langs_bar_mtbench, "change"), |
| | ]: |
| | getattr(comp, fn)( |
| | core.update_df, |
| | [gr.State(value=3), |
| | gr.State(value=core.get_available_task_groups(core.get_selected_task_type(2), False)), |
| | search_bar_mtbench, langs_bar_mtbench, gr.State(value=False)], |
| | leaderboard_table_mtbench, |
| | ) |
| |
|
| | gr.Blocks.load( |
| | block=demo, |
| | fn=core.update_df, |
| | inputs=[gr.State(value=0), shown_tasks, search_bar, langs_bar, model_sizes, gr.State(value=True), model_types], |
| | outputs=leaderboard_table, |
| | ) |
| |
|
| | gr.Blocks.load( |
| | block=demo, |
| | fn=core.update_df, |
| | inputs=[gr.State(value=1), shown_tasks_zero_shot, search_bar_zero_shot, langs_bar_zero_shot, |
| | model_sizes_zero_shot, gr.State(value=False), model_types_zero_shot], |
| | outputs=leaderboard_table_zero_shot, |
| | ) |
| |
|
| | gr.Blocks.load( |
| | block=demo, |
| | fn=core.update_df, |
| | inputs=[gr.State(value=2), shown_tasks_misc, search_bar_misc, langs_bar_misc, model_sizes_misc, |
| | gr.State(value=False), model_types_misc], |
| | outputs=leaderboard_table_misc, |
| | ) |
| |
|
| | |
| | gr.Blocks.load( |
| | block=demo, |
| | fn=core.update_df, |
| | inputs=[gr.State(value=3), |
| | gr.State(value=core.get_available_task_groups(core.get_selected_task_type(2), False)), |
| | search_bar_mtbench, langs_bar_mtbench, gr.State(value=False)], |
| | outputs=leaderboard_table_mtbench, |
| | ) |
| |
|
| | demo.launch() |
| |
|