import gradio as gr import pandas as pd import plotly.express as px from model_handler import ModelHandler from data_handler import unified_exam_result_table, mmlu_result_table, unified_exam_chart, mmlu_chart global_unified_exam_df = None global_mmlu_df = None global_output_armenian = None global_output_mmlu = None def refresh_data(): global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu model_handler = ModelHandler() global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data() global_output_armenian = unified_exam_result_table(global_unified_exam_df) global_output_mmlu = mmlu_result_table(global_mmlu_df) return global_output_armenian, global_output_mmlu, unified_exam_chart(global_output_armenian, 'Average'), mmlu_chart(global_output_mmlu, 'Average') def main(): # global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu # model_handler = ModelHandler() # global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data() # global_output_armenian = unified_exam_result_table(global_unified_exam_df) # global_output_mmlu = mmlu_result_table(global_mmlu_df) with gr.Blocks() as app: with gr.Tabs(): # with gr.TabItem("Armenian Unified Exams"): # gr.Markdown("# Armenian Unified Test Exams") # gr.Markdown( # """ # This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction. # """ # ) # table_output_armenian = gr.DataFrame(value=global_output_armenian) # plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot') # plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam) # with gr.TabItem("MMLU-Pro-Hy"): # gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)") # gr.Markdown( # """ # This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy. # """ # ) # table_output_mmlu = gr.DataFrame(value=global_output_mmlu) # subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other'] # plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot') # plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu) with gr.TabItem("About"): gr.Markdown("# About the Benchmark") gr.Markdown( """ This benchmark evaluates Language Models on Armenian-specific tasks, including Armenian Unified Test Exams and a translated version of the MMLU-Pro benchmark (MMLU-Pro-Hy). It is designed to measure the models' understanding and generation capabilities in the Armenian language. **Creator Company:** Metric AI Research Lab, Yerevan, Armenia.""" ) gr.Image("logo.png", width=200, show_label=False, show_download_button=False, show_fullscreen_button=False, show_share_button=False) gr.Markdown(""" - [Website](https://metric.am/) - [Hugging Face](https://huggingface.co/Metric-AI) MMLU-Pro-Hy is a massive multi-task test in MCQA format, inspired by the original MMLU benchmark, adapted for the Armenian language. The Armenian Unified Exams benchmark allows for comparison with human-level knowledge. """ ) gr.Markdown("## Submission Guide") gr.Markdown( """ To submit a model for evaluation, please follow these steps: 1. **Evaluate your model**: - Follow the evaluation script provided here: [https://github.com/Anania-AI/Arm-LLM-Benchmark](https://github.com/Anania-AI/Arm-LLM-Benchmark) 2. **Format your submission file**: - After evaluation, you will get a `result.json` file. Ensure the file follows this format: ```json { "mmlu_results": [ { "category": "category_name", "score": score_value }, ... ], "unified_exam_results": [ { "category": "category_name", "score": score_value }, ... ] } ``` 3. **Submit your model**: - Add the `arm_bench` tag and the `result.json` file to your model card. - Click on the "Refresh Data" button in this app, and you will see your model's results. """ ) gr.Markdown("## Contributing") gr.Markdown( """ You can contribute to this benchmark in several ways: - Providing API credits for evaluating API-based models. - Citing our work in your research and publications. - Contributing to the development of the benchmark itself. """ ) refresh_button = gr.Button("Refresh Data") refresh_button.click( fn=refresh_data, outputs=[table_output_armenian, table_output_mmlu, plot_output_armenian, plot_output_mmlu], ) app.launch(share=True, debug=True) if __name__ == "__main__": main()