import gradio as gr
import pandas as pd
import plotly.express as px
from model_handler import ModelHandler
from data_handler import unified_exam_result_table, mmlu_result_table, unified_exam_chart, mmlu_chart

global_unified_exam_df = None
global_mmlu_df = None
global_output_armenian = None
global_output_mmlu = None

def refresh_data():
    global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu

    model_handler = ModelHandler()
    global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()

    global_output_armenian = unified_exam_result_table(global_unified_exam_df)
    global_output_mmlu = mmlu_result_table(global_mmlu_df)
    
    return global_output_armenian, global_output_mmlu, unified_exam_chart(global_output_armenian, 'Average'), mmlu_chart(global_output_mmlu, 'Average')

def main():
    # global global_mmlu_df, global_unified_exam_df, global_output_armenian, global_output_mmlu
    # model_handler = ModelHandler()
    # global_mmlu_df, global_unified_exam_df = model_handler.get_arm_bench_data()

    # global_output_armenian = unified_exam_result_table(global_unified_exam_df)
    # global_output_mmlu = mmlu_result_table(global_mmlu_df)

    with gr.Blocks() as app:
        with gr.Tabs():
            # with gr.TabItem("Armenian Unified Exams"):
            #     gr.Markdown("# Armenian Unified Test Exams")
            #     gr.Markdown(
            #         """
            #         This benchmark contains results of various Language Models on Armenian Unified Test Exams for Armenian language and literature, Armenian history and mathematics. The scoring system is a 20-point scale, where 0-8 is a Fail, 8-18 is a Pass, and 18-20 is a Distinction.
            #         """
            #     )
            #     table_output_armenian = gr.DataFrame(value=global_output_armenian)
            #     plot_column_dropdown_unified_exam = gr.Dropdown(choices=['Average', 'Armenian language and literature', 'Armenian history', 'Mathematics'], value='Average', label='Select Column to Plot')
            #     plot_output_armenian = gr.Plot(lambda column: unified_exam_chart(global_output_armenian, column), inputs=plot_column_dropdown_unified_exam)
            # with gr.TabItem("MMLU-Pro-Hy"):
            #     gr.Markdown("# MMLU-Pro Translated to Armenian (MMLU-Pro-Hy)")
            #     gr.Markdown(
            #         """
            #         This benchmark contains results of various Language Models on the MMLU-Pro benchmark, translated into Armenian. MMLU-Pro is a massive multi-task test in MCQA format. The scores represent accuracy.
            #         """
            #     )
            #     table_output_mmlu = gr.DataFrame(value=global_output_mmlu)
            #     subject_cols = ['Average','Biology', 'Business', 'Chemistry', 'Computer Science', 'Economics', 'Engineering', 'Health', 'History', 'Law', 'Math', 'Philosophy', 'Physics', 'Psychology','Other']
            #     plot_column_dropdown_mmlu = gr.Dropdown(choices=subject_cols, value='Average', label='Select Column to Plot')
            #     plot_output_mmlu = gr.Plot(lambda column: mmlu_chart(global_output_mmlu, column), inputs=plot_column_dropdown_mmlu)
            with gr.TabItem("About"):
                gr.Markdown("# About the Benchmark")
                gr.Markdown(
                    """
                    This benchmark evaluates Language Models on Armenian-specific tasks, including Armenian Unified Test Exams and a translated version of the MMLU-Pro benchmark (MMLU-Pro-Hy). It is designed to measure the models' understanding and generation capabilities in the Armenian language.

                    **Creator Company:** Metric AI Research Lab, Yerevan, Armenia."""
                )
                gr.Image("logo.png", width=200, show_label=False, show_download_button=False, show_fullscreen_button=False, show_share_button=False)
                gr.Markdown("""    
                    - [Website](https://metric.am/)
                    - [Hugging Face](https://huggingface.co/Metric-AI)
                    
                    MMLU-Pro-Hy is a massive multi-task test in MCQA format, inspired by the original MMLU benchmark, adapted for the Armenian language. The Armenian Unified Exams benchmark allows for comparison with human-level knowledge.
                    """
                )
                gr.Markdown("## Submission Guide")
                gr.Markdown(
                    """
                    To submit a model for evaluation, please follow these steps:
                    1. **Evaluate your model**:
                       - Follow the evaluation script provided here: [https://github.com/Anania-AI/Arm-LLM-Benchmark](https://github.com/Anania-AI/Arm-LLM-Benchmark)
                    2. **Format your submission file**:
                        - After evaluation, you will get a `result.json` file. Ensure the file follows this format:
                        ```json
                        {
                            "mmlu_results": [
                                {
                                    "category": "category_name",
                                    "score": score_value
                                },
                                ...
                            ],
                            "unified_exam_results": [
                                {
                                    "category": "category_name",
                                    "score": score_value
                                },
                                ...
                            ]
                        }
                        ```
                    3. **Submit your model**:
                        - Add the `arm_bench` tag and the `result.json` file to your model card.
                        - Click on the "Refresh Data" button in this app, and you will see your model's results.
                    """
                )
                gr.Markdown("## Contributing")
                gr.Markdown(
                    """
                    You can contribute to this benchmark in several ways:
                    - Providing API credits for evaluating API-based models.
                    - Citing our work in your research and publications.
                    - Contributing to the development of the benchmark itself.
                    """
                )

        refresh_button = gr.Button("Refresh Data")
        refresh_button.click(
            fn=refresh_data,
            outputs=[table_output_armenian,
                     table_output_mmlu,
                     plot_output_armenian,
                     plot_output_mmlu],
        )
    app.launch(share=True, debug=True)

if __name__ == "__main__":
    main()