import gradio as gr import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler # Removed Hugging Face Hub imports as they are not needed for the simplified leaderboard # from huggingface_hub import snapshot_download, HfApi from src.about import ( # Assuming these still exist and are relevant for other tabs CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css # Keep custom CSS # Removed utils imports related to the old leaderboard # from src.display.utils import (...) from src.envs import REPO_ID # Keep if needed for restart_space or other functions # Removed constants related to old data paths and repos if not needed elsewhere # from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN # Removed old data processing functions # from src.populate import get_evaluation_queue_df, get_leaderboard_df from src.submission.submit import add_new_eval # Keep submission logic # --- Elo Leaderboard Configuration --- # Data from the table provided by the user data = [ {'model': 'gpt-4o-mini', 'MLE-Lite_Elo': 753, 'Tabular_Elo': 839, 'NLP_Elo': 758, 'CV_Elo': 754, 'Overall': 778}, {'model': 'gpt-4o', 'MLE-Lite_Elo': 830, 'Tabular_Elo': 861, 'NLP_Elo': 903, 'CV_Elo': 761, 'Overall': 841}, {'model': 'o3-mini', 'MLE-Lite_Elo': 1108, 'Tabular_Elo': 1019, 'NLP_Elo': 1056, 'CV_Elo': 1207, 'Overall': 1096}, # Renamed 'DeepSeek-v3' to match previous list - adjust if needed {'model': 'deepseek-v3', 'MLE-Lite_Elo': 1004, 'Tabular_Elo': 1015, 'NLP_Elo': 1028, 'CV_Elo': 1067, 'Overall': 1023}, # Renamed 'DeepSeek-r1' to match previous list - adjust if needed {'model': 'deepseek-r1', 'MLE-Lite_Elo': 1137, 'Tabular_Elo': 1053, 'NLP_Elo': 1103, 'CV_Elo': 1083, 'Overall': 1100}, # Renamed 'Gemini-2.0-Flash' to match previous list - adjust if needed {'model': 'gemini-2.0-flash', 'MLE-Lite_Elo': 847, 'Tabular_Elo': 923, 'NLP_Elo': 860, 'CV_Elo': 978, 'Overall': 895}, # Renamed 'Gemini-2.0-Pro' to match previous list - adjust if needed {'model': 'gemini-2.0-pro', 'MLE-Lite_Elo': 1064, 'Tabular_Elo': 1139, 'NLP_Elo': 1028, 'CV_Elo': 973, 'Overall': 1054}, # Renamed 'Gemini-2.5-Pro' to match previous list - adjust if needed {'model': 'gemini-2.5-pro', 'MLE-Lite_Elo': 1257, 'Tabular_Elo': 1150, 'NLP_Elo': 1266, 'CV_Elo': 1177, 'Overall': 1214}, ] # Create a master DataFrame master_df = pd.DataFrame(data) # Define categories for selection (user-facing) CATEGORIES = ["MLE-Lite", "Tabular", "NLP", "CV", "Overall"] DEFAULT_CATEGORY = "Overall" # Set a default category # Map user-facing categories to DataFrame column names category_to_column = { "MLE-Lite": "MLE-Lite_Elo", "Tabular": "Tabular_Elo", "NLP": "NLP_Elo", "CV": "CV_Elo", "Overall": "Overall" } # --- Helper function to update leaderboard --- def update_leaderboard(category): """ Selects the relevant columns for the category, renames the score column to 'Elo Score', sorts by score descending, and returns the DataFrame. """ score_column = category_to_column.get(category) if score_column is None or score_column not in master_df.columns: # Fallback if category or column is invalid print(f"Warning: Invalid category '{category}' or column '{score_column}'. Falling back to default.") score_column = category_to_column[DEFAULT_CATEGORY] if score_column not in master_df.columns: # Check fallback column too return pd.DataFrame({"Model": [], "Elo Score": []}) # Return empty if still invalid # Select model and the specific score column df = master_df[['model', score_column]].copy() # Rename the score column to 'Elo Score' for consistent display df.rename(columns={score_column: 'Elo Score'}, inplace=True) # Sort by 'Elo Score' descending df.sort_values(by='Elo Score', ascending=False, inplace=True) # Reset index for cleaner display (optional) df.reset_index(drop=True, inplace=True) return df # --- Mock/Placeholder functions/data for other tabs --- # (Same as previous version - providing empty data) print("Warning: Evaluation queue data fetching is disabled/mocked due to leaderboard changes.") finished_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"]) running_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"]) pending_eval_queue_df = pd.DataFrame(columns=["Model", "Status", "Requested", "Started"]) EVAL_COLS = ["Model", "Status", "Requested", "Started"] # Define for the dataframe headers EVAL_TYPES = ["str", "str", "str", "str"] # Define for the dataframe types # --- Keep restart function if relevant --- # (Same as previous version) def restart_space(): print(f"Attempting to restart space: {REPO_ID}") # Replace with your actual space restart mechanism if needed # --- Gradio App Definition --- demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🏅 MLE-Dojo Benchmark", elem_id="llm-benchmark-tab-table", id=0): with gr.Column(): gr.Markdown("## Model Elo Rankings") # New title for the section category_selector = gr.Radio( choices=CATEGORIES, label="Select Category to Sort By", # Updated label value=DEFAULT_CATEGORY, # Default selection interactive=True, container=False, ) leaderboard_df_component = gr.Dataframe( # Initialize with sorted data for the default category value=update_leaderboard(DEFAULT_CATEGORY), headers=["Model", "Elo Score"], datatype=["str", "number"], interactive=False, # Adjust row count based on the number of models row_count=(len(master_df), "fixed"), col_count=(2, "fixed"), ) # Link the radio button change to the update function category_selector.change( fn=update_leaderboard, inputs=category_selector, outputs=leaderboard_df_component ) with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2): # (Content unchanged) gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") # with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3): # # (Content unchanged, still uses potentially empty/mock queue data) # with gr.Column(): # with gr.Row(): # gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") # with gr.Column(): # with gr.Accordion( # f"✅ Finished Evaluations ({len(finished_eval_queue_df)})", # open=False, # ): # with gr.Row(): # finished_eval_table = gr.components.Dataframe( # value=finished_eval_queue_df, # headers=EVAL_COLS, # datatype=EVAL_TYPES, # row_count=5, # ) # with gr.Accordion( # f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})", # open=False, # ): # with gr.Row(): # running_eval_table = gr.components.Dataframe( # value=running_eval_queue_df, # headers=EVAL_COLS, # datatype=EVAL_TYPES, # row_count=5, # ) # with gr.Accordion( # f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", # open=False, # ): # with gr.Row(): # pending_eval_table = gr.components.Dataframe( # value=pending_eval_queue_df, # headers=EVAL_COLS, # datatype=EVAL_TYPES, # row_count=5, # ) # with gr.Row(): # gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text") # with gr.Row(): # # Submission form - kept as is # with gr.Column(): # model_name_textbox = gr.Textbox(label="Model name") # revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") # model_type = gr.Dropdown( # choices=["Type A", "Type B", "Type C"], # Example choices # label="Model type", # multiselect=False, # value=None, # interactive=True, # ) # with gr.Column(): # precision = gr.Dropdown( # choices=["float16", "bfloat16", "float32", "int8"], # Example choices # label="Precision", # multiselect=False, # value="float16", # interactive=True, # ) # weight_type = gr.Dropdown( # choices=["Original", "Adapter", "Delta"], # Example choices # label="Weights type", # multiselect=False, # value="Original", # interactive=True, # ) # base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)") # submit_button = gr.Button("Submit Eval") # submission_result = gr.Markdown() # submit_button.click( # add_new_eval, # [ # model_name_textbox, # base_model_name_textbox, # revision_name_textbox, # precision, # weight_type, # model_type, # ], # submission_result, # ) with gr.Row(): with gr.Accordion("📙 Citation", open=False): # (Content unchanged) citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) # --- Keep scheduler if relevant --- # scheduler = BackgroundScheduler() # scheduler.add_job(restart_space, "interval", seconds=1800) # Restart every 30 mins # scheduler.start() # --- Launch the app --- demo.launch()