|
|
|
|
|
|
|
|
|
|
|
import json |
|
import random |
|
import pandas as pd |
|
import streamlit as st |
|
from datasets import load_dataset |
|
from datasets import get_dataset_config_names |
|
|
|
st.title("Code:blue[Arena]") |
|
|
|
problem_dict = dict() |
|
|
|
|
|
with st.spinner("Loading Venus data...", show_time=True): |
|
venus_ds = load_dataset("Elfsong/leetcode_data", split='train') |
|
for problem in venus_ds: |
|
problem_id = problem["title"] |
|
problem['type'] = "leetcode" |
|
problem_dict[problem_id] = problem |
|
|
|
|
|
with st.spinner("Loading APPS data...", show_time=True): |
|
apps_ds = load_dataset("Elfsong/APPS_Python", split='test') |
|
for problem in apps_ds: |
|
problem_id = f'apps_{problem["problem_id"]}' |
|
problem['type'] = "apps" |
|
problem_dict[problem_id] = problem |
|
|
|
problem_count = len(problem_dict) |
|
|
|
|
|
if "problem" in st.query_params: |
|
problem_id = str(st.query_params["problem"]) |
|
problem_instance = problem_dict[problem_id] |
|
problem_type = problem_instance['type'] |
|
|
|
st.header(problem_id) |
|
|
|
with st.expander("Problem Description"): |
|
if problem_type == "leetcode": |
|
st.markdown(problem_instance["question_content"]) |
|
elif problem_type == "apps": |
|
st.markdown(problem_instance["problem_content"]) |
|
|
|
with st.expander("Test Cases"): |
|
test_cases = json.loads(problem_instance["test_cases"]) |
|
df = pd.DataFrame( |
|
{ |
|
"input": [test_case['input'] for test_case in test_cases], |
|
"output": [test_case['output'] for test_case in test_cases], |
|
} |
|
) |
|
st.dataframe( |
|
df, |
|
column_config={ |
|
"input": st.column_config.TextColumn("Input"), |
|
"output": st.column_config.TextColumn("Output"), |
|
}, |
|
column_order=("input", "output"), |
|
) |
|
|
|
with st.expander("Test Case Generator"): |
|
if problem_type == "leetcode": |
|
test_case_generator = problem_instance["test_case_generator"] |
|
prompt = "# For now, we only disclose the top 20 lines of the test case generator.\n# the full version will be released after the paper review process.\n" |
|
test_case_generator = "\n".join(test_case_generator.split("\n")[:20]) |
|
st.code(prompt+test_case_generator) |
|
else: |
|
st.code("Stay tuned!") |
|
|
|
|
|
else: |
|
tab_problem, tab_submission, tab_model, tab_about = st.tabs(["Problems", "Submissions", "Models", "About"]) |
|
|
|
with tab_problem: |
|
with st.spinner("Loading Framework...", show_time=True): |
|
df = pd.DataFrame( |
|
{ |
|
"problem_id": [int(problem['problem_id']) for problem in problem_dict.values()], |
|
"difficulty": [str(problem['difficulty']) for problem in problem_dict.values()], |
|
"type": [str(problem['type']) for problem in problem_dict.values()], |
|
"problem_link": ["https://huggingface.co/spaces/Elfsong/CodeArena/?problem=" + (str(problem['title']) if problem['type'] == "leetcode" else f'apps_{problem["problem_id"]}') for problem in problem_dict.values()], |
|
"acceptance_rate": [[random.randint(0, 100) for _ in range(20)] for problem in problem_dict.values()], |
|
} |
|
) |
|
st.dataframe( |
|
df, |
|
column_config={ |
|
"problem_id": st.column_config.NumberColumn("Problem ID", width='small'), |
|
"difficulty": st.column_config.TextColumn("Difficulty", width='small'), |
|
"type": st.column_config.TextColumn("Type", width='small'), |
|
"acceptance_rate": st.column_config.LineChartColumn("Acceptance Rate", y_min=0, y_max=100), |
|
"problem_link": st.column_config.LinkColumn("Link", display_text="Open", width='small'), |
|
}, |
|
height=800, |
|
column_order=("problem_id", "difficulty", "type", "acceptance_rate", "problem_link"), |
|
hide_index=True, |
|
) |
|
|
|
with tab_submission: |
|
st.header("Submissions") |
|
models = get_dataset_config_names("Elfsong/Venus_Model_Evaluation") |
|
model_name = st.selectbox("Which model you are looking for?", models, placeholder="Select a model...") |
|
st.write("You selected:", model_name) |
|
|
|
with st.spinner("Loading Data...", show_time=True): |
|
ds = load_dataset("Elfsong/Venus_Model_Evaluation", model_name, split='train') |
|
df = pd.DataFrame( |
|
{ |
|
"problem_id": [int(problem['problem_id']) for problem in ds], |
|
"solution": [str(problem['solution']) for problem in ds], |
|
} |
|
) |
|
st.dataframe( |
|
df, |
|
column_config={ |
|
"problem_id": st.column_config.NumberColumn("Problem ID", width='small'), |
|
"solution": st.column_config.TextColumn("Solution", width='big'), |
|
}, |
|
height=800, |
|
column_order=("problem_id", "solution"), |
|
hide_index=True, |
|
) |
|
|
|
|
|
with tab_model: |
|
model_list = [ |
|
"deepSeek-Coder", |
|
"GPT-4o", |
|
"Claude-3-5-sonnet", |
|
"Gemini-1.5-flash", |
|
"DeepSeek-Coder-V2-Lite", |
|
"Claude-3-Opus", |
|
"Gemini-1.5-pro", |
|
"Llama-3.1-8B", |
|
"Llama-3-8B", |
|
"GPT-4-Turbo", |
|
"GPT-3.5-Turbo", |
|
"Mistral-Nemo", |
|
"CodeLlama-13b", |
|
"Claude-3-Haiku", |
|
"Mistral-7B-v0.3", |
|
"Codestral-22B-v0.1", |
|
"Claude-3-sonnet", |
|
"CodeLlama-34b", |
|
"CodeLlama-7b" |
|
] |
|
|
|
|
|
df = pd.DataFrame( |
|
{ |
|
"model_name": [model_name for model_name in model_list], |
|
"dynamic_point": [0 for model_name in model_list], |
|
"pass@1": [0 for model_name in model_list], |
|
"beyond@t": [0 for model_name in model_list], |
|
"beyond@m": [0 for model_name in model_list], |
|
"model_progress": [int(random.randint(0, problem_count+1)) for model_name in model_list], |
|
} |
|
) |
|
|
|
st.dataframe( |
|
df, |
|
column_config={ |
|
"model_name": st.column_config.TextColumn("Model Name"), |
|
"dynamic_point": st.column_config.NumberColumn("Dynamic Point"), |
|
"pass@1": st.column_config.NumberColumn("Pass@1"), |
|
"beyond@t": st.column_config.NumberColumn("Beyond@Time"), |
|
"beyond@m": st.column_config.NumberColumn("Beyond@Memory"), |
|
"model_progress": st.column_config.ProgressColumn("Progress", min_value=0, max_value=problem_count, format="compact"), |
|
}, |
|
column_order=("model_name", "Dynamic Point", "pass@1", "beyond@t", "beyond@m", "model_progress"), |
|
height=800, |
|
) |
|
|
|
with tab_about: |
|
st.write("Hello World!") |
|
st.write("This is the new version of Code Arena. Refer to [Monolith](https://github.com/Elfsong/Monolith) for instructions on how to submit code.") |
|
st.write("🚧 WIP: We will update real data very soon!") |
|
|