File size: 1,790 Bytes
76050e6 43b9e03 4821c71 76050e6 b54190c 4bec130 76050e6 43b9e03 76050e6 4821c71 76050e6 4bec130 76050e6 4bec130 43b9e03 76050e6 43b9e03 76050e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
import streamlit as st
from draw_utils import PAGE_MARKDOWN, PAGE_INFO, LENGTHS
from draw_utils import load_results, style_dataframe
st.set_page_config(layout="wide", page_title="Leaderboard App")
st.markdown(PAGE_MARKDOWN, unsafe_allow_html=True)
def draw_leaderboard():
df = load_results()
tasks = ['avg(qa1-5)'] + [f"qa{i}" for i in range(1, 11)]
columns = ["model_name", "β€32k", "β€128k"] + LENGTHS
st.title("πππͺ‘πβ BABILong Leaderboard π")
st.markdown(PAGE_INFO)
st.subheader("Evaluation results:")
st.text('Each tab corresponds to a task, avg - averaged scores over qa1-5 tasks.')
st.markdown('Predictions of all evaluated models: '
'[BABILong evals](https://huggingface.co/datasets/RMT-team/babilong_evals)')
search_term = st.text_input("Search models:", "")
tabs = st.tabs(tasks)
for i, tab in enumerate(tabs):
with tab:
task_df = df[df.task == tasks[i]][columns]
if i == 0: # do not dispay models with no evals β€1k for avg task
task_df = task_df.loc[~task_df[task_df.columns[:5]].isna().any(axis=1)]
if search_term:
task_df = task_df[task_df['model_name'].str.contains(search_term, case=False)]
task_df.reset_index(drop=True, inplace=True)
row_height = 35
height = (len(task_df) + 1) * row_height
styled_df = style_dataframe(task_df).format(precision=1)
st.dataframe(
styled_df,
width=1070,
height=height,
column_config={
"model_name": st.column_config.Column(width=260, pinned=True)
}
)
if __name__ == "__main__":
draw_leaderboard()
|