|
import json |
|
from datetime import datetime, date |
|
|
|
import gradio as gr |
|
import plotly.graph_objects as go |
|
|
|
|
|
def create_big_five_capex_plot() -> go.Figure: |
|
|
|
with open("big_five_capex.jsonl", "r") as file: |
|
data = [json.loads(line) for line in file if line.strip()] |
|
|
|
quarters: list[str] = [entry["Quarter"] for entry in data] |
|
companies = ['Microsoft', 'Google', 'Meta', 'Amazon'] |
|
colors = ['#80bb00', '#ee161f', '#0065e3', '#ff6200'] |
|
|
|
x_positions = list(range(len(quarters))) |
|
|
|
traces = [] |
|
for company, color in zip(companies, colors): |
|
y_data = [entry[company] for entry in data] |
|
traces.append(go.Bar( |
|
name=company, |
|
x=x_positions, |
|
y=y_data, |
|
marker_color=color |
|
)) |
|
|
|
fig = go.Figure(data=traces) |
|
fig.update_layout( |
|
barmode="stack", |
|
title="Capital Expenditures of Amazon, Meta, Google and Microsoft in Millions of USD per Quarter", |
|
xaxis_title="Quarter", |
|
yaxis_title="Capital Expenditures (Millions USD)", |
|
xaxis=dict( |
|
tickmode='array', |
|
tickvals=x_positions, |
|
ticktext=quarters |
|
), |
|
height=800 |
|
) |
|
|
|
|
|
|
|
try: |
|
idx_q1 = quarters.index("2023 Q1") |
|
idx_q2 = quarters.index("2023 Q2") |
|
vline_x = (idx_q1 + idx_q2) / 2 |
|
except ValueError: |
|
|
|
vline_x = 0 |
|
|
|
|
|
fig.add_shape( |
|
type="line", |
|
xref="x", |
|
yref="paper", |
|
x0=vline_x, |
|
y0=0, |
|
x1=vline_x, |
|
y1=1, |
|
line=dict( |
|
color="black", |
|
dash="dot", |
|
width=2 |
|
) |
|
) |
|
|
|
|
|
fig.add_annotation( |
|
x=vline_x, |
|
y=1.05, |
|
xref="x", |
|
yref="paper", |
|
text="AI arms race begins", |
|
showarrow=False, |
|
font=dict( |
|
color="black", |
|
size=12 |
|
), |
|
align="center" |
|
) |
|
|
|
return fig |
|
|
|
|
|
def create_simple_plot(data_path: str, |
|
name: str, |
|
subtitle: str, |
|
start_date: datetime, end_date: datetime, |
|
min_value: int = 0, max_value: int = 100, |
|
labeled_horizontal_lines: dict[str, float] = None) -> go.Figure: |
|
leaderboard = [] |
|
with open(data_path, 'r') as file: |
|
for line in file: |
|
leaderboard.append(json.loads(line)) |
|
|
|
models = [] |
|
with open("models.jsonl", 'r') as file: |
|
for line in file: |
|
models.append(json.loads(line)) |
|
|
|
data = [] |
|
for entry in leaderboard: |
|
model_name = entry['model'] |
|
score = entry['score'] |
|
model_info = next((m for m in models if m['Name'] == model_name), None) |
|
if model_info: |
|
release_date = datetime.strptime(model_info['Release Date'], "%Y-%m-%d") |
|
data.append({'model': model_name, 'score': score, 'release_date': release_date}) |
|
else: |
|
print(f"[WARNING] Model '{model_name}' not found in models.jsonl") |
|
|
|
data.sort(key=lambda x: x['release_date']) |
|
|
|
x_dates = [d['release_date'] for d in data] |
|
y_scores = [] |
|
max_score = 0 |
|
for entry in data: |
|
if entry['score'] > max_score: |
|
max_score = entry['score'] |
|
y_scores.append(max_score) |
|
|
|
fig = go.Figure() |
|
|
|
fig.add_trace(go.Scatter( |
|
x=x_dates, |
|
y=y_scores, |
|
mode='lines', |
|
line=dict(shape='hv', width=2), |
|
name='Best Score to Date' |
|
)) |
|
|
|
for i, entry in enumerate(data): |
|
if i == 0 or y_scores[i] > y_scores[i - 1]: |
|
fig.add_trace(go.Scatter( |
|
x=[entry['release_date']], |
|
y=[entry['score']], |
|
mode='markers+text', |
|
marker=dict(size=10), |
|
text=[entry['model']], |
|
textposition="top center", |
|
name=entry['model'] |
|
)) |
|
|
|
fig.update_layout( |
|
title=f'{name} Over Time<br><sup>{subtitle}</sup>', |
|
xaxis_title='Publication or Release Date', |
|
yaxis_title=name, |
|
hovermode='x unified', |
|
xaxis=dict( |
|
range=[start_date, end_date], |
|
type='date' |
|
), |
|
yaxis=dict( |
|
range=[min_value, max_value] |
|
), |
|
height=800 |
|
) |
|
|
|
if labeled_horizontal_lines: |
|
for label, y_value in labeled_horizontal_lines.items(): |
|
fig.add_hline( |
|
y=y_value, |
|
line_dash="dot", |
|
line_color="black", |
|
annotation_text=label, |
|
annotation_position="right", |
|
annotation=dict( |
|
font_size=12, |
|
font_color="black", |
|
xanchor="left", |
|
yanchor="middle", |
|
xshift=10 |
|
) |
|
) |
|
|
|
return fig |
|
|
|
|
|
with gr.Blocks() as demo: |
|
with gr.Tab("System Performance Over Time"): |
|
with gr.Tab("Legend"): |
|
legend_markdown: gr.Markdown = gr.Markdown( |
|
value=""" |
|
## Benchmarks and Top Scores |
|
|
|
| Benchmark | Top Score | |
|
|-----------|-----------| |
|
| BigCodeBench | 🟠 36% | |
|
| Simple Bench | 🟠 42% | |
|
| PlanBench | 🟠 53% | |
|
| GAIA | 🟡 65% | |
|
| ARC-AGI-Pub (Semi-Private Eval) | 🟡 76% | |
|
| GPQA | 🟡 76% | |
|
| ZebraLogic | 🟡 81% | |
|
| ARC-AGI-Pub (Public Eval) | 🟡 83% | |
|
| ZeroEval | 🟡 86% | |
|
| MATH-L5 | 🟡 89% | |
|
| MMLU-Redux | 🟢 93% | |
|
| CRUX | 🟢 96% | |
|
|
|
## Colors |
|
|
|
| Color | Score Range | |
|
|-------|------------| |
|
| 🔴 Red | Below 30% | |
|
| 🟠 Orange | 30% to 60% | |
|
| 🟡 Yellow | 60% to 90% | |
|
| 🟢 Green | Above 90% |""" |
|
) |
|
with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab: |
|
bigcodebench_plot: gr.Plot = gr.Plot() |
|
bigcodebench_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [BigCodeBench Leaderboard](https://bigcode-bench.github.io/)""" |
|
) |
|
with gr.Tab("🟠 Simple Bench") as simple_bench_tab: |
|
simple_bench_plot: gr.Plot = gr.Plot() |
|
simple_bench_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)""" |
|
) |
|
with gr.Tab("🟠 PlanBench") as planbench_tab: |
|
planbench_plot: gr.Plot = gr.Plot() |
|
planbench_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)""" |
|
) |
|
with gr.Tab("🟡 GAIA") as gaia_tab: |
|
gaia_plot: gr.Plot = gr.Plot() |
|
gaia_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)""" |
|
) |
|
with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab: |
|
with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab: |
|
arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot() |
|
with gr.Tab("🟡 Public Eval") as arc_agi_public_eval_tab: |
|
arc_agi_public_eval_plot: gr.Plot = gr.Plot() |
|
arc_agi_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)""" |
|
) |
|
with gr.Tab("🟡 GPQA") as gpqa_tab: |
|
gpqa_plot: gr.Plot = gr.Plot() |
|
gpqa_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)""" |
|
) |
|
with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab: |
|
zeroeval_zebralogic_plot: gr.Plot = gr.Plot() |
|
zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)""" |
|
) |
|
with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab: |
|
zeroeval_average_plot: gr.Plot = gr.Plot() |
|
zeroeval_average_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)""" |
|
) |
|
with gr.Tab("🟡 MATH-L5") as zeroeval_math_l5_tab: |
|
zeroeval_math_l5_plot: gr.Plot = gr.Plot() |
|
zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)""" |
|
) |
|
with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab: |
|
zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot() |
|
zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)""" |
|
) |
|
with gr.Tab("🟢 CRUX") as zeroeval_crux_tab: |
|
zeroeval_crux_plot: gr.Plot = gr.Plot() |
|
zeroeval_crux_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)""" |
|
) |
|
with gr.Tab("Codeforces") as codeforces_tab: |
|
codeforces_plot: gr.Plot = gr.Plot() |
|
with gr.Tab("OpenCompass", visible=False): |
|
opencompass_plot: gr.Plot = gr.Plot() |
|
opencompass_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [OpenCompass LLM Leaderboard](https://huggingface.co/spaces/opencompass/opencompass-llm-leaderboard)""" |
|
) |
|
with gr.Tab("SWE-bench", visible=False): |
|
swe_bench_plot: gr.Plot = gr.Plot() |
|
swe_bench_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)""" |
|
) |
|
with gr.Tab("WebArena", visible=False): |
|
webarena_plot: gr.Plot = gr.Plot() |
|
webarena_markdown: gr.Markdown = gr.Markdown( |
|
value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)""" |
|
) |
|
with gr.Tab("Finance") as finance_tab: |
|
with gr.Tab("Big Tech Capex") as big_five_capex_tab: |
|
big_five_capex_plot: gr.Plot = gr.Plot() |
|
with gr.Tab("NVIDIA Revenue", visible=False) as nvidia_revenue: |
|
nvidia_revenue_plot: gr.Plot = gr.Plot() |
|
big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot) |
|
arc_agi_public_eval_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("arc_agi_leaderboard.jsonl"), |
|
gr.State("ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"), |
|
gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"), |
|
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)), |
|
gr.State(0), gr.State(100), |
|
gr.State({"Humans\n(LeGris et al. 2024)": 64.2})], |
|
outputs=arc_agi_public_eval_plot) |
|
arc_agi_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"), |
|
gr.State("ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"), |
|
gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"), |
|
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)), |
|
gr.State(0), gr.State(100), |
|
gr.State({"MTurkers": 77})], |
|
outputs=arc_agi_semi_private_eval_plot) |
|
arc_agi_semi_private_eval_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"), |
|
gr.State("ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"), |
|
gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"), |
|
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)), |
|
gr.State(0), gr.State(100), |
|
gr.State({"MTurkers": 77})], |
|
outputs=arc_agi_semi_private_eval_plot) |
|
finance_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot) |
|
simple_bench_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("simple_bench_leaderboard.jsonl"), |
|
gr.State("Simple Bench Score"), |
|
gr.State("\"multiple-choice text benchmark [...] [including] over 200 questions covering spatio-temporal reasoning, social intelligence, and what we call linguistic adversarial robustness\" (Philip & Hemang, 2024)"), |
|
gr.State(date(2024, 4, 1)), gr.State(date(2025, 1, 1)), |
|
gr.State(0), gr.State(100), |
|
gr.State({"Humans": 83.7})], |
|
outputs=simple_bench_plot) |
|
codeforces_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("codeforces_leaderboard.jsonl"), |
|
gr.State("Codeforces Rating"), |
|
gr.State("\"[Codeforces] is a platform where [programming] contests are held regularly, the participant's skills are reflected by their rating [...] The rating is a modification of Elo rating\" (Mirzayanov, 2011)"), |
|
gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)), |
|
gr.State(0), gr.State(4000), |
|
gr.State({"Pupil": 1200, "Specialist": 1400, "Expert": 1600, "Candidate Master": 1900, "Master": 2100, "International Master": 2300, "Grandmaster": 2400, "International Grandmaster": 2600, "Legendary Grandmaster": 3000})], |
|
outputs=codeforces_plot) |
|
planbench_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("planbench_leaderboard.jsonl"), |
|
gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"), |
|
gr.State("\"benchmark suite based on the kinds of domains used in the automated planning community [...] to test the capabilities of LLMs in planning or reasoning about actions and change.\" (Valmeekam et al. 2022)"), |
|
gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))], |
|
outputs=planbench_plot) |
|
bigcodebench_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"), |
|
gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"), |
|
gr.State("\"benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks\" (Zhuo et al. 2024)"), |
|
gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))], |
|
outputs=bigcodebench_plot) |
|
gaia_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("gaia_leaderboard.jsonl"), |
|
gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"), |
|
gr.State("\"real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency\" (Mialon et al. 2023)"), |
|
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)), |
|
gr.State(0), gr.State(100), |
|
gr.State({"Humans": 92})], |
|
outputs=gaia_plot) |
|
gpqa_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("gpqa_leaderboard.jsonl"), |
|
gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"), |
|
gr.State("\"challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry [that] are high-quality and extremely difficult\" (Rein et al. 2023)"), |
|
gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)), |
|
gr.State(25), gr.State(100), |
|
gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})], |
|
outputs=gpqa_plot) |
|
zeroeval_average_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("zeroeval_average_leaderboard.jsonl"), |
|
gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"), |
|
gr.State("\"a simple unified framework for evaluating language models on various tasks\" (Ai2, 2024)"), |
|
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))], |
|
outputs=zeroeval_average_plot) |
|
zeroeval_mmlu_redux_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"), |
|
gr.State("ZeroEval MMLU-Redux (Massive Multitask Language Understanding) Score"), |
|
gr.State("\"knowledge reasoning\" (Ai2, 2024); \"subset of 3,000 manually re-annotated questions across 30 MMLU subjects\" (Gema et al. 2024)"), |
|
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))], |
|
outputs=zeroeval_mmlu_redux_plot) |
|
zeroeval_zebralogic_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"), |
|
gr.State("ZeroEval ZebraLogic Score"), |
|
gr.State("\"logical reasoning\" (Ai2, 2024); \"Each example is a Logic Grid Puzzle [...] often used to test humans' logical reasoning abilities\" (Lin, 2024)"), |
|
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))], |
|
outputs=zeroeval_zebralogic_plot) |
|
zeroeval_crux_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"), |
|
gr.State("ZeroEval CRUX (Code Reasoning, Understanding, and eXecution Evaluation) Score"), |
|
gr.State("\"code reasoning\" (Ai2, 2024); \"benchmark consisting of 800 Python functions (3-13 lines). Each function comes with [...] two natural tasks: input prediction and output prediction.\" (Gu et al. 2024)"), |
|
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))], |
|
outputs=zeroeval_crux_plot) |
|
zeroeval_math_l5_tab.select(fn=create_simple_plot, |
|
inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"), |
|
gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"), |
|
gr.State("\"math reasoning\" (Ai2, 2024); \"dataset of 12,500 challenging competition mathematics problems. [...] a subject’s hardest problems are assigned a difficulty level of ‘5.’\" (Hendrycks et al. 2021)"), |
|
gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))], |
|
outputs=zeroeval_math_l5_plot) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|