Spaces:

kaizuberbuehler
/

ai-progress-charts

Running

App Files Files Community

ai-progress-charts / app.py

kaizuberbuehler

Remove unused import statements

a2d5ea0 6 months ago

raw

history blame

20.8 kB

	import json
	from datetime import datetime, date

	import gradio as gr
	import plotly.graph_objects as go


	def create_big_five_capex_plot() -> go.Figure:
	# Read data from the JSON Lines file.
	with open("big_five_capex.jsonl", "r") as file:
	data = [json.loads(line) for line in file if line.strip()]

	quarters: list[str] = [entry["Quarter"] for entry in data]
	companies = ['Microsoft', 'Google', 'Meta', 'Amazon']
	colors = ['#80bb00', '#ee161f', '#0065e3', '#ff6200']

	x_positions = list(range(len(quarters)))

	traces = []
	for company, color in zip(companies, colors):
	y_data = [entry[company] for entry in data]
	traces.append(go.Bar(
	name=company,
	x=x_positions,
	y=y_data,
	marker_color=color
	))

	fig = go.Figure(data=traces)
	fig.update_layout(
	barmode="stack",
	title="Capital Expenditures of Amazon, Meta, Google and Microsoft in Millions of USD per Quarter",
	xaxis_title="Quarter",
	yaxis_title="Capital Expenditures (Millions USD)",
	xaxis=dict(
	tickmode='array',
	tickvals=x_positions,
	ticktext=quarters
	),
	height=800
	)

	# Calculate the x position for the vertical dotted line.
	# We want the line drawn between "2023 Q1" and "2023 Q2".
	try:
	idx_q1 = quarters.index("2023 Q1")
	idx_q2 = quarters.index("2023 Q2")
	vline_x = (idx_q1 + idx_q2) / 2 # position midway between the two quarters
	except ValueError:
	# Fall back if quarters not found.
	vline_x = 0

	# Add a vertical dotted line spanning the full height
	fig.add_shape(
	type="line",
	xref="x",
	yref="paper",
	x0=vline_x,
	y0=0,
	x1=vline_x,
	y1=1,
	line=dict(
	color="black",
	dash="dot",
	width=2
	)
	)

	# Add an annotation label above the vertical line.
	fig.add_annotation(
	x=vline_x,
	y=1.05, # place just above the top of the plotting area
	xref="x",
	yref="paper",
	text="AI arms race begins",
	showarrow=False,
	font=dict(
	color="black",
	size=12
	),
	align="center"
	)

	return fig


	def create_simple_plot(data_path: str,
	name: str,
	subtitle: str,
	start_date: datetime, end_date: datetime,
	min_value: int = 0, max_value: int = 100,
	labeled_horizontal_lines: dict[str, float] = None) -> go.Figure:
	leaderboard = []
	with open(data_path, 'r') as file:
	for line in file:
	leaderboard.append(json.loads(line))

	models = []
	with open("models.jsonl", 'r') as file:
	for line in file:
	models.append(json.loads(line))

	data = []
	for entry in leaderboard:
	model_name = entry['model']
	score = entry['score']
	model_info = next((m for m in models if m['Name'] == model_name), None)
	if model_info:
	release_date = datetime.strptime(model_info['Release Date'], "%Y-%m-%d")
	data.append({'model': model_name, 'score': score, 'release_date': release_date})
	else:
	print(f"[WARNING] Model '{model_name}' not found in models.jsonl")

	data.sort(key=lambda x: x['release_date'])

	x_dates = [d['release_date'] for d in data]
	y_scores = []
	max_score = 0
	for entry in data:
	if entry['score'] > max_score:
	max_score = entry['score']
	y_scores.append(max_score)

	fig = go.Figure()

	fig.add_trace(go.Scatter(
	x=x_dates,
	y=y_scores,
	mode='lines',
	line=dict(shape='hv', width=2),
	name='Best Score to Date'
	))

	for i, entry in enumerate(data):
	if i == 0 or y_scores[i] > y_scores[i - 1]:
	fig.add_trace(go.Scatter(
	x=[entry['release_date']],
	y=[entry['score']],
	mode='markers+text',
	marker=dict(size=10),
	text=[entry['model']],
	textposition="top center",
	name=entry['model']
	))

	fig.update_layout(
	title=f'{name} Over Time<br><sup>{subtitle}</sup>',
	xaxis_title='Publication or Release Date',
	yaxis_title=name,
	hovermode='x unified',
	xaxis=dict(
	range=[start_date, end_date],
	type='date'
	),
	yaxis=dict(
	range=[min_value, max_value]
	),
	height=800
	)

	if labeled_horizontal_lines:
	for label, y_value in labeled_horizontal_lines.items():
	fig.add_hline(
	y=y_value,
	line_dash="dot",
	line_color="black",
	annotation_text=label,
	annotation_position="right",
	annotation=dict(
	font_size=12,
	font_color="black",
	xanchor="left",
	yanchor="middle",
	xshift=10
	)
	)

	return fig


	with gr.Blocks() as demo:
	with gr.Tab("System Performance Over Time"):
	with gr.Tab("Legend"):
	legend_markdown: gr.Markdown = gr.Markdown(
	value="""
	## Benchmarks and Top Scores

	\| Benchmark \| Top Score \|
	\|-----------\|-----------\|
	\| BigCodeBench \| 🟠 36% \|
	\| Simple Bench \| 🟠 42% \|
	\| PlanBench \| 🟠 53% \|
	\| GAIA \| 🟡 65% \|
	\| ARC-AGI-Pub (Semi-Private Eval) \| 🟡 76% \|
	\| GPQA \| 🟡 76% \|
	\| ZebraLogic \| 🟡 81% \|
	\| ARC-AGI-Pub (Public Eval) \| 🟡 83% \|
	\| ZeroEval \| 🟡 86% \|
	\| MATH-L5 \| 🟡 89% \|
	\| MMLU-Redux \| 🟢 93% \|
	\| CRUX \| 🟢 96% \|

	## Colors

	\| Color \| Score Range \|
	\|-------\|------------\|
	\| 🔴 Red \| Below 30% \|
	\| 🟠 Orange \| 30% to 60% \|
	\| 🟡 Yellow \| 60% to 90% \|
	\| 🟢 Green \| Above 90% \|"""
	)
	with gr.Tab("🟠 BigCodeBench") as bigcodebench_tab:
	bigcodebench_plot: gr.Plot = gr.Plot()
	bigcodebench_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [BigCodeBench Leaderboard](https://bigcode-bench.github.io/)"""
	)
	with gr.Tab("🟠 Simple Bench") as simple_bench_tab:
	simple_bench_plot: gr.Plot = gr.Plot()
	simple_bench_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [SimpleBench Leaderboard](https://simple-bench.com/)"""
	)
	with gr.Tab("🟠 PlanBench") as planbench_tab:
	planbench_plot: gr.Plot = gr.Plot()
	planbench_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [Valmeekam et al. 2024](https://arxiv.org/abs/2409.13373)"""
	)
	with gr.Tab("🟡 GAIA") as gaia_tab:
	gaia_plot: gr.Plot = gr.Plot()
	gaia_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)"""
	)
	with gr.Tab("🟡 ARC-AGI-Pub") as arc_agi_tab:
	with gr.Tab("🟡 Semi-Private Eval") as arc_agi_semi_private_eval_tab:
	arc_agi_semi_private_eval_plot: gr.Plot = gr.Plot()
	with gr.Tab("🟡 Public Eval") as arc_agi_public_eval_tab:
	arc_agi_public_eval_plot: gr.Plot = gr.Plot()
	arc_agi_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [ARC Prize 2024](https://arcprize.org/2024-results)"""
	)
	with gr.Tab("🟡 GPQA") as gpqa_tab:
	gpqa_plot: gr.Plot = gr.Plot()
	gpqa_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [Epoch AI Benchmarking Dashboard](https://epoch.ai/data/ai-benchmarking-dashboard)"""
	)
	with gr.Tab("🟡 ZebraLogic") as zeroeval_zebralogic_tab:
	zeroeval_zebralogic_plot: gr.Plot = gr.Plot()
	zeroeval_zebralogic_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
	)
	with gr.Tab("🟡 ZeroEval") as zeroeval_average_tab:
	zeroeval_average_plot: gr.Plot = gr.Plot()
	zeroeval_average_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
	)
	with gr.Tab("🟡 MATH-L5") as zeroeval_math_l5_tab:
	zeroeval_math_l5_plot: gr.Plot = gr.Plot()
	zeroeval_math_l5_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
	)
	with gr.Tab("🟢 MMLU-Redux") as zeroeval_mmlu_redux_tab:
	zeroeval_mmlu_redux_plot: gr.Plot = gr.Plot()
	zeroeval_mmlu_redux_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
	)
	with gr.Tab("🟢 CRUX") as zeroeval_crux_tab:
	zeroeval_crux_plot: gr.Plot = gr.Plot()
	zeroeval_crux_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [ZeroEval Leaderboard](https://huggingface.co/spaces/allenai/ZeroEval)"""
	)
	with gr.Tab("Codeforces") as codeforces_tab:
	codeforces_plot: gr.Plot = gr.Plot()
	with gr.Tab("OpenCompass", visible=False):
	opencompass_plot: gr.Plot = gr.Plot()
	opencompass_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [OpenCompass LLM Leaderboard](https://huggingface.co/spaces/opencompass/opencompass-llm-leaderboard)"""
	)
	with gr.Tab("SWE-bench", visible=False):
	swe_bench_plot: gr.Plot = gr.Plot()
	swe_bench_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [SWE-bench Leaderboard](https://www.swebench.com/)"""
	)
	with gr.Tab("WebArena", visible=False):
	webarena_plot: gr.Plot = gr.Plot()
	webarena_markdown: gr.Markdown = gr.Markdown(
	value="""Source: [X-WebArena-Leaderboard](https://docs.google.com/spreadsheets/d/1M801lEpBbKSNwP-vDBkC_pF7LdyGU1f_ufZb_NWNBZQ)"""
	)
	with gr.Tab("Finance") as finance_tab:
	with gr.Tab("Big Tech Capex") as big_five_capex_tab:
	big_five_capex_plot: gr.Plot = gr.Plot()
	with gr.Tab("NVIDIA Revenue", visible=False) as nvidia_revenue:
	nvidia_revenue_plot: gr.Plot = gr.Plot()
	big_five_capex_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
	arc_agi_public_eval_tab.select(fn=create_simple_plot,
	inputs=[gr.State("arc_agi_leaderboard.jsonl"),
	gr.State("ARC-AGI-Pub Score (Public Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
	gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
	gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
	gr.State(0), gr.State(100),
	gr.State({"Humans\n(LeGris et al. 2024)": 64.2})],
	outputs=arc_agi_public_eval_plot)
	arc_agi_tab.select(fn=create_simple_plot,
	inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
	gr.State("ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
	gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
	gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
	gr.State(0), gr.State(100),
	gr.State({"MTurkers": 77})],
	outputs=arc_agi_semi_private_eval_plot)
	arc_agi_semi_private_eval_tab.select(fn=create_simple_plot,
	inputs=[gr.State("arc_agi_semi_private_eval_leaderboard.jsonl"),
	gr.State("ARC-AGI-Pub Score (Semi-Private Eval, $20 Compute Budget per Task, General-Purpose Systems)"),
	gr.State("\"ARC can be seen as a general artificial intelligence benchmark, as a program synthesis benchmark, or as a psychometric intelligence test.\" (Chollet, 2019)"),
	gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
	gr.State(0), gr.State(100),
	gr.State({"MTurkers": 77})],
	outputs=arc_agi_semi_private_eval_plot)
	finance_tab.select(fn=create_big_five_capex_plot, outputs=big_five_capex_plot)
	simple_bench_tab.select(fn=create_simple_plot,
	inputs=[gr.State("simple_bench_leaderboard.jsonl"),
	gr.State("Simple Bench Score"),
	gr.State("\"multiple-choice text benchmark [...] [including] over 200 questions covering spatio-temporal reasoning, social intelligence, and what we call linguistic adversarial robustness\" (Philip & Hemang, 2024)"),
	gr.State(date(2024, 4, 1)), gr.State(date(2025, 1, 1)),
	gr.State(0), gr.State(100),
	gr.State({"Humans": 83.7})],
	outputs=simple_bench_plot)
	codeforces_tab.select(fn=create_simple_plot,
	inputs=[gr.State("codeforces_leaderboard.jsonl"),
	gr.State("Codeforces Rating"),
	gr.State("\"[Codeforces] is a platform where [programming] contests are held regularly, the participant's skills are reflected by their rating [...] The rating is a modification of Elo rating\" (Mirzayanov, 2011)"),
	gr.State(date(2024, 5, 1)), gr.State(date(2025, 1, 1)),
	gr.State(0), gr.State(4000),
	gr.State({"Pupil": 1200, "Specialist": 1400, "Expert": 1600, "Candidate Master": 1900, "Master": 2100, "International Master": 2300, "Grandmaster": 2400, "International Grandmaster": 2600, "Legendary Grandmaster": 3000})],
	outputs=codeforces_plot)
	planbench_tab.select(fn=create_simple_plot,
	inputs=[gr.State("planbench_leaderboard.jsonl"),
	gr.State("PlanBench Score (Mystery Blocksworld, 0-shot)"),
	gr.State("\"benchmark suite based on the kinds of domains used in the automated planning community [...] to test the capabilities of LLMs in planning or reasoning about actions and change.\" (Valmeekam et al. 2022)"),
	gr.State(date(2023, 3, 1)), gr.State(date(2024, 9, 20))],
	outputs=planbench_plot)
	bigcodebench_tab.select(fn=create_simple_plot,
	inputs=[gr.State("bigcodebench_hard_average_leaderboard.jsonl"),
	gr.State("BigCodeBench Score (Hard, Average of Complete and Instruct)"),
	gr.State("\"benchmark that challenges LLMs to invoke multiple function calls as tools from 139 libraries and 7 domains for 1,140 fine-grained tasks\" (Zhuo et al. 2024)"),
	gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1))],
	outputs=bigcodebench_plot)
	gaia_tab.select(fn=create_simple_plot,
	inputs=[gr.State("gaia_leaderboard.jsonl"),
	gr.State("General AI Assistants (GAIA) Benchmark Score (Test Set, Average)"),
	gr.State("\"real-world questions that require a set of fundamental abilities such as reasoning, multi-modality handling, web browsing, and generally tool-use proficiency\" (Mialon et al. 2023)"),
	gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1)),
	gr.State(0), gr.State(100),
	gr.State({"Humans": 92})],
	outputs=gaia_plot)
	gpqa_tab.select(fn=create_simple_plot,
	inputs=[gr.State("gpqa_leaderboard.jsonl"),
	gr.State("Graduate-Level Google-Proof Q&A (GPQA) Benchmark Score"),
	gr.State("\"challenging dataset of 448 multiple-choice questions written by domain experts in biology, physics, and chemistry [that] are high-quality and extremely difficult\" (Rein et al. 2023)"),
	gr.State(date(2023, 6, 1)), gr.State(date(2025, 1, 1)),
	gr.State(25), gr.State(100),
	gr.State({"Highly skilled non-expert validators": 34, "PhD-level domain experts": 65})],
	outputs=gpqa_plot)
	zeroeval_average_tab.select(fn=create_simple_plot,
	inputs=[gr.State("zeroeval_average_leaderboard.jsonl"),
	gr.State("ZeroEval Average (MMLU-Redux, ZebraLogic, CRUX and MATH-5) Score"),
	gr.State("\"a simple unified framework for evaluating language models on various tasks\" (Ai2, 2024)"),
	gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
	outputs=zeroeval_average_plot)
	zeroeval_mmlu_redux_tab.select(fn=create_simple_plot,
	inputs=[gr.State("zeroeval_mmlu_redux_leaderboard.jsonl"),
	gr.State("ZeroEval MMLU-Redux (Massive Multitask Language Understanding) Score"),
	gr.State("\"knowledge reasoning\" (Ai2, 2024); \"subset of 3,000 manually re-annotated questions across 30 MMLU subjects\" (Gema et al. 2024)"),
	gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
	outputs=zeroeval_mmlu_redux_plot)
	zeroeval_zebralogic_tab.select(fn=create_simple_plot,
	inputs=[gr.State("zeroeval_zebralogic_leaderboard.jsonl"),
	gr.State("ZeroEval ZebraLogic Score"),
	gr.State("\"logical reasoning\" (Ai2, 2024); \"Each example is a Logic Grid Puzzle [...] often used to test humans' logical reasoning abilities\" (Lin, 2024)"),
	gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
	outputs=zeroeval_zebralogic_plot)
	zeroeval_crux_tab.select(fn=create_simple_plot,
	inputs=[gr.State("zeroeval_crux_leaderboard.jsonl"),
	gr.State("ZeroEval CRUX (Code Reasoning, Understanding, and eXecution Evaluation) Score"),
	gr.State("\"code reasoning\" (Ai2, 2024); \"benchmark consisting of 800 Python functions (3-13 lines). Each function comes with [...] two natural tasks: input prediction and output prediction.\" (Gu et al. 2024)"),
	gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
	outputs=zeroeval_crux_plot)
	zeroeval_math_l5_tab.select(fn=create_simple_plot,
	inputs=[gr.State("zeroeval_math_l5_leaderboard.jsonl"),
	gr.State("ZeroEval MATH-L5 (Difficulty Level 5 of MATH) Score"),
	gr.State("\"math reasoning\" (Ai2, 2024); \"dataset of 12,500 challenging competition mathematics problems. [...] a subject’s hardest problems are assigned a difficulty level of ‘5.’\" (Hendrycks et al. 2021)"),
	gr.State(date(2023, 3, 1)), gr.State(date(2025, 1, 1))],
	outputs=zeroeval_math_l5_plot)


	if __name__ == "__main__":
	demo.launch()