Spaces:

zsyJosh
/

stark

Sleeping

stark / app.py

Shiyu Zhao

Update space

4c504d3 10 months ago

15 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import os
	import re
	from datetime import datetime
	import json

	# Data dictionaries for leaderboard
	data_synthesized_full = {
	'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2'],
	'STARK-AMAZON_Hit@1': [44.94, 15.29, 30.96, 26.56, 39.16, 40.93, 21.74, 42.08, 40.07, 46.10],
	'STARK-AMAZON_Hit@5': [67.42, 47.93, 51.06, 50.01, 62.73, 64.37, 41.65, 66.87, 64.98, 66.02],
	'STARK-AMAZON_R@20': [53.77, 44.49, 41.95, 52.05, 53.29, 54.28, 33.22, 56.52, 55.12, 53.44],
	'STARK-AMAZON_MRR': [55.30, 30.20, 40.66, 37.75, 50.35, 51.60, 31.47, 53.46, 51.55, 55.51],
	'STARK-MAG_Hit@1': [25.85, 10.51, 21.96, 12.88, 29.08, 30.06, 18.01, 37.90, 25.92, 31.18],
	'STARK-MAG_Hit@5': [45.25, 35.23, 36.50, 39.01, 49.61, 50.58, 34.85, 56.74, 50.43, 46.42],
	'STARK-MAG_R@20': [45.69, 42.11, 35.32, 46.97, 48.36, 50.49, 35.46, 46.40, 50.80, 43.94],
	'STARK-MAG_MRR': [34.91, 21.34, 29.14, 29.12, 38.62, 39.66, 26.10, 47.25, 36.94, 38.39],
	'STARK-PRIME_Hit@1': [12.75, 4.46, 6.53, 8.85, 12.63, 10.85, 10.10, 15.57, 15.10, 11.75],
	'STARK-PRIME_Hit@5': [27.92, 21.85, 15.67, 21.35, 31.49, 30.23, 22.49, 33.42, 33.56, 23.85],
	'STARK-PRIME_R@20': [31.25, 30.13, 16.52, 29.63, 36.00, 37.83, 26.34, 39.09, 38.05, 25.04],
	'STARK-PRIME_MRR': [19.84, 12.38, 11.05, 14.73, 21.41, 19.99, 16.12, 24.11, 23.49, 17.39]
	}

	data_synthesized_10 = {
	'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
	'STARK-AMAZON_Hit@1': [42.68, 16.46, 30.09, 25.00, 39.02, 43.29, 18.90, 43.29, 40.85, 44.31, 45.49, 44.79],
	'STARK-AMAZON_Hit@5': [67.07, 50.00, 49.27, 48.17, 64.02, 67.68, 37.80, 71.34, 62.80, 65.24, 71.13, 71.17],
	'STARK-AMAZON_R@20': [54.48, 42.15, 41.91, 51.65, 49.30, 56.04, 34.73, 56.14, 52.47, 51.00, 53.77, 55.35],
	'STARK-AMAZON_MRR': [54.02, 30.20, 39.30, 36.87, 50.32, 54.20, 28.76, 55.07, 51.54, 55.07, 55.91, 55.69],
	'STARK-MAG_Hit@1': [27.81, 11.65, 22.89, 12.03, 28.20, 34.59, 19.17, 38.35, 25.56, 31.58, 36.54, 40.90],
	'STARK-MAG_Hit@5': [45.48, 36.84, 37.26, 37.97, 52.63, 50.75, 33.46, 58.64, 50.37, 47.36, 53.17, 58.18],
	'STARK-MAG_R@20': [44.59, 42.30, 44.16, 47.98, 49.25, 50.75, 29.85, 46.38, 53.03, 45.72, 48.36, 48.60],
	'STARK-MAG_MRR': [35.97, 21.82, 30.00, 28.70, 38.55, 42.90, 26.06, 48.25, 36.82, 38.98, 44.15, 49.00],
	'STARK-PRIME_Hit@1': [13.93, 5.00, 6.78, 7.14, 15.36, 12.14, 9.29, 16.79, 15.36, 15.00, 17.79, 18.28],
	'STARK-PRIME_Hit@5': [31.07, 23.57, 16.15, 17.14, 31.07, 31.42, 20.7, 34.29, 32.86, 26.07, 36.90, 37.28],
	'STARK-PRIME_R@20': [32.84, 30.50, 17.07, 32.95, 37.88, 37.34, 25.54, 41.11, 40.99, 27.78, 35.57, 34.05],
	'STARK-PRIME_MRR': [21.68, 13.50, 11.42, 16.27, 23.50, 21.23, 15.00, 24.99, 23.70, 19.98, 26.27, 26.55]
	}

	data_human_generated = {
	'Method': ['BM25', 'DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)', 'ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b', 'multi-ada-002', 'ColBERTv2', 'Claude3 Reranker', 'GPT4 Reranker'],
	'STARK-AMAZON_Hit@1': [27.16, 16.05, 25.93, 22.22, 39.50, 35.80, 29.63, 40.74, 46.91, 33.33, 53.09, 50.62],
	'STARK-AMAZON_Hit@5': [51.85, 39.51, 54.32, 49.38, 64.19, 62.96, 46.91, 71.60, 72.84, 55.56, 74.07, 75.31],
	'STARK-AMAZON_R@20': [29.23, 15.23, 23.69, 21.54, 35.46, 33.01, 21.21, 36.30, 40.22, 29.03, 35.46, 35.46],
	'STARK-AMAZON_MRR': [18.79, 27.21, 37.12, 31.33, 52.65, 47.84, 38.61, 53.21, 58.74, 43.77, 62.11, 61.06],
	'STARK-MAG_Hit@1': [32.14, 4.72, 25.00, 20.24, 28.57, 22.62, 16.67, 34.52, 23.81, 33.33, 38.10, 36.90],
	'STARK-MAG_Hit@5': [41.67, 9.52, 30.95, 26.19, 41.67, 36.90, 28.57, 44.04, 41.67, 36.90, 45.24, 46.43],
	'STARK-MAG_R@20': [32.46, 25.00, 27.24, 28.76, 35.95, 32.44, 21.74, 34.57, 39.85, 30.50, 35.95, 35.95],
	'STARK-MAG_MRR': [37.42, 7.90, 27.98, 25.53, 35.81, 29.68, 21.59, 38.72, 31.43, 35.97, 42.00, 40.65],
	'STARK-PRIME_Hit@1': [22.45, 2.04, 7.14, 6.12, 17.35, 16.33, 9.18, 25.51, 24.49, 15.31, 28.57, 28.57],
	'STARK-PRIME_Hit@5': [41.84, 9.18, 13.27, 13.27, 34.69, 32.65, 21.43, 41.84, 39.80, 26.53, 46.94, 44.90],
	'STARK-PRIME_R@20': [42.32, 10.69, 11.72, 17.62, 41.09, 39.01, 26.77, 48.10, 47.21, 25.56, 41.61, 41.61],
	'STARK-PRIME_MRR': [30.37, 7.05, 10.07, 9.39, 26.35, 24.33, 15.24, 34.28, 32.98, 19.67, 36.32, 34.82]
	}

	# Initialize DataFrames
	df_synthesized_full = pd.DataFrame(data_synthesized_full)
	df_synthesized_10 = pd.DataFrame(data_synthesized_10)
	df_human_generated = pd.DataFrame(data_human_generated)

	# Model type definitions
	model_types = {
	'Sparse Retriever': ['BM25'],
	'Small Dense Retrievers': ['DPR (roberta)', 'ANCE (roberta)', 'QAGNN (roberta)'],
	'LLM-based Dense Retrievers': ['ada-002', 'voyage-l2-instruct', 'LLM2Vec', 'GritLM-7b'],
	'Multivector Retrievers': ['multi-ada-002', 'ColBERTv2'],
	'LLM Rerankers': ['Claude3 Reranker', 'GPT4 Reranker']
	}

	# Submission form validation functions
	def validate_email(email_str):
	"""Validate email format(s)"""
	emails = [e.strip() for e in email_str.split(';')]
	email_pattern = re.compile(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')
	return all(email_pattern.match(email) for email in emails)

	def validate_github_url(url):
	"""Validate GitHub URL format"""
	github_pattern = re.compile(
	r'^https?:\/\/(?:www\.)?github\.com\/[\w-]+\/[\w.-]+\/?$'
	)
	return bool(github_pattern.match(url))

	def validate_csv(file_obj):
	"""Validate CSV file format and content"""
	try:
	df = pd.read_csv(file_obj.name)
	required_cols = ['query_id', 'pred_rank']

	if not all(col in df.columns for col in required_cols):
	return False, "CSV must contain 'query_id' and 'pred_rank' columns"

	try:
	first_rank = eval(df['pred_rank'].iloc[0]) if isinstance(df['pred_rank'].iloc[0], str) else df['pred_rank'].iloc[0]
	if not isinstance(first_rank, list) or len(first_rank) < 20:
	return False, "pred_rank must be a list with at least 20 candidates"
	except:
	return False, "Invalid pred_rank format"

	return True, "Valid CSV file"
	except Exception as e:
	return False, f"Error processing CSV: {str(e)}"

	def save_submission(submission_data):
	"""Save submission data to a JSON file"""
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	submission_id = f"{submission_data['team_name']}_{timestamp}"

	os.makedirs("submissions", exist_ok=True)
	submission_path = f"submissions/{submission_id}.json"
	with open(submission_path, 'w') as f:
	json.dump(submission_data, f, indent=4)

	return submission_id

	# Leaderboard functions
	def filter_by_model_type(df, selected_types):
	if not selected_types:
	return df.head(0)
	selected_models = [model for type in selected_types for model in model_types[type]]
	return df[df['Method'].isin(selected_models)]

	def format_dataframe(df, dataset):
	columns = ['Method'] + [col for col in df.columns if dataset in col]
	filtered_df = df[columns].copy()
	filtered_df.columns = [col.split('_')[-1] if '_' in col else col for col in filtered_df.columns]
	filtered_df = filtered_df.sort_values('MRR', ascending=False)
	return filtered_df

	def update_tables(selected_types):
	filtered_df_full = filter_by_model_type(df_synthesized_full, selected_types)
	filtered_df_10 = filter_by_model_type(df_synthesized_10, selected_types)
	filtered_df_human = filter_by_model_type(df_human_generated, selected_types)

	outputs = []
	for df in [filtered_df_full, filtered_df_10, filtered_df_human]:
	for dataset in ['AMAZON', 'MAG', 'PRIME']:
	outputs.append(format_dataframe(df, f"STARK-{dataset}"))

	return outputs

	def process_submission(
	method_name, team_name, dataset, split, contact_email,
	code_repo, csv_file, model_description, hardware, paper_link
	):
	"""Process and validate submission"""
	# Input validation
	if not method_name or not team_name or not dataset or not split or not contact_email or not code_repo or not csv_file:
	return "Error: Please fill in all required fields"

	# Length validation
	if len(method_name) > 25:
	return "Error: Method name must be 25 characters or less"
	if len(team_name) > 25:
	return "Error: Team name must be 25 characters or less"
	if not validate_email(contact_email):
	return "Error: Invalid email format"
	if not validate_github_url(code_repo):
	return "Error: Invalid GitHub repository URL"

	# Validate CSV file
	csv_valid, csv_message = validate_csv(csv_file)
	if not csv_valid:
	return f"Error with CSV file: {csv_message}"

	# Process CSV file through evaluation pipeline
	try:
	results = compute_metrics(
	csv_file.name,
	dataset=dataset.lower(),
	split=split,
	num_workers=4
	)

	if isinstance(results, str) and results.startswith("Error"):
	return f"Evaluation error: {results}"

	# Prepare submission data
	submission_data = {
	"method_name": method_name,
	"team_name": team_name,
	"dataset": dataset,
	"split": split,
	"contact_email": contact_email,
	"code_repo": code_repo,
	"model_description": model_description,
	"hardware": hardware,
	"paper_link": paper_link,
	"results": results,
	"status": "pending_review",
	"submission_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	}

	# Save submission
	submission_id = save_submission(submission_data)

	return f"""
	Submission successful! Your submission ID is: {submission_id}

	Evaluation Results:
	Hit@1: {results['hit@1']:.2f}
	Hit@5: {results['hit@5']:.2f}
	Recall@20: {results['recall@20']:.2f}
	MRR: {results['mrr']:.2f}

	Your submission is pending review. You will receive an email notification once the review is complete.
	"""

	except Exception as e:
	return f"Error processing submission: {str(e)}"

	# CSS styling
	css = """
	table > thead {
	white-space: normal
	}

	table {
	--cell-width-1: 250px
	}

	table > tbody > tr > td:nth-child(2) > div {
	overflow-x: auto
	}
	"""

	def add_submission_form(demo):
	with demo:
	gr.Markdown("---")
	gr.Markdown("## Submit Your Results")
	gr.Markdown("""
	Submit your results to be included in the leaderboard. Please ensure your submission meets all requirements.
	For questions, contact stark-qa@cs.stanford.edu
	""")

	with gr.Row():
	with gr.Column():
	method_name = gr.Textbox(
	label="Method Name (max 25 chars)*",
	placeholder="e.g., MyRetrievalModel-v1"
	)
	team_name = gr.Textbox(
	label="Team Name (max 25 chars)*",
	placeholder="e.g., Stanford NLP"
	)
	dataset = gr.Dropdown(
	choices=["amazon", "mag", "prime"],
	label="Dataset*",
	value="amazon"
	)
	split = gr.Dropdown(
	choices=["test", "test-0.1", "human_generated_eval"],
	label="Split*",
	value="test"
	)
	contact_email = gr.Textbox(
	label="Contact Email(s)*",
	placeholder="email@example.com; another@example.com"
	)

	with gr.Column():
	code_repo = gr.Textbox(
	label="Code Repository*",
	placeholder="https://github.com/username/repository"
	)
	csv_file = gr.File(
	label="Prediction CSV*",
	file_types=[".csv"]
	)
	model_description = gr.Textbox(
	label="Model Description*",
	lines=3,
	placeholder="Briefly describe how your retriever model works..."
	)
	hardware = gr.Textbox(
	label="Hardware Specifications*",
	placeholder="e.g., 4x NVIDIA A100 80GB"
	)
	paper_link = gr.Textbox(
	label="Paper Link (Optional)",
	placeholder="https://arxiv.org/abs/..."
	)

	submit_btn = gr.Button("Submit", variant="primary")
	result = gr.Textbox(label="Submission Status", interactive=False)

	submit_btn.click(
	process_submission,
	inputs=[
	method_name, team_name, dataset, split, contact_email,
	code_repo, csv_file, model_description, hardware, paper_link
	],
	outputs=result
	)

	# Main application
	if __name__ == "__main__":
	with gr.Blocks(css=css) as demo:
	gr.Markdown("# Semi-structured Retrieval Benchmark (STaRK) Leaderboard")
	gr.Markdown("Refer to the [STaRK paper](https://arxiv.org/pdf/2404.13207) for details on metrics, tasks and models.")

	with gr.Row():
	model_type_filter = gr.CheckboxGroup(
	choices=list(model_types.keys()),
	value=list(model_types.keys()),
	label="Model types",
	interactive=True
	)

	all_dfs = []

	with gr.Tabs() as outer_tabs:
	for tab_name, df_source in [("Synthesized (full)", df_synthesized_full),
	("Synthesized (10%)", df_synthesized_10),
	("Human-Generated", df_human_generated)]:
	with gr.TabItem(tab_name):
	with gr.Tabs() as inner_tabs:
	for dataset in ['AMAZON', 'MAG', 'PRIME']:
	with gr.TabItem(dataset):
	df = gr.DataFrame(interactive=False)
	all_dfs.append(df)

	model_type_filter.change(
	update_tables,
	inputs=[model_type_filter],
	outputs=all_dfs
	)

	demo.load(
	update_tables,
	inputs=[model_type_filter],
	outputs=all_dfs
	)

	# Add submission form
	add_submission_form(demo)

	demo.launch()