Spaces:

AdnanElAssadi
/

MTEB-Human-Eval-Demo

Sleeping

App Files Files Community

MTEB-Human-Eval-Demo / app.py

AdnanElAssadi

Update app.py

9069a07 verified 5 months ago

raw

history blame

21.2 kB

	import gradio as gr
	import json
	import os
	from pathlib import Path

	def create_reranking_interface(task_data):
	"""Create a Gradio interface for reranking evaluation."""
	samples = task_data["samples"]
	results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
	completed_samples = {s["id"]: False for s in samples}

	# Store the current document order for the active sample
	current_order = []

	def save_ranking(sample_id):
	"""Save the current document ordering as rankings."""
	try:
	if not current_order:
	return "⚠️ No document ordering found", f"Progress: {sum(completed_samples.values())}/{len(samples)}"

	# Convert document positions to rankings (position in list -> document's rank)
	# First document (position 0) gets rank 1, etc.
	rankings = []
	for i, doc_idx in enumerate(current_order):
	rankings.append(i + 1) # Convert to 1-based ranks

	# Store this annotation in memory
	existing_idx = next((i for i, a in enumerate(results["annotations"]) if a["sample_id"] == sample_id), None)
	if existing_idx is not None:
	results["annotations"][existing_idx] = {
	"sample_id": sample_id,
	"rankings": rankings
	}
	else:
	results["annotations"].append({
	"sample_id": sample_id,
	"rankings": rankings
	})

	completed_samples[sample_id] = True

	# Try to save to file, but continue even if it fails
	try:
	output_path = f"{task_data['task_name']}_human_results.json"
	with open(output_path, "w") as f:
	json.dump(results, f, indent=2)
	return f"✅ Rankings saved successfully (in memory and to file)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
	except:
	# If file saving fails, still mark as success since we saved in memory
	return f"✅ Rankings saved in memory (file save failed)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
	except Exception as e:
	# Return specific error message
	return f"Error: {str(e)}", f"Progress: {sum(completed_samples.values())}/{len(samples)}"

	with gr.Blocks(theme=gr.themes.Soft(), css="""
	.document-text {
	font-size: 16px;
	padding: 10px;
	border-radius: 8px;
	background-color: #f8f9fa;
	border-left: 5px solid #4CAF50;
	}
	.query-box {
	background-color: #e3f2fd;
	padding: 16px;
	border-radius: 8px;
	border-left: 5px solid #2196F3;
	font-size: 18px;
	margin-bottom: 20px;
	}
	.progress-indicator {
	font-weight: bold;
	text-align: center;
	padding: 12px;
	background-color: #f1f8e9;
	border-radius: 8px;
	margin: 10px 0;
	}
	.status-box {
	font-weight: bold;
	text-align: center;
	}
	.ranking-box {
	border: 1px solid #e0e0e0;
	border-radius: 8px;
	padding: 15px;
	margin-bottom: 12px;
	transition: all 0.3s;
	}
	.ranking-box:hover {
	box-shadow: 0 4px 8px rgba(0,0,0,0.1);
	}
	/* Add different colors for rank levels */
	.rank-1, .rank-2, .rank-3 {
	border-left: 5px solid #4CAF50; /* Green for top ranks */
	}
	.rank-4, .rank-5, .rank-6, .rank-7 {
	border-left: 5px solid #FFC107; /* Yellow for mid ranks */
	}
	.rank-8, .rank-9, .rank-10 {
	border-left: 5px solid #FF5722; /* Orange for lower ranks */
	}
	.rank-11, .rank-12, .rank-13, .rank-14, .rank-15, .rank-16, .rank-17, .rank-18, .rank-19, .rank-20 {
	border-left: 5px solid #9E9E9E; /* Gray for lowest ranks */
	}
	""") as demo:
	gr.Markdown(f"# {task_data['task_name']} - Human Reranking Evaluation")

	with gr.Accordion("Instructions", open=True):
	gr.Markdown("""
	## Task Instructions

	{instructions}

	### How to use this interface:
	1. Read the query at the top
	2. Review each document in the list
	3. Use the "Move Up" and "Move Down" buttons to arrange documents by relevance
	(most relevant at the top, least relevant at the bottom)
	4. Click "Submit Rankings" when you're done with the current query
	5. Use "Previous" and "Next" to navigate between queries
	6. Click "Save All Results" periodically to ensure your work is saved
	""".format(instructions=task_data["instructions"]))

	current_sample_id = gr.State(value=samples[0]["id"])

	with gr.Row(elem_classes=["progress-indicator"]):
	progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
	status_box = gr.Textbox(label="Status", value="Ready to start evaluation", interactive=False, elem_classes=["status-box"])

	with gr.Group():
	gr.Markdown("## Query:")
	query_text = gr.Textbox(value=samples[0]["query"], label="", interactive=False, elem_classes=["query-box"])

	gr.Markdown("## Documents (Arrange in order of relevance, most relevant at top):")

	# Create simple document list with move up/down buttons
	document_containers = []

	# Function to initialize the document list for a sample
	def initialize_document_list(sample_id):
	nonlocal current_order

	sample = next((s for s in samples if s["id"] == sample_id), None)
	if not sample:
	return "Query not found", f"Progress: {sum(completed_samples.values())}/{len(samples)}"

	# Get the documents for this sample
	docs = sample["candidates"]

	# Initialize document order (0, 1, 2, ..., n-1)
	current_order = list(range(len(docs)))

	# Check if this sample has already been annotated to restore ordering
	existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
	if existing_annotation and "rankings" in existing_annotation:
	# Create pairs of (doc_idx, rank)
	ranked_docs = []
	for doc_idx, rank in enumerate(existing_annotation["rankings"]):
	ranked_docs.append((doc_idx, rank))

	# Sort by rank (ascending)
	ranked_docs.sort(key=lambda x: x[1])

	# Extract document indices in rank order
	current_order = [doc[0] for doc in ranked_docs]

	# Update UI
	for i in range(len(document_containers)):
	if i < len(docs):
	doc_idx = current_order[i]
	rank_class = f"rank-{i+1}"
	document_containers[i].update(
	value=f"[RANK {i+1}] Document {doc_idx+1}: {docs[doc_idx]}",
	visible=True,
	elem_classes=["document-text", rank_class]
	)
	else:
	document_containers[i].update(value="", visible=False)

	# Status message
	status = f"Viewing query {samples.index(sample) + 1} of {len(samples)}"
	if completed_samples[sample_id]:
	status += " (already completed)"

	return status, f"Progress: {sum(completed_samples.values())}/{len(samples)}"

	# Create document display containers with up/down buttons
	with gr.Column():
	gr.Markdown("""
	### Instructions for Ranking:
	- Documents are initially shown in their original order
	- The most relevant document should be at the TOP (Rank 1)
	- Use the "Move Up" and "Move Down" buttons to rearrange documents
	- The rank is shown at the beginning of each document: [RANK X]
	- When you're satisfied with the order, click "Submit Rankings"
	""")

	for i in range(20): # Now handling up to 20 documents per sample
	with gr.Group():
	with gr.Box():
	with gr.Row():
	rank_label = gr.Markdown(f"Document #{i+1}")
	with gr.Column(scale=1):
	up_btn = gr.Button("⬆️ Move Up", size="sm", variant="primary")
	down_btn = gr.Button("⬇️ Move Down", size="sm", variant="secondary")

	doc_text = gr.Textbox(
	label="",
	interactive=False,
	elem_id=f"doc-text-{i}",
	elem_classes=["document-text"]
	)
	document_containers.append(doc_text)

	# Create a closure that properly captures the current index
	def make_up_handler(idx):
	def up_handler():
	nonlocal current_order

	# Only move if index is valid
	if idx < len(current_order) and idx > 0:
	# Swap with the document above
	current_order[idx], current_order[idx-1] = current_order[idx-1], current_order[idx]

	# Update all document displays with new order
	sample = next((s for s in samples if s["id"] == current_sample_id.value), None)
	if sample:
	docs = sample["candidates"]
	updates = []
	for j in range(len(document_containers)):
	if j < len(current_order) and j < len(docs):
	doc_idx = current_order[j]
	# Add rank-specific class for styling
	rank_class = f"rank-{j+1}"
	updates.append(gr.update(
	value=f"[RANK {j+1}] Document {doc_idx+1}: {docs[doc_idx]}",
	visible=True,
	elem_classes=["document-text", rank_class]
	))
	else:
	updates.append(gr.update(value="", visible=False))
	return updates
	return [gr.update() for _ in document_containers]
	return up_handler

	def make_down_handler(idx):
	def down_handler():
	nonlocal current_order

	# Only move if index is valid
	if idx < len(current_order) - 1:
	# Swap with the document below
	current_order[idx], current_order[idx+1] = current_order[idx+1], current_order[idx]

	# Update all document displays with new order
	sample = next((s for s in samples if s["id"] == current_sample_id.value), None)
	if sample:
	docs = sample["candidates"]
	updates = []
	for j in range(len(document_containers)):
	if j < len(current_order) and j < len(docs):
	doc_idx = current_order[j]
	# Add rank-specific class for styling
	rank_class = f"rank-{j+1}"
	updates.append(gr.update(
	value=f"[RANK {j+1}] Document {doc_idx+1}: {docs[doc_idx]}",
	visible=True,
	elem_classes=["document-text", rank_class]
	))
	else:
	updates.append(gr.update(value="", visible=False))
	return updates
	return [gr.update() for _ in document_containers]
	return down_handler

	# Connect buttons with properly created handlers
	up_btn.click(
	make_up_handler(i),
	inputs=[],
	outputs=document_containers
	)

	down_btn.click(
	make_down_handler(i),
	inputs=[],
	outputs=document_containers
	)

	with gr.Row(equal_height=True):
	prev_btn = gr.Button("← Previous Query", size="lg", variant="secondary")
	submit_btn = gr.Button("💾 Submit Rankings", size="lg", variant="primary")
	next_btn = gr.Button("Next Query →", size="lg", variant="secondary")

	with gr.Row():
	save_btn = gr.Button("💾 Save All Results", variant="primary", size="lg")
	# Add a clear button to reset rankings
	clear_btn = gr.Button("🔄 Reset Order", variant="secondary", size="lg")

	# Initialize the document list for the first sample
	status_val, progress_val = initialize_document_list(samples[0]["id"])
	status_box.value = status_val
	progress_text.value = progress_val

	# Navigation functions
	def nav_to_prev(current_id):
	current_sample = next((s for s in samples if s["id"] == current_id), None)
	if not current_sample:
	return current_id

	current_idx = samples.index(current_sample)
	if current_idx > 0:
	prev_sample = samples[current_idx - 1]
	return prev_sample["id"]
	return current_id

	def nav_to_next(current_id):
	current_sample = next((s for s in samples if s["id"] == current_id), None)
	if not current_sample:
	return current_id

	current_idx = samples.index(current_sample)
	if current_idx < len(samples) - 1:
	next_sample = samples[current_idx + 1]
	return next_sample["id"]
	return current_id

	# Connect navigation buttons
	prev_btn.click(
	nav_to_prev,
	inputs=[current_sample_id],
	outputs=[current_sample_id]
	).then(
	lambda id: initialize_document_list(id),
	inputs=[current_sample_id],
	outputs=[status_box, progress_text]
	).then(
	lambda id: next((s["query"] for s in samples if s["id"] == id), ""),
	inputs=[current_sample_id],
	outputs=[query_text]
	)

	next_btn.click(
	nav_to_next,
	inputs=[current_sample_id],
	outputs=[current_sample_id]
	).then(
	lambda id: initialize_document_list(id),
	inputs=[current_sample_id],
	outputs=[status_box, progress_text]
	).then(
	lambda id: next((s["query"] for s in samples if s["id"] == id), ""),
	inputs=[current_sample_id],
	outputs=[query_text]
	)

	# Submit button
	submit_btn.click(
	save_ranking,
	inputs=[current_sample_id],
	outputs=[status_box, progress_text]
	)

	# Add function to reset the current ranking order
	def reset_order():
	nonlocal current_order
	sample = next((s for s in samples if s["id"] == current_sample_id.value), None)
	if not sample:
	return [gr.update() for _ in document_containers]

	docs = sample["candidates"]
	# Reset to original order
	current_order = list(range(len(docs)))

	# Update UI
	updates = []
	for i in range(len(document_containers)):
	if i < len(docs):
	doc_idx = current_order[i]
	rank_class = f"rank-{i+1}"
	updates.append(gr.update(
	value=f"[RANK {i+1}] Document {doc_idx+1}: {docs[doc_idx]}",
	visible=True,
	elem_classes=["document-text", rank_class]
	))
	else:
	updates.append(gr.update(value="", visible=False))

	return updates

	# Connect reset button
	clear_btn.click(
	reset_order,
	inputs=[],
	outputs=document_containers
	)

	# Save all results
	def save_results():
	"""Save all collected results to a file."""
	output_path = f"{task_data['task_name']}_human_results.json"
	with open(output_path, "w") as f:
	json.dump(results, f, indent=2)
	return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"

	save_btn.click(save_results, outputs=[status_box])

	return demo

	# Just use a simplified version with the main demo functionality
	def get_task_file():
	"""Get the task file to use for the demo."""
	# Try to load the default example
	default_task = "AskUbuntuDupQuestions_human_eval.json"
	if os.path.exists(default_task):
	return default_task

	# If default doesn't exist, look for any other task files
	task_files = [f for f in os.listdir(".") if f.endswith("_human_eval.json")]
	if task_files:
	return task_files[0]

	# Raise error if no task files found
	raise FileNotFoundError("No task files found. Please ensure there's a *_human_eval.json file in the current directory.")

	# Main app with simplified structure
	with gr.Blocks(theme=gr.themes.Soft()) as app:
	gr.Markdown("# MTEB Human Evaluation Demo")

	try:
	# Load the task file
	task_file = get_task_file()

	with open(task_file, "r") as f:
	task_data = json.load(f)

	# Show which task is currently loaded
	gr.Markdown(f"Current Task: {task_data['task_name']} ({len(task_data['samples'])} samples)")

	# Display the interface
	reranking_demo = create_reranking_interface(task_data)
	except Exception as e:
	gr.Markdown(f"Error loading task: {str(e)}")
	error_details = gr.Textbox(
	value=str(e),
	label="Error Details",
	interactive=False
	)

	if __name__ == "__main__":
	app.launch()