Spaces:
Sleeping
Sleeping
import gradio as gr | |
import json | |
import os | |
from pathlib import Path | |
def create_reranking_interface(task_data): | |
"""Create a Gradio interface for reranking evaluation using drag and drop.""" | |
samples = task_data["samples"] | |
results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []} | |
completed_samples = {s["id"]: False for s in samples} | |
# Define helper functions before UI elements are created | |
def generate_sortable_html(candidates, existing_ranks=None): | |
"""Generate HTML with simple dropdowns for ranking.""" | |
# Use existing ranks if available | |
ranks = [0] * len(candidates) | |
if existing_ranks and len(existing_ranks) == len(candidates): | |
ranks = existing_ranks.copy() | |
# Generate a unique ID for this set of dropdowns to avoid conflicts | |
import random | |
import time | |
dropdown_group_id = f"rank_group_{int(time.time())}_{random.randint(1000, 9999)}" | |
html = f""" | |
<div class="ranking-simple"> | |
<input type="hidden" id="rank-order-state" value=""> | |
<div class="rank-instructions">Select a rank (1-{len(candidates)}) for each document.</div> | |
""" | |
# Add each document with a dropdown selector | |
for i, doc in enumerate(candidates): | |
import html as html_escaper | |
escaped_doc = html_escaper.escape(doc) | |
current_rank = ranks[i] if ranks[i] > 0 else i + 1 | |
html += f""" | |
<div class="rank-item" data-doc-id="{i}"> | |
<div class="rank-selector"> | |
<select class="rank-dropdown" data-doc-id="{i}" onchange="updateRankOrder('{dropdown_group_id}')"> | |
""" | |
# Add options 1 through N | |
for rank in range(1, len(candidates) + 1): | |
selected = "selected" if rank == current_rank else "" | |
html += f'<option value="{rank}" {selected}>{rank}</option>' | |
html += f""" | |
</select> | |
</div> | |
<div class="doc-content">{escaped_doc}</div> | |
</div> | |
""" | |
# Add the JavaScript for handling rank updates | |
html += f""" | |
<script> | |
// Function to update the hidden state when dropdowns change | |
function updateRankOrder(groupId) {{ | |
const items = document.querySelectorAll('.rank-item'); | |
const selectedRanks = new Map(); | |
const docOrder = []; | |
// First collect all selected ranks | |
items.forEach(item => {{ | |
const docId = parseInt(item.getAttribute('data-doc-id')); | |
const dropdown = item.querySelector('.rank-dropdown'); | |
const rank = parseInt(dropdown.value); | |
selectedRanks.set(docId, rank); | |
}}); | |
// Sort documents by their selected rank | |
const sortedDocs = Array.from(selectedRanks.entries()) | |
.sort((a, b) => a[1] - b[1]) | |
.map(entry => entry[0]); | |
// Update the order state | |
const orderInput = document.querySelector('#current-order textarea'); | |
if (orderInput) {{ | |
orderInput.value = JSON.stringify(sortedDocs); | |
const event = new Event('input', {{ bubbles: true }}); | |
orderInput.dispatchEvent(event); | |
}} | |
}} | |
// Initialize on page load | |
document.addEventListener('DOMContentLoaded', function() {{ | |
updateRankOrder('{dropdown_group_id}'); | |
}}); | |
// Backup initialization for iframe environments | |
setTimeout(function() {{ | |
updateRankOrder('{dropdown_group_id}'); | |
}}, 1000); | |
</script> | |
</div> | |
""" | |
return html | |
def save_ranking(order_json, sample_id): | |
"""Save the current ranking to results.""" | |
try: | |
if not order_json or order_json == "[]": | |
return "⚠️ Drag documents to set the ranking before submitting.", progress_text.value | |
order = json.loads(order_json) | |
num_candidates = len(next(s["candidates"] for s in samples if s["id"] == sample_id)) | |
if len(order) != num_candidates: | |
return f"⚠️ Ranking order length mismatch. Expected {num_candidates}, got {len(order)}.", progress_text.value | |
rankings = [0] * num_candidates | |
for rank_minus_1, doc_idx in enumerate(order): | |
if doc_idx < num_candidates: | |
rankings[doc_idx] = rank_minus_1 + 1 | |
else: | |
raise ValueError(f"Invalid document index {doc_idx} found in order.") | |
if sorted(rankings) != list(range(1, num_candidates + 1)): | |
return "⚠️ Ranking validation failed. Ranks are not 1 to N.", progress_text.value | |
annotation = {"sample_id": sample_id, "rankings": rankings} | |
existing_idx = next((i for i, a in enumerate(results["annotations"]) if a["sample_id"] == sample_id), None) | |
if existing_idx is not None: | |
results["annotations"][existing_idx] = annotation | |
else: | |
results["annotations"].append(annotation) | |
completed_samples[sample_id] = True | |
output_path = f"{task_data['task_name']}_human_results.json" | |
with open(output_path, "w") as f: | |
json.dump(results, f, indent=2) | |
return f"✅ Rankings saved successfully ({len(results['annotations'])}/{len(samples)} completed)", f"Progress: {sum(completed_samples.values())}/{len(samples)}" | |
except json.JSONDecodeError: | |
return "⚠️ Error decoding ranking order. Please try again.", progress_text.value | |
except Exception as e: | |
import traceback | |
print(traceback.format_exc()) | |
return f"Error saving ranking: {str(e)}", progress_text.value | |
def load_sample(sample_id): | |
"""Load a sample into the interface.""" | |
try: | |
sample = next((s for s in samples if s["id"] == sample_id), None) | |
if not sample: | |
return gr.update(), gr.update(value="[]"), gr.update(), gr.update() | |
existing_ranking = next((anno["rankings"] for anno in results["annotations"] if anno["sample_id"] == sample_id), None) | |
new_html = generate_sortable_html(sample["candidates"], existing_ranking) | |
status = "Ready to rank" if not completed_samples.get(sample_id, False) else "Already ranked" | |
progress = f"Progress: {sum(completed_samples.values())}/{len(samples)}" | |
return sample["query"], new_html, "[]", progress, status | |
except Exception as e: | |
return gr.update(), gr.update(value="[]"), gr.update(), gr.update(value=f"Error loading sample: {str(e)}") | |
def next_sample_id(current_id): | |
current_idx = next((i for i, s in enumerate(samples) if s["id"] == current_id), -1) | |
if current_idx == -1: | |
return current_id | |
next_idx = min(current_idx + 1, len(samples) - 1) | |
return samples[next_idx]["id"] | |
def prev_sample_id(current_id): | |
current_idx = next((i for i, s in enumerate(samples) if s["id"] == current_id), -1) | |
if current_idx == -1: | |
return current_id | |
prev_idx = max(current_idx - 1, 0) | |
return samples[prev_idx]["id"] | |
def save_results(): | |
output_path = f"{task_data['task_name']}_human_results.json" | |
try: | |
with open(output_path, "w") as f: | |
json.dump(results, f, indent=2) | |
return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)" | |
except Exception as e: | |
return f"⚠️ Error saving results file: {str(e)}" | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
gr.Markdown(f"# {task_data['task_name']} - Human Reranking Evaluation") | |
with gr.Accordion("Instructions", open=True): | |
gr.Markdown(""" | |
## Task Instructions | |
{instructions} | |
### How to use this interface: | |
1. Read the query at the top | |
2. Drag and drop documents to reorder them based on relevance | |
3. Top document = Rank 1, Second = Rank 2, etc. | |
4. Click "Submit Rankings" when you're done with the current query | |
5. Use "Previous" and "Next" to navigate between queries | |
6. Click "Save All Results" periodically to ensure your work is saved | |
""".format(instructions=task_data["instructions"])) | |
current_sample_id = gr.State(value=samples[0]["id"]) | |
with gr.Row(): | |
progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False) | |
status_box = gr.Textbox(label="Status", value="Ready to start evaluation", interactive=False) | |
with gr.Group(): | |
gr.Markdown("## Query:") | |
query_text = gr.Textbox(value=samples[0]["query"], label="", interactive=False) | |
gr.Markdown("## Documents to Rank (Drag to Reorder):") | |
sortable_list = gr.HTML(generate_sortable_html(samples[0]["candidates"], []), elem_id="sortable-list-container") | |
order_state = gr.Textbox(value="[]", visible=False, elem_id="current-order") | |
with gr.Row(): | |
prev_btn = gr.Button("← Previous Query", size="sm", elem_id="prev-btn") | |
submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary", elem_id="submit-btn") | |
next_btn = gr.Button("Next Query →", size="sm", elem_id="next-btn") | |
save_btn = gr.Button("💾 Save All Results", variant="secondary") | |
js_code = """ | |
<style> | |
/* Simple dropdown ranking styles */ | |
.ranking-simple { | |
width: 100%; | |
max-width: 100%; | |
margin: 0 auto; | |
} | |
.rank-instructions { | |
margin-bottom: 15px; | |
padding: 10px; | |
background-color: #f0f9ff; | |
border-left: 4px solid #3b82f6; | |
border-radius: 4px; | |
} | |
.rank-item { | |
display: flex; | |
align-items: flex-start; | |
padding: 12px; | |
margin-bottom: 10px; | |
background: white; | |
border: 1px solid #e0e0e0; | |
border-radius: 6px; | |
} | |
.rank-selector { | |
margin-right: 15px; | |
min-width: 70px; | |
} | |
.rank-dropdown { | |
width: 60px; | |
padding: 6px; | |
border: 1px solid #d1d5db; | |
border-radius: 4px; | |
background-color: white; | |
font-size: 14px; | |
} | |
.doc-content { | |
flex: 1; | |
line-height: 1.5; | |
padding: 5px 0; | |
} | |
</style> | |
""" | |
gr.HTML(js_code) | |
submit_btn.click( | |
save_ranking, | |
inputs=[order_state, current_sample_id], | |
outputs=[status_box, progress_text] | |
) | |
next_btn.click( | |
next_sample_id, inputs=[current_sample_id], outputs=[current_sample_id] | |
).then( | |
load_sample, | |
inputs=[current_sample_id], | |
outputs=[query_text, sortable_list, order_state, progress_text, status_box] | |
) | |
prev_btn.click( | |
prev_sample_id, inputs=[current_sample_id], outputs=[current_sample_id] | |
).then( | |
load_sample, | |
inputs=[current_sample_id], | |
outputs=[query_text, sortable_list, order_state, progress_text, status_box] | |
) | |
save_btn.click(save_results, outputs=[status_box]) | |
demo.load(lambda: load_sample(samples[0]['id']), | |
outputs=[query_text, sortable_list, order_state, progress_text, status_box]) | |
return demo | |
# Main app with file upload capability | |
with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
gr.Markdown("# MTEB Human Evaluation Demo") | |
with gr.Tabs(): | |
with gr.TabItem("Demo"): | |
gr.Markdown(""" | |
## MTEB Human Evaluation Interface | |
This interface allows you to evaluate the relevance of documents for reranking tasks. | |
""") | |
# Function to get the most recent task file | |
def get_latest_task_file(): | |
# Check first in uploaded_tasks directory | |
os.makedirs("uploaded_tasks", exist_ok=True) | |
uploaded_tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")] | |
if uploaded_tasks: | |
# Sort by modification time, newest first | |
uploaded_tasks.sort(key=lambda x: os.path.getmtime(os.path.join("uploaded_tasks", x)), reverse=True) | |
return os.path.join("uploaded_tasks", uploaded_tasks[0]) | |
# Fall back to default example | |
return "AskUbuntuDupQuestions_human_eval.json" | |
# Load the task file | |
task_file = get_latest_task_file() | |
try: | |
with open(task_file, "r") as f: | |
task_data = json.load(f) | |
# Show which task is currently loaded | |
gr.Markdown(f"**Current Task: {task_data['task_name']}** ({len(task_data['samples'])} samples)") | |
# Display the interface | |
reranking_demo = create_reranking_interface(task_data) | |
except Exception as e: | |
gr.Markdown(f"**Error loading task: {str(e)}**") | |
gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.") | |
with gr.TabItem("Upload & Evaluate"): | |
gr.Markdown(""" | |
## Upload Your Own Task File | |
If you have a prepared task file, you can upload it here to create an evaluation interface. | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
file_input = gr.File(label="Upload a task file (JSON)") | |
load_btn = gr.Button("Load Task") | |
message = gr.Textbox(label="Status", interactive=False) | |
# Add task list for previously uploaded tasks | |
gr.Markdown("### Previous Uploads") | |
# Function to list existing task files in the tasks directory | |
def list_task_files(): | |
os.makedirs("uploaded_tasks", exist_ok=True) | |
tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")] | |
if not tasks: | |
return "No task files uploaded yet." | |
return "\n".join([f"- [{t}](javascript:selectTask('{t}'))" for t in tasks]) | |
task_list = gr.Markdown(list_task_files()) | |
refresh_btn = gr.Button("Refresh List") | |
# Add results management section | |
gr.Markdown("### Results Management") | |
# Function to list existing result files | |
def list_result_files(): | |
results = [f for f in os.listdir(".") if f.endswith("_human_results.json")] | |
if not results: | |
return "No result files available yet." | |
result_links = [] | |
for r in results: | |
# Calculate completion stats | |
try: | |
with open(r, "r") as f: | |
result_data = json.load(f) | |
annotation_count = len(result_data.get("annotations", [])) | |
task_name = result_data.get("task_name", "Unknown") | |
result_links.append(f"- {r} ({annotation_count} annotations for {task_name})") | |
except: | |
result_links.append(f"- {r}") | |
return "\n".join(result_links) | |
results_list = gr.Markdown(list_result_files()) | |
download_results_btn = gr.Button("Download Results") | |
# Right side - will contain the actual interface | |
with gr.Column(): | |
task_container = gr.HTML() | |
# Handle file upload and storage | |
def handle_upload(file): | |
if not file: | |
return "Please upload a task file", task_list.value, task_container.value | |
try: | |
# Create directory if it doesn't exist | |
os.makedirs("uploaded_tasks", exist_ok=True) | |
# Read the uploaded file | |
with open(file.name, "r") as f: | |
task_data = json.load(f) | |
# Validate task format | |
if "task_name" not in task_data or "samples" not in task_data: | |
return "Invalid task file format. Must contain 'task_name' and 'samples' fields.", task_list.value, task_container.value | |
# Save to a consistent location | |
task_filename = f"uploaded_tasks/{task_data['task_name']}_task.json" | |
with open(task_filename, "w") as f: | |
json.dump(task_data, f, indent=2) | |
# Instead of trying to create the interface here, | |
# we'll return a message with instructions | |
return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f""" | |
<div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px;"> | |
<h3>Task uploaded successfully!</h3> | |
<p>Task Name: {task_data['task_name']}</p> | |
<p>Samples: {len(task_data['samples'])}</p> | |
<p>To evaluate this task:</p> | |
<ol> | |
<li>Refresh the app</li> | |
<li>The Demo tab will now use your uploaded task</li> | |
<li>Complete your evaluations</li> | |
<li>Results will be saved as {task_data['task_name']}_human_results.json</li> | |
</ol> | |
</div> | |
""" | |
except Exception as e: | |
return f"Error processing task file: {str(e)}", task_list.value, task_container.value | |
# Function to prepare results for download | |
def prepare_results_for_download(): | |
results = [f for f in os.listdir(".") if f.endswith("_human_results.json")] | |
if not results: | |
return None | |
# Create a zip file with all results | |
import zipfile | |
zip_path = "mteb_human_eval_results.zip" | |
with zipfile.ZipFile(zip_path, 'w') as zipf: | |
for r in results: | |
zipf.write(r) | |
return zip_path | |
# Connect events | |
load_btn.click(handle_upload, inputs=[file_input], outputs=[message, task_list, task_container]) | |
refresh_btn.click(list_task_files, outputs=[task_list]) | |
download_results_btn.click(prepare_results_for_download, outputs=[gr.File(label="Download Results")]) | |
with gr.TabItem("Results Management"): | |
gr.Markdown(""" | |
## Manage Evaluation Results | |
View, download, and analyze your evaluation results. | |
""") | |
# Function to load and display result stats | |
def get_result_stats(): | |
results = [f for f in os.listdir(".") if f.endswith("_human_results.json")] | |
if not results: | |
return "No result files available yet." | |
stats = [] | |
for r in results: | |
try: | |
with open(r, "r") as f: | |
result_data = json.load(f) | |
task_name = result_data.get("task_name", "Unknown") | |
annotations = result_data.get("annotations", []) | |
annotation_count = len(annotations) | |
# Calculate completion percentage | |
sample_ids = set(a.get("sample_id") for a in annotations) | |
# Try to get the total sample count from the corresponding task file | |
total_samples = 0 | |
task_file = f"uploaded_tasks/{task_name}_task.json" | |
if os.path.exists(task_file): | |
with open(task_file, "r") as f: | |
task_data = json.load(f) | |
total_samples = len(task_data.get("samples", [])) | |
completion = f"{len(sample_ids)}/{total_samples}" if total_samples else f"{len(sample_ids)} samples" | |
stats.append(f"### {task_name}\n- Annotations: {annotation_count}\n- Completion: {completion}\n- File: {r}") | |
except Exception as e: | |
stats.append(f"### {r}\n- Error loading results: {str(e)}") | |
return "\n\n".join(stats) | |
result_stats = gr.Markdown(get_result_stats()) | |
refresh_results_btn = gr.Button("Refresh Results") | |
# Add download options | |
with gr.Row(): | |
with gr.Column(): | |
download_all_btn = gr.Button("Download All Results (ZIP)") | |
with gr.Column(): | |
result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download", value=None) | |
download_selected_btn = gr.Button("Download Selected") | |
# Add results visualization placeholder | |
gr.Markdown("### Results Visualization") | |
gr.Markdown("*Visualization features will be added in a future update.*") | |
# Connect events | |
refresh_results_btn.click(get_result_stats, outputs=[result_stats]) | |
# Function to prepare all results for download as ZIP | |
def prepare_all_results(): | |
import zipfile | |
zip_path = "mteb_human_eval_results.zip" | |
with zipfile.ZipFile(zip_path, 'w') as zipf: | |
for r in [f for f in os.listdir(".") if f.endswith("_human_results.json")]: | |
zipf.write(r) | |
return zip_path | |
# Function to return a single result file | |
def get_selected_result(filename): | |
if not filename: | |
return None | |
if os.path.exists(filename): | |
return filename | |
return None | |
# Update dropdown when refreshing results | |
def update_result_dropdown(): | |
return gr.Dropdown.update(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")]) | |
refresh_results_btn.click(update_result_dropdown, outputs=[result_select]) | |
download_all_btn.click(prepare_all_results, outputs=[gr.File(label="Download All Results")]) | |
download_selected_btn.click(get_selected_result, inputs=[result_select], outputs=[gr.File(label="Download Selected Result")]) | |
if __name__ == "__main__": | |
demo.launch() | |