Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
|
|
2 |
import json
|
3 |
import os
|
4 |
from pathlib import Path
|
|
|
5 |
|
6 |
def create_reranking_interface(task_data):
|
7 |
"""Create a Gradio interface for reranking evaluation."""
|
@@ -9,10 +10,29 @@ def create_reranking_interface(task_data):
|
|
9 |
results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
|
10 |
completed_samples = {s["id"]: False for s in samples}
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def save_ranking(rankings, sample_id):
|
13 |
"""Save the current set of rankings."""
|
14 |
try:
|
15 |
# Check if all documents have rankings
|
|
|
|
|
|
|
16 |
all_ranked = all(r is not None and r != "" for r in rankings)
|
17 |
if not all_ranked:
|
18 |
return "⚠️ Please assign a rank to all documents before submitting", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
|
@@ -42,17 +62,19 @@ def create_reranking_interface(task_data):
|
|
42 |
|
43 |
completed_samples[sample_id] = True
|
44 |
|
45 |
-
#
|
46 |
try:
|
47 |
output_path = f"{task_data['task_name']}_human_results.json"
|
48 |
with open(output_path, "w") as f:
|
49 |
json.dump(results, f, indent=2)
|
50 |
-
return f"✅ Rankings saved successfully
|
51 |
-
except:
|
52 |
# If file saving fails, still mark as success since we saved in memory
|
|
|
53 |
return f"✅ Rankings saved in memory (file save failed)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
|
54 |
except Exception as e:
|
55 |
# Return specific error message
|
|
|
56 |
return f"Error: {str(e)}", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
|
57 |
|
58 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
@@ -71,14 +93,16 @@ def create_reranking_interface(task_data):
|
|
71 |
4. Each document must have a unique rank
|
72 |
5. Click "Submit Rankings" when you're done with the current query
|
73 |
6. Use "Previous" and "Next" to navigate between queries
|
74 |
-
7.
|
75 |
-
""".format(instructions=task_data
|
76 |
|
77 |
current_sample_id = gr.State(value=samples[0]["id"])
|
|
|
78 |
|
79 |
with gr.Row():
|
80 |
progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
|
81 |
status_box = gr.Textbox(label="Status", value="Ready to start evaluation", interactive=False)
|
|
|
82 |
|
83 |
with gr.Group():
|
84 |
gr.Markdown("## Query:")
|
@@ -86,38 +110,73 @@ def create_reranking_interface(task_data):
|
|
86 |
|
87 |
gr.Markdown("## Documents to Rank:")
|
88 |
|
89 |
-
# Create document displays and ranking
|
90 |
doc_containers = []
|
91 |
-
|
|
|
92 |
|
93 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
for i, doc in enumerate(samples[0]["candidates"]):
|
95 |
with gr.Row():
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
with gr.Row():
|
110 |
prev_btn = gr.Button("← Previous Query", size="sm")
|
111 |
submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
|
112 |
next_btn = gr.Button("Next Query →", size="sm")
|
113 |
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
def load_sample(sample_id):
|
117 |
"""Load a specific sample into the interface."""
|
118 |
sample = next((s for s in samples if s["id"] == sample_id), None)
|
119 |
if not sample:
|
120 |
-
return [query_text.value] + [d.value for d in doc_containers] + [""] * len(
|
121 |
|
122 |
# Update query
|
123 |
new_query = sample["query"]
|
@@ -129,7 +188,7 @@ def create_reranking_interface(task_data):
|
|
129 |
new_docs.append(doc)
|
130 |
|
131 |
# Initialize rankings
|
132 |
-
new_rankings = [""] * len(
|
133 |
|
134 |
# Check if this sample has already been annotated
|
135 |
existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
|
@@ -147,7 +206,34 @@ def create_reranking_interface(task_data):
|
|
147 |
if completed_samples[sample_id]:
|
148 |
new_status += " (already completed)"
|
149 |
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
def next_sample(current_id):
|
153 |
"""Load the next sample."""
|
@@ -176,49 +262,120 @@ def create_reranking_interface(task_data):
|
|
176 |
def save_results():
|
177 |
"""Save all collected results to a file."""
|
178 |
output_path = f"{task_data['task_name']}_human_results.json"
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
|
183 |
-
#
|
184 |
-
def
|
185 |
-
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
sample_id = args[-1]
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
-
# Connect events
|
191 |
submit_btn.click(
|
192 |
-
|
193 |
-
inputs=
|
194 |
outputs=[status_box, progress_text]
|
195 |
)
|
196 |
|
|
|
197 |
next_btn.click(
|
198 |
-
|
199 |
-
inputs=[current_sample_id],
|
200 |
-
outputs=[current_sample_id]
|
201 |
).then(
|
202 |
load_sample,
|
203 |
inputs=[current_sample_id],
|
204 |
-
outputs=[query_text] + doc_containers +
|
205 |
)
|
206 |
|
207 |
prev_btn.click(
|
208 |
-
|
209 |
-
inputs=[current_sample_id],
|
210 |
-
outputs=[current_sample_id]
|
211 |
).then(
|
212 |
load_sample,
|
213 |
inputs=[current_sample_id],
|
214 |
-
outputs=[query_text] + doc_containers +
|
215 |
)
|
216 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
save_btn.click(save_results, outputs=[status_box])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
|
219 |
return demo
|
220 |
|
221 |
-
# Main app with file upload capability
|
222 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
223 |
gr.Markdown("# MTEB Human Evaluation Demo")
|
224 |
|
@@ -239,25 +396,49 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
239 |
if uploaded_tasks:
|
240 |
# Sort by modification time, newest first
|
241 |
uploaded_tasks.sort(key=lambda x: os.path.getmtime(os.path.join("uploaded_tasks", x)), reverse=True)
|
242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
|
244 |
-
#
|
245 |
-
return
|
246 |
|
247 |
# Load the task file
|
248 |
task_file = get_latest_task_file()
|
249 |
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
|
|
|
|
|
|
|
|
261 |
gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
|
262 |
|
263 |
with gr.TabItem("Upload & Evaluate"):
|
@@ -282,7 +463,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
282 |
tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
|
283 |
if not tasks:
|
284 |
return "No task files uploaded yet."
|
285 |
-
return "\n".join([f"-
|
286 |
|
287 |
task_list = gr.Markdown(list_task_files())
|
288 |
refresh_btn = gr.Button("Refresh List")
|
@@ -316,11 +497,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
316 |
# Right side - will contain the actual interface
|
317 |
with gr.Column(scale=2):
|
318 |
task_container = gr.HTML()
|
|
|
319 |
|
320 |
# Handle file upload and storage
|
321 |
def handle_upload(file):
|
322 |
if not file:
|
323 |
-
return "Please upload a task file", task_list.value, task_container.value
|
324 |
|
325 |
try:
|
326 |
# Create directory if it doesn't exist
|
@@ -332,15 +514,20 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
332 |
|
333 |
# Validate task format
|
334 |
if "task_name" not in task_data or "samples" not in task_data:
|
335 |
-
return "Invalid task file format. Must contain 'task_name' and 'samples' fields.", task_list.value, task_container.value
|
336 |
|
337 |
# Save to a consistent location
|
338 |
task_filename = f"uploaded_tasks/{task_data['task_name']}_task.json"
|
339 |
with open(task_filename, "w") as f:
|
340 |
json.dump(task_data, f, indent=2)
|
341 |
|
342 |
-
#
|
343 |
-
|
|
|
|
|
|
|
|
|
|
|
344 |
return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
|
345 |
<div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
|
346 |
<h3>Task uploaded successfully!</h3>
|
@@ -354,9 +541,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
354 |
<li>Results will be saved as {task_data['task_name']}_human_results.json</li>
|
355 |
</ol>
|
356 |
</div>
|
357 |
-
"""
|
358 |
except Exception as e:
|
359 |
-
return f"Error processing task file: {str(e)}", task_list.value, task_container.value
|
360 |
|
361 |
# Function to prepare results for download
|
362 |
def prepare_results_for_download():
|
@@ -374,7 +561,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
374 |
return zip_path
|
375 |
|
376 |
# Connect events
|
377 |
-
load_btn.click(handle_upload, inputs=[file_input], outputs=[message, task_list, task_container])
|
378 |
refresh_btn.click(list_task_files, outputs=[task_list])
|
379 |
download_results_btn.click(prepare_results_for_download, outputs=[gr.File(label="Download Results")])
|
380 |
|
@@ -406,11 +593,20 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
406 |
|
407 |
# Try to get the total sample count from the corresponding task file
|
408 |
total_samples = 0
|
|
|
|
|
409 |
task_file = f"uploaded_tasks/{task_name}_task.json"
|
410 |
if os.path.exists(task_file):
|
411 |
with open(task_file, "r") as f:
|
412 |
task_data = json.load(f)
|
413 |
total_samples = len(task_data.get("samples", []))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
414 |
|
415 |
completion = f"{len(sample_ids)}/{total_samples}" if total_samples else f"{len(sample_ids)} samples"
|
416 |
|
@@ -429,10 +625,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
429 |
result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
|
430 |
download_selected_btn = gr.Button("Download Selected")
|
431 |
|
432 |
-
# Add results visualization placeholder
|
433 |
-
gr.Markdown("### Results Visualization")
|
434 |
-
gr.Markdown("*Visualization features will be added in a future update.*")
|
435 |
-
|
436 |
# Connect events
|
437 |
refresh_results_btn.click(get_result_stats, outputs=[result_stats])
|
438 |
|
|
|
2 |
import json
|
3 |
import os
|
4 |
from pathlib import Path
|
5 |
+
import time
|
6 |
|
7 |
def create_reranking_interface(task_data):
|
8 |
"""Create a Gradio interface for reranking evaluation."""
|
|
|
10 |
results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
|
11 |
completed_samples = {s["id"]: False for s in samples}
|
12 |
|
13 |
+
# Load existing results if available
|
14 |
+
output_path = f"{task_data['task_name']}_human_results.json"
|
15 |
+
if os.path.exists(output_path):
|
16 |
+
try:
|
17 |
+
with open(output_path, "r") as f:
|
18 |
+
saved_results = json.load(f)
|
19 |
+
if "annotations" in saved_results:
|
20 |
+
results["annotations"] = saved_results["annotations"]
|
21 |
+
# Update completed_samples based on loaded data
|
22 |
+
for annotation in saved_results["annotations"]:
|
23 |
+
sample_id = annotation.get("sample_id")
|
24 |
+
if sample_id and sample_id in completed_samples:
|
25 |
+
completed_samples[sample_id] = True
|
26 |
+
except Exception as e:
|
27 |
+
print(f"Error loading existing results: {e}")
|
28 |
+
|
29 |
def save_ranking(rankings, sample_id):
|
30 |
"""Save the current set of rankings."""
|
31 |
try:
|
32 |
# Check if all documents have rankings
|
33 |
+
if not rankings or len(rankings) == 0:
|
34 |
+
return "⚠️ No rankings provided", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
|
35 |
+
|
36 |
all_ranked = all(r is not None and r != "" for r in rankings)
|
37 |
if not all_ranked:
|
38 |
return "⚠️ Please assign a rank to all documents before submitting", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
|
|
|
62 |
|
63 |
completed_samples[sample_id] = True
|
64 |
|
65 |
+
# Always save to file for redundancy
|
66 |
try:
|
67 |
output_path = f"{task_data['task_name']}_human_results.json"
|
68 |
with open(output_path, "w") as f:
|
69 |
json.dump(results, f, indent=2)
|
70 |
+
return f"✅ Rankings saved successfully", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
|
71 |
+
except Exception as file_error:
|
72 |
# If file saving fails, still mark as success since we saved in memory
|
73 |
+
print(f"File save error: {file_error}")
|
74 |
return f"✅ Rankings saved in memory (file save failed)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
|
75 |
except Exception as e:
|
76 |
# Return specific error message
|
77 |
+
print(f"Save ranking error: {e}")
|
78 |
return f"Error: {str(e)}", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
|
79 |
|
80 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
|
93 |
4. Each document must have a unique rank
|
94 |
5. Click "Submit Rankings" when you're done with the current query
|
95 |
6. Use "Previous" and "Next" to navigate between queries
|
96 |
+
7. Your rankings are automatically saved when you submit or navigate
|
97 |
+
""".format(instructions=task_data.get("instructions", "Rank documents by their relevance to the query.")))
|
98 |
|
99 |
current_sample_id = gr.State(value=samples[0]["id"])
|
100 |
+
current_state = gr.State(value={"auto_save_enabled": True, "last_saved": time.time()})
|
101 |
|
102 |
with gr.Row():
|
103 |
progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
|
104 |
status_box = gr.Textbox(label="Status", value="Ready to start evaluation", interactive=False)
|
105 |
+
auto_save_toggle = gr.Checkbox(label="Auto-save when navigating", value=True)
|
106 |
|
107 |
with gr.Group():
|
108 |
gr.Markdown("## Query:")
|
|
|
110 |
|
111 |
gr.Markdown("## Documents to Rank:")
|
112 |
|
113 |
+
# Create document displays and ranking inputs in synchronized pairs
|
114 |
doc_containers = []
|
115 |
+
ranking_inputs = []
|
116 |
+
validation_indicators = []
|
117 |
|
118 |
with gr.Column():
|
119 |
+
# Quick ranking tools
|
120 |
+
with gr.Row():
|
121 |
+
gr.Markdown("### Quick Ranking Options:")
|
122 |
+
sequential_btn = gr.Button("Rank in Order (1,2,3...)")
|
123 |
+
reverse_btn = gr.Button("Reverse Order (n,n-1,...)")
|
124 |
+
clear_btn = gr.Button("Clear All Rankings")
|
125 |
+
|
126 |
+
# Document display with better UI for ranking
|
127 |
for i, doc in enumerate(samples[0]["candidates"]):
|
128 |
with gr.Row():
|
129 |
+
with gr.Column(scale=4):
|
130 |
+
doc_box = gr.Textbox(
|
131 |
+
value=doc,
|
132 |
+
label=f"Document {i+1}",
|
133 |
+
interactive=False
|
134 |
+
)
|
135 |
+
doc_containers.append(doc_box)
|
136 |
+
|
137 |
+
with gr.Column(scale=1):
|
138 |
+
# Use Radio buttons for ranking rather than dropdowns
|
139 |
+
# This provides a more visual and error-resistant interface
|
140 |
+
rank_input = gr.Radio(
|
141 |
+
choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
|
142 |
+
label=f"Rank",
|
143 |
+
value="",
|
144 |
+
interactive=True
|
145 |
+
)
|
146 |
+
ranking_inputs.append(rank_input)
|
147 |
+
|
148 |
+
# Add validation indicator
|
149 |
+
with gr.Column(scale=1, min_width=50):
|
150 |
+
validation = gr.HTML(value="")
|
151 |
+
validation_indicators.append(validation)
|
152 |
|
153 |
with gr.Row():
|
154 |
prev_btn = gr.Button("← Previous Query", size="sm")
|
155 |
submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
|
156 |
next_btn = gr.Button("Next Query →", size="sm")
|
157 |
|
158 |
+
with gr.Row():
|
159 |
+
save_btn = gr.Button("💾 Save All Results", variant="secondary")
|
160 |
+
results_info = gr.HTML(value=f"<p>Results will be saved to <code>{task_data['task_name']}_human_results.json</code></p>")
|
161 |
+
|
162 |
+
def validate_rankings(*rankings):
|
163 |
+
"""Validate rankings and update indicators."""
|
164 |
+
results = []
|
165 |
+
all_valid = True
|
166 |
+
for rank in rankings:
|
167 |
+
if rank is None or rank == "":
|
168 |
+
results.append("⚠️")
|
169 |
+
all_valid = False
|
170 |
+
else:
|
171 |
+
results.append("✓")
|
172 |
+
|
173 |
+
return results, all_valid
|
174 |
|
175 |
def load_sample(sample_id):
|
176 |
"""Load a specific sample into the interface."""
|
177 |
sample = next((s for s in samples if s["id"] == sample_id), None)
|
178 |
if not sample:
|
179 |
+
return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_inputs) + validation_indicators + [sample_id, progress_text.value, status_box.value]
|
180 |
|
181 |
# Update query
|
182 |
new_query = sample["query"]
|
|
|
188 |
new_docs.append(doc)
|
189 |
|
190 |
# Initialize rankings
|
191 |
+
new_rankings = [""] * len(ranking_inputs)
|
192 |
|
193 |
# Check if this sample has already been annotated
|
194 |
existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
|
|
|
206 |
if completed_samples[sample_id]:
|
207 |
new_status += " (already completed)"
|
208 |
|
209 |
+
# Initialize validation indicators
|
210 |
+
validation_results, _ = validate_rankings(*new_rankings)
|
211 |
+
|
212 |
+
return [new_query] + new_docs + new_rankings + validation_results + [sample_id, new_progress, new_status]
|
213 |
+
|
214 |
+
def auto_save_and_navigate(direction, current_id, auto_save, *rankings):
|
215 |
+
"""Save rankings if auto-save is enabled, then navigate."""
|
216 |
+
# Extract rankings (remove validation indicators)
|
217 |
+
actual_rankings = rankings[:len(ranking_inputs)]
|
218 |
+
|
219 |
+
# If auto-save is enabled, try to save the current rankings
|
220 |
+
status_msg = ""
|
221 |
+
progress_msg = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
|
222 |
+
|
223 |
+
if auto_save:
|
224 |
+
# Only save if all rankings are provided
|
225 |
+
validation_results, all_valid = validate_rankings(*actual_rankings)
|
226 |
+
if all_valid:
|
227 |
+
status_msg, progress_msg = save_ranking(actual_rankings, current_id)
|
228 |
+
|
229 |
+
# Navigate to the next/previous sample
|
230 |
+
if direction == "next":
|
231 |
+
new_id = next_sample(current_id)
|
232 |
+
else:
|
233 |
+
new_id = prev_sample(current_id)
|
234 |
+
|
235 |
+
# Return the new sample ID and status message
|
236 |
+
return new_id, status_msg, progress_msg
|
237 |
|
238 |
def next_sample(current_id):
|
239 |
"""Load the next sample."""
|
|
|
262 |
def save_results():
|
263 |
"""Save all collected results to a file."""
|
264 |
output_path = f"{task_data['task_name']}_human_results.json"
|
265 |
+
try:
|
266 |
+
with open(output_path, "w") as f:
|
267 |
+
json.dump(results, f, indent=2)
|
268 |
+
current_state.value["last_saved"] = time.time()
|
269 |
+
return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
|
270 |
+
except Exception as e:
|
271 |
+
return f"Error saving results: {str(e)}"
|
272 |
+
|
273 |
+
# Function to assign sequential ranks
|
274 |
+
def assign_sequential_ranks():
|
275 |
+
return [str(i+1) for i in range(len(samples[0]["candidates"]))]
|
276 |
|
277 |
+
# Function to assign reverse ranks
|
278 |
+
def assign_reverse_ranks():
|
279 |
+
n = len(samples[0]["candidates"])
|
280 |
+
return [str(n-i) for i in range(n)]
|
281 |
+
|
282 |
+
# Function to clear all rankings
|
283 |
+
def clear_rankings():
|
284 |
+
return ["" for _ in range(len(samples[0]["candidates"]))]
|
285 |
+
|
286 |
+
# Define a function that collects all ranking values and validates them
|
287 |
+
def submit_rankings(*args):
|
288 |
+
# Get the last argument (sample_id) and the rankings
|
289 |
+
if len(args) < 1:
|
290 |
+
return "Error: No arguments provided", progress_text.value
|
291 |
+
|
292 |
+
# Verify we have enough rankings
|
293 |
+
if len(args) < len(ranking_inputs) + 1:
|
294 |
+
return "Error: Not enough ranking inputs provided", progress_text.value
|
295 |
+
|
296 |
sample_id = args[-1]
|
297 |
+
rankings = args[:len(ranking_inputs)]
|
298 |
+
|
299 |
+
# First validate the rankings
|
300 |
+
validation_results, all_valid = validate_rankings(*rankings)
|
301 |
+
|
302 |
+
# Update validation indicators
|
303 |
+
for i, result in enumerate(validation_results):
|
304 |
+
validation_indicators[i].update(value=result)
|
305 |
+
|
306 |
+
# If not all valid, return error message
|
307 |
+
if not all_valid:
|
308 |
+
return "⚠️ Please assign a rank to all documents before submitting", progress_text.value
|
309 |
+
|
310 |
+
# Save the validated rankings
|
311 |
+
status, progress = save_ranking(rankings, sample_id)
|
312 |
+
return status, progress
|
313 |
|
314 |
+
# Connect events - Direct input/output connections for reliability
|
315 |
submit_btn.click(
|
316 |
+
submit_rankings,
|
317 |
+
inputs=ranking_inputs + [current_sample_id],
|
318 |
outputs=[status_box, progress_text]
|
319 |
)
|
320 |
|
321 |
+
# Apply auto-save before navigation if enabled
|
322 |
next_btn.click(
|
323 |
+
auto_save_and_navigate,
|
324 |
+
inputs=["next", current_sample_id, auto_save_toggle] + ranking_inputs,
|
325 |
+
outputs=[current_sample_id, status_box, progress_text]
|
326 |
).then(
|
327 |
load_sample,
|
328 |
inputs=[current_sample_id],
|
329 |
+
outputs=[query_text] + doc_containers + ranking_inputs + validation_indicators + [current_sample_id, progress_text, status_box]
|
330 |
)
|
331 |
|
332 |
prev_btn.click(
|
333 |
+
auto_save_and_navigate,
|
334 |
+
inputs=["prev", current_sample_id, auto_save_toggle] + ranking_inputs,
|
335 |
+
outputs=[current_sample_id, status_box, progress_text]
|
336 |
).then(
|
337 |
load_sample,
|
338 |
inputs=[current_sample_id],
|
339 |
+
outputs=[query_text] + doc_containers + ranking_inputs + validation_indicators + [current_sample_id, progress_text, status_box]
|
340 |
)
|
341 |
|
342 |
+
# Connect quick ranking buttons
|
343 |
+
sequential_btn.click(
|
344 |
+
assign_sequential_ranks,
|
345 |
+
outputs=ranking_inputs
|
346 |
+
)
|
347 |
+
|
348 |
+
reverse_btn.click(
|
349 |
+
assign_reverse_ranks,
|
350 |
+
outputs=ranking_inputs
|
351 |
+
)
|
352 |
+
|
353 |
+
clear_btn.click(
|
354 |
+
clear_rankings,
|
355 |
+
outputs=ranking_inputs
|
356 |
+
)
|
357 |
+
|
358 |
+
# Connect save button
|
359 |
save_btn.click(save_results, outputs=[status_box])
|
360 |
+
|
361 |
+
# Add validation on ranking changes
|
362 |
+
for i, ranking in enumerate(ranking_inputs):
|
363 |
+
ranking.change(
|
364 |
+
validate_rankings,
|
365 |
+
inputs=ranking_inputs,
|
366 |
+
outputs=validation_indicators + [gr.State(value=None)] # Add dummy output to match function return
|
367 |
+
)
|
368 |
+
|
369 |
+
# Set up auto-save feature
|
370 |
+
auto_save_toggle.change(
|
371 |
+
lambda x: {"auto_save_enabled": x},
|
372 |
+
inputs=[auto_save_toggle],
|
373 |
+
outputs=[current_state]
|
374 |
+
)
|
375 |
|
376 |
return demo
|
377 |
|
378 |
+
# Main app with file upload capability and improved task management
|
379 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
380 |
gr.Markdown("# MTEB Human Evaluation Demo")
|
381 |
|
|
|
396 |
if uploaded_tasks:
|
397 |
# Sort by modification time, newest first
|
398 |
uploaded_tasks.sort(key=lambda x: os.path.getmtime(os.path.join("uploaded_tasks", x)), reverse=True)
|
399 |
+
task_path = os.path.join("uploaded_tasks", uploaded_tasks[0])
|
400 |
+
|
401 |
+
# Verify this is a valid task file
|
402 |
+
try:
|
403 |
+
with open(task_path, "r") as f:
|
404 |
+
task_data = json.load(f)
|
405 |
+
if "task_name" in task_data and "samples" in task_data:
|
406 |
+
return task_path
|
407 |
+
except:
|
408 |
+
pass
|
409 |
+
|
410 |
+
# Look for task files in the current directory
|
411 |
+
current_dir_tasks = [f for f in os.listdir(".") if f.endswith("_human_eval.json")]
|
412 |
+
if current_dir_tasks:
|
413 |
+
# Sort by modification time, newest first
|
414 |
+
current_dir_tasks.sort(key=lambda x: os.path.getmtime(x), reverse=True)
|
415 |
+
return current_dir_tasks[0]
|
416 |
+
|
417 |
+
# Fall back to fixed example if available
|
418 |
+
if os.path.exists("AskUbuntuDupQuestions_human_eval.json"):
|
419 |
+
return "AskUbuntuDupQuestions_human_eval.json"
|
420 |
|
421 |
+
# No valid task file found
|
422 |
+
return None
|
423 |
|
424 |
# Load the task file
|
425 |
task_file = get_latest_task_file()
|
426 |
|
427 |
+
if task_file:
|
428 |
+
try:
|
429 |
+
with open(task_file, "r") as f:
|
430 |
+
task_data = json.load(f)
|
431 |
+
|
432 |
+
# Show which task is currently loaded
|
433 |
+
gr.Markdown(f"**Current Task: {task_data['task_name']}** ({len(task_data['samples'])} samples)")
|
434 |
+
|
435 |
+
# Display the interface
|
436 |
+
reranking_demo = create_reranking_interface(task_data)
|
437 |
+
except Exception as e:
|
438 |
+
gr.Markdown(f"**Error loading task: {str(e)}**")
|
439 |
+
gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
|
440 |
+
else:
|
441 |
+
gr.Markdown("**No task file found**")
|
442 |
gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
|
443 |
|
444 |
with gr.TabItem("Upload & Evaluate"):
|
|
|
463 |
tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
|
464 |
if not tasks:
|
465 |
return "No task files uploaded yet."
|
466 |
+
return "\n".join([f"- {t}" for t in tasks])
|
467 |
|
468 |
task_list = gr.Markdown(list_task_files())
|
469 |
refresh_btn = gr.Button("Refresh List")
|
|
|
497 |
# Right side - will contain the actual interface
|
498 |
with gr.Column(scale=2):
|
499 |
task_container = gr.HTML()
|
500 |
+
loaded_task_info = gr.JSON(label="Loaded Task Information", visible=False)
|
501 |
|
502 |
# Handle file upload and storage
|
503 |
def handle_upload(file):
|
504 |
if not file:
|
505 |
+
return "Please upload a task file", task_list.value, task_container.value, loaded_task_info.value
|
506 |
|
507 |
try:
|
508 |
# Create directory if it doesn't exist
|
|
|
514 |
|
515 |
# Validate task format
|
516 |
if "task_name" not in task_data or "samples" not in task_data:
|
517 |
+
return "Invalid task file format. Must contain 'task_name' and 'samples' fields.", task_list.value, task_container.value, loaded_task_info.value
|
518 |
|
519 |
# Save to a consistent location
|
520 |
task_filename = f"uploaded_tasks/{task_data['task_name']}_task.json"
|
521 |
with open(task_filename, "w") as f:
|
522 |
json.dump(task_data, f, indent=2)
|
523 |
|
524 |
+
# Show task info
|
525 |
+
task_info = {
|
526 |
+
"task_name": task_data["task_name"],
|
527 |
+
"samples": len(task_data["samples"]),
|
528 |
+
"file_path": task_filename
|
529 |
+
}
|
530 |
+
|
531 |
return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
|
532 |
<div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
|
533 |
<h3>Task uploaded successfully!</h3>
|
|
|
541 |
<li>Results will be saved as {task_data['task_name']}_human_results.json</li>
|
542 |
</ol>
|
543 |
</div>
|
544 |
+
""", task_info
|
545 |
except Exception as e:
|
546 |
+
return f"Error processing task file: {str(e)}", task_list.value, task_container.value, loaded_task_info.value
|
547 |
|
548 |
# Function to prepare results for download
|
549 |
def prepare_results_for_download():
|
|
|
561 |
return zip_path
|
562 |
|
563 |
# Connect events
|
564 |
+
load_btn.click(handle_upload, inputs=[file_input], outputs=[message, task_list, task_container, loaded_task_info])
|
565 |
refresh_btn.click(list_task_files, outputs=[task_list])
|
566 |
download_results_btn.click(prepare_results_for_download, outputs=[gr.File(label="Download Results")])
|
567 |
|
|
|
593 |
|
594 |
# Try to get the total sample count from the corresponding task file
|
595 |
total_samples = 0
|
596 |
+
|
597 |
+
# Try uploaded_tasks directory first
|
598 |
task_file = f"uploaded_tasks/{task_name}_task.json"
|
599 |
if os.path.exists(task_file):
|
600 |
with open(task_file, "r") as f:
|
601 |
task_data = json.load(f)
|
602 |
total_samples = len(task_data.get("samples", []))
|
603 |
+
else:
|
604 |
+
# Try human_eval file in current directory
|
605 |
+
task_file = f"{task_name}_human_eval.json"
|
606 |
+
if os.path.exists(task_file):
|
607 |
+
with open(task_file, "r") as f:
|
608 |
+
task_data = json.load(f)
|
609 |
+
total_samples = len(task_data.get("samples", []))
|
610 |
|
611 |
completion = f"{len(sample_ids)}/{total_samples}" if total_samples else f"{len(sample_ids)} samples"
|
612 |
|
|
|
625 |
result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
|
626 |
download_selected_btn = gr.Button("Download Selected")
|
627 |
|
|
|
|
|
|
|
|
|
628 |
# Connect events
|
629 |
refresh_results_btn.click(get_result_stats, outputs=[result_stats])
|
630 |
|