AdnanElAssadi commited on
Commit
77dbca6
·
verified ·
1 Parent(s): dbcd9e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -69
app.py CHANGED
@@ -2,6 +2,7 @@ import gradio as gr
2
  import json
3
  import os
4
  from pathlib import Path
 
5
 
6
  def create_reranking_interface(task_data):
7
  """Create a Gradio interface for reranking evaluation."""
@@ -9,10 +10,29 @@ def create_reranking_interface(task_data):
9
  results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
10
  completed_samples = {s["id"]: False for s in samples}
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def save_ranking(rankings, sample_id):
13
  """Save the current set of rankings."""
14
  try:
15
  # Check if all documents have rankings
 
 
 
16
  all_ranked = all(r is not None and r != "" for r in rankings)
17
  if not all_ranked:
18
  return "⚠️ Please assign a rank to all documents before submitting", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
@@ -42,17 +62,19 @@ def create_reranking_interface(task_data):
42
 
43
  completed_samples[sample_id] = True
44
 
45
- # Try to save to file, but continue even if it fails
46
  try:
47
  output_path = f"{task_data['task_name']}_human_results.json"
48
  with open(output_path, "w") as f:
49
  json.dump(results, f, indent=2)
50
- return f"✅ Rankings saved successfully (in memory and to file)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
51
- except:
52
  # If file saving fails, still mark as success since we saved in memory
 
53
  return f"✅ Rankings saved in memory (file save failed)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
54
  except Exception as e:
55
  # Return specific error message
 
56
  return f"Error: {str(e)}", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
57
 
58
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -71,14 +93,16 @@ def create_reranking_interface(task_data):
71
  4. Each document must have a unique rank
72
  5. Click "Submit Rankings" when you're done with the current query
73
  6. Use "Previous" and "Next" to navigate between queries
74
- 7. Click "Save All Results" periodically to ensure your work is saved
75
- """.format(instructions=task_data["instructions"]))
76
 
77
  current_sample_id = gr.State(value=samples[0]["id"])
 
78
 
79
  with gr.Row():
80
  progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
81
  status_box = gr.Textbox(label="Status", value="Ready to start evaluation", interactive=False)
 
82
 
83
  with gr.Group():
84
  gr.Markdown("## Query:")
@@ -86,38 +110,73 @@ def create_reranking_interface(task_data):
86
 
87
  gr.Markdown("## Documents to Rank:")
88
 
89
- # Create document displays and ranking dropdowns in synchronized pairs
90
  doc_containers = []
91
- ranking_dropdowns = []
 
92
 
93
  with gr.Column():
 
 
 
 
 
 
 
 
94
  for i, doc in enumerate(samples[0]["candidates"]):
95
  with gr.Row():
96
- doc_box = gr.Textbox(
97
- value=doc,
98
- label=f"Document {i+1}",
99
- interactive=False
100
- )
101
- dropdown = gr.Dropdown(
102
- choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
103
- label=f"Rank",
104
- value=""
105
- )
106
- doc_containers.append(doc_box)
107
- ranking_dropdowns.append(dropdown)
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  with gr.Row():
110
  prev_btn = gr.Button("← Previous Query", size="sm")
111
  submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
112
  next_btn = gr.Button("Next Query →", size="sm")
113
 
114
- save_btn = gr.Button("💾 Save All Results", variant="secondary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  def load_sample(sample_id):
117
  """Load a specific sample into the interface."""
118
  sample = next((s for s in samples if s["id"] == sample_id), None)
119
  if not sample:
120
- return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_dropdowns) + [current_sample_id.value, progress_text.value, status_box.value]
121
 
122
  # Update query
123
  new_query = sample["query"]
@@ -129,7 +188,7 @@ def create_reranking_interface(task_data):
129
  new_docs.append(doc)
130
 
131
  # Initialize rankings
132
- new_rankings = [""] * len(ranking_dropdowns)
133
 
134
  # Check if this sample has already been annotated
135
  existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
@@ -147,7 +206,34 @@ def create_reranking_interface(task_data):
147
  if completed_samples[sample_id]:
148
  new_status += " (already completed)"
149
 
150
- return [new_query] + new_docs + new_rankings + [sample["id"], new_progress, new_status]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  def next_sample(current_id):
153
  """Load the next sample."""
@@ -176,49 +262,120 @@ def create_reranking_interface(task_data):
176
  def save_results():
177
  """Save all collected results to a file."""
178
  output_path = f"{task_data['task_name']}_human_results.json"
179
- with open(output_path, "w") as f:
180
- json.dump(results, f, indent=2)
181
- return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
 
 
 
 
 
 
 
 
182
 
183
- # Define a wrapper function that collects all the dropdown values into a list
184
- def save_ranking_wrapper(*args):
185
- # The last argument is the sample_id, all others are rankings
186
- rankings = args[:-1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  sample_id = args[-1]
188
- return save_ranking(rankings, sample_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
- # Connect events
191
  submit_btn.click(
192
- save_ranking_wrapper,
193
- inputs=ranking_dropdowns + [current_sample_id],
194
  outputs=[status_box, progress_text]
195
  )
196
 
 
197
  next_btn.click(
198
- next_sample,
199
- inputs=[current_sample_id],
200
- outputs=[current_sample_id]
201
  ).then(
202
  load_sample,
203
  inputs=[current_sample_id],
204
- outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
205
  )
206
 
207
  prev_btn.click(
208
- prev_sample,
209
- inputs=[current_sample_id],
210
- outputs=[current_sample_id]
211
  ).then(
212
  load_sample,
213
  inputs=[current_sample_id],
214
- outputs=[query_text] + doc_containers + ranking_dropdowns + [current_sample_id, progress_text, status_box]
215
  )
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  save_btn.click(save_results, outputs=[status_box])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
  return demo
220
 
221
- # Main app with file upload capability
222
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
223
  gr.Markdown("# MTEB Human Evaluation Demo")
224
 
@@ -239,25 +396,49 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
239
  if uploaded_tasks:
240
  # Sort by modification time, newest first
241
  uploaded_tasks.sort(key=lambda x: os.path.getmtime(os.path.join("uploaded_tasks", x)), reverse=True)
242
- return os.path.join("uploaded_tasks", uploaded_tasks[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
 
244
- # Fall back to default example
245
- return "AskUbuntuDupQuestions_human_eval.json"
246
 
247
  # Load the task file
248
  task_file = get_latest_task_file()
249
 
250
- try:
251
- with open(task_file, "r") as f:
252
- task_data = json.load(f)
253
-
254
- # Show which task is currently loaded
255
- gr.Markdown(f"**Current Task: {task_data['task_name']}** ({len(task_data['samples'])} samples)")
256
-
257
- # Display the interface
258
- reranking_demo = create_reranking_interface(task_data)
259
- except Exception as e:
260
- gr.Markdown(f"**Error loading task: {str(e)}**")
 
 
 
 
261
  gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
262
 
263
  with gr.TabItem("Upload & Evaluate"):
@@ -282,7 +463,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
282
  tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
283
  if not tasks:
284
  return "No task files uploaded yet."
285
- return "\n".join([f"- [{t}](javascript:selectTask('{t}'))" for t in tasks])
286
 
287
  task_list = gr.Markdown(list_task_files())
288
  refresh_btn = gr.Button("Refresh List")
@@ -316,11 +497,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
316
  # Right side - will contain the actual interface
317
  with gr.Column(scale=2):
318
  task_container = gr.HTML()
 
319
 
320
  # Handle file upload and storage
321
  def handle_upload(file):
322
  if not file:
323
- return "Please upload a task file", task_list.value, task_container.value
324
 
325
  try:
326
  # Create directory if it doesn't exist
@@ -332,15 +514,20 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
332
 
333
  # Validate task format
334
  if "task_name" not in task_data or "samples" not in task_data:
335
- return "Invalid task file format. Must contain 'task_name' and 'samples' fields.", task_list.value, task_container.value
336
 
337
  # Save to a consistent location
338
  task_filename = f"uploaded_tasks/{task_data['task_name']}_task.json"
339
  with open(task_filename, "w") as f:
340
  json.dump(task_data, f, indent=2)
341
 
342
- # Instead of trying to create the interface here,
343
- # we'll return a message with instructions
 
 
 
 
 
344
  return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
345
  <div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
346
  <h3>Task uploaded successfully!</h3>
@@ -354,9 +541,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
354
  <li>Results will be saved as {task_data['task_name']}_human_results.json</li>
355
  </ol>
356
  </div>
357
- """
358
  except Exception as e:
359
- return f"Error processing task file: {str(e)}", task_list.value, task_container.value
360
 
361
  # Function to prepare results for download
362
  def prepare_results_for_download():
@@ -374,7 +561,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
374
  return zip_path
375
 
376
  # Connect events
377
- load_btn.click(handle_upload, inputs=[file_input], outputs=[message, task_list, task_container])
378
  refresh_btn.click(list_task_files, outputs=[task_list])
379
  download_results_btn.click(prepare_results_for_download, outputs=[gr.File(label="Download Results")])
380
 
@@ -406,11 +593,20 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
406
 
407
  # Try to get the total sample count from the corresponding task file
408
  total_samples = 0
 
 
409
  task_file = f"uploaded_tasks/{task_name}_task.json"
410
  if os.path.exists(task_file):
411
  with open(task_file, "r") as f:
412
  task_data = json.load(f)
413
  total_samples = len(task_data.get("samples", []))
 
 
 
 
 
 
 
414
 
415
  completion = f"{len(sample_ids)}/{total_samples}" if total_samples else f"{len(sample_ids)} samples"
416
 
@@ -429,10 +625,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
429
  result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
430
  download_selected_btn = gr.Button("Download Selected")
431
 
432
- # Add results visualization placeholder
433
- gr.Markdown("### Results Visualization")
434
- gr.Markdown("*Visualization features will be added in a future update.*")
435
-
436
  # Connect events
437
  refresh_results_btn.click(get_result_stats, outputs=[result_stats])
438
 
 
2
  import json
3
  import os
4
  from pathlib import Path
5
+ import time
6
 
7
  def create_reranking_interface(task_data):
8
  """Create a Gradio interface for reranking evaluation."""
 
10
  results = {"task_name": task_data["task_name"], "task_type": "reranking", "annotations": []}
11
  completed_samples = {s["id"]: False for s in samples}
12
 
13
+ # Load existing results if available
14
+ output_path = f"{task_data['task_name']}_human_results.json"
15
+ if os.path.exists(output_path):
16
+ try:
17
+ with open(output_path, "r") as f:
18
+ saved_results = json.load(f)
19
+ if "annotations" in saved_results:
20
+ results["annotations"] = saved_results["annotations"]
21
+ # Update completed_samples based on loaded data
22
+ for annotation in saved_results["annotations"]:
23
+ sample_id = annotation.get("sample_id")
24
+ if sample_id and sample_id in completed_samples:
25
+ completed_samples[sample_id] = True
26
+ except Exception as e:
27
+ print(f"Error loading existing results: {e}")
28
+
29
  def save_ranking(rankings, sample_id):
30
  """Save the current set of rankings."""
31
  try:
32
  # Check if all documents have rankings
33
+ if not rankings or len(rankings) == 0:
34
+ return "⚠️ No rankings provided", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
35
+
36
  all_ranked = all(r is not None and r != "" for r in rankings)
37
  if not all_ranked:
38
  return "⚠️ Please assign a rank to all documents before submitting", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
 
62
 
63
  completed_samples[sample_id] = True
64
 
65
+ # Always save to file for redundancy
66
  try:
67
  output_path = f"{task_data['task_name']}_human_results.json"
68
  with open(output_path, "w") as f:
69
  json.dump(results, f, indent=2)
70
+ return f"✅ Rankings saved successfully", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
71
+ except Exception as file_error:
72
  # If file saving fails, still mark as success since we saved in memory
73
+ print(f"File save error: {file_error}")
74
  return f"✅ Rankings saved in memory (file save failed)", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
75
  except Exception as e:
76
  # Return specific error message
77
+ print(f"Save ranking error: {e}")
78
  return f"Error: {str(e)}", f"Progress: {sum(completed_samples.values())}/{len(samples)}"
79
 
80
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
93
  4. Each document must have a unique rank
94
  5. Click "Submit Rankings" when you're done with the current query
95
  6. Use "Previous" and "Next" to navigate between queries
96
+ 7. Your rankings are automatically saved when you submit or navigate
97
+ """.format(instructions=task_data.get("instructions", "Rank documents by their relevance to the query.")))
98
 
99
  current_sample_id = gr.State(value=samples[0]["id"])
100
+ current_state = gr.State(value={"auto_save_enabled": True, "last_saved": time.time()})
101
 
102
  with gr.Row():
103
  progress_text = gr.Textbox(label="Progress", value=f"Progress: 0/{len(samples)}", interactive=False)
104
  status_box = gr.Textbox(label="Status", value="Ready to start evaluation", interactive=False)
105
+ auto_save_toggle = gr.Checkbox(label="Auto-save when navigating", value=True)
106
 
107
  with gr.Group():
108
  gr.Markdown("## Query:")
 
110
 
111
  gr.Markdown("## Documents to Rank:")
112
 
113
+ # Create document displays and ranking inputs in synchronized pairs
114
  doc_containers = []
115
+ ranking_inputs = []
116
+ validation_indicators = []
117
 
118
  with gr.Column():
119
+ # Quick ranking tools
120
+ with gr.Row():
121
+ gr.Markdown("### Quick Ranking Options:")
122
+ sequential_btn = gr.Button("Rank in Order (1,2,3...)")
123
+ reverse_btn = gr.Button("Reverse Order (n,n-1,...)")
124
+ clear_btn = gr.Button("Clear All Rankings")
125
+
126
+ # Document display with better UI for ranking
127
  for i, doc in enumerate(samples[0]["candidates"]):
128
  with gr.Row():
129
+ with gr.Column(scale=4):
130
+ doc_box = gr.Textbox(
131
+ value=doc,
132
+ label=f"Document {i+1}",
133
+ interactive=False
134
+ )
135
+ doc_containers.append(doc_box)
136
+
137
+ with gr.Column(scale=1):
138
+ # Use Radio buttons for ranking rather than dropdowns
139
+ # This provides a more visual and error-resistant interface
140
+ rank_input = gr.Radio(
141
+ choices=[str(j) for j in range(1, len(samples[0]["candidates"])+1)],
142
+ label=f"Rank",
143
+ value="",
144
+ interactive=True
145
+ )
146
+ ranking_inputs.append(rank_input)
147
+
148
+ # Add validation indicator
149
+ with gr.Column(scale=1, min_width=50):
150
+ validation = gr.HTML(value="")
151
+ validation_indicators.append(validation)
152
 
153
  with gr.Row():
154
  prev_btn = gr.Button("← Previous Query", size="sm")
155
  submit_btn = gr.Button("Submit Rankings", size="lg", variant="primary")
156
  next_btn = gr.Button("Next Query →", size="sm")
157
 
158
+ with gr.Row():
159
+ save_btn = gr.Button("💾 Save All Results", variant="secondary")
160
+ results_info = gr.HTML(value=f"<p>Results will be saved to <code>{task_data['task_name']}_human_results.json</code></p>")
161
+
162
+ def validate_rankings(*rankings):
163
+ """Validate rankings and update indicators."""
164
+ results = []
165
+ all_valid = True
166
+ for rank in rankings:
167
+ if rank is None or rank == "":
168
+ results.append("⚠️")
169
+ all_valid = False
170
+ else:
171
+ results.append("✓")
172
+
173
+ return results, all_valid
174
 
175
  def load_sample(sample_id):
176
  """Load a specific sample into the interface."""
177
  sample = next((s for s in samples if s["id"] == sample_id), None)
178
  if not sample:
179
+ return [query_text.value] + [d.value for d in doc_containers] + [""] * len(ranking_inputs) + validation_indicators + [sample_id, progress_text.value, status_box.value]
180
 
181
  # Update query
182
  new_query = sample["query"]
 
188
  new_docs.append(doc)
189
 
190
  # Initialize rankings
191
+ new_rankings = [""] * len(ranking_inputs)
192
 
193
  # Check if this sample has already been annotated
194
  existing_annotation = next((a for a in results["annotations"] if a["sample_id"] == sample_id), None)
 
206
  if completed_samples[sample_id]:
207
  new_status += " (already completed)"
208
 
209
+ # Initialize validation indicators
210
+ validation_results, _ = validate_rankings(*new_rankings)
211
+
212
+ return [new_query] + new_docs + new_rankings + validation_results + [sample_id, new_progress, new_status]
213
+
214
+ def auto_save_and_navigate(direction, current_id, auto_save, *rankings):
215
+ """Save rankings if auto-save is enabled, then navigate."""
216
+ # Extract rankings (remove validation indicators)
217
+ actual_rankings = rankings[:len(ranking_inputs)]
218
+
219
+ # If auto-save is enabled, try to save the current rankings
220
+ status_msg = ""
221
+ progress_msg = f"Progress: {sum(completed_samples.values())}/{len(samples)}"
222
+
223
+ if auto_save:
224
+ # Only save if all rankings are provided
225
+ validation_results, all_valid = validate_rankings(*actual_rankings)
226
+ if all_valid:
227
+ status_msg, progress_msg = save_ranking(actual_rankings, current_id)
228
+
229
+ # Navigate to the next/previous sample
230
+ if direction == "next":
231
+ new_id = next_sample(current_id)
232
+ else:
233
+ new_id = prev_sample(current_id)
234
+
235
+ # Return the new sample ID and status message
236
+ return new_id, status_msg, progress_msg
237
 
238
  def next_sample(current_id):
239
  """Load the next sample."""
 
262
  def save_results():
263
  """Save all collected results to a file."""
264
  output_path = f"{task_data['task_name']}_human_results.json"
265
+ try:
266
+ with open(output_path, "w") as f:
267
+ json.dump(results, f, indent=2)
268
+ current_state.value["last_saved"] = time.time()
269
+ return f"✅ Results saved to {output_path} ({len(results['annotations'])} annotations)"
270
+ except Exception as e:
271
+ return f"Error saving results: {str(e)}"
272
+
273
+ # Function to assign sequential ranks
274
+ def assign_sequential_ranks():
275
+ return [str(i+1) for i in range(len(samples[0]["candidates"]))]
276
 
277
+ # Function to assign reverse ranks
278
+ def assign_reverse_ranks():
279
+ n = len(samples[0]["candidates"])
280
+ return [str(n-i) for i in range(n)]
281
+
282
+ # Function to clear all rankings
283
+ def clear_rankings():
284
+ return ["" for _ in range(len(samples[0]["candidates"]))]
285
+
286
+ # Define a function that collects all ranking values and validates them
287
+ def submit_rankings(*args):
288
+ # Get the last argument (sample_id) and the rankings
289
+ if len(args) < 1:
290
+ return "Error: No arguments provided", progress_text.value
291
+
292
+ # Verify we have enough rankings
293
+ if len(args) < len(ranking_inputs) + 1:
294
+ return "Error: Not enough ranking inputs provided", progress_text.value
295
+
296
  sample_id = args[-1]
297
+ rankings = args[:len(ranking_inputs)]
298
+
299
+ # First validate the rankings
300
+ validation_results, all_valid = validate_rankings(*rankings)
301
+
302
+ # Update validation indicators
303
+ for i, result in enumerate(validation_results):
304
+ validation_indicators[i].update(value=result)
305
+
306
+ # If not all valid, return error message
307
+ if not all_valid:
308
+ return "⚠️ Please assign a rank to all documents before submitting", progress_text.value
309
+
310
+ # Save the validated rankings
311
+ status, progress = save_ranking(rankings, sample_id)
312
+ return status, progress
313
 
314
+ # Connect events - Direct input/output connections for reliability
315
  submit_btn.click(
316
+ submit_rankings,
317
+ inputs=ranking_inputs + [current_sample_id],
318
  outputs=[status_box, progress_text]
319
  )
320
 
321
+ # Apply auto-save before navigation if enabled
322
  next_btn.click(
323
+ auto_save_and_navigate,
324
+ inputs=["next", current_sample_id, auto_save_toggle] + ranking_inputs,
325
+ outputs=[current_sample_id, status_box, progress_text]
326
  ).then(
327
  load_sample,
328
  inputs=[current_sample_id],
329
+ outputs=[query_text] + doc_containers + ranking_inputs + validation_indicators + [current_sample_id, progress_text, status_box]
330
  )
331
 
332
  prev_btn.click(
333
+ auto_save_and_navigate,
334
+ inputs=["prev", current_sample_id, auto_save_toggle] + ranking_inputs,
335
+ outputs=[current_sample_id, status_box, progress_text]
336
  ).then(
337
  load_sample,
338
  inputs=[current_sample_id],
339
+ outputs=[query_text] + doc_containers + ranking_inputs + validation_indicators + [current_sample_id, progress_text, status_box]
340
  )
341
 
342
+ # Connect quick ranking buttons
343
+ sequential_btn.click(
344
+ assign_sequential_ranks,
345
+ outputs=ranking_inputs
346
+ )
347
+
348
+ reverse_btn.click(
349
+ assign_reverse_ranks,
350
+ outputs=ranking_inputs
351
+ )
352
+
353
+ clear_btn.click(
354
+ clear_rankings,
355
+ outputs=ranking_inputs
356
+ )
357
+
358
+ # Connect save button
359
  save_btn.click(save_results, outputs=[status_box])
360
+
361
+ # Add validation on ranking changes
362
+ for i, ranking in enumerate(ranking_inputs):
363
+ ranking.change(
364
+ validate_rankings,
365
+ inputs=ranking_inputs,
366
+ outputs=validation_indicators + [gr.State(value=None)] # Add dummy output to match function return
367
+ )
368
+
369
+ # Set up auto-save feature
370
+ auto_save_toggle.change(
371
+ lambda x: {"auto_save_enabled": x},
372
+ inputs=[auto_save_toggle],
373
+ outputs=[current_state]
374
+ )
375
 
376
  return demo
377
 
378
+ # Main app with file upload capability and improved task management
379
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
380
  gr.Markdown("# MTEB Human Evaluation Demo")
381
 
 
396
  if uploaded_tasks:
397
  # Sort by modification time, newest first
398
  uploaded_tasks.sort(key=lambda x: os.path.getmtime(os.path.join("uploaded_tasks", x)), reverse=True)
399
+ task_path = os.path.join("uploaded_tasks", uploaded_tasks[0])
400
+
401
+ # Verify this is a valid task file
402
+ try:
403
+ with open(task_path, "r") as f:
404
+ task_data = json.load(f)
405
+ if "task_name" in task_data and "samples" in task_data:
406
+ return task_path
407
+ except:
408
+ pass
409
+
410
+ # Look for task files in the current directory
411
+ current_dir_tasks = [f for f in os.listdir(".") if f.endswith("_human_eval.json")]
412
+ if current_dir_tasks:
413
+ # Sort by modification time, newest first
414
+ current_dir_tasks.sort(key=lambda x: os.path.getmtime(x), reverse=True)
415
+ return current_dir_tasks[0]
416
+
417
+ # Fall back to fixed example if available
418
+ if os.path.exists("AskUbuntuDupQuestions_human_eval.json"):
419
+ return "AskUbuntuDupQuestions_human_eval.json"
420
 
421
+ # No valid task file found
422
+ return None
423
 
424
  # Load the task file
425
  task_file = get_latest_task_file()
426
 
427
+ if task_file:
428
+ try:
429
+ with open(task_file, "r") as f:
430
+ task_data = json.load(f)
431
+
432
+ # Show which task is currently loaded
433
+ gr.Markdown(f"**Current Task: {task_data['task_name']}** ({len(task_data['samples'])} samples)")
434
+
435
+ # Display the interface
436
+ reranking_demo = create_reranking_interface(task_data)
437
+ except Exception as e:
438
+ gr.Markdown(f"**Error loading task: {str(e)}**")
439
+ gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
440
+ else:
441
+ gr.Markdown("**No task file found**")
442
  gr.Markdown("Please upload a valid task file in the 'Upload & Evaluate' tab.")
443
 
444
  with gr.TabItem("Upload & Evaluate"):
 
463
  tasks = [f for f in os.listdir("uploaded_tasks") if f.endswith(".json")]
464
  if not tasks:
465
  return "No task files uploaded yet."
466
+ return "\n".join([f"- {t}" for t in tasks])
467
 
468
  task_list = gr.Markdown(list_task_files())
469
  refresh_btn = gr.Button("Refresh List")
 
497
  # Right side - will contain the actual interface
498
  with gr.Column(scale=2):
499
  task_container = gr.HTML()
500
+ loaded_task_info = gr.JSON(label="Loaded Task Information", visible=False)
501
 
502
  # Handle file upload and storage
503
  def handle_upload(file):
504
  if not file:
505
+ return "Please upload a task file", task_list.value, task_container.value, loaded_task_info.value
506
 
507
  try:
508
  # Create directory if it doesn't exist
 
514
 
515
  # Validate task format
516
  if "task_name" not in task_data or "samples" not in task_data:
517
+ return "Invalid task file format. Must contain 'task_name' and 'samples' fields.", task_list.value, task_container.value, loaded_task_info.value
518
 
519
  # Save to a consistent location
520
  task_filename = f"uploaded_tasks/{task_data['task_name']}_task.json"
521
  with open(task_filename, "w") as f:
522
  json.dump(task_data, f, indent=2)
523
 
524
+ # Show task info
525
+ task_info = {
526
+ "task_name": task_data["task_name"],
527
+ "samples": len(task_data["samples"]),
528
+ "file_path": task_filename
529
+ }
530
+
531
  return f"Task '{task_data['task_name']}' uploaded successfully with {len(task_data['samples'])} samples. Please refresh the app and use the Demo tab to evaluate it.", list_task_files(), f"""
532
  <div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px;">
533
  <h3>Task uploaded successfully!</h3>
 
541
  <li>Results will be saved as {task_data['task_name']}_human_results.json</li>
542
  </ol>
543
  </div>
544
+ """, task_info
545
  except Exception as e:
546
+ return f"Error processing task file: {str(e)}", task_list.value, task_container.value, loaded_task_info.value
547
 
548
  # Function to prepare results for download
549
  def prepare_results_for_download():
 
561
  return zip_path
562
 
563
  # Connect events
564
+ load_btn.click(handle_upload, inputs=[file_input], outputs=[message, task_list, task_container, loaded_task_info])
565
  refresh_btn.click(list_task_files, outputs=[task_list])
566
  download_results_btn.click(prepare_results_for_download, outputs=[gr.File(label="Download Results")])
567
 
 
593
 
594
  # Try to get the total sample count from the corresponding task file
595
  total_samples = 0
596
+
597
+ # Try uploaded_tasks directory first
598
  task_file = f"uploaded_tasks/{task_name}_task.json"
599
  if os.path.exists(task_file):
600
  with open(task_file, "r") as f:
601
  task_data = json.load(f)
602
  total_samples = len(task_data.get("samples", []))
603
+ else:
604
+ # Try human_eval file in current directory
605
+ task_file = f"{task_name}_human_eval.json"
606
+ if os.path.exists(task_file):
607
+ with open(task_file, "r") as f:
608
+ task_data = json.load(f)
609
+ total_samples = len(task_data.get("samples", []))
610
 
611
  completion = f"{len(sample_ids)}/{total_samples}" if total_samples else f"{len(sample_ids)} samples"
612
 
 
625
  result_select = gr.Dropdown(choices=[f for f in os.listdir(".") if f.endswith("_human_results.json")], label="Select Result to Download")
626
  download_selected_btn = gr.Button("Download Selected")
627
 
 
 
 
 
628
  # Connect events
629
  refresh_results_btn.click(get_result_stats, outputs=[result_stats])
630