Spaces:
Running
on
Zero
Running
on
Zero
Refine vote options
Browse files
app.py
CHANGED
@@ -7,18 +7,56 @@ import time
|
|
7 |
import numpy as np
|
8 |
from utils.data_loader import get_random_example
|
9 |
from utils.models import generate_summaries, model_names
|
10 |
-
from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
|
11 |
from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
|
12 |
from utils.vote_logger import save_vote_details
|
13 |
from utils.shared import generation_interrupt
|
14 |
|
15 |
feedback_options = {
|
16 |
-
"left": [
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
}
|
23 |
|
24 |
def weighted_sample_without_replacement(population, weights, k=2):
|
@@ -248,7 +286,7 @@ def show_loading_state():
|
|
248 |
gr.update(visible=False), # feedback_section
|
249 |
gr.update(interactive=False), # submit_button
|
250 |
gr.update(visible=False), # results_reveal_area
|
251 |
-
gr.update(interactive=
|
252 |
None # Reset selected_winner
|
253 |
]
|
254 |
|
@@ -268,6 +306,15 @@ def update_ui_for_new_context(example):
|
|
268 |
False
|
269 |
]
|
270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
def cleanup_on_disconnect():
|
272 |
print(f"Browser disconnected. Cleaning up resources...")
|
273 |
generation_interrupt.set()
|
@@ -321,13 +368,14 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
321 |
show_results_state = gr.State(False)
|
322 |
results_agg = gr.State(load_leaderboard_data())
|
323 |
show_full_context = gr.State(False)
|
|
|
324 |
faq_expanded = gr.State(False) # State for FAQ toggle
|
325 |
|
326 |
with gr.Tabs() as tabs:
|
327 |
with gr.TabItem("Arena", id="arena-tab"):
|
328 |
-
gr.Markdown("#
|
329 |
gr.Markdown("""
|
330 |
-
🏟️ This arena evaluates how well
|
331 |
|
332 |
📝 Instructions:
|
333 |
- **Click the "Get a Question" button** to load a random question with context
|
@@ -372,6 +420,8 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
372 |
# Model comparison section - initially hidden
|
373 |
with gr.Column(visible=False, elem_id="model-section") as model_section:
|
374 |
gr.Markdown("---")
|
|
|
|
|
375 |
gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
|
376 |
|
377 |
with gr.Row(elem_id="summary-containers"):
|
@@ -395,6 +445,15 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
395 |
autoscroll=False,
|
396 |
elem_id="summary-b-display"
|
397 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
|
399 |
# Voting section - initially hidden
|
400 |
with gr.Column(visible=False, elem_id="voting-section") as voting_section:
|
@@ -431,7 +490,7 @@ with gr.Blocks(theme=gr.themes.Default(
|
|
431 |
|
432 |
with gr.TabItem("Leaderboard", id="leaderboard-tab"):
|
433 |
gr.Markdown("# SLM RAG Leaderboard", elem_classes="orange-title")
|
434 |
-
gr.
|
435 |
|
436 |
with gr.Group(elem_id="leaderboard-info"):
|
437 |
gr.Markdown("""### About Elo Ratings
|
@@ -458,6 +517,13 @@ The Elo rating system provides a more accurate ranking than simple win rates:
|
|
458 |
inputs=[current_example, show_full_context],
|
459 |
outputs=[show_full_context, context_display, context_toggle_btn]
|
460 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
461 |
|
462 |
# Initialize UI to empty state on load
|
463 |
demo.load(
|
@@ -497,6 +563,11 @@ The Elo rating system provides a more accurate ranking than simple win rates:
|
|
497 |
inputs=[current_example],
|
498 |
outputs=[query_display, context_description, context_display,
|
499 |
context_toggle_btn, show_full_context]
|
|
|
|
|
|
|
|
|
|
|
500 |
).then(
|
501 |
# IMPORTANT: Explicitly hide FAQ here
|
502 |
fn=hide_faq_section,
|
@@ -541,6 +612,11 @@ The Elo rating system provides a more accurate ranking than simple win rates:
|
|
541 |
inputs=[current_example],
|
542 |
outputs=[query_display, context_description, context_display,
|
543 |
context_toggle_btn, show_full_context]
|
|
|
|
|
|
|
|
|
|
|
544 |
).then(
|
545 |
# IMPORTANT: Explicitly hide FAQ here too
|
546 |
fn=hide_faq_section,
|
|
|
7 |
import numpy as np
|
8 |
from utils.data_loader import get_random_example
|
9 |
from utils.models import generate_summaries, model_names
|
10 |
+
from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html, toggle_reference_answer
|
11 |
from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
|
12 |
from utils.vote_logger import save_vote_details
|
13 |
from utils.shared import generation_interrupt
|
14 |
|
15 |
feedback_options = {
|
16 |
+
"left": [
|
17 |
+
"Model A: Answers the question completely",
|
18 |
+
"Model A: Information is accurate and correct",
|
19 |
+
"Model A: Stays on topic and relevant",
|
20 |
+
"Model A: Clear and well-written response",
|
21 |
+
"Model A: Appropriately says 'I don't know' without enough info",
|
22 |
+
"Model A: Asks helpful follow-up questions when unclear"
|
23 |
+
],
|
24 |
+
"right": [
|
25 |
+
"Model B: Answers the question completely",
|
26 |
+
"Model B: Information is accurate and correct",
|
27 |
+
"Model B: Stays on topic and relevant",
|
28 |
+
"Model B: Clear and well-written response",
|
29 |
+
"Model B: Appropriately says 'I don't know' without enough info",
|
30 |
+
"Model B: Asks helpful follow-up questions when unclear"
|
31 |
+
],
|
32 |
+
"tie": [
|
33 |
+
"Model A: Answers the question completely",
|
34 |
+
"Model A: Information is accurate and correct",
|
35 |
+
"Model A: Stays on topic and relevant",
|
36 |
+
"Model A: Clear and well-written response",
|
37 |
+
"Model A: Appropriately says 'I don't know' without enough info",
|
38 |
+
"Model A: Asks helpful follow-up questions when unclear",
|
39 |
+
"Model B: Answers the question completely",
|
40 |
+
"Model B: Information is accurate and correct",
|
41 |
+
"Model B: Stays on topic and relevant",
|
42 |
+
"Model B: Clear and well-written response",
|
43 |
+
"Model B: Appropriately says 'I don't know' without enough info",
|
44 |
+
"Model B: Asks helpful follow-up questions when unclear"
|
45 |
+
],
|
46 |
+
"neither": [
|
47 |
+
"Model A: Incomplete or missing key information",
|
48 |
+
"Model A: Contains incorrect or made-up information",
|
49 |
+
"Model A: Goes off-topic or irrelevant",
|
50 |
+
"Model A: Poorly written or confusing",
|
51 |
+
"Model A: Should have admitted uncertainty without enough info",
|
52 |
+
"Model A: Should have asked clarifying questions but didn't",
|
53 |
+
"Model B: Incomplete or missing key information",
|
54 |
+
"Model B: Contains incorrect or made-up information",
|
55 |
+
"Model B: Goes off-topic or irrelevant",
|
56 |
+
"Model B: Poorly written or confusing",
|
57 |
+
"Model B: Should have admitted uncertainty without enough info",
|
58 |
+
"Model B: Should have asked clarifying questions but didn't"
|
59 |
+
]
|
60 |
}
|
61 |
|
62 |
def weighted_sample_without_replacement(population, weights, k=2):
|
|
|
286 |
gr.update(visible=False), # feedback_section
|
287 |
gr.update(interactive=False), # submit_button
|
288 |
gr.update(visible=False), # results_reveal_area
|
289 |
+
gr.update(interactive=True), # random_question_btn - KEEP ACTIVE during inference
|
290 |
None # Reset selected_winner
|
291 |
]
|
292 |
|
|
|
306 |
False
|
307 |
]
|
308 |
|
309 |
+
def reset_reference_section():
|
310 |
+
"""Reset reference answer section to hidden state when loading new question"""
|
311 |
+
return [
|
312 |
+
False, # Reset show_reference_answer state to False
|
313 |
+
gr.update(visible=False), # Hide reference content (like FAQ)
|
314 |
+
gr.update(value="▶ Show Reference Answer"), # Reset button text (like FAQ)
|
315 |
+
gr.update(value="") # Clear reference content
|
316 |
+
]
|
317 |
+
|
318 |
def cleanup_on_disconnect():
|
319 |
print(f"Browser disconnected. Cleaning up resources...")
|
320 |
generation_interrupt.set()
|
|
|
368 |
show_results_state = gr.State(False)
|
369 |
results_agg = gr.State(load_leaderboard_data())
|
370 |
show_full_context = gr.State(False)
|
371 |
+
show_reference_answer = gr.State(False) # NEW: State for reference answer toggle
|
372 |
faq_expanded = gr.State(False) # State for FAQ toggle
|
373 |
|
374 |
with gr.Tabs() as tabs:
|
375 |
with gr.TabItem("Arena", id="arena-tab"):
|
376 |
+
gr.Markdown("# SLM RAG Arena - Compare and Find The Best Sub-5B Models for RAG")
|
377 |
gr.Markdown("""
|
378 |
+
🏟️ This arena evaluates how well small language models (under 5B) answer questions based on document contexts.
|
379 |
|
380 |
📝 Instructions:
|
381 |
- **Click the "Get a Question" button** to load a random question with context
|
|
|
420 |
# Model comparison section - initially hidden
|
421 |
with gr.Column(visible=False, elem_id="model-section") as model_section:
|
422 |
gr.Markdown("---")
|
423 |
+
|
424 |
+
# NEW: Model comparison header (simple)
|
425 |
gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
|
426 |
|
427 |
with gr.Row(elem_id="summary-containers"):
|
|
|
445 |
autoscroll=False,
|
446 |
elem_id="summary-b-display"
|
447 |
)
|
448 |
+
|
449 |
+
|
450 |
+
# NEW: Reference Answer Toggle (exactly like FAQ style)
|
451 |
+
with gr.Row(elem_id="reference-toggle-row"):
|
452 |
+
reference_toggle_btn = gr.Button("▶ Show Reference Answer", elem_classes=["faq-toggle-button"])
|
453 |
+
|
454 |
+
# Reference Answer Content - initially hidden (exactly like FAQ)
|
455 |
+
with gr.Row(visible=False, elem_id="reference-content") as reference_content:
|
456 |
+
reference_answer_display = gr.Markdown("", elem_classes="faq-text")
|
457 |
|
458 |
# Voting section - initially hidden
|
459 |
with gr.Column(visible=False, elem_id="voting-section") as voting_section:
|
|
|
490 |
|
491 |
with gr.TabItem("Leaderboard", id="leaderboard-tab"):
|
492 |
gr.Markdown("# SLM RAG Leaderboard", elem_classes="orange-title")
|
493 |
+
gr.HTML('View performance statistics for all models ranked by Elo rating. <br><br><a href="https://docs.google.com/forms/d/e/1FAIpQLSeUZoy43MlpK8-tJS4a6n5Q8PAKf-8Twdui5ybU18t0e2UuVA/viewform" class="form-link" target="_blank" rel="noopener noreferrer">Submit a new model request</a>')
|
494 |
|
495 |
with gr.Group(elem_id="leaderboard-info"):
|
496 |
gr.Markdown("""### About Elo Ratings
|
|
|
517 |
inputs=[current_example, show_full_context],
|
518 |
outputs=[show_full_context, context_display, context_toggle_btn]
|
519 |
)
|
520 |
+
|
521 |
+
# NEW: Reference answer toggle functionality (exactly like FAQ)
|
522 |
+
reference_toggle_btn.click(
|
523 |
+
fn=toggle_reference_answer,
|
524 |
+
inputs=[show_reference_answer, current_example],
|
525 |
+
outputs=[show_reference_answer, reference_content, reference_toggle_btn, reference_answer_display]
|
526 |
+
)
|
527 |
|
528 |
# Initialize UI to empty state on load
|
529 |
demo.load(
|
|
|
563 |
inputs=[current_example],
|
564 |
outputs=[query_display, context_description, context_display,
|
565 |
context_toggle_btn, show_full_context]
|
566 |
+
).then(
|
567 |
+
# NEW: Reset reference section when loading new question
|
568 |
+
fn=reset_reference_section,
|
569 |
+
inputs=[],
|
570 |
+
outputs=[show_reference_answer, reference_content, reference_toggle_btn, reference_answer_display]
|
571 |
).then(
|
572 |
# IMPORTANT: Explicitly hide FAQ here
|
573 |
fn=hide_faq_section,
|
|
|
612 |
inputs=[current_example],
|
613 |
outputs=[query_display, context_description, context_display,
|
614 |
context_toggle_btn, show_full_context]
|
615 |
+
).then(
|
616 |
+
# NEW: Reset reference section when trying another question
|
617 |
+
fn=reset_reference_section,
|
618 |
+
inputs=[],
|
619 |
+
outputs=[show_reference_answer, reference_content, reference_toggle_btn, reference_answer_display]
|
620 |
).then(
|
621 |
# IMPORTANT: Explicitly hide FAQ here too
|
622 |
fn=hide_faq_section,
|