aizip-dev commited on
Commit
fcbb39d
·
verified ·
1 Parent(s): 68e1082

Refine vote options

Browse files
Files changed (1) hide show
  1. app.py +87 -11
app.py CHANGED
@@ -7,18 +7,56 @@ import time
7
  import numpy as np
8
  from utils.data_loader import get_random_example
9
  from utils.models import generate_summaries, model_names
10
- from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
11
  from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
12
  from utils.vote_logger import save_vote_details
13
  from utils.shared import generation_interrupt
14
 
15
  feedback_options = {
16
- "left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"],
17
- "right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
18
- "tie": ["Model A: Complete", "Model A: Accurate", "Model A: Relevant", "Model A: Well written", "Model A: Correct refusal (if applicable)",
19
- "Model B: Complete", "Model B: Accurate", "Model B: Relevant", "Model B: Well written", "Model B: Corrent refusal (if applicable)"],
20
- "neither": ["Model A: Incomplete", "Model A: Hallucinate", "Model A: Irrelevant", "Model A: Incorrect refusal (if applicable)",
21
- "Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  }
23
 
24
  def weighted_sample_without_replacement(population, weights, k=2):
@@ -248,7 +286,7 @@ def show_loading_state():
248
  gr.update(visible=False), # feedback_section
249
  gr.update(interactive=False), # submit_button
250
  gr.update(visible=False), # results_reveal_area
251
- gr.update(interactive=False), # random_question_btn
252
  None # Reset selected_winner
253
  ]
254
 
@@ -268,6 +306,15 @@ def update_ui_for_new_context(example):
268
  False
269
  ]
270
 
 
 
 
 
 
 
 
 
 
271
  def cleanup_on_disconnect():
272
  print(f"Browser disconnected. Cleaning up resources...")
273
  generation_interrupt.set()
@@ -321,13 +368,14 @@ with gr.Blocks(theme=gr.themes.Default(
321
  show_results_state = gr.State(False)
322
  results_agg = gr.State(load_leaderboard_data())
323
  show_full_context = gr.State(False)
 
324
  faq_expanded = gr.State(False) # State for FAQ toggle
325
 
326
  with gr.Tabs() as tabs:
327
  with gr.TabItem("Arena", id="arena-tab"):
328
- gr.Markdown("# Small Language Model RAG Arena")
329
  gr.Markdown("""
330
- 🏟️ This arena evaluates how well SLMs (under 5B) answer questions based on document contexts.
331
 
332
  📝 Instructions:
333
  - **Click the "Get a Question" button** to load a random question with context
@@ -372,6 +420,8 @@ with gr.Blocks(theme=gr.themes.Default(
372
  # Model comparison section - initially hidden
373
  with gr.Column(visible=False, elem_id="model-section") as model_section:
374
  gr.Markdown("---")
 
 
375
  gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
376
 
377
  with gr.Row(elem_id="summary-containers"):
@@ -395,6 +445,15 @@ with gr.Blocks(theme=gr.themes.Default(
395
  autoscroll=False,
396
  elem_id="summary-b-display"
397
  )
 
 
 
 
 
 
 
 
 
398
 
399
  # Voting section - initially hidden
400
  with gr.Column(visible=False, elem_id="voting-section") as voting_section:
@@ -431,7 +490,7 @@ with gr.Blocks(theme=gr.themes.Default(
431
 
432
  with gr.TabItem("Leaderboard", id="leaderboard-tab"):
433
  gr.Markdown("# SLM RAG Leaderboard", elem_classes="orange-title")
434
- gr.Markdown("View performance statistics for all models ranked by Elo rating.")
435
 
436
  with gr.Group(elem_id="leaderboard-info"):
437
  gr.Markdown("""### About Elo Ratings
@@ -458,6 +517,13 @@ The Elo rating system provides a more accurate ranking than simple win rates:
458
  inputs=[current_example, show_full_context],
459
  outputs=[show_full_context, context_display, context_toggle_btn]
460
  )
 
 
 
 
 
 
 
461
 
462
  # Initialize UI to empty state on load
463
  demo.load(
@@ -497,6 +563,11 @@ The Elo rating system provides a more accurate ranking than simple win rates:
497
  inputs=[current_example],
498
  outputs=[query_display, context_description, context_display,
499
  context_toggle_btn, show_full_context]
 
 
 
 
 
500
  ).then(
501
  # IMPORTANT: Explicitly hide FAQ here
502
  fn=hide_faq_section,
@@ -541,6 +612,11 @@ The Elo rating system provides a more accurate ranking than simple win rates:
541
  inputs=[current_example],
542
  outputs=[query_display, context_description, context_display,
543
  context_toggle_btn, show_full_context]
 
 
 
 
 
544
  ).then(
545
  # IMPORTANT: Explicitly hide FAQ here too
546
  fn=hide_faq_section,
 
7
  import numpy as np
8
  from utils.data_loader import get_random_example
9
  from utils.models import generate_summaries, model_names
10
+ from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html, toggle_reference_answer
11
  from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
12
  from utils.vote_logger import save_vote_details
13
  from utils.shared import generation_interrupt
14
 
15
  feedback_options = {
16
+ "left": [
17
+ "Model A: Answers the question completely",
18
+ "Model A: Information is accurate and correct",
19
+ "Model A: Stays on topic and relevant",
20
+ "Model A: Clear and well-written response",
21
+ "Model A: Appropriately says 'I don't know' without enough info",
22
+ "Model A: Asks helpful follow-up questions when unclear"
23
+ ],
24
+ "right": [
25
+ "Model B: Answers the question completely",
26
+ "Model B: Information is accurate and correct",
27
+ "Model B: Stays on topic and relevant",
28
+ "Model B: Clear and well-written response",
29
+ "Model B: Appropriately says 'I don't know' without enough info",
30
+ "Model B: Asks helpful follow-up questions when unclear"
31
+ ],
32
+ "tie": [
33
+ "Model A: Answers the question completely",
34
+ "Model A: Information is accurate and correct",
35
+ "Model A: Stays on topic and relevant",
36
+ "Model A: Clear and well-written response",
37
+ "Model A: Appropriately says 'I don't know' without enough info",
38
+ "Model A: Asks helpful follow-up questions when unclear",
39
+ "Model B: Answers the question completely",
40
+ "Model B: Information is accurate and correct",
41
+ "Model B: Stays on topic and relevant",
42
+ "Model B: Clear and well-written response",
43
+ "Model B: Appropriately says 'I don't know' without enough info",
44
+ "Model B: Asks helpful follow-up questions when unclear"
45
+ ],
46
+ "neither": [
47
+ "Model A: Incomplete or missing key information",
48
+ "Model A: Contains incorrect or made-up information",
49
+ "Model A: Goes off-topic or irrelevant",
50
+ "Model A: Poorly written or confusing",
51
+ "Model A: Should have admitted uncertainty without enough info",
52
+ "Model A: Should have asked clarifying questions but didn't",
53
+ "Model B: Incomplete or missing key information",
54
+ "Model B: Contains incorrect or made-up information",
55
+ "Model B: Goes off-topic or irrelevant",
56
+ "Model B: Poorly written or confusing",
57
+ "Model B: Should have admitted uncertainty without enough info",
58
+ "Model B: Should have asked clarifying questions but didn't"
59
+ ]
60
  }
61
 
62
  def weighted_sample_without_replacement(population, weights, k=2):
 
286
  gr.update(visible=False), # feedback_section
287
  gr.update(interactive=False), # submit_button
288
  gr.update(visible=False), # results_reveal_area
289
+ gr.update(interactive=True), # random_question_btn - KEEP ACTIVE during inference
290
  None # Reset selected_winner
291
  ]
292
 
 
306
  False
307
  ]
308
 
309
+ def reset_reference_section():
310
+ """Reset reference answer section to hidden state when loading new question"""
311
+ return [
312
+ False, # Reset show_reference_answer state to False
313
+ gr.update(visible=False), # Hide reference content (like FAQ)
314
+ gr.update(value="▶ Show Reference Answer"), # Reset button text (like FAQ)
315
+ gr.update(value="") # Clear reference content
316
+ ]
317
+
318
  def cleanup_on_disconnect():
319
  print(f"Browser disconnected. Cleaning up resources...")
320
  generation_interrupt.set()
 
368
  show_results_state = gr.State(False)
369
  results_agg = gr.State(load_leaderboard_data())
370
  show_full_context = gr.State(False)
371
+ show_reference_answer = gr.State(False) # NEW: State for reference answer toggle
372
  faq_expanded = gr.State(False) # State for FAQ toggle
373
 
374
  with gr.Tabs() as tabs:
375
  with gr.TabItem("Arena", id="arena-tab"):
376
+ gr.Markdown("# SLM RAG Arena - Compare and Find The Best Sub-5B Models for RAG")
377
  gr.Markdown("""
378
+ 🏟️ This arena evaluates how well small language models (under 5B) answer questions based on document contexts.
379
 
380
  📝 Instructions:
381
  - **Click the "Get a Question" button** to load a random question with context
 
420
  # Model comparison section - initially hidden
421
  with gr.Column(visible=False, elem_id="model-section") as model_section:
422
  gr.Markdown("---")
423
+
424
+ # NEW: Model comparison header (simple)
425
  gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
426
 
427
  with gr.Row(elem_id="summary-containers"):
 
445
  autoscroll=False,
446
  elem_id="summary-b-display"
447
  )
448
+
449
+
450
+ # NEW: Reference Answer Toggle (exactly like FAQ style)
451
+ with gr.Row(elem_id="reference-toggle-row"):
452
+ reference_toggle_btn = gr.Button("▶ Show Reference Answer", elem_classes=["faq-toggle-button"])
453
+
454
+ # Reference Answer Content - initially hidden (exactly like FAQ)
455
+ with gr.Row(visible=False, elem_id="reference-content") as reference_content:
456
+ reference_answer_display = gr.Markdown("", elem_classes="faq-text")
457
 
458
  # Voting section - initially hidden
459
  with gr.Column(visible=False, elem_id="voting-section") as voting_section:
 
490
 
491
  with gr.TabItem("Leaderboard", id="leaderboard-tab"):
492
  gr.Markdown("# SLM RAG Leaderboard", elem_classes="orange-title")
493
+ gr.HTML('View performance statistics for all models ranked by Elo rating. <br><br><a href="https://docs.google.com/forms/d/e/1FAIpQLSeUZoy43MlpK8-tJS4a6n5Q8PAKf-8Twdui5ybU18t0e2UuVA/viewform" class="form-link" target="_blank" rel="noopener noreferrer">Submit a new model request</a>')
494
 
495
  with gr.Group(elem_id="leaderboard-info"):
496
  gr.Markdown("""### About Elo Ratings
 
517
  inputs=[current_example, show_full_context],
518
  outputs=[show_full_context, context_display, context_toggle_btn]
519
  )
520
+
521
+ # NEW: Reference answer toggle functionality (exactly like FAQ)
522
+ reference_toggle_btn.click(
523
+ fn=toggle_reference_answer,
524
+ inputs=[show_reference_answer, current_example],
525
+ outputs=[show_reference_answer, reference_content, reference_toggle_btn, reference_answer_display]
526
+ )
527
 
528
  # Initialize UI to empty state on load
529
  demo.load(
 
563
  inputs=[current_example],
564
  outputs=[query_display, context_description, context_display,
565
  context_toggle_btn, show_full_context]
566
+ ).then(
567
+ # NEW: Reset reference section when loading new question
568
+ fn=reset_reference_section,
569
+ inputs=[],
570
+ outputs=[show_reference_answer, reference_content, reference_toggle_btn, reference_answer_display]
571
  ).then(
572
  # IMPORTANT: Explicitly hide FAQ here
573
  fn=hide_faq_section,
 
612
  inputs=[current_example],
613
  outputs=[query_display, context_description, context_display,
614
  context_toggle_btn, show_full_context]
615
+ ).then(
616
+ # NEW: Reset reference section when trying another question
617
+ fn=reset_reference_section,
618
+ inputs=[],
619
+ outputs=[show_reference_answer, reference_content, reference_toggle_btn, reference_answer_display]
620
  ).then(
621
  # IMPORTANT: Explicitly hide FAQ here too
622
  fn=hide_faq_section,