SLM-RAG-Arena

Running on Zero

App Files Files Community

aizip-dev commited on May 22

Commit

fcbb39d

verified ·

1 Parent(s): 68e1082

Refine vote options

Browse files

Files changed (1) hide show

app.py +87 -11

app.py CHANGED Viewed

@@ -7,18 +7,56 @@ import time
 import numpy as np
 from utils.data_loader import get_random_example
 from utils.models import generate_summaries, model_names
-from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html
 from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
 from utils.vote_logger import save_vote_details
 from utils.shared import generation_interrupt
 feedback_options = {
-    "left": ["Model A: More complete", "Model A: More accurate", "Model A: More relevant", "Model A: Better written", "Model A: Better refusal (if applicable)"],
-    "right": ["Model B: More complete", "Model B: More accurate", "Model B: More relevant", "Model B: Better written", "Model B: Better refusal (if applicable)"],
-    "tie": ["Model A: Complete", "Model A: Accurate", "Model A: Relevant", "Model A: Well written", "Model A: Correct refusal (if applicable)",
-           "Model B: Complete", "Model B: Accurate", "Model B: Relevant", "Model B: Well written", "Model B: Corrent refusal (if applicable)"],
-    "neither": ["Model A: Incomplete", "Model A: Hallucinate", "Model A: Irrelevant", "Model A: Incorrect refusal (if applicable)",
-               "Model B: Incomplete", "Model B: Hallucinate", "Model B: Irrelevant", "Model B: Incorrect refusal (if applicable)"]
 }
 def weighted_sample_without_replacement(population, weights, k=2):
@@ -248,7 +286,7 @@ def show_loading_state():
         gr.update(visible=False),      # feedback_section
         gr.update(interactive=False),  # submit_button
         gr.update(visible=False),      # results_reveal_area
-        gr.update(interactive=False),  # random_question_btn
         None  # Reset selected_winner
     ]
@@ -268,6 +306,15 @@ def update_ui_for_new_context(example):
         False
     ]
 def cleanup_on_disconnect():
     print(f"Browser disconnected. Cleaning up resources...")
     generation_interrupt.set()
@@ -321,13 +368,14 @@ with gr.Blocks(theme=gr.themes.Default(
     show_results_state = gr.State(False)
     results_agg = gr.State(load_leaderboard_data())
     show_full_context = gr.State(False)
     faq_expanded = gr.State(False)  # State for FAQ toggle
     with gr.Tabs() as tabs:
         with gr.TabItem("Arena", id="arena-tab"):
-            gr.Markdown("# Small Language Model RAG Arena")
             gr.Markdown("""
-🏟️ This arena evaluates how well SLMs (under 5B) answer questions based on document contexts.
 📝 Instructions：
 -  **Click the "Get a Question" button** to load a random question with context
@@ -372,6 +420,8 @@ with gr.Blocks(theme=gr.themes.Default(
                 # Model comparison section - initially hidden
                 with gr.Column(visible=False, elem_id="model-section") as model_section:
                     gr.Markdown("---")
                     gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
                     with gr.Row(elem_id="summary-containers"):
@@ -395,6 +445,15 @@ with gr.Blocks(theme=gr.themes.Default(
                                     autoscroll=False,
                                     elem_id="summary-b-display"
                                 )
                 # Voting section - initially hidden
                 with gr.Column(visible=False, elem_id="voting-section") as voting_section:
@@ -431,7 +490,7 @@ with gr.Blocks(theme=gr.themes.Default(
         with gr.TabItem("Leaderboard", id="leaderboard-tab"):
             gr.Markdown("# SLM RAG Leaderboard", elem_classes="orange-title")
-            gr.Markdown("View performance statistics for all models ranked by Elo rating.")
             with gr.Group(elem_id="leaderboard-info"):
                 gr.Markdown("""### About Elo Ratings
@@ -458,6 +517,13 @@ The Elo rating system provides a more accurate ranking than simple win rates:
         inputs=[current_example, show_full_context],
         outputs=[show_full_context, context_display, context_toggle_btn]
     )
     # Initialize UI to empty state on load
     demo.load(
@@ -497,6 +563,11 @@ The Elo rating system provides a more accurate ranking than simple win rates:
         inputs=[current_example],
         outputs=[query_display, context_description, context_display,
                 context_toggle_btn, show_full_context]
     ).then(
         # IMPORTANT: Explicitly hide FAQ here
         fn=hide_faq_section,
@@ -541,6 +612,11 @@ The Elo rating system provides a more accurate ranking than simple win rates:
         inputs=[current_example],
         outputs=[query_display, context_description, context_display,
                 context_toggle_btn, show_full_context]
     ).then(
         # IMPORTANT: Explicitly hide FAQ here too
         fn=hide_faq_section,

 import numpy as np
 from utils.data_loader import get_random_example
 from utils.models import generate_summaries, model_names
+from utils.ui_helpers import toggle_context_display, update_feedback, get_context_html, toggle_reference_answer
 from utils.leaderboard import load_leaderboard_data, submit_vote_with_elo, generate_leaderboard_html
 from utils.vote_logger import save_vote_details
 from utils.shared import generation_interrupt
 feedback_options = {
+    "left": [
+        "Model A: Answers the question completely",
+        "Model A: Information is accurate and correct",
+        "Model A: Stays on topic and relevant",
+        "Model A: Clear and well-written response",
+        "Model A: Appropriately says 'I don't know' without enough info",
+        "Model A: Asks helpful follow-up questions when unclear"
+    ],
+    "right": [
+        "Model B: Answers the question completely",
+        "Model B: Information is accurate and correct",
+        "Model B: Stays on topic and relevant",
+        "Model B: Clear and well-written response",
+        "Model B: Appropriately says 'I don't know' without enough info",
+        "Model B: Asks helpful follow-up questions when unclear"
+    ],
+    "tie": [
+        "Model A: Answers the question completely",
+        "Model A: Information is accurate and correct",
+        "Model A: Stays on topic and relevant",
+        "Model A: Clear and well-written response",
+        "Model A: Appropriately says 'I don't know' without enough info",
+        "Model A: Asks helpful follow-up questions when unclear",
+        "Model B: Answers the question completely",
+        "Model B: Information is accurate and correct",
+        "Model B: Stays on topic and relevant",
+        "Model B: Clear and well-written response",
+        "Model B: Appropriately says 'I don't know' without enough info",
+        "Model B: Asks helpful follow-up questions when unclear"
+    ],
+    "neither": [
+        "Model A: Incomplete or missing key information",
+        "Model A: Contains incorrect or made-up information",
+        "Model A: Goes off-topic or irrelevant",
+        "Model A: Poorly written or confusing",
+        "Model A: Should have admitted uncertainty without enough info",
+        "Model A: Should have asked clarifying questions but didn't",
+        "Model B: Incomplete or missing key information",
+        "Model B: Contains incorrect or made-up information",
+        "Model B: Goes off-topic or irrelevant",
+        "Model B: Poorly written or confusing",
+        "Model B: Should have admitted uncertainty without enough info",
+        "Model B: Should have asked clarifying questions but didn't"
+    ]
 }
 def weighted_sample_without_replacement(population, weights, k=2):
         gr.update(visible=False),      # feedback_section
         gr.update(interactive=False),  # submit_button
         gr.update(visible=False),      # results_reveal_area
+        gr.update(interactive=True),   # random_question_btn - KEEP ACTIVE during inference
         None  # Reset selected_winner
     ]
         False
     ]
+def reset_reference_section():
+    """Reset reference answer section to hidden state when loading new question"""
+    return [
+        False,  # Reset show_reference_answer state to False
+        gr.update(visible=False),  # Hide reference content (like FAQ)
+        gr.update(value="▶ Show Reference Answer"),  # Reset button text (like FAQ)
+        gr.update(value="")  # Clear reference content
+    ]
 def cleanup_on_disconnect():
     print(f"Browser disconnected. Cleaning up resources...")
     generation_interrupt.set()
     show_results_state = gr.State(False)
     results_agg = gr.State(load_leaderboard_data())
     show_full_context = gr.State(False)
+    show_reference_answer = gr.State(False)  # NEW: State for reference answer toggle
     faq_expanded = gr.State(False)  # State for FAQ toggle
     with gr.Tabs() as tabs:
         with gr.TabItem("Arena", id="arena-tab"):
+            gr.Markdown("# SLM RAG Arena -  Compare and Find The Best Sub-5B Models for RAG")
             gr.Markdown("""
+🏟️ This arena evaluates how well small language models (under 5B) answer questions based on document contexts.
 📝 Instructions：
 -  **Click the "Get a Question" button** to load a random question with context
                 # Model comparison section - initially hidden
                 with gr.Column(visible=False, elem_id="model-section") as model_section:
                     gr.Markdown("---")
+                    # NEW: Model comparison header (simple)
                     gr.Markdown("### 🔍 Compare Models - Are these Grounded, Complete Answers or Correct Rejections?", elem_classes="section-heading")
                     with gr.Row(elem_id="summary-containers"):
                                     autoscroll=False,
                                     elem_id="summary-b-display"
                                 )
+                    # NEW: Reference Answer Toggle (exactly like FAQ style)
+                    with gr.Row(elem_id="reference-toggle-row"):
+                        reference_toggle_btn = gr.Button("▶ Show Reference Answer", elem_classes=["faq-toggle-button"])
+                    # Reference Answer Content - initially hidden (exactly like FAQ)
+                    with gr.Row(visible=False, elem_id="reference-content") as reference_content:
+                        reference_answer_display = gr.Markdown("", elem_classes="faq-text")
                 # Voting section - initially hidden
                 with gr.Column(visible=False, elem_id="voting-section") as voting_section:
         with gr.TabItem("Leaderboard", id="leaderboard-tab"):
             gr.Markdown("# SLM RAG Leaderboard", elem_classes="orange-title")
+            gr.HTML('View performance statistics for all models ranked by Elo rating. <br><br><a href="https://docs.google.com/forms/d/e/1FAIpQLSeUZoy43MlpK8-tJS4a6n5Q8PAKf-8Twdui5ybU18t0e2UuVA/viewform" class="form-link" target="_blank" rel="noopener noreferrer">Submit a new model request</a>')
             with gr.Group(elem_id="leaderboard-info"):
                 gr.Markdown("""### About Elo Ratings
         inputs=[current_example, show_full_context],
         outputs=[show_full_context, context_display, context_toggle_btn]
     )
+    # NEW: Reference answer toggle functionality (exactly like FAQ)
+    reference_toggle_btn.click(
+        fn=toggle_reference_answer,
+        inputs=[show_reference_answer, current_example],
+        outputs=[show_reference_answer, reference_content, reference_toggle_btn, reference_answer_display]
+    )
     # Initialize UI to empty state on load
     demo.load(
         inputs=[current_example],
         outputs=[query_display, context_description, context_display,
                 context_toggle_btn, show_full_context]
+    ).then(
+        # NEW: Reset reference section when loading new question
+        fn=reset_reference_section,
+        inputs=[],
+        outputs=[show_reference_answer, reference_content, reference_toggle_btn, reference_answer_display]
     ).then(
         # IMPORTANT: Explicitly hide FAQ here
         fn=hide_faq_section,
         inputs=[current_example],
         outputs=[query_display, context_description, context_display,
                 context_toggle_btn, show_full_context]
+    ).then(
+        # NEW: Reset reference section when trying another question
+        fn=reset_reference_section,
+        inputs=[],
+        outputs=[show_reference_answer, reference_content, reference_toggle_btn, reference_answer_display]
     ).then(
         # IMPORTANT: Explicitly hide FAQ here too
         fn=hide_faq_section,