Spaces:

MaziyarPanahi
/

FACTS-Leaderboard

Running

App Files Files Community

MaziyarPanahi commited on May 28

Commit

2139f5c

1 Parent(s): 09aab35

refactor filters and style

Browse files

Files changed (1) hide show

app.py +265 -31

app.py CHANGED Viewed

@@ -37,7 +37,9 @@ def get_size_category(size):
 df["Size_Category"] = df["Size"].apply(get_size_category)
-def filter_and_search_models(search_query, size_ranges, sort_by):
     """Filter and search models based on user inputs"""
     filtered_df = df.copy()
@@ -52,6 +54,32 @@ def filter_and_search_models(search_query, size_ranges, sort_by):
     if size_ranges and len(size_ranges) > 0:
         filtered_df = filtered_df[filtered_df["Size_Category"].isin(size_ranges)]
     # Sort by selected metric
     if sort_by in filtered_df.columns:
         filtered_df = filtered_df.sort_values(sort_by, ascending=False)
@@ -139,37 +167,60 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
     with gr.Tabs():
         with gr.TabItem("Leaderboard"):
-            # Filters at the top
             with gr.Row():
-                with gr.Column(scale=2):
-                    search_box = gr.Textbox(
-                        label="Model Search",
-                        placeholder="Search for a model name...",
-                        value="",
                     )
-                with gr.Column(scale=1):
-                    sort_dropdown = gr.Dropdown(
                         choices=[
-                            "Combined Score",
-                            "Separate Grounding Score",
-                            "Separate Quality Score",
                         ],
-                        value="Combined Score",
-                        label="Sort by",
-                        elem_classes="sort-dropdown",
                     )
-            # Size filters in a row
-            with gr.Row():
-                gr.Markdown("**Filter by Model Size:**")
-                size_checkboxes = gr.CheckboxGroup(
-                    choices=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
-                    value=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
-                    label="",
-                    elem_classes="size-filter",
-                    container=False,
-                )
             # Model count
             total_models = gr.Markdown(f"**Showing {len(df)} models**")
@@ -181,6 +232,7 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
                         "",
                         ["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
                         "Combined Score",
                     )
                 ),
                 elem_id="leaderboard-table",
@@ -219,7 +271,8 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
             ### Key Modifications:
             - **Domain-Specific**: Uses only the 236 medical examples from the original 860-example dataset
             - **Single Judge Model**: Employs Gemini 1.5 Flash as the sole evaluator (vs. the original's ensemble of 3 models)
-            - **Focus on Open Models**: Evaluates open-source models often missing from mainstream leaderboards for medical domain
             ### Why Medical Domain?
             Medical information requires exceptional accuracy and grounding. By focusing on this domain, we can assess how well smaller models handle critical healthcare information while strictly adhering to provided sources—a crucial capability for safe medical AI applications.
@@ -244,27 +297,33 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
             )
     # Update table when filters change
-    def update_table(search, sizes, sort_by):
-        filtered_df = filter_and_search_models(search, sizes, sort_by)
         model_count = f"**Showing {len(filtered_df)} models**"
         return create_html_table(filtered_df), model_count
     # Connect all inputs to the update function
     search_box.change(
         fn=update_table,
-        inputs=[search_box, size_checkboxes, sort_dropdown],
         outputs=[results_table, total_models],
     )
     size_checkboxes.change(
         fn=update_table,
-        inputs=[search_box, size_checkboxes, sort_dropdown],
         outputs=[results_table, total_models],
     )
     sort_dropdown.change(
         fn=update_table,
-        inputs=[search_box, size_checkboxes, sort_dropdown],
         outputs=[results_table, total_models],
     )
@@ -418,6 +477,181 @@ with gr.Blocks(title="FACTS Grounding Leaderboard", theme=gr.themes.Base()) as a
         border-color: #0d6efd !important;
         box-shadow: 0 2px 4px rgba(13, 110, 253, 0.2) !important;
     }
     """
 # Launch the app

 df["Size_Category"] = df["Size"].apply(get_size_category)
+def filter_and_search_models(
+    search_query, size_ranges, sort_by, architecture_filters=None
+):
     """Filter and search models based on user inputs"""
     filtered_df = df.copy()
     if size_ranges and len(size_ranges) > 0:
         filtered_df = filtered_df[filtered_df["Size_Category"].isin(size_ranges)]
+    # Apply architecture filter
+    if architecture_filters and len(architecture_filters) > 0:
+        architecture_mask = pd.Series(
+            [False] * len(filtered_df), index=filtered_df.index
+        )
+        for arch in architecture_filters:
+            if arch == "llama":
+                architecture_mask |= filtered_df["Model Name"].str.contains(
+                    "meta-llama", case=False, na=False
+                )
+            elif arch == "deepseek":
+                architecture_mask |= filtered_df["Model Name"].str.contains(
+                    "deepseek", case=False, na=False
+                )
+            elif arch == "qwen":
+                architecture_mask |= filtered_df["Model Name"].str.contains(
+                    "Qwen", case=False, na=False
+                )
+            elif arch == "google":
+                architecture_mask |= filtered_df["Model Name"].str.contains(
+                    "google", case=False, na=False
+                )
+        filtered_df = filtered_df[architecture_mask]
     # Sort by selected metric
     if sort_by in filtered_df.columns:
         filtered_df = filtered_df.sort_values(sort_by, ascending=False)
     with gr.Tabs():
         with gr.TabItem("Leaderboard"):
+            # Top section with search and filters
             with gr.Row():
+                # Left side - All Filters
+                with gr.Column(scale=1):
+                    gr.Markdown("### 🎛️ **Filter & Sort Options**")
+                    # Sort dropdown with modern styling
+                    with gr.Row():
+                        sort_dropdown = gr.Dropdown(
+                            choices=[
+                                ("🏆 Combined Score", "Combined Score"),
+                                ("🎯 Grounding Score", "Separate Grounding Score"),
+                                ("📊 Quality Score", "Separate Quality Score"),
+                            ],
+                            value="Combined Score",
+                            label="Sort by Metric",
+                            elem_classes="sort-dropdown-modern",
+                            container=True,
+                        )
+                    # Size filters
+                    gr.Markdown("**📏 Filter by Model Size:**")
+                    size_checkboxes = gr.CheckboxGroup(
+                        choices=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
+                        value=["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
+                        label="",
+                        elem_classes="size-filter",
+                        container=False,
                     )
+                    # Model architecture filters
+                    gr.Markdown("**🏗️ Filter by Model Architecture:**")
+                    architecture_checkboxes = gr.CheckboxGroup(
                         choices=[
+                            ("🤖 DeepSeek", "deepseek"),
+                            ("🐧 Qwen", "qwen"),
+                            ("🦙 Llama", "llama"),
+                            ("🔷 Gemma", "google"),
                         ],
+                        value=["llama", "deepseek", "qwen", "google"],
+                        label="",
+                        elem_classes="architecture-filter",
+                        container=False,
                     )
+                # Right side - Search
+                with gr.Column(scale=1):
+                    gr.Markdown("### 🔍 **Search Models**")
+                    search_box = gr.Textbox(
+                        label="",
+                        placeholder="Search for a model name (e.g., Llama, Qwen, DeepSeek)...",
+                        value="",
+                        elem_classes="search-input",
+                    )
             # Model count
             total_models = gr.Markdown(f"**Showing {len(df)} models**")
                         "",
                         ["0-5B", "5-10B", "10-20B", "20-40B", "40-80B", ">80B"],
                         "Combined Score",
+                        ["llama", "deepseek", "qwen", "google"],
                     )
                 ),
                 elem_id="leaderboard-table",
             ### Key Modifications:
             - **Domain-Specific**: Uses only the 236 medical examples from the original 860-example dataset
             - **Single Judge Model**: Employs Gemini 1.5 Flash as the sole evaluator (vs. the original's ensemble of 3 models)
+            - **Focus on Accessibility**: Tests Qwen 3 1.7B, demonstrating that smaller models can be benchmarked on this important task
+            - **Streamlined Process**: Simplified evaluation pipeline suitable for resource-constrained environments
             ### Why Medical Domain?
             Medical information requires exceptional accuracy and grounding. By focusing on this domain, we can assess how well smaller models handle critical healthcare information while strictly adhering to provided sources—a crucial capability for safe medical AI applications.
             )
     # Update table when filters change
+    def update_table(search, sizes, sort_by, arch_filters):
+        filtered_df = filter_and_search_models(search, sizes, sort_by, arch_filters)
         model_count = f"**Showing {len(filtered_df)} models**"
         return create_html_table(filtered_df), model_count
     # Connect all inputs to the update function
     search_box.change(
         fn=update_table,
+        inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
         outputs=[results_table, total_models],
     )
     size_checkboxes.change(
         fn=update_table,
+        inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
         outputs=[results_table, total_models],
     )
     sort_dropdown.change(
         fn=update_table,
+        inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
+        outputs=[results_table, total_models],
+    )
+    architecture_checkboxes.change(
+        fn=update_table,
+        inputs=[search_box, size_checkboxes, sort_dropdown, architecture_checkboxes],
         outputs=[results_table, total_models],
     )
         border-color: #0d6efd !important;
         box-shadow: 0 2px 4px rgba(13, 110, 253, 0.2) !important;
     }
+    .architecture-filter {
+        margin-top: 10px;
+    }
+    .architecture-filter > div {
+        display: flex !important;
+        flex-wrap: wrap !important;
+        gap: 8px !important;
+        align-items: center !important;
+    }
+    .architecture-filter label {
+        display: flex !important;
+        align-items: center !important;
+        border-radius: 8px !important;
+        padding: 8px 12px !important;
+        margin: 0 !important;
+        cursor: pointer !important;
+        transition: all 0.2s ease !important;
+        font-weight: 500 !important;
+        font-size: 14px !important;
+        min-width: 140px !important;
+        justify-content: center !important;
+        border: 2px solid !important;
+    }
+    .architecture-filter label:hover {
+        transform: translateY(-1px);
+        box-shadow: 0 2px 8px rgba(0,0,0,0.1) !important;
+    }
+    .architecture-filter input[type="checkbox"] {
+        display: none !important;
+    }
+    /* Llama styling */
+    .architecture-filter label:nth-child(1) {
+        background: #fffbf0 !important;
+        border-color: #f7e6a3 !important;
+        color: #8b4513 !important;
+    }
+    .architecture-filter label:nth-child(1):has(input[type="checkbox"]:checked) {
+        background: #f4a261 !important;
+        border-color: #f4a261 !important;
+        color: white !important;
+        box-shadow: 0 2px 4px rgba(244, 162, 97, 0.3) !important;
+    }
+    /* DeepSeek styling */
+    .architecture-filter label:nth-child(2) {
+        background: #f0f8ff !important;
+        border-color: #b3d9ff !important;
+        color: #1e40af !important;
+    }
+    .architecture-filter label:nth-child(2):has(input[type="checkbox"]:checked) {
+        background: #3b82f6 !important;
+        border-color: #3b82f6 !important;
+        color: white !important;
+        box-shadow: 0 2px 4px rgba(59, 130, 246, 0.3) !important;
+    }
+    /* Qwen styling */
+    .architecture-filter label:nth-child(3) {
+        background: #f5fff5 !important;
+        border-color: #b3ffb3 !important;
+        color: #15803d !important;
+    }
+    .architecture-filter label:nth-child(3):has(input[type="checkbox"]:checked) {
+        background: #22c55e !important;
+        border-color: #22c55e !important;
+        color: white !important;
+        box-shadow: 0 2px 4px rgba(34, 197, 94, 0.3) !important;
+    }
+    /* Google styling */
+    .architecture-filter label:nth-child(4) {
+        background: #fff0f5 !important;
+        border-color: #ffb3d9 !important;
+        color: #be185d !important;
+    }
+    .architecture-filter label:nth-child(4):has(input[type="checkbox"]:checked) {
+        background: #ec4899 !important;
+        border-color: #ec4899 !important;
+        color: white !important;
+        box-shadow: 0 2px 4px rgba(236, 72, 153, 0.3) !important;
+    }
+    /* Search and Filter Section Styling */
+    .search-input input {
+        border: 2px solid #e9ecef !important;
+        border-radius: 12px !important;
+        padding: 12px 16px !important;
+        font-size: 14px !important;
+        transition: all 0.3s ease !important;
+        background: linear-gradient(135deg, #f8f9fa 0%, #ffffff 100%) !important;
+    }
+    .search-input input:focus {
+        border-color: #6366f1 !important;
+        box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.1) !important;
+        background: white !important;
+    }
+    .search-input input::placeholder {
+        color: #6b7280 !important;
+        font-style: italic !important;
+    }
+    /* Modern Sort Dropdown Styling */
+    .sort-dropdown-modern label {
+        font-weight: 600 !important;
+        color: #374151 !important;
+        margin-bottom: 8px !important;
+    }
+    .sort-dropdown-modern .wrap {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+        border-radius: 12px !important;
+        padding: 2px !important;
+        border: none !important;
+    }
+    .sort-dropdown-modern select {
+        background: white !important;
+        border: none !important;
+        border-radius: 10px !important;
+        padding: 12px 16px !important;
+        font-size: 14px !important;
+        font-weight: 500 !important;
+        color: #374151 !important;
+        cursor: pointer !important;
+        transition: all 0.3s ease !important;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
+    }
+    .sort-dropdown-modern select:hover {
+        box-shadow: 0 4px 8px rgba(0,0,0,0.15) !important;
+        transform: translateY(-1px) !important;
+    }
+    .sort-dropdown-modern select:focus {
+        outline: none !important;
+        box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.2) !important;
+    }
+    /* Section Headers */
+    h3 {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+        -webkit-background-clip: text !important;
+        -webkit-text-fill-color: transparent !important;
+        background-clip: text !important;
+        margin-bottom: 12px !important;
+    }
+    /* Centered Architecture Section */
+    .centered-title {
+        text-align: center !important;
+    }
+    .centered-filter > div {
+        display: flex !important;
+        flex-wrap: wrap !important;
+        gap: 8px !important;
+        align-items: center !important;
+        justify-content: center !important;
+    }
+    .size-filter {
+        margin-top: 10px;
+    }
     """
 # Launch the app